Python爬虫爬企查查数据
因为制作B2b网站需要,需要入库企业信息数据。所以目光锁定企查查数据,废话不多说,开干!
#-*- coding-8 -*- import requests import lxml import sys from bs4 import BeautifulSoup import xlwt import time import urllib def craw(url,key_word,x): User_Agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0' # if x == 0: # re = 'http://www.qichacha.com/search?key='+key_word # else: # re = 'https://www.qichacha.com/search?key={}#p:{}&'.format(key_word,x-1) re = r'https://www.qichacha.com/search?key='+key_word headers = { 'Host':'www.qichacha.com', 'Connection': 'keep-alive', 'Accept':r'text/html, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Referer': re, 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.9', 'Cookie':r'xxxxxxxxx这里换成你的cookiexxxxxxxx这里换成你的cookiexxxxxxxxx这里换成你的cookiexxxxxxx', } try: response = requests.get(url,headers = headers) if response.status_code != 200: response.encoding = 'utf-8' print(response.status_code) print('ERROR') soup = BeautifulSoup(response.text,'lxml') except Exception: print('请求都不让,这企查查是想逆天吗???') try: com_all_info = soup.find_all(class_='m_srchList')[0].tbody com_all_info_array = com_all_info.select('tr') print('开始爬取数据,请勿打开excel') for i in range(0,len(com_all_info_array)): # try: temp_g_name = com_all_info_array[i].select('td')[2].select('.ma_h1')[0].text #获取公司名 temp_g_tag = com_all_info_array[i].select('td')[2].select('.search-tags')[0].text #获取公司标签 temp_r_name = com_all_info_array[i].select('td')[2].select('p')[0].a.text #获取法人名 temp_g_money = com_all_info_array[i].select('td')[2].select('p')[0].select('span')[0].text.strip('注册资本:') #获取注册资本 temp_g_date = com_all_info_array[i].select('td')[2].select('p')[0].select('span')[1].text.strip('成立日期:') #获取公司注册时间 temp_r_email = com_all_info_array[i].select('td')[2].select('p')[1].text.split('\n')[1].strip().strip('邮箱:') #获取法人Email temp_r_phone = com_all_info_array[i].select('td')[2].select('p')[1].select('.m-l')[0].text.strip('电话:') #获取法人手机号 temp_g_addr = com_all_info_array[i].select('td')[2].select('p')[2].text.strip().strip('地址:') #获取公司地址 temp_g_state = com_all_info_array[i].select('td')[3].select('.nstatus.text-success-lt.m-l-xs')[0].text.strip() #获取公司状态 g_name_list.append(temp_g_name) g_tag_list.append(temp_g_tag) r_name_list.append(temp_r_name) g_money_list.append(temp_g_money) g_date_list.append(temp_g_date) r_email_list.append(temp_r_email) r_phone_list.append(temp_r_phone) g_addr_list.append(temp_g_addr) g_state_list.append(temp_g_state) # except Exception: # print('错误!') except Exception: print('好像被拒绝访问了呢...请稍后再试叭...') if __name__ == '__main__': global g_name_list global g_tag_list global r_name_list global g_money_list global g_date_list global r_email_list global r_phone_list global g_addr_list global g_state_list g_name_list=[] g_tag_list=[] r_name_list=[] g_money_list=[] g_date_list=[] r_email_list=[] r_phone_list=[] g_addr_list=[] g_state_list=[] key_word = input('请输入您想搜索的关键词:') num = int(input('请输入您想检索的次数:'))+1 sleep_time = int(input('请输入每次检索延时的秒数:')) key_word = urllib.parse.quote(key_word) print('正在搜索,请稍后') for x in range(1,num): url = r'https://www.qichacha.com/search_index?key={}&ajaxflag=1&p={}&'.format(key_word,x) s1 = craw(url,key_word,x) time.sleep(sleep_time) workbook = xlwt.Workbook() #创建sheet对象,新建sheet sheet1 = workbook.add_sheet('企查查数据', cell_overwrite_ok=True) #---设置excel样式--- #初始化样式 style = xlwt.XFStyle() #创建字体样式 font = xlwt.Font() font.name = '仿宋' # font.bold = True #加粗 #设置字体 style.font = font #使用样式写入数据 print('正在存储数据,请勿打开excel') #向sheet中写入数据 name_list = ['公司名字','公司标签','法定法人','注册资本','成立日期','法人邮箱','法人电话','公司地址','公司状态'] for cc in range(0,len(name_list)): sheet1.write(0,cc,name_list[cc],style) for i in range(0,len(g_name_list)): print(g_name_list[i]) sheet1.write(i+1,0,g_name_list[i],style)#公司名字 sheet1.write(i+1,1,g_tag_list[i],style)#公司标签 sheet1.write(i+1,2,r_name_list[i],style)#法定法人 sheet1.write(i+1,3,g_money_list[i],style)#注册资本 sheet1.write(i+1,4,g_date_list[i],style)#成立日期 sheet1.write(i+1,5,r_email_list[i],style)#法人邮箱 sheet1.write(i+1,6,r_phone_list[i],style)#法人电话 sheet1.write(i+1,7,g_addr_list[i],style)#公司地址 sheet1.write(i+1,8,g_state_list[i],style)#公司状态 #保存excel文件,有同名的直接覆盖 workbook.save(r"D:\wyy-qcc-"+time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) +".xls") print('保存完毕~')
千行代码,Bug何处藏。 纵使上线又怎样,朝令改,夕断肠。