爬取前程无忧信息,并保存到数据库中
import urllib.request ##请求 import ssl import re import xlwt import pymysql ssl._create_default_https_context = ssl._create_unverified_context ##去爬取数据,返回的是HTML页面的内容 def getContent(name,j): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4503.5 Safari/537.36", 'Connection': 'keep-alive' } j = j+1 url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%s,2,%d.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="%(name,j) ##请求对象(URL + 请求头) req = urllib.request.Request(url, headers=headers) ##获取页面内容 page = urllib.request.urlopen(req).read() ##对获取的到内容,设置编码:防止中文乱码 page = page.decode("GBK") return page ##使用正则找出 页面中 工作 相关的信息 def getItem(content): pattern = re.compile(r'"job_href":"(.+?)","job_name":"(.+?)".+?"company_href":"(.+?)","company_name":"(.+?)","providesalary_text":"(.*?)".+?"workarea_text":"(.*?)","updatedate":"(.*?)".*?"companytype_text":"(.*?)","degreefrom":"(.*?)".*?"attribute_text":(.*?),"companysize_text":"(.*?)",.*?,"companyind_text":"(.*?)".*?') res = re.findall(pattern,content) return res ##将找出的信息存储在Excel表格中 def saveExcel(list): ##(2)工作簿 wb = xlwt.Workbook() ##(3)表 sheet = wb.add_sheet("数据分析50") ##(4)写数据:一行一行的写 header = ["公司的名字", "公司的网址","公司类型","公司规模","行业","工作地点","岗位名字", "待遇","岗位详情", "发布时间","学历","招聘要求"] ##表头 for (i,v) in enumerate(header): sheet.write(0,i,v) ##(0岗位详情,1岗位名字,2公司的网址,3公司的名字,4待遇,5工作地点,6发布时间,7公司类型,8学历,9招聘要求,10公司规模,11行业) for (i,tuple) in enumerate(list): sheet.write(i + 1, 0, tuple[3]) sheet.write(i + 1, 1, tuple[2]) sheet.write(i + 1, 2, tuple[7]) sheet.write(i + 1, 3, tuple[10]) sheet.write(i + 1, 4, tuple[11]) sheet.write(i + 1, 5, tuple[5]) sheet.write(i + 1, 6, tuple[1]) sheet.write(i + 1, 7, tuple[4]) sheet.write(i + 1, 8, tuple[0]) sheet.write(i + 1, 9, tuple[6]) sheet.write(i + 1, 10, tuple[8]) sheet.write(i + 1, 11, tuple[9]) ##保存 wb.save("51job2.xls") list=[] name = input("请输入您想要搜索的行业") for j in range(0,201): print("正在为您查询第%s页数据,请不要进行任何操作或退出程序。"%(j+1)) aaa = getContent(name,j) content = getItem(aaa) list.extend(content) def saveMysql(list): conn = pymysql.connect(host="localhost", user="root", password="123", database="xmmysql", charset="utf8") cursor = conn.cursor() ##创建游标(新建查询会话),通过游标执行SQL语句 for i in list: sql = "insert into sjfx(name,wz,leix,gm,hy,gzdd,gwmz,dy,gwxq,fbsj,xl,zpyq) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7],i[8],i[9],i[10],i[11]) cursor.execute(sql) ##将SQL语句放入游标中,准备执行 conn.commit() ##提交 cursor.close() conn.close() # saveExcel(list) # saveMysql(list)