小小小虎

爬取前程无忧信息,并保存到数据库中

import urllib.request ##请求
import ssl
import re
import xlwt
import pymysql


ssl._create_default_https_context = ssl._create_unverified_context

##去爬取数据,返回的是HTML页面的内容
def getContent(name,j):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4503.5 Safari/537.36",
        'Connection': 'keep-alive'
    }
    j = j+1
    url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%s,2,%d.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="%(name,j)

    ##请求对象(URL + 请求头)
    req = urllib.request.Request(url, headers=headers)

    ##获取页面内容
    page = urllib.request.urlopen(req).read()

    ##对获取的到内容,设置编码:防止中文乱码
    page = page.decode("GBK")

    return page

##使用正则找出 页面中 工作 相关的信息
def getItem(content):
    pattern = re.compile(r'"job_href":"(.+?)","job_name":"(.+?)".+?"company_href":"(.+?)","company_name":"(.+?)","providesalary_text":"(.*?)".+?"workarea_text":"(.*?)","updatedate":"(.*?)".*?"companytype_text":"(.*?)","degreefrom":"(.*?)".*?"attribute_text":(.*?),"companysize_text":"(.*?)",.*?,"companyind_text":"(.*?)".*?')
    res = re.findall(pattern,content)
    return res

##将找出的信息存储在Excel表格中
def saveExcel(list):
    ##(2)工作簿
    wb = xlwt.Workbook()

    ##(3)表
    sheet = wb.add_sheet("数据分析50")

    ##(4)写数据:一行一行的写
    header = ["公司的名字", "公司的网址","公司类型","公司规模","行业","工作地点","岗位名字", "待遇","岗位详情", "发布时间","学历","招聘要求"]
    ##表头
    for (i,v) in enumerate(header):
        sheet.write(0,i,v)
    ##(0岗位详情,1岗位名字,2公司的网址,3公司的名字,4待遇,5工作地点,6发布时间,7公司类型,8学历,9招聘要求,10公司规模,11行业)
    for (i,tuple) in enumerate(list):
        sheet.write(i + 1, 0, tuple[3])
        sheet.write(i + 1, 1, tuple[2])
        sheet.write(i + 1, 2, tuple[7])
        sheet.write(i + 1, 3, tuple[10])
        sheet.write(i + 1, 4, tuple[11])
        sheet.write(i + 1, 5, tuple[5])
        sheet.write(i + 1, 6, tuple[1])
        sheet.write(i + 1, 7, tuple[4])
        sheet.write(i + 1, 8, tuple[0])
        sheet.write(i + 1, 9, tuple[6])
        sheet.write(i + 1, 10, tuple[8])
        sheet.write(i + 1, 11, tuple[9])

    ##保存
    wb.save("51job2.xls")

list=[]
name = input("请输入您想要搜索的行业")
for j in range(0,201):
    print("正在为您查询第%s页数据,请不要进行任何操作或退出程序。"%(j+1))
    aaa = getContent(name,j)
    content = getItem(aaa)
    list.extend(content)

def saveMysql(list):
    conn = pymysql.connect(host="localhost",
                           user="root",
                           password="123",
                           database="xmmysql",
                           charset="utf8")
    cursor = conn.cursor()  ##创建游标(新建查询会话),通过游标执行SQL语句
    for i in list:
        sql = "insert into sjfx(name,wz,leix,gm,hy,gzdd,gwmz,dy,gwxq,fbsj,xl,zpyq) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7],i[8],i[9],i[10],i[11])
        cursor.execute(sql)  ##将SQL语句放入游标中,准备执行
    conn.commit()  ##提交
    cursor.close()
    conn.close()

# saveExcel(list)
# saveMysql(list)

 

posted on 2021-06-04 23:49  小小小虎  阅读(105)  评论(0编辑  收藏  举报

导航