爬取智联招聘

import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import json


class ZhiLianSpider(object):
    url = "https://sou.zhaopin.com/?"

    def __init__(self, jl, kw, start_page, end_page):
        self.jl = jl
        self.kw = kw
        self.start_page = start_page
        self.end_page = end_page
        self.items = []  # 定义一个空列表,存放所有的工作信息

    # 解析html文件,提取所需的内容
    def parse_content(self, content):
        soup = BeautifulSoup(content, 'html.parser')
        table_list = soup.select('#listContent > table')[1:]
        for table in table_list:
            zwmc = table.select('.zwmc > div > a')[0].text
            gsmc = table.select('.gsmc > a')[0].text
            zwyx = table.select('.zwyx')[0].text
            gzdd = table.select('.gzdd')[0].text
            gxsj = table.select('.gxsj > span')[0].text
            item = {
                '职位名称': zwmc,
                '公司名称': gsmc,
                '职位月薪': zwyx,
                '工作地点': gzdd,
                '更新时间': gxsj,
            }
            self.items.append(item)

    # 启动爬虫
    def run(self):
        for page in range(self.start_page, self.end_page+1):
            request = self.handler_request(page)  # 构建request对象
            content = urllib.request.urlopen(request).read().decode()  # 发起get请求,获得html文件
            self.parse_content(content)
        string_items = json.dumps(self.items, ensure_ascii=False)  # 将列表类型转化为字符串类型
        with open("zhilian.txt", "w", encoding="utf-8") as f:      # 设置ensure_ascii,打开txt文件时显示中文
            f.write(string_items)

    def handler_request(self, page):  # 处理url,构建request对象
        data = {
            'jl': self.jl,
            'kw': self.kw,
            'p': page
        }
        get_url = ZhiLianSpider.url + urllib.parse.urlencode(data)  # url中有中文,需要urlencode编码
        # print(get_url)
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\
                          WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }
        request = urllib.request.Request(url=get_url, headers=headers)
        return request


def main():
    jl = input("请输入工作地点:")
    kw = input("请输入工作关键词:")
    start_page = int(input("请输入查询起始页面:"))
    end_page = int(input("查询结束页面:"))

    # 创建对象,启动爬取程序
    spider = ZhiLianSpider(jl, kw, start_page, end_page)
    spider.run()


if __name__ == '__main__':
    main()

 

posted @ 2019-08-15 19:30  飞蝎儿  阅读(667)  评论(0编辑  收藏  举报