爬取智联招聘
import urllib.request import urllib.parse from bs4 import BeautifulSoup import json class ZhiLianSpider(object): url = "https://sou.zhaopin.com/?" def __init__(self, jl, kw, start_page, end_page): self.jl = jl self.kw = kw self.start_page = start_page self.end_page = end_page self.items = [] # 定义一个空列表,存放所有的工作信息 # 解析html文件,提取所需的内容 def parse_content(self, content): soup = BeautifulSoup(content, 'html.parser') table_list = soup.select('#listContent > table')[1:] for table in table_list: zwmc = table.select('.zwmc > div > a')[0].text gsmc = table.select('.gsmc > a')[0].text zwyx = table.select('.zwyx')[0].text gzdd = table.select('.gzdd')[0].text gxsj = table.select('.gxsj > span')[0].text item = { '职位名称': zwmc, '公司名称': gsmc, '职位月薪': zwyx, '工作地点': gzdd, '更新时间': gxsj, } self.items.append(item) # 启动爬虫 def run(self): for page in range(self.start_page, self.end_page+1): request = self.handler_request(page) # 构建request对象 content = urllib.request.urlopen(request).read().decode() # 发起get请求,获得html文件 self.parse_content(content) string_items = json.dumps(self.items, ensure_ascii=False) # 将列表类型转化为字符串类型 with open("zhilian.txt", "w", encoding="utf-8") as f: # 设置ensure_ascii,打开txt文件时显示中文 f.write(string_items) def handler_request(self, page): # 处理url,构建request对象 data = { 'jl': self.jl, 'kw': self.kw, 'p': page } get_url = ZhiLianSpider.url + urllib.parse.urlencode(data) # url中有中文,需要urlencode编码 # print(get_url) headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\ WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } request = urllib.request.Request(url=get_url, headers=headers) return request def main(): jl = input("请输入工作地点:") kw = input("请输入工作关键词:") start_page = int(input("请输入查询起始页面:")) end_page = int(input("查询结束页面:")) # 创建对象,启动爬取程序 spider = ZhiLianSpider(jl, kw, start_page, end_page) spider.run() if __name__ == '__main__': main()