案例:使用BeautifuSoup4的爬虫

使用BeautifuSoup4解析器,将招聘网页上的招聘单位名称存储出来。其他信息可类似爬取即可

# -*- coding:utf-8 -*-



from bs4 import BeautifulSoup
import urllib2
import json  # 使用了json格式存储


def csdn():
    url = 'https://job.csdn.net/search/index'
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    request = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(request)
    resHtml = response.read()
    output = open('csdnJob.json', 'w')
    soup = BeautifulSoup(resHtml, 'html.parser', from_encoding='utf-8')
    # 创建CSS选择器
    result = soup.select('div[class="position_list clearfix"]')
    # result += result2
    print("----")
    print (result)
    items = []
    for site in result:
        item = {}
        print (site)
        name = site.select('a[class="enterprise_name"]')[0].get_text()
        item['name'] = name
        items.append(item)

    # 禁用ascii编码,按utf-8编码
    line = json.dumps(items, ensure_ascii=False)

    output.write(line.encode('utf-8'))
    output.close()


if __name__ == "__main__":
    csdn()

效果:

posted on 2019-11-23 16:18  LoaderMan  阅读(294)  评论(0编辑  收藏  举报

导航