python爬虫总结
安装Scrapy(有很多依赖库要装,略麻烦)
参考: https://www.cnblogs.com/liuliliuli2017/p/6746440.html
Scrapy中文文档: http://scrapy-chs.readthedocs.io/zh_CN/0.24/index.html
查看scrapy基本信息和功能
scrapy
测试爬虫性能
scrapy bench
爬取网页信息(以百度首页为例)
scrapy fetch "http://www.baidu.com"
shell环境,可以在cmd进行操作(以百度为例)
scrapy shell "http://www.baidu.com" print response.body # 打印响应主体
创建项目(以ITcast为例)
scrapy startproject ITcast
settings.py屏蔽ROBOTSTXT_OBEY(不遵守机器人协议)
生成爬虫文件
# scrapy genspider example example_url scrapy genspider itcast "http://www.itcast.cn"
items字段(items.py)
import scrapy class ItcastItem(scrapy.Item): # define the fields for your item here like: #老师姓名 name = scrapy.Field() #老师职称 title = scrapy.Field() #老师信息 info = scrapy.Field()
编写爬虫文件(itcast.py)
# -*- coding: utf-8 -*- import scrapy from ITcast.items import ItcastItem class ItcastSpider(scrapy.Spider): #爬虫名(必选) name = 'itcast' allowed_domains = ['http://www.itcast.cn'] start_urls = ['http://www.itcast.cn/channel/teacher.shtml'] def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") #存储所有的item字段 items = [] for node in node_list: item = ItcastItem() name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] items.append(item) return items #pass
检查爬虫是否无误
scrapy check itcast
运行爬虫
scrapy crawl itcast
查看爬虫
scrapy list
编写多个管道,则需要在settints文件中的ITEM_PIPELINES添加
例: 腾讯招聘(多页抓取)
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class TencentItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #职位名 positionName = scrapy.Field() #职位详情 #positionLink = scrapy.Field() #职位类型 #positionType = scrapy.Field() #人数 #peopleNumber = scrapy.Field() #工作地点 #workLocation = scrapy.Field() #发布时间 #publishTime = scrapy.Field() #pass
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json class TencentPipeline(object): def __init__(self): self.f = open("tencent.json", "w") def process_item(self, item, spider): content = json.dumps(dict(item), ensure_ascii=False) +"\n" #self.f.write(item['positionName'] + "\n") self.f.write(content) return item def close_spider(self, spider): self.f.close()
settings.py开启管道
ITEM_PIPELINES = { 'Tencent.pipelines.TencentPipeline': 300, }
tencent.py
# -*- coding: utf-8 -*- import scrapy from Tencent.items import TencentItem class TencentSpider(scrapy.Spider): name = 'tencent' allowed_domains = ['tencent.com'] base_url = "http://hr.tencent.com/position.php?&start=" offset = 0 start_urls = [base_url + str(offset)] def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0] #item['positionLink'] = node.xpath("./td[1]/a/@href").extract()[0].encode("utf-8") #item['positionType'] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8") #item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8") #item['workLocation'] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8") #item['publishTime'] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8") yield item # if self.offset < 2620 : # self.offset += 10 # url = self.base_url + str(self.offset) # yield scrapy.Request(url, callback = self.parse) next_page = response.xpath("//*[@id='next']/@href").extract()[0] if not next_page.startswith("java") : yield scrapy.Request("http://hr.tencent.com/" + next_page, callback = self.parse) #pass
例: 斗鱼主播图片爬取(图片爬取)
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class DouyuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() nickname = scrapy.Field() imagelink = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import os from Douyu.settings import IMAGES_STORE as image_store from scrapy.pipelines.images import ImagesPipeline import scrapy class DouyuPipeline(ImagesPipeline): def get_media_requests(self, item, info): image_link = item['imagelink'] yield scrapy.Request(image_link) def item_completed(self, results, item, info): #print(results) image_path = [x['path'] for ok,x in results if ok] os.rename(image_store + image_path[0], image_store + item['nickname'] + ".jpg")
settings.py配置IMAGE_STORE和USER_AGENT并开启管道(同腾讯招聘)
IMAGES_STORE = "E:/PythonScrapy/Douyu/Douyu/Images/" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Linux; U; Android 4.4.2; zh-CN; HUAWEI MT7-TL00 Build/HuaweiMT7-TL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.3.8.909 Mobile Safari/537.36'
douyu.py
# -*- coding: utf-8 -*- import scrapy import json from Douyu.items import DouyuItem class DouyuSpider(scrapy.Spider): name = 'douyu' allowed_domains = ['douyucdn.cn'] baseURL = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=" offset = 0 start_urls = [baseURL + str(offset)] def parse(self, response): data_list = json.loads(response.body.decode('gbk'))['data'] if len(data_list) == 0: return #print(data_list) for data in data_list: item = DouyuItem() item['nickname'] = data['nickname'] item['imagelink'] = data['vertical_src'] yield item #self.offset += 20 #yield scrapy.Request(self.baseURL + str(self.offset), callback = self.parse)