python爬虫总结

安装Scrapy(有很多依赖库要装，略麻烦)

参考: https://www.cnblogs.com/liuliliuli2017/p/6746440.html

Scrapy中文文档: http://scrapy-chs.readthedocs.io/zh_CN/0.24/index.html

查看scrapy基本信息和功能

scrapy

测试爬虫性能

scrapy bench

爬取网页信息(以百度首页为例)

scrapy fetch "http://www.baidu.com"

shell环境，可以在cmd进行操作(以百度为例)

scrapy shell "http://www.baidu.com"
print response.body # 打印响应主体

创建项目(以ITcast为例)

scrapy startproject ITcast

settings.py屏蔽ROBOTSTXT_OBEY(不遵守机器人协议)

生成爬虫文件

# scrapy genspider example example_url
scrapy genspider itcast "http://www.itcast.cn"

items字段(items.py)

import scrapy


class ItcastItem(scrapy.Item):
    # define the fields for your item here like:
    
    #老师姓名
    name = scrapy.Field()
    #老师职称
    title = scrapy.Field()
    #老师信息
    info = scrapy.Field()

编写爬虫文件(itcast.py)

# -*- coding: utf-8 -*-

import scrapy
from ITcast.items import ItcastItem

class ItcastSpider(scrapy.Spider):
    #爬虫名(必选)
    name = 'itcast'
    allowed_domains = ['http://www.itcast.cn']
    start_urls = ['http://www.itcast.cn/channel/teacher.shtml']

    def parse(self, response): 
        node_list = response.xpath("//div[@class='li_txt']")
        #存储所有的item字段
        items = []
        for node in node_list:
            item = ItcastItem()
            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()
            
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            
            items.append(item)
        return items
        #pass

检查爬虫是否无误

scrapy check itcast

运行爬虫

scrapy crawl itcast

查看爬虫

scrapy list

编写多个管道，则需要在settints文件中的ITEM_PIPELINES添加

例: 腾讯招聘(多页抓取)

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #职位名
    positionName = scrapy.Field()
    #职位详情
    #positionLink = scrapy.Field()
    #职位类型
    #positionType = scrapy.Field()
    #人数
    #peopleNumber = scrapy.Field()
    #工作地点
    #workLocation = scrapy.Field()
    #发布时间
    #publishTime = scrapy.Field()
    #pass

View Code

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json
class TencentPipeline(object):
    def __init__(self):
        self.f = open("tencent.json", "w")
    def process_item(self, item, spider):
        content = json.dumps(dict(item), ensure_ascii=False) +"\n"
        #self.f.write(item['positionName'] + "\n")
        self.f.write(content)
        return item
    def close_spider(self, spider):
        self.f.close()

View Code

settings.py开启管道

ITEM_PIPELINES = {
    'Tencent.pipelines.TencentPipeline': 300,
}

View Code

tencent.py

# -*- coding: utf-8 -*-
import scrapy
from Tencent.items import TencentItem

class TencentSpider(scrapy.Spider):
    name = 'tencent'
    allowed_domains = ['tencent.com']
    base_url = "http://hr.tencent.com/position.php?&start="
    offset = 0
    
    start_urls = [base_url + str(offset)]

    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for node in node_list:
            item = TencentItem()
            item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0]
            #item['positionLink'] = node.xpath("./td[1]/a/@href").extract()[0].encode("utf-8")
            #item['positionType'] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8")
            #item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8")
            #item['workLocation'] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8")
            #item['publishTime'] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8")
            yield item
        # if self.offset < 2620 :
            # self.offset += 10
            # url = self.base_url + str(self.offset)
            # yield scrapy.Request(url, callback = self.parse)
        next_page = response.xpath("//*[@id='next']/@href").extract()[0]
        if not next_page.startswith("java") :
            yield scrapy.Request("http://hr.tencent.com/" + next_page, callback = self.parse)
            
        #pass

View Code

例: 斗鱼主播图片爬取(图片爬取)

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DouyuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    nickname = scrapy.Field()
    imagelink = scrapy.Field()

View Code

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from Douyu.settings import IMAGES_STORE as image_store
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class DouyuPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        image_link = item['imagelink']
        yield scrapy.Request(image_link)
    
    def item_completed(self, results, item, info):
        #print(results)
        image_path = [x['path'] for ok,x in results if ok]
        os.rename(image_store + image_path[0], image_store + item['nickname'] + ".jpg")

View Code

settings.py配置IMAGE_STORE和USER_AGENT并开启管道(同腾讯招聘)

IMAGES_STORE = "E:/PythonScrapy/Douyu/Douyu/Images/"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Linux; U; Android 4.4.2; zh-CN; HUAWEI MT7-TL00 Build/HuaweiMT7-TL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.3.8.909 Mobile Safari/537.36'

View Code

douyu.py

# -*- coding: utf-8 -*-
import scrapy
import json
from Douyu.items import DouyuItem
class DouyuSpider(scrapy.Spider):
    name = 'douyu'
    allowed_domains = ['douyucdn.cn']
    baseURL = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
    offset = 0
    start_urls = [baseURL + str(offset)]

    def parse(self, response):
        
        data_list = json.loads(response.body.decode('gbk'))['data']
        if len(data_list) == 0:
            return
        #print(data_list)
        for data in data_list:
            item = DouyuItem()
            item['nickname'] = data['nickname']
            item['imagelink'] = data['vertical_src']
            yield item
            
        #self.offset += 20
        #yield scrapy.Request(self.baseURL + str(self.offset), callback = self.parse)

View Code

posted @ 2018-02-21 21:59 wust_ouyangli 阅读(174) 评论(0) 编辑收藏举报

刷新页面返回顶部

wust_ouyangli

python爬虫总结

公告