Scrapy框架基础

创建 Scrapy 项目
scrapy startproject 项目名
scrapy genspider name 起始网址


运行 Scrapy框架

scrapy crawl 名称

修改 chouti.py 下的 parse 方法,打印 response
没有日志写法为

scrapy crawl chouti --nolog


流程图

抽屉 爬取标题和对应 url 链接
import scrapy
from scrapy.selector import Selector
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['dig.chouti.com']
    start_urls = ['http://dig.chouti.com/']

    def parse(self, response):
        # print(response.text) 打印网页内容
        a_list = Selector(response = response).xpath('//div[@class="link-detail"]')
        for a in a_list:
            # //div[@class="link-detail"]/a
            title = a.xpath('.//a/text()').extract()[1]
            href = a.xpath('./a/@href').extract()[0]
            print(title)
            print(href)


md5 加密 网址 等长名称进行存储
def md5(self,href):
    import hashlib
    obj = hashlib.md5()
    obj.update(bytes(href,encoding = 'utf-8'))
    return obj.hexdigest()

进行页面请求使用 ,一直递归调用访问网址
yield Request(url = href,callback = self.parse)
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
# 导入 Request 请求

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['dig.chouti.com']
    start_urls = ['http://dig.chouti.com/']
    all_urls = set()

    def parse(self, response):
        a_list = Selector(response = response).xpath('//div[@class="link-detail"]')
        # 查看所有的 div ,class 为 link-detail 的
        for a in a_list:
            href = a.xpath('./a/@href').extract_first()
            # div 下的 a 链接的 href 属性
            text = a.xpath('./a/text()').extract_first()
            # div 下的 a 链接的 内容
            # print(href,text)
            if href not in self.all_urls:
                # 如果还没有进行访问
                self.all_urls.add(href)
                print(href)
                # Request(url = href) -> 发送到调度器
                yield Request(url = href,callback = self.parse)

        print(self.all_urls)

    def md5(self,href):
        import hashlib
        obj = hashlib.md5()
        obj.update(bytes(href,encoding = 'utf-8'))
        return obj.hexdigest()


设置访问页面的深度
在 settings.py 中添加

DEPTH_LIMIT = 2

使用 pipelines 进行持久化保存数据
在 settings 中开启 pipeline

ITEM_PIPELINES = {
   'myscrapy.pipelines.MyscrapyPipeline': 300,
   # 打开管道,进行保存到本地
}
修改 items 中字段

class MyscrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    href = scrapy.Field()
    title = scrapy.Field()
修改 pipelines 字段

class MyscrapyPipeline:
    def process_item(self, item, spider):
        contents = f"{item['href']}\n{item['title']}\n"
        f = open('news.json','a')
        f.write(contents)
        f.close()
注:使用 item['href'] 而不是 item.href 
chouti.py 代码
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
# 导入 Request 请求
from ..items import MyscrapyItem


class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['dig.chouti.com']
    start_urls = ['http://dig.chouti.com/']
    all_urls = set()

    def parse(self, response):
        a_list = Selector(response = response).xpath('//div[@class="link-detail"]')
        # 查看所有的 div ,class 为 link-detail 的
        for a in a_list:
            href = a.xpath('./a/@href').extract_first()
            # div 下的 a 链接的 href 属性
            title = a.xpath('./a/text()').extract_first()
            # div 下的 a 链接的 内容
            # print(href,text)
            if href not in self.all_urls:
                # 如果还没有进行访问
                self.all_urls.add(href)
                print(href)
                print(title)
                item_obj = MyscrapyItem(href = href,title = title)
                # 创建 item 对象,在items.py中创建 href 和 title
                '''
                    href = scrapy.Field()
                    title = scrapy.Field()
                '''

                yield item_obj

                # Request(url = href) -> 发送到调度器
                # yield Request(url = href,callback = self.parse)

    def md5(self,href):
        import hashlib
        obj = hashlib.md5()
        obj.update(bytes(href,encoding = 'utf-8'))
        return obj.hexdigest()
注:
    使用 extract_first 转换为字符串
    使用 extract 方法转换为列表
    一定要使用 yield item_obj
    
运行结果

news.json 内容


爬取多个网站
allowed_domains = [网站1,网站2]

allowed_domains = ['dig.chouti.com']
关于递归查询
设置请求
from scrapy.http import Request

设置
yield Request(url = href,callback = self.parse)

注:href 为 网址内容,回调函数 callback 为固定用法,最好不要修改 parse名称

parse 下的 response 参数
response.meta

response.url

response.text

response.body

某一个标签的属性以某一个值开头

//标签[starts-with(@属性,"")]

starts-with 为 武老师博客整理 Q2Day81
链接:
https://www.cnblogs.com/wupeiqi/articles/6229292.html
使用 re 进行筛选
//标签[re:test(@属性,"匹配规则")]

//标签[contains(@属性,"")]

某一个标签的属性值里面含有值

Scrapy 会自动去掉重复 url,不用进行判断
RFPDupeFilter 类进行判断

from scrapy.dupefilters import RFPDupeFilter

可以自己创建一个 py 文件,进行自定义判断是否重复
但是要在 settings 中进行配置
DUPEFILTER_CLASS = "项目名.py文件名.类名"
可以判断数据库中是否存在记录
使用 from_settings 使用 cls 进行创建对象
创建 py 文件进行定义判断是否重复流程
    1.使用 from_settings 进行创建对象
    2.使用 __init__ 方法对对象进行初始化
    3.open 方法开始进行爬取
    4.request_seen 检查是否已经访问过
    5.close 方法进行关闭

pipelines 文件补充
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem

class MyscrapyPipeline:

    def open_spider(self,spider):
        print("开始进行爬取")
        self.f = open('news.json', 'w')

    def close_spider(self,spider):
        print("爬取完毕")
        self.f.close()

    def process_item(self, item, spider):
        # yield item_obj 时,进行执行 process_item 方法
        # item_obj = MyscrapyItem(href = href,title = title)
        contents = f"{item['href']}\n{item['title']}\n"
        self.f.write(contents)

        # return item # 返还给另一个 pipeline 对象
        # raise DropItem() # 丢弃 item

获取 settings 中的 DB 数据库
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
# 交给下一个 pipeline 对象

class MyscrapyPipeline:

    def __init__(self,conn_str):
        # 对 from_crawler 对象进行初始化
        self.conn_str = conn_str
        self.conn = None

    @classmethod
    def from_crawler(cls,crawler):
        # 创建 pipeline 对象,可以创建与数据库的连接
        conn_str = crawler.settings.get('DB')
        return cls(conn_str)

    def open_spider(self,spider):
        print("开始进行爬取")
        self.conn = open(self.conn_str,'w')

    def close_spider(self,spider):
        print("爬取完毕")
        self.conn.close()

    def process_item(self, item, spider):
        # yield item_obj 时,进行执行 process_item 方法
        # item_obj = MyscrapyItem(href = href,title = title)
        contents = f"{item['href']}\n{item['title']}\n"
        self.conn.write(contents)
        
        # return item # 返还给另一个 pipeline 对象
        # raise DropItem() # 丢弃 item
注:需要在 settings 中添加
DB = '数据库名称'
DB
= 'choutidb' # 数据库


Cookies 基础使用

from scrapy.http.cookies import CookieJar
# 导入 CookieJar
cookie_obj = CookieJar()
# 创建 cookies 对象
cookie_obj.extract_cookies(response,response.request)
# 将 cookie 放入到 cookie_obj
print(cookie_obj._cookies)

response 方法
['_DEFAULT_ENCODING', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__',
'__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__',
 '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclas
shook__', '__weakref__', '_auto_detect_fun', '_body', '_body_declared_encoding', '_body_inferred_encoding', '_cached_b
enc', '_cached_decoded_json', '_cached_selector', '_cached_ubody', '_declared_encoding', '_encoding', '_get_body', '_g
et_url', '_headers_encoding', '_set_body', '_set_url', '_url', 'body', 'body_as_unicode', 'cb_kwargs', 'certificate',
'copy', 'css', 'encoding', 'flags', 'follow', 'follow_all', 'headers', 'ip_address', 'json', 'meta', 'replace', 'reque
st', 'selector', 'status', 'text', 'url', 'urljoin', 'xpath']

带有 Cookie 的页面请求
yield Request(
    url = '网站',
    method = "POST",
    body = {
        "用户名":"密码"
    },
    headers = {
        "请求头":"内容"
    },
    cookies = cookie_obj._cookies,
    # cookie 内容
    callback = self.函数名
    # 回调函数
)
def 函数名(self,response):
    print(response.text)

Scrapy 扩展 EXTENSIONS
settings 中 EXTENSIONS 

EXTENSIONS = {
   'scrapy.extensions.telnet.TelnetConsole': None,
   # 可以在此处加上扩展
   "myscrapy.extensions.MyExtend":300
}

创建了 extensions.py文件,类名为 MyExtend
extensions.py 文件内容
from scrapy import signals

class MyExtend:
    def __init__(self,crawler):
        self.crawler = crawler
        crawler.signals.connect(self.start, signals.engine_started)
        crawler.signals.connect(self.close, signals.spider_closed)

    @classmethod
    def from_crawler(cls,crawler):
        return cls(crawler)

    def start(self):
        print("开始 extensions")

    def close(self):
        print("结束 extensions")
运行结果

settings 配置文件简单理解
BOT_NAME = 'myscrapy'
# 爬虫的名字

SPIDER_MODULES = ['myscrapy.spiders']
NEWSPIDER_MODULE = 'myscrapy.spiders'
# 爬虫的路径

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'myscrapy (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
# 当前客户端

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# 是否遵循网站的 robots.txt 配置文件

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16
# 并发请求数,默认 16 个,最大 32 个

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# 下载延迟,3秒

# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 针对某一个 域名
CONCURRENT_REQUESTS_PER_IP = 16
# 针对某一个 ip

# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# 是否去爬取 Cookie
COOKIES_DEBUG = True
# 是否在运行时显示 Cookie

# Disable Telnet Console (enabled by default)
TELNETCONSOLE_ENABLED = False
# 是否暂停爬虫发送请求

EXTENSIONS = {
   'scrapy.extensions.telnet.TelnetConsole': None,
   # 可以在此处加上扩展
   "myscrapy.extensions.MyExtend":300
}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'myscrapy.pipelines.MyscrapyPipeline': 300,
   # 打开管道,进行保存到本地
}

DEPTH_LIMIT = 2
# 爬取深度
DEPTH_PRIORITY = 0
# 0 深度优先 ,1 广度优先
DB = 'choutidb'
# 数据库

数字越小越优先


2021-01-25

posted @ 2021-01-25 08:26  Hany47315  阅读(157)  评论(0编辑  收藏  举报