爬虫_scrapy_某瓜视频

1.settings.py

# Scrapy settings for scrapy_ixigua project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapy_ixigua'

SPIDER_MODULES = ['scrapy_ixigua.spiders']
NEWSPIDER_MODULE = 'scrapy_ixigua.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# ua可以写再这里，也可以写在 DEFAULT_REQUEST_HEADERS，根据个人习惯
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# 解开注释代表启用cookie，为False时候使用settings中的cookie
# 总结
# 如果使用自定义cookie就把COOKIES_ENABLED设置为True
# 如果使用settings的cookie就把COOKIES_ENABLED设置为False
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  'Accept-Language': 'zh-CN,zh;q=0.9',
  'cookie': 'ttcid=bcb6f1be446c4cd582bb5e23d160443d30; MONITOR_WEB_ID=aa2d700f-6664-4ab9-8a72-fa92af0e86d2; s_v_web_id=verify_l3lbglna_w7Mnw0NE_fqKL_4EkF_8q0i_zGaLOrhiCfb0; _tea_utm_cache_1300=undefined; ixigua-a-s=1; BD_REF=1; support_webp=true; support_avif=true; __ac_signature=_02B4Z6wo00f01j1kb-QAAIDCvWaVpRGIALo9QGtAAO3PkbI7CARXRmXsKOtUBp5FiKR-ArNnnFR4Ezbs2vWvO5pjf6IOKkDAboJ8FvY6BaIKH4b19oW9uJWNAdjll476BJginerDQG2.16UH1b; msToken=bn5b2TKGS6jRsq0K1lR8Pbrt4H29ghi0bSrvVtA0vq2jAFpFmo_SPzLJarFJSfseQPaQU6n-nZh2MVkE8SSMg8kq-yDOFhx6Ymkiazf7S8TzCmp6ujj1ctKptJ1w0G-Z; tt_scid=EpID885-K-4MtEOFevGSniewf1jbhDnQcY--yonCIQp.ou2Q4jqV9AyHq4rXyDtGf0f9; ttwid=1|EXTC5VvDMV8v4FXyn9V_k2BQi6I_NEtP38oaZVfrlN4|1654134611|625dc6664875869927cd8af09d64e49589b6108f556cde317de886b3c88e3633',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'scrapy_ixigua.middlewares.ScrapyIxiguaSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'scrapy_ixigua.middlewares.ScrapyIxiguaDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

#配置数据库,名称一定要大写
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = '数据库账户'
DB_PASSWORD = '数据库密码'
DB_NAME = 'spider01'
# utf-8的“-”杠不允许写，否则就报错
DB_CHARSET = 'utf8'

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'scrapy_ixigua.pipelines.ScrapyIxiguaPipeline': 300,
   'scrapy_ixigua.pipelines.MysqlPipeline': 301
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

2.pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import urllib.request
# 管道-下载缩略图
class ScrapyIxiguaPipeline:
    def process_item(self,item,spider):
        url = item.get('src')
        filename = './src/'+item.get('title')+'.jpg'
        urllib.request.urlretrieve(url=url,filename=filename)
        return item

#加载settings文件
from scrapy.utils.project import get_project_settings
#导入pymysql
import pymysql
# 管道-Mysql持久化
class MysqlPipeline:
    def open_spider(self,spider):
        settings = get_project_settings()
        self.host = settings['DB_HOST']
        self.port = settings['DB_PORT']
        self.user = settings['DB_USER']
        self.password = settings['DB_PASSWORD']
        self.database = settings['DB_NAME']
        self.charset = settings['DB_CHARSET']
        self.connect()

    def connect(self):
        self.conn =pymysql.connect(
                            host=self.host,
                            port=self.port,
                            user=self.user,
                            password=self.password,
                            db=self.database,
                            charset=self.charset
        )

        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        #插入数据库
        sql = 'insert into ixigua(src,title,user,play,mv,userimg,duration) values("{}","{}","{}","{}","{}","{}","{}")'.format(item['src'],item['title'],item['user'],item['play'],item['mv'],item['img'],item['duration'])
        # 执行sql语句
        self.cursor.execute(sql)
        # 提交
        self.conn.commit()
        return item

    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()

3.items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ScrapyIxiguaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 视频预览图片地址
    src = scrapy.Field()
    # 视频标题
    title = scrapy.Field()
    # 作者名称
    user = scrapy.Field()
    # 播放量
    play = scrapy.Field()
    # 视频播放地址
    mv = scrapy.Field()
    # 作者头像地址
    img = scrapy.Field()
    # 视频时长
    duration = scrapy.Field()

    pass

4.ixigua.py

import scrapy
from scrapy_ixigua.items import ScrapyIxiguaItem

class IxiguaSpider(scrapy.Spider):
    name = 'ixigua'
    allowed_domains = ['www.ixigua.com']
    #
    # start_urls = ['https://www.ixigua.com/search/广场舞/?logTag=da15511cf98419ebe3b8&tab_name=search']
    start_urls = ['https://www.ixigua.com/']

    def parse(self, response):
        # print(response.request.headers['User-Agent'])
        # print(response.request.headers['cookie'])
        # print(response.text)

        ixigua_list = response.xpath('//div[@class="FeedContainer__items"]/div[@class="FeedContainer__itemWrapper"]')

        for xg in ixigua_list:
            #视频地址
            mv = 'https://www.ixigua.com'+xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a/@href').extract_first()
            # 视频标题
            title = xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a/@title').extract_first()
            # 缩略图片地址
            src = 'https:'+xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a//img/@src').extract_first()
            # 视频时长
            duration =xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a//span/text()').extract_first()
            # 作者头像地址
            userImg = 'http:'+xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//img/@src').extract_first()
            # 作者名称
            user = xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//a[@class="user__name"]/text()').extract_first()
            # 播放量
            play = xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//span[@class="HorizontalFeedCard-accessories-bottomInfo__statistics"]/text()').extract_first()
            # 提交管道
            ixigua = ScrapyIxiguaItem(mv=mv,title=title,src=src,duration=duration,img=userImg,user=user,play=play)
            yield ixigua

建立一个启动文件，定时执行爬取文件

import time
import os

if __name__ == '__main__':
    print('tuwner_spider running')
    times = 1
    while True:
        os.system("scrapy crawl ixigua")
        time.sleep(10)  # 每隔一天运行一次 24*60*60=86400s
        times+=1
        print('Crawl successful,【'+str(times)+'】')

运行效果：

代码地址：https://gitee.com/heating-cloud/python_spider.git

posted @ 2022-06-02 13:21 创客未来阅读(154) 评论(0) 收藏举报

刷新页面返回顶部

创客未来

爬虫_scrapy_某瓜视频

1.settings.py

2.pipelines.py

3.items.py

4.ixigua.py

公告