爬虫_scrapy_某瓜视频
1.settings.py
# Scrapy settings for scrapy_ixigua project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'scrapy_ixigua' SPIDER_MODULES = ['scrapy_ixigua.spiders'] NEWSPIDER_MODULE = 'scrapy_ixigua.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # ua可以写再这里,也可以写在 DEFAULT_REQUEST_HEADERS,根据个人习惯 # USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36' # Obey robots.txt rules # ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # 解开注释代表启用cookie,为False时候使用settings中的cookie # 总结 # 如果使用自定义cookie就把COOKIES_ENABLED设置为True # 如果使用settings的cookie就把COOKIES_ENABLED设置为False COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9', 'cookie': 'ttcid=bcb6f1be446c4cd582bb5e23d160443d30; MONITOR_WEB_ID=aa2d700f-6664-4ab9-8a72-fa92af0e86d2; s_v_web_id=verify_l3lbglna_w7Mnw0NE_fqKL_4EkF_8q0i_zGaLOrhiCfb0; _tea_utm_cache_1300=undefined; ixigua-a-s=1; BD_REF=1; support_webp=true; support_avif=true; __ac_signature=_02B4Z6wo00f01j1kb-QAAIDCvWaVpRGIALo9QGtAAO3PkbI7CARXRmXsKOtUBp5FiKR-ArNnnFR4Ezbs2vWvO5pjf6IOKkDAboJ8FvY6BaIKH4b19oW9uJWNAdjll476BJginerDQG2.16UH1b; msToken=bn5b2TKGS6jRsq0K1lR8Pbrt4H29ghi0bSrvVtA0vq2jAFpFmo_SPzLJarFJSfseQPaQU6n-nZh2MVkE8SSMg8kq-yDOFhx6Ymkiazf7S8TzCmp6ujj1ctKptJ1w0G-Z; tt_scid=EpID885-K-4MtEOFevGSniewf1jbhDnQcY--yonCIQp.ou2Q4jqV9AyHq4rXyDtGf0f9; ttwid=1|EXTC5VvDMV8v4FXyn9V_k2BQi6I_NEtP38oaZVfrlN4|1654134611|625dc6664875869927cd8af09d64e49589b6108f556cde317de886b3c88e3633', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36' } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'scrapy_ixigua.middlewares.ScrapyIxiguaSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'scrapy_ixigua.middlewares.ScrapyIxiguaDownloaderMiddleware': 543, # } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} #配置数据库,名称一定要大写 DB_HOST = '127.0.0.1' DB_PORT = 3306 DB_USER = '数据库账户' DB_PASSWORD = '数据库密码' DB_NAME = 'spider01' # utf-8的“-”杠不允许写,否则就报错 DB_CHARSET = 'utf8' # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'scrapy_ixigua.pipelines.ScrapyIxiguaPipeline': 300, 'scrapy_ixigua.pipelines.MysqlPipeline': 301 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
2.pipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter import urllib.request # 管道-下载缩略图 class ScrapyIxiguaPipeline: def process_item(self,item,spider): url = item.get('src') filename = './src/'+item.get('title')+'.jpg' urllib.request.urlretrieve(url=url,filename=filename) return item #加载settings文件 from scrapy.utils.project import get_project_settings #导入pymysql import pymysql # 管道-Mysql持久化 class MysqlPipeline: def open_spider(self,spider): settings = get_project_settings() self.host = settings['DB_HOST'] self.port = settings['DB_PORT'] self.user = settings['DB_USER'] self.password = settings['DB_PASSWORD'] self.database = settings['DB_NAME'] self.charset = settings['DB_CHARSET'] self.connect() def connect(self): self.conn =pymysql.connect( host=self.host, port=self.port, user=self.user, password=self.password, db=self.database, charset=self.charset ) self.cursor = self.conn.cursor() def process_item(self, item, spider): #插入数据库 sql = 'insert into ixigua(src,title,user,play,mv,userimg,duration) values("{}","{}","{}","{}","{}","{}","{}")'.format(item['src'],item['title'],item['user'],item['play'],item['mv'],item['img'],item['duration']) # 执行sql语句 self.cursor.execute(sql) # 提交 self.conn.commit() return item def close_spider(self,spider): self.cursor.close() self.conn.close()
3.items.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class ScrapyIxiguaItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 视频预览图片地址 src = scrapy.Field() # 视频标题 title = scrapy.Field() # 作者名称 user = scrapy.Field() # 播放量 play = scrapy.Field() # 视频播放地址 mv = scrapy.Field() # 作者头像地址 img = scrapy.Field() # 视频时长 duration = scrapy.Field() pass
4.ixigua.py
import scrapy from scrapy_ixigua.items import ScrapyIxiguaItem class IxiguaSpider(scrapy.Spider): name = 'ixigua' allowed_domains = ['www.ixigua.com'] # # start_urls = ['https://www.ixigua.com/search/广场舞/?logTag=da15511cf98419ebe3b8&tab_name=search'] start_urls = ['https://www.ixigua.com/'] def parse(self, response): # print(response.request.headers['User-Agent']) # print(response.request.headers['cookie']) # print(response.text) ixigua_list = response.xpath('//div[@class="FeedContainer__items"]/div[@class="FeedContainer__itemWrapper"]') for xg in ixigua_list: #视频地址 mv = 'https://www.ixigua.com'+xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a/@href').extract_first() # 视频标题 title = xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a/@title').extract_first() # 缩略图片地址 src = 'https:'+xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a//img/@src').extract_first() # 视频时长 duration =xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a//span/text()').extract_first() # 作者头像地址 userImg = 'http:'+xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//img/@src').extract_first() # 作者名称 user = xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//a[@class="user__name"]/text()').extract_first() # 播放量 play = xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//span[@class="HorizontalFeedCard-accessories-bottomInfo__statistics"]/text()').extract_first() # 提交管道 ixigua = ScrapyIxiguaItem(mv=mv,title=title,src=src,duration=duration,img=userImg,user=user,play=play) yield ixigua
建立一个启动文件,定时执行爬取文件
import time import os if __name__ == '__main__': print('tuwner_spider running') times = 1 while True: os.system("scrapy crawl ixigua") time.sleep(10) # 每隔一天运行一次 24*60*60=86400s times+=1 print('Crawl successful,【'+str(times)+'】')
运行效果:
代码地址:https://gitee.com/heating-cloud/python_spider.git
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!