今日内容
0 爬取的数据,存到mysql中
class FirstscrapyMySqlPipeline:
def open_spider(self, spider):
print('我开了')
self.conn = pymysql.connect(
user='root',
password="",
host='127.0.0.1',
database='cnblogs',
port=3306)
self.cursor = self.conn.cursor()
def close_spider(self, spider):
print('我关了')
self.cursor.close()
self.conn.close()
def process_item(self, item, spider):
sql = '''INSERT INTO aritcle (title,author_img,author_name,`desc`,url,content) VALUES(%s,%s,%s,%s,%s,%s);'''
print(len(item['content']))
self.cursor.execute(sql,
args=[item['title'], item['author_img'], item['author_name'], item['desc'], item['url'],item['content']
])
self.conn.commit()
return item
1 爬虫和下载中间件
-进来的时候是个Request对象
-出去的时候是个Response对象
class FirstscrapySpiderMiddleware:
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class FirstscrapyDownloaderMiddleware:
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
中间件中抛异常会走到
pass
-修改请求头
-加cookie
-加代理
--------
-集成selenium
2.1 加代理
-在下载中间件写process_request方法
def get_proxy(self):
import requests
res = requests.get('http://127.0.0.1:5010/get/').json()
if res.get('https'):
return 'https://' + res.get('proxy')
else:
return 'http://' + res.get('proxy')
def process_request(self, request, spider):
request.meta['proxy'] = self.get_proxy()
return None
def process_exception(self, request, exception, spider):
print('-----',request.url)
return request
2.2 加cookie,修改请求头,随机生成UserAgent
def process_request(self, request, spider):
print(request.cookies)
request.cookies['name']='lqz'
return None
def process_request(self, request, spider):
print(request.headers)
request.headers['referer'] = 'http://www.lagou.com'
return None
def process_request(self, request, spider):
from fake_useragent import UserAgent
ua = UserAgent()
request.headers['User-Agent']=str(ua.random)
print(request.headers)
return None
2.3 集成selenium
from selenium import webdriver
class CnblogsSpider(scrapy.Spider):
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)
def close(spider, reason):
spider.bro.close()
def process_request(self, request, spider):
if 'sitehome/p' in request.url:
spider.bro.get(request.url)
from scrapy.http.response.html import HtmlResponse
response = HtmlResponse(url=request.url, body=bytes(spider.bro.page_source, encoding='utf-8'))
return response
else:
return None
3 去重规则源码分析(布隆过滤器)
from scrapy.core.scheduler import Scheduler
def enqueue_request(self, request: Request) -> bool:
if self.df.request_seen(request):
return False
return True
-在配置文件中如果配置了:DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'表示,使用它作为去重类,按照它的规则做去重
-RFPDupeFilter的request_seen
def request_seen(self, request: Request) -> bool:
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
return False
-爬取的网址:https://www.cnblogs.com/teach/p/17238610.html?name=lqz&age=19
-和 https://www.cnblogs.com/teach/p/17238610.html?name=lqz&age=19
-它俩是一样的,返回的数据都是一样的,就应该是一条url,就只会爬取一次
-所以 request_fingerprint 就是来把它们做成一样的(核心原理是把查询条件排序,再拼接到后面)
-生成指纹,指纹是什么? 生成的指纹放到集合中去重
-www.cnblogs.com?name=lqz&age=19
-www.cnblogs.com?age=19&name=lqz
-上面的两种地址生成的指纹是一样的
from scrapy.utils.request import RequestFingerprinter
from scrapy import Request
fingerprinter = RequestFingerprinter()
request1 = Request(url='http://www.cnblogs.com?name=lqz&age=20')
request2 = Request(url='http://www.cnblogs.com?age=20&name=lqz')
res1 = fingerprinter.fingerprint(request1).hex()
res2 = fingerprinter.fingerprint(request2).hex()
print(res1)
print(res2)
-根据配置的去重类RFPDupeFilter的request_seen方法,如果返回True,就不爬了,如果返回False就爬
-后期咱们可以使用自己定义的去重类,实现去重
-如果是集合:存的数据库越多,占内存空间越大,如果数据量特别大,可以使用布隆过滤器实现去重
from pybloom_live import BloomFilter
bf = BloomFilter(capacity=10)
url = 'www.baidu.com'
bf.add(url)
bf.add('aaaa')
bf.add('ggg')
bf.add('deww')
bf.add('aerqaaa')
bf.add('ae2rqaaa')
bf.add('aerweqaaa')
bf.add('aerwewqaaa')
bf.add('aerereweqaaa')
bf.add('we')
print(url in bf)
print("wa" in bf)
-指定错误率
-指定大小
-编译redis---》把第三方扩展布隆过滤器编译进去,才有这个功能
-https://zhuanlan.zhihu.com/p/94668736
4 scrapy-redis实现分布式爬虫
-集群:一个项目,在多个机器上部署,每个机器完成完整的功能,称之为集群
-原来使用一台机器爬取cnblogs整站
-现在想使用3台机器爬取cnblogs整站
- 每台机器爬取数据是不一样的
- 最终组装成完整的数据
-1 去重集合,我们要使用同一个----》redis集合
-2 多台机器使用同一个调度器:Scheduler,排队爬取,使用同一个队列
第一步:安装scrapy-redis ---》pip3 install scrapy-redis
第二步:改造爬虫类
from scrapy_redis.spiders import RedisSpider
class CnblogSpider(RedisSpider):
name = 'cnblog_redis'
allowed_domains = ['cnblogs.com']
redis_key = 'myspider:start_urls'
第三步:配置文件配置
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
ITEM_PIPELINES = {
'cnblogs.pipelines.CnblogsFilePipeline': 300,
'cnblogs.pipelines.CnblogsMysqlPipeline': 100,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
第四步:在多台机器上启动scrapy项目,在一台机器起了多个scrapy爬虫进程,就相当于多台机器
第五步:把起始爬取的地址放到redis的列表中
lpush mycrawler:start_urls http://www.cnblogs.com/
4.1 拓展
class RFPDupeFilter(BaseDupeFilter):
def request_seen(self, request):
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
return False
class RFPDupeFilter(BaseDupeFilter):
def request_seen(self, request):
fp = self.request_fingerprint(request)
added = self.server.sadd(self.key, fp)
return added == 0
class RedisPipeline(object):
def process_item(self, item, spider):
return deferToThread(self._process_item, item, spider)
def _process_item(self, item, spider):
key = self.item_key(item, spider)
data = self.serialize(item)
self.server.rpush(key, data)
return item
面试题
https://zhuanlan.zhihu.com/p/91643259
https://www.zhihu.com/column/c_1175438244715651072
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 25岁的心里话
· 按钮权限的设计及实现