Scrapt框架安装
# 下载过程 pip3 install scrapy # mac本基本可以直接安装成功 # windows可能会出现错误
报错处理:
""" 报错: error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Micros oft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/ ---------------------------------------- ERROR: Command errored out with exit status 1: 'C:\ProgramData\Anaconda3\python. exe' -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\chun _xiong\\AppData\\Local\\Temp\\pip-install-t2sgma8_\\twisted\\setup.py'"'"'; __fi le__='"'"'C:\\Users\\chun_xiong\\AppData\\Local\\Temp\\pip-install-t2sgma8_\\twi sted\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.r ead().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file_ _, '"'"'exec'"'"'))' install --record 'C:\Users\chun_xiong\AppData\Local\Temp\pi p-record-tibwu8if\install-record.txt' --single-version-externally-managed --comp ile --install-headers 'C:\ProgramData\Anaconda3\Include\Twisted' Check the logs for full command output. """ # 处理方法 1.pip3 install wheel 2.http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted #根据解释器版本下载对应的依赖文件 3.利用pip报错确定上述下载文件的位置 C:\Users\用户名\ pip3 install Twisted-20.3.0-cp38-cp38-win_amd64.whl 4.pip3 install pywin32 5.pip3 install scrapy '''pip版本过低可能也会报错,根据报错命令更新pip即可''' # 验证是否安装成功 scrapy
scrapt基本使用
# 命令行创建项目 scrapy startproject 项目名字 例如:scrapy startproject papa # 新建爬虫 scrapy genspider 爬虫名 爬取的域名 例如:scrapy genspider cnblogs cnblogs.com -项目目录介绍 -spiders -所有的爬虫程序 -items.py -类似于django的model类 -middlewares.py -中间件 -pipelines.py -持久化相关(数据存储) -settings.py -配置文件(关注:爬虫协议) -scrapy.cfg -部署相关 -运行爬虫 -scrapy crawl 文件名 --nolog -爬取数据 //*[@id="post_list"]/div[1] https://www.cnblogs.com/xiaoyuanqujing/protected/articles/11805810.html
爬取博客园首页文章
cnblogs.py中代码
import scrapy from scrapy import Request class CnblogsSpider(scrapy.Spider): name = 'cnblogs' #爬虫名,必须唯一 allowed_domains = ['cnblogs.com'] #允许的域名, start_urls = ['https://www.cnblogs.com/'] #起始的url #深度优先,广度优先 #爬虫去重:好多策略 #爬虫起始入口 start_requests def parse_detail(self,response): print(len(response.text)) def parse(self, response): # print('--------4444-------',response) # print(response.text) #混搭 # from bs4 import BeautifulSoup # soup=BeautifulSoup(response.text,"lxml") # soup.find(name='div') #解析 div_list=response.css('.post_item') #取出class为post_item的所有 # print(len(div_list)) #//a[contains(@class,"li")] # div_list=response.xpath('//div[contains(@class,"post_item")]') # div_list=response.xpath('//*[@id="post_list"]/div') # print(len(div_list)) for div in div_list: #extract_first()表示取列表中的第一个 url=div.css('.post_item_body a::attr(href)').extract_first() print(url) yield Request(url,callback=self.parse_detail) next_url=response.css('.pager a:last-child::attr(href)').extract_first() print('https://www.cnblogs.com'+next_url) yield Request('https://www.cnblogs.com'+next_url)
# 新增main.py
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'cnblogs','--nolog'])