Python爬虫知识
一、爬虫
1、概述
网络爬虫,搜索引擎就是爬虫的应用者。
2、爬虫分类
(1)通用爬虫,常见就是搜索引擎,无差别的收集数据,存储,提取关键字,构建索引库,给用户提供搜索接口。
爬取一般流程:
初始化一批URL,将这些url放入到等待爬取队列。
从队列取出这些url,通过dns解析ip,对应ip站点下载HTML页面,保存到本地服务器中,爬取完的url放到已爬取队列。
分析这些网页内容,找出网页里面关心的url连接,继续执行第二步,直到爬取结束。
搜索引擎如何获取一个新网站的url。
新网站主动提交给搜索引擎。
通过其他网站页面中设置的外链。
搜索引擎和dns服务商合作,获取最新收录的网站。
(2)聚焦爬虫
有针对性的编写特定领域数据的爬取程序,针对某些类别数据的采集的爬虫,是面向主题的。
3、robots协议
指定一个robots.txt文件,告诉爬虫引擎什么可以爬取。
这个协议为了让搜索引擎更有效率搜索自己内容,提供了sitemap这样的文件。
这个文件禁止抓取的往往又是可能我们感兴趣的内容,反而泄露了这些地址。。
4、http请求和响应处理
爬虫网页就是通过HTTP协议访问网页,不过通过浏览器访问往往是人的行为,把程序编程人的行为的问题。
Urllib包
from urllib.request import urlopen
response = urlopen('http://www.bing.com')
print(response.closed)
with response:
print(response.status)
print(response._method)
print(response.read())
print(response.closed)
print(response.info)
print(response.closed)
使用等,urllib包,使用查询等。
解决useragent问题:
from urllib.request import urlopen,Request
url = 'http://www.bing.com'
ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
req = Request(url,headers={'User-agent':ua})
response = urlopen(req,timeout=10)
# print(req)
print(response.closed)
with response:
print(response.status)
print(response._method)
# print(response.read())
# print(response.closed)
# # print(response.info)
print(response.geturl())
print(req.get_header('User-agent'))
print(response.closed)
Chrome浏览器获取useragent
5、parse
from urllib import parse
d = {
'id':1,
'name':'tom',
'url':'http://www.magedu.com'
}
url = 'http://www.magedu.com'
u = parse.urlencode(d) #url编码
print(u)
print(parse.unquote(u))#解码
6、请求方法
from urllib import parse
import simplejson
base_url = 'http://cn.bing.com/search'
d = {
'q':'马哥教育'
}
# d = {
# 'id':1,
# 'name':'tom',
# 'url':'http://www.magedu.com'
# }
# url = 'http://www.magedu.com'
u = parse.urlencode(d) #url编码
# url = '{}?{}'.format(base_url,u)
# print(url)
#
# print(parse.unquote(url))#解码
from urllib.request import urlopen,Request
url = 'http://httpbin.org/post'
ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
data = parse.urlencode({'name':'张三,@=/&*','age':'6'})
req = Request(url,headers={
'User-agent':ua
})
# res = urlopen(req)
with urlopen(req,data= data.encode()) as res:
text = res.read()
d = simplejson.loads(text)
print(d)
# with open('c:/assets/bing.html','wb+') as f:
# f.write(res.read())
# f.flush()
from urllib import parse
import simplejson
base_url = 'http://cn.bing.com/search'
d = {
'q':'马哥教育'
}
# d = {
# 'id':1,
# 'name':'tom',
# 'url':'http://www.magedu.com'
# }
# url = 'http://www.magedu.com'
u = parse.urlencode(d) #url编码
# url = '{}?{}'.format(base_url,u)
# print(url)
#
# print(parse.unquote(url))#解码
from urllib.request import urlopen,Request
url = 'http://httpbin.org/post'
ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
data = parse.urlencode({'name':'张三,@=/&*','age':'6'})
req = Request(url,headers={
'User-agent':ua
})
# res = urlopen(req)
with urlopen(req,data= data.encode()) as res:
text = res.read()
d = simplejson.loads(text)
print(d)
# with open('c:/assets/bing.html','wb+') as f:
# f.write(res.read())
# f.flush()
7、爬取豆瓣网
from urllib.request import Request,urlopen
import simplejson
from urllib import
parse
ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
jurl = 'https://movie.douban.com/j/search_subjects'
d = {
'type':'movie',
'tag':'热门',
'page_limit':10,
'page_start':10
}
req = Request('{}?{}'.format(jurl,parse.urlencode(d)),headers={
'User-agent':ua
})
with urlopen(req) as res:
sub = simplejson.loads(res.read())
print(len(sub))
print(sub)
8、解决https,ca证书的问题
忽略证书,ssl
from urllib.request import Request,urlopen
from urllib import
parse
import ssl
#request =
Request('http://www.12306.cn/mormhweb')
request = Request('http://www.baidu.com')
request.add_header('User-agent','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/67.0.3396.99 Safari/537.36'
)
context = ssl._create_unverified_context()
#忽略不可用证书
with urlopen(request,context=context) as res:
print(res._method)
print(res.read())
9、urllib3
pip install urllib3
import urllib3
url = 'http://movie.douban.com'
ua =
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/67.0.3396.99 Safari/537.36'
with urllib3.PoolManager() as http: #连接池管理器
response = http.request('GET',url,headers={'User-agent':ua})
print(1,response)
print(2,type(response))
print(3,response.status,response.reason)
print(4,response.headers)
print(5,response.data)
import urllib3
from urllib.parse import urlencode
from urllib3 import
HTTPResponse
url = 'http://movie.douban.com'
ua =
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99
Safari/537.36'
jurl = 'https://movie.douban.com/j/search_subjects'
d = {
'type':'movie',
'tag':'热门',
'page_limit':10,
'page_start':10
}
# with urllib3.PoolManager() as http: #连接池管理器
# response =
http.request('GET',url,headers={'User-agent':ua}) #可以指定请求方法
# print(1,response)
# print(2,type(response))
# print(3,response.status,response.reason)
# print(4,response.headers)
# print(5,response.data)
with urllib3.PoolManager() as http:
response = http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})
print(response)
print(response.status)
print(response.data)
10、requests库
Requests使用了urllib3.
pip install requests
import urllib3
from urllib.parse import urlencode
from urllib3 import
HTTPResponse
import requests
# url = 'http://movie.douban.com'
ua =
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/67.0.3396.99 Safari/537.36'
jurl = 'https://movie.douban.com/j/search_subjects'
d = {
'type':'movie',
'tag':'热门',
'page_limit':10,
'page_start':10
}
url = '{}?{}'.format(jurl,urlencode(d))
# with urllib3.PoolManager() as http: #连接池管理器
# response =
http.request('GET',url,headers={'User-agent':ua}) #可以指定请求方法
# print(1,response)
# print(2,type(response))
#
print(3,response.status,response.reason)
# print(4,response.headers)
# print(5,response.data)
# with urllib3.PoolManager() as http:
# response =
http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})
# print(response)
# print(response.status)
# print(response.data)
response =
requests.request('GET',url,headers = {'User-agent':ua})
with response:
print(response.text)
print(response.status_code)
print(response.url)
print(response.headers)
print(response.request)
带会话的方式 session。
会把请求头等信息自动管理。
import urllib3
from urllib.parse import urlencode
from urllib3 import
HTTPResponse
import requests
# url = 'http://movie.douban.com'
ua =
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99
Safari/537.36'
jurl = 'https://movie.douban.com/j/search_subjects'
d = {
'type':'movie',
'tag':'热门',
'page_limit':10,
'page_start':10
}
# url = '{}?{}'.format(jurl,urlencode(d))
# with urllib3.PoolManager() as http: #连接池管理器
# response =
http.request('GET',url,headers={'User-agent':ua}) #可以指定请求方法
# print(1,response)
# print(2,type(response))
#
print(3,response.status,response.reason)
# print(4,response.headers)
# print(5,response.data)
# with urllib3.PoolManager() as http:
# response =
http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})
# print(response)
# print(response.status)
# print(response.data)
# response = requests.request('GET',url,headers = {'User-agent':ua})
#
# with response:
# print(response.text)
# print(response.status_code)
# print(response.url)
# print(response.headers)
# print(response.request)
urls = ['https://www.baidu.com/s?wd=magedu','https://www.baidu.com/s?wd=magedu']
session = requests.Session()
with session:
for url in urls:
response = session.get(url,headers={'User-agent':ua})
with response:
print(1,response.text)
print(2,response.status_code)
print(3,response.url)
print(4,response.headers)
print(5,response.request.headers)
print('--------')
print(response.cookies)
print('--------------')
print(response.cookies)
11、特别注意
个别网站登录的时候cookie,登录的时候要把原来的cookie带回去,然后登录成功后其给你返回一个新的,否则不能进行相关操作。有些时候只是带一些cookie相关的值即可。
反爬措施:对于用户发起的请求来检测上一次是否访问的是我的网站。
在network的referer里面显示上一次访问网站的哪个一页。
Files:上传的文件内容。
路由器的将用户名和密码加密放在请求头里面。
Cert证书。
Requests基本功能:
import requests
ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36
Core/1.63.5514.400 QQBrowser/10.1.1660.400'
url = 'https://dig.chouti.com/login'
data = {
'phone':'8618804928235',
'password':'tana248654',
'oneMonth':'1'
}
r1_urls = 'https://dig.chouti.com'
r1 = requests.get(url=r1_urls,headers={'User-Agent':ua})
# print(r1.text)
r1_cookie = r1.cookies.get_dict()
print('r1',r1.cookies)
response = requests.post(url,data,headers={'User-Agent':ua},cookies=r1_cookie)
print(response.text)
print(response.cookies.get_dict())
r3 = requests.post(url='https://dig.chouti.com/link/vote?linksId=21718341',
cookies={'gpsd':r1_cookie.get('gpsd')},headers={'User-Agent':ua})
print(r3.text)
二、HTML解析
通过上面的库,可以拿到HTML内容。
1、Xpath
http://www.qutoric.com/xmlquire/
站点。
路径的遍历,查找到需要的内容。
2、lxml库
解析HTML的库。
安装:
pip install lxml
爬取豆瓣网top10
import urllib3
from urllib.parse import urlencode
from urllib3 import
HTTPResponse
import requests
from lxml import
etree
# url = 'http://movie.douban.com'
ua =
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/67.0.3396.99 Safari/537.36'
jurl = 'https://movie.douban.com/j/search_subjects'
d = {
'type':'movie',
'tag':'热门',
'page_limit':10,
'page_start':10
}
# urls =
['https://www.baidu.com/s?wd=magedu','https://www.baidu.com/s?wd=magedu']
urls = ['https://movie.douban.com/']
session = requests.Session()
with session:
for url in urls:
response = session.get(url,headers={'User-agent':ua})
with response:
content = response.text
html = etree.HTML(content)
title = html.xpath("//div[@class='billboard-bd']//tr")
for t in title:
txt = t.xpath('.//text()')
print(''.join(map(lambda x:x.strip(),txt)))
# print(t)
3、beautifulsoup4
4、可以导航的string(navigablestring)
深度优先遍历。
Soup.findall().
Soup.findall(id =’header’)
5、css选择器
Soup.select 正则表达式
Pip install jsonpath.
from concurrent.futures import ThreadPoolExecutor
import threading
import time
from queue import
Queue
import logging
import requests
from bs4 import
BeautifulSoup
event = threading.Event()
url = 'https://news.enblogs.com'
path = '/n/page/'
ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
urls = Queue()
htmls = Queue()
outps = Queue()
def create_urls(start,stop,step=1):
for i in range(start,stop+1,step):
url1 = '{}{}{}/'.format(url,path,i)
urls.put(url1)
def crawler():
while not event.is_set():
try:
url1 =
urls.get(True,1)
response = requests.get(url,headers={'User-agent':ua})
with response:
html = response.text
htmls.put(html)
except Exception
as e:
print(1,e)
def parse():
while not event.is_set():
try:
html = htmls.get(True,1)
soup = BeautifulSoup(html,'lxml')
news = soup.select('h2.news_entry a')
for n in news:
txt = n.text
url1 = url + n.attrs.get('href')
outps.put((txt,url1))
except Exception
as e:
print(e)
def save(path):
with open(path,'a+',encoding='utf-8') as f:
while not event.is_set():
try:
title,url1 = outps.get(True,1)
f.write('{}{}\n'.format(title,url1))
f.flush()
except Exception
as e:
print(e)
executor = ThreadPoolExecutor(max_workers=10)
executor.submit(create_urls,1,10)
executor.submit(parse)
executor.submit(save,'c:/new.txt')
for i in range(7):
executor.submit(crawler)
while True:
cmd = input('>>>')
if cmd.strip()
== 'q':
event.set()
executor.shutdown()
print('close')
time.sleep()
break
三、动态网页处理
很多网站采用的是ajax技术,spa技术。部分内容都是异步加载的,提高用户体验。
1、phantomjs无头浏览器
Xml http 与后端服务器建立的连接。
2、selenium
(1)自动化测试工具等,可以直接截图。模仿浏览器的行为等。
from selenium import webdriver
import datetime
import time
import random
driver = webdriver.PhantomJS('c:/assets/phantomjs-2.1.1-windows/bin/phantomjs.exe')
driver.set_window_size(1024,1024)
url = 'https://cn.bing.com/search?q=%E9%A9%AC%E5%93%A5%E6%95%99%E8%82%B2'
driver.get(url)
def savedic():
try:
base_dir = 'C:/assets/'
filename = '{}{:%Y%m%d%H%M%S}{}.png'.format(base_dir,datetime.datetime.now(),random.randint(1,100))
driver.save_screenshot(filename)
except Exception
as e:
print(1,e)
# time.sleep(6)
# print('-------')
# savedic()
MAXRETRIES = 5
while MAXRETRIES:
try:
ele = driver.find_element_by_id('b_results')
print(ele)
print('===========')
savedic()
break
except Exception as e:
print(e)
print(type(e))
time.sleep(1)
MAXRETRIES -= 1
查找数据等,异步的方式。
(2)下拉框子使用,使用Select。
3、模拟键盘输入
模仿浏览器登录,先找到登录框的id,然后,setkeys。
之后返回登录后的网页。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import random
import datetime
driver = webdriver.PhantomJS('c:/assets/phantomjs-2.1.1-windows/bin/phantomjs.exe')
driver.set_window_size(1024,1024)
url = 'https://www.oschina.net/home/login?goto_page=https%3A%2F%2Fwww.oschina.net%2F'
def savedic():
try:
base_dir = 'C:/assets/'
filename = '{}{:%Y%m%d%H%M%S}{}.png'.format(base_dir,datetime.datetime.now(),random.randint(1,100))
driver.save_screenshot(filename)
except Exception
as e:
print(1,e)
driver.get(url)
print(driver.current_url,111111111111)
savedic()
email = driver.find_element_by_id('userMail')
passwed = driver.find_element_by_id('userPassword')
email.send_keys('604603701@qq.com')
passwed.send_keys('tana248654')
savedic()
passwed.send_keys(Keys.ENTER)
time.sleep(2)
print(driver.current_url,2222222222)
userinfo = driver.find_element_by_class_name('user-info')
print(userinfo.text)
time.sleep(2)
cookie = driver.get_cookies()
print(cookie)
savedic()
4、页面等待
(1)time.sleep
数据js加载需要一定的时间内。
线程休眠。
设置尝试的次数等
(2)selenium里面的wait
显示等待
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
as EC
try:
email = WebDriverWait(driver,10).until(
EC.presence_of_all_elements_located((By.ID,'userMail'))
)
savedic()
finally:
driver.quit()
隐士的等待
driver.implicitly_wait(10)
总结:
四、scrapy框架
1、安装
Pip install scrapy 可能报错,报错的原因是下载tw开头的文件.whl文件,然后pip安装。
2、使用
scrapy startproject scrapyapp 开启一个项目
scrapy genspider donz_spider dnoz.org 进入spider文件下创建一个新的模块,把要爬取的网站加到url列表中。
scrapy genspider -t basic dbbook douban.com 继承自baseic模板。内容少。
scrapy genspider -t crawl book douban.com 继承自crawl模板,内容多。
-t 后面加的是模板。 然后名字和网站
scrapy crawl donz_spider 运行代码,运行时候报错的话pip install pypiwin32
from scrapy.http.response.html import HtmlResponse
response 继承于HTMLResponse。
在item设置中设置要爬取的信息的类例如标题。
在spiders下的文件里面写爬虫的xpath,爬取的队列及爬取内容的匹配。
Middlewares里面是中间件。
Pipelines里面处理函数。
五、scrapy-redis组件
1、scrapy-redis使用
Pip install scrapy_redis
使用redis作为队列需要的配置文件
Setting.py
BOT_NAME = 'scrapyapp'
SPIDER_MODULES = ['scrapyapp.spiders']
NEWSPIDER_MODULE = 'scrapyapp.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
COOKIES_ENABLED = False
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
ITEM_PIPELINES = { #
redis数据库连接相关
'scrapyapp.pipelines.ScrapyappPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 543,
}
REDIS_HOST = '192.168.118.130'
REDIS_PORT = 6379
# LOG_LEVEL = 'DEBUG'
Spiders 下面的爬虫文件.py
# -*- coding: utf-8
-*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from ..items import
MovieItem
class MoviecommentSpider(RedisCrawlSpider):
name = 'moviecomment'
allowed_domains = ['douban.com']
# start_urls = ['http://douban.com/']
redis_key = 'moviecomment1:start_urls'
rules = (
Rule(LinkExtractor(allow=r'start=\d+'), callback='parse_item', follow=False),
)
def parse_item(self, response):
# i = {}
#i['domain_id'] =
response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] =
response.xpath('//div[@id="name"]').extract()
#i['description'] =
response.xpath('//div[@id="description"]').extract()
# return i
comment = '//div[@class="comment-item"]//span[@class="short"]/text()'
reviews = response.xpath(comment).extract()
for review
in reviews:
item = MovieItem()
item['comment'] =
review.strip()
yield item
Item.py
import scrapy
class MovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
comment = scrapy.Field()
redis数据中要设置一个key值和movecomment.py 中的redis_key = 'moviecomment1:start_urls' 设置value及初始的url值。
完成后数据库会存储响应的值
可以在redis-cli 后面加上 –ra
2、分析
(1)jieba分词
Pip install jieba
(2)stopword停用词
数据清洗:把脏数据洗掉,检测出并除去数据中无效或者无关的数据,例如空值,非法值的检测,重复数据检测等。
(3)词云
Pip install wordcloud
from redis import Redis
import json
import jieba
redis = Redis()
stopwords = set()
with open('', encoding='gbk') as f:
for line
in f:
print(line.rstrip('\r\n').encode())
stopwords.add(line.rstrip('\r\n'))
print(len(stopwords))
print(stopwords)
items = redis.lrange('dbreview:items', 0, -1)
print(type(items))
words = {}
for item in items:
val = json.loads(item)['review']
for word
in jieba.cut(val):
words[word] = words.get(word, 0) + 1
print(len(words))
print(sorted(words.items(), key=lambda x: x[1], reverse=True))
分词代码测试
六、scrapy项目
1、知识回顾
2、爬取技术网站
praise_nums = response.xpath("//span[contains(@class,
'vote-post-up')]/text()").extract()
fav_nums = response.xpath("//span[contains(@class,
'bookmark-btn')]/text()").extract()
# match_re =
re.match(".*(\d+).*", fav_nums)
class的值有多个的时候,使用container进行选取。
from scrapy.http import Request #找到的url传递给下一级
from urllib import parse
#提取下一页并交给scrapy下载
next_url =
response.xpath('//div[@class="navigation
margin-20"]/a[4]/@href').extract()
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
(1)图片处理及存储:
pip install pillow
IMAGES_URLS_FIELD =
"front_image_url"
project_dir =
os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir, 'images')
(2)写入到本地文件:
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file
= codecs.open('article.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()
scrapy自带的JsonItemExporter
(3)导出功能,还有csv文件等
class JsonItemExporterPipeline(object):
'''
调用scrapy的JsonItemExporter
'''
def __init__(self):
self.file
= open('articleexport.json', 'wb')
self.exporter
= JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
(4)数据库插入操作
class MysqlPipeline(object):
def __init__(self):
self.conn
= MySQLdb.connect('192.168.118.131', 'wang', 'wang', 'scrapy_jobbole', charset='utf8', use_unicode=True)
self.cursor
= self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into jobbole_article(title,
url, create_date, fav_nums)
values (%s, %s, %s, %s)
"""
self.cursor.execute(insert_sql, (item['title'], item['url'], item['create_date'], item['fav_nums']))
self.conn.commit()
(5)scrapy提供的异步方法
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool
= dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
password =
settings['MYSQL_PASSWORD'],
charset='utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
'''
异步操作
:param item:
:param spider:
:return:
'''
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error)
def handle_error(self, failure):
'''
处理插入的异常
:param failure:
:return:
'''
print(failure)
def do_insert(self, cursor, item):
'''
执行具体插入
:param cursor:
:param item:
:return:
'''
insert_sql = """
insert into
jobbole_article(title, url, create_date, fav_nums)
values (%s, %s, %s, %s)
"""
cursor.execute(insert_sql, (item['title'], item['url'], item['create_date'], item['fav_nums']))
(5)将django的model集成到scrapy
Scrapy-djangoitem
(6)改变超多的xpath和css,使用itemloader
# 通过itemloader加载item
item_loader =
ArticleItemLoader(item=ArticleItem(), response=response)
# item_loader.add_css()
item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()')
可以在item里面的field里面选择,
class ArticleItem(scrapy.Item):
title = scrapy.Field(
input_processor=MapCompose(add_jobbole)
)
create_date = scrapy.Field(
input_processor=MapCompose(add_time)
)
自定义输出:
class ArticleItemLoader(ItemLoader):
# 自定义item
loader
default_output_processor = TakeFirst()
pipeline后面的数值是优先级的问题
七、反爬虫策略
1、修改settings和middlewares文件
Setting里面设置一个user-agent-list的列表。
Middlewares里面设置
class RandomUserAgentMiddlware(object):
'''
随机更换user-agent
'''
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.user_agent_list
= crawler.settings.get("user_agent_list", [])
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
request.headers.setdefault('User-Agent', random())
2、随意更换user-agent 的库
>pip install fake-useragent
from fake_useragent import UserAgent
class RandomUserAgentMiddlware(object):
'''
随机更换user-agent
'''
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
# self.user_agent_list =
crawler.settings.get("user_agent_list", [])
self.ua
= UserAgent()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
request.headers.setdefault('User-Agent', self.ua.random)
class RandomUserAgentMiddlware(object):
'''
随机更换user-agent
'''
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
# self.user_agent_list =
crawler.settings.get("user_agent_list", [])
self.ua
= UserAgent()
self.ua_type
= crawler.settings.get("RANDOM_UA_TYPE", "random") 配置项
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
随机选取一个user-agent
3、代理ip
普通ip代理
request.meta['proxy'] = "http://61.135.217.7:80" #ip 代理
(1)直接设置普通ip
(2)首先爬取某代理网站的代理ip存入到数据库中,然后从数据库中找到数据,放到middlewares里面进行ip代理。
import requests
from scrapy.selector import Selector
import MySQLdb
import threading
from fake_useragent import UserAgent
conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='centos', db='test', charset='utf8')
cour = conn.cursor()
ua = UserAgent()
def crawl_ips():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
for i in range(3):
re = requests.get('http://www.xicidaili.com/wt/{0}'.format(i), headers=headers)
seletor = Selector(text=re.text)
all_trs = seletor.css('#ip_list tr')
ip_list = []
for tr in all_trs:
speed_strs = tr.css(".bar::attr(title)").extract()
if speed_strs:
speed_str = speed_strs[0]
all_texts = tr.css('td::text').extract()
if all_texts:
ip = all_texts[0]
port = all_texts[1]
proxy_type = all_texts[5]
ip_list.append((ip, port, proxy_type, speed_str.split('秒')[0]))
for ip_info
in ip_list:
cour.execute(
"insert xici_ip_list(ip, port, speed, proxy_type)
VALUES('{0}', '{1}', '{2}', '{3}')".format(
ip_info[0], ip_info[1], ip_info[3], ip_info[2])
)
conn.commit()
print('数据库写入完成')
# crawl_ips()
class GetIP(object):
def delete_ip(self, ip):
delete_sql = """
delete from xici_ip_list where
ip='{0}'
""".format(ip)
cour.execute(delete_sql)
conn.commit()
return True
def judge_ip(self, ip, port):
http_url = 'http://ww.baidu.com'
proxy_url = 'http://{}:{}'.format(ip, port)
try:
proxy_dict = {
'http': proxy_url
}
response =
requests.get(http_url, proxies=proxy_dict)
except Exception
as e:
print('invalid ip and port')
self.delete_ip(ip)
return False
else:
code = response.status_code
if code
>= 200 and code < 300:
print('eddective ip')
return True
else:
print('invalid ip and port')
self.delete_ip(ip)
return False
def get_random_ip(self):
# 从数据库中随机获取一个ip
sql = """
SELECT ip, port FROM xici_ip_list
ORDER BY RAND()
LIMIT 1
"""
result =
cour.execute(sql)
for ip_info
in cour.fetchall():
ip = ip_info[0]
port = ip_info[1]
judge_ip = self.judge_ip(ip, port)
if judge_ip:
return "http://{0}:{1}".format(ip, port)
else:
return self.get_random_ip()
# t = threading.Thread(target=crawl_ips)
# t.start()
get_ip = GetIP()
get_ip.get_random_ip()
class RandomProxyMiddleware(object):
#动态设计ip代理
def process_request(self, request, spider):
get_ip = GetIP()
request.meta['proxy'] = get_ip.get_random_ip() #ip
代理
(3)插件化scrapy-proxies
https://github.com/aivarsk/scrapy-proxies/blob/master/scrapy_proxies
(4)scrapy-crawlera
收费版本
(5)tor洋葱网络
https://github.com/aivarsk/scrapy-proxies/blob/master/scrapy_proxies
稳定版本
八、验证码识别
1、验证码识别方法
编码实现tesseract-cor
在线打码
人工打码