1月17日学习内容整理:Scrapy框架补充之pipeline,去重规则

@@@老师博客:::

关于高性能和scrapy框架

http://www.cnblogs.com/wupeiqi/articles/6229292.html

关于scrapy-reids组件

http://www.cnblogs.com/wupeiqi/articles/6912807.html

 

一、使用Scrapy

1. pipeline
class FilePipeline(object):
"""
放入缓存
"""

def __init__(self,path):
self.file_path = path

@classmethod
def from_crawler(cls, crawler):
"""
初始化时候,用于创建pipeline对象
:param crawler:
:return:
"""
file_path = crawler.settings.get('FILE_PATH')
return cls(file_path)

def process_item(self, item, spider):
# 操作并进行持久化


self.f.write(item['url'] + '\n')
self.f.flush()

# return表示会被后续的pipeline继续处理
return item

# 表示将item丢弃,不会被后续pipeline处理
# raise DropItem()

def open_spider(self, spider):
"""
爬虫开始执行时,调用
:param spider:
:return:
"""
self.f = open(self.file_path, 'w+', encoding='utf-8')

def close_spider(self, spider):
"""
爬虫关闭时,被调用
:param spider:
:return:
"""
self.f.close()


ITEM_PIPELINES = {
'spnew.pipelines.DBPipeline': 300,
'spnew.pipelines.CachePipeline': 400,
'spnew.pipelines.FilePipeline': 500,
}

class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://chouti.com/']

def parse(self, response):
# 对返回的字符串进行解析,从而获取到自己想要的内容
# response.xpath()=
# hxs = Selector(response=response)
# hxs = HtmlXPathSelector(response)

hxs = HtmlXPathSelector(response)
news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
for item in news_list:
url = item.xpath('.//div[@class="part1"]/a/@href').extract_first()
text = item.xpath('.//div[@class="part1"]/a/text()').extract_first()
# 将每个url和text交给pipeline,进行持久化
yield SpnewItem(url=url,text=text.strip())

2. 去重规则
class MyDupeFilter(object):

def __init__(self):
self.visited = set()

@classmethod
def from_settings(cls, settings):
return cls()

def request_seen(self, request):
"""
:param request:
:return: True,已经下载过了;False,没有看过
"""
# 放在数据库,在一台服务的文件中存储数据
# 根据当前request.url在数据库中已经存在,表示已经爬取过
# return True
# 否则
# 添加到数据库
# return False

# 放在redis中,在一台服务器的内容中存取数据(快)
# {
# http://www.baidu.com:...
# http://www.baidu1.com:...
# http://www.baidu2.com:...
# http://www.baidu3.com:...
# http://www.baidu3.com:...
# }
if request.url in self.visited:
return True
self.visited.add(request.url)
return False

def open(self): # can return deferred
pass

def close(self, reason): # can return a deferred
pass

def log(self, request, spider): # log that a request has been filtered
pass


DUPEFILTER_CLASS = 'spnew.filter.MyDupeFilter'

3. 去响应中获取cookie信息

cookie_dict = {}
# 去响应的内容中,提取cookie信息
cookie_jar = CookieJar()
cookie_jar.extract_cookies(response, response.request)
for k, v in cookie_jar._cookies.items():
for i, j in v.items():
for m, n in j.items():
cookie_dict[m] = n.value

req = Request(
url='http://dig.chouti.com/login',
method='POST',
headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
body='phone=8615131255089&password=pppppppp&oneMonth=1',
cookies=self.cookie_dict,
callback=self.check_login
)

FormData:
headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
body='phone=8615131255089&password=pppppppp&oneMonth=1',

RequestPaylod
headers={'Content-Type': 'application/json; charset=UTF-8'},
body=json.dumps({phone:8615131255089,password:"")

4. 起始request

def start_requests(self):
url = 'http://dig.chouti.com/'
#yield Request(url=url, callback=self.login)
return [Request(url=url, callback=self.login),]

posted @ 2018-01-17 17:37  九二零  阅读(110)  评论(0编辑  收藏  举报