scrapy爬虫

a. 配置文件

#settings.py
DEPTH_LIMIT = 1    			#指定“递归”的层数
ROBOTSTXT_OBEY = False		#对方网站规定哪些网址可以爬,这个选项表示不遵循此规定

 

  

 

b. 选择器

.//  		#表示对象的子孙中
./    		#儿子
./dev 		#儿子中的div标签
./div[@id='i1']		#儿子中的div标签且id='i1'
obj.extract()		#列表中每一个对象转换字符串 => []
obj.extract_first	#列表中的每一个对象转换字符串 => 列表第一个元素
//div/text()            #获取某个标签的文本
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.selector import Selector, HtmlXPathSelector
from scrapy.http import HtmlResponse
html = """<!DOCTYPE html>
<html>
    <head lang="en">
        <meta charset="UTF-8">
        <title></title>
    </head>
    <body>
        <ul>
            <li class="item-"><a id='i1' href="link.html">first item</a></li>
            <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
            <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
        </ul>
        <div><a href="llink2.html">second item</a></div>
    </body>
</html>
"""
response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
# hxs = HtmlXPathSelector(response)
# print(hxs)
# hxs = Selector(response=response).xpath('//a')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[2]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
# print(hxs)
 
# ul_list = Selector(response=response).xpath('//body/ul/li')
# for item in ul_list:
#     v = item.xpath('./a/span')
#     # 或
#     # v = item.xpath('a/span')
#     # 或
#     # v = item.xpath('*/a/span')
#     print(v)
View Code

c. 结构化处理

setting.py

ITEM_PIPELINES = {
   'day96.pipelines.Day96Pipeline': 300,
}

DB = "....."
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


from scrapy.exceptions import DropItem

class Day96Pipeline(object):



    def __init__(self,conn_str):
        self.conn_str = conn_str


    @classmethod
    def from_crawler(cls, crawler):
        """
        初始化时候,用于创建pipeline对象
        :param crawler:
        :return:
        """
        conn_str = crawler.settings.get('DB')
        return cls(conn_str)

    def open_spider(self,spider):
        """
        爬虫开始执行时,调用
        :param spider:
        :return:
        """
        self.conn = open(self.conn_str, 'a')

    def close_spider(self,spider):
        """
        爬虫关闭时,被调用
        :param spider:
        :return:
        """
        self.conn.close()

    def process_item(self, item, spider):
        """
        每当数据需要持久化时,就会被调用
        :param item:
        :param spider:
        :return:
        """
        # if spider.name == 'chouti'
        tpl = "%s\n%s\n\n" %(item['title'],item['href'])
        self.conn.write(tpl)

        # 交给下一个pipeline处理
        return item

        # 丢弃item,不交给
        # raise DropItem()
pipelines.py

d. 常用命令 

scrapy startproject sp1
cd p1
scrapy genspider baidu baidu.com      #创建爬虫
scrapy crawl baidu
scrapy crawl baidu --nolog

e. 目录结构

sp1
	- scrapy.cfg		 #初始配置文件
	- sp1
		- spiders		 #目录
		- items.py		 #格式化
		- pipelines.py	 #持久化
		- middlewares.py #中间件
		- settings.py    #配置

  

  

  

 

 

事例

# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']

    def parse(self, response):
        hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]')
        for obj in hxs:
            a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first()
            print(a.strip())
获取抽屉新闻标题
# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']

    visited_urls = set()

    def parse(self, response):

        #获取当前页的所有页码的url
        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()
        for url in hxs:
            md5_url = self.md5(url)
            if md5_url in self.visited_urls:
                print('已经存在',url)
            else:
                self.visited_urls.add(md5_url)
                print(url)

    def md5(self,url):
        import hashlib
        obj = hashlib.md5()
        obj.update(bytes(url,encoding='utf-8'))
        return obj.hexdigest()
获取抽屉当前页的所有页码
# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.http import Request
from scrapy.selector import Selector,HtmlXPathSelector

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']

    visited_urls = set()

    def parse(self, response):

        #获取当前页的所有页码的url

        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()

        for url in hxs:
            md5_url = self.md5(url)
            if md5_url in self.visited_urls:
                pass
            else:
                print(url)
                self.visited_urls.add(md5_url)
                url = "http://dig.chouti.com%s" %url
                #将新要访问的url增加到调度器
                yield Request(url=url,callback=self.parse)




    def md5(self,url):
        import hashlib
        obj = hashlib.md5()
        obj.update(bytes(url,encoding='utf-8'))
        return obj.hexdigest()
获取抽屉所有页码

a. 避免重复的url

setting.py 

DUPEFILTER_CLASS = "day96.duplication.RepeatFilter"
class RepeatFilter(object):
    def __init__(self):
        self.visited_set = set()
    @classmethod
    def from_settings(cls, settings):
        print('...')
        return cls()

    def request_seen(self, request):
        if request.url in self.visited_set:
            return True
        self.visited_set.add(request.url)
        return False

    def open(self):  # can return deferred
        print('open')
        pass

    def close(self, reason):  # can return a deferred
        print('close')
        pass
    def log(self, request, spider):  # log that a request has been filtered
        # print('log....')
        pass
duplication.py
# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.http import Request
from scrapy.selector import Selector,HtmlXPathSelector


class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']

    from scrapy.dupefilter import RFPDupeFilter

    def parse(self, response):
        print(response.url)

        #获取当前页的所有页码的url

        hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()

        for url in hxs:

            url = "http://dig.chouti.com%s" %url
            #将新要访问的url增加到调度器
            yield Request(url=url,callback=self.parse)




    def md5(self,url):
        import hashlib
        obj = hashlib.md5()
        obj.update(bytes(url,encoding='utf-8'))
        return obj.hexdigest()
chouti.py

 

  

 116

 

 

传智播客

 

#爬传智播客的老师的名称

#scrapy startproject mySpider

#cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/items.py

    import scrapy
    class MyspiderItem(scrapy.Item):
        name = scrapy.Field()
        title = scrapy.Field()
        info = scrapy.Field()

#cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/spiders/itcastspider.py
    import scrapy
    from mySpider.items import ItcastItem

    #创建一个爬虫类
    class ItcastSpider(scrapy.Spider):
        #爬虫名
        name = "itcast"
        #允许爬虫作用的范围
        allowd_domains = ["http://www.itcast.cn"]
        #爬虫起始的url
        start_urls = ["http://www.itcast.cn/channel/teacher.shtml#"]

        def parse(self, response):

            #通过scripy自带的xpath匹配出所有老师的根节点列表集合
            teacher_list = response.xpath('//div[@class="li_txt"]')

            teacherItem = []
            #遍历根节点集合
            for each in teacher_list:
                item = ItcastItem()
                name = each.xpath('./h3/text()').extract()
                title = each.xpath('./h4/text()').extract()
                info = each.xpath('./p/text()').extract()
                print("--------------",type(name))
                item['name'] = name[0]
                item['title'] = title[0]
                item['info'] = info[0]

                teacherItem.append(item)
            return teacherItem





#保存到json文件
scrapy crawl itcast -o itcast.json
#保存到csv文件
scrapy crawl itcast -o itcast.csv
爬传智播客的老师的名称

 

 

 

 

 

 

沛齐 

posted @ 2017-09-04 17:44  golangav  阅读(357)  评论(0编辑  收藏  举报