爬虫scrapy模块

首先下载scrapy模块

这里有惊喜

https://www.cnblogs.com/bobo-zhang/p/10068997.html

创建一个scrapy文件

首先在终端找到一个文件夹

输入

scrapy startproject jy (项目件名)

修改setting文件配置

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = Fals

 

cd 到 spiders文件,在终端创建一个文件

scrapy genspider myjy(文件名) www.xxx.com

 

在文件里执行我们的第一个代码吧

 

#实现解析+持久化存储


# -*- coding: utf-8 -*-
import scrapy


class FirstSpider(scrapy.Spider):
#爬虫文件的名称
name = 'first'
#允许的域名
#allowed_domains = ['www.xxx.com']
#起始url列表
start_urls = ['https://www.qiushibaike.com/text/']
#实现了数据的基本解析操作
# def parse(self, response):
# div_list = response.xpath('//div[@id="content-left"]/div')
# for div in div_list:
# #author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
# #如果可以保证xpath返回的列表中只有一个列表元素则可以使用extract_first(),否则必须使用extract()
# author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
# content = div.xpath('./a[1]/div/span//text()').extract()
# content = ''.join(content)
# print(author,content)

#实现解析+持久化存储
#1.基于终端指令的持久化存储
# 只可以将parse方法的返回值持久化存储到本地的文本中
#2.基于管道的持久化存储


# 1.基于终端指令的持久化存储
def parse(self, response):
div_list = response.xpath('//div[@id="content-left"]/div')
all_data = []
for div in div_list:
#author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
#如果可以保证xpath返回的列表中只有一个列表元素则可以使用extract_first(),否则必须使用extract()
author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
content = div.xpath('./a[1]/div/span//text()').extract()
content = ''.join(content)

dic = {
'author':author,
'content':content
}

all_data.append(dic)

return all_data

 

 

最后运行文件

scrapy crawl myjy

 

 

 

#解析+管道持久化存储

 

首先在psrse里写入文件

# -*- coding: utf-8 -*-
import scrapy

from bossPro.items import BossproItem
class BossSpider(scrapy.Spider):
    name = 'boss'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position=']

    url = 'https://www.zhipin.com/c101010100/?query=python爬虫&page=%d&ka=page-2'
    page = 1
    #解析+管道持久化存储
    def parse(self, response):
        li_list = response.xpath('//div[@class="job-list"]/ul/li')
        for li in li_list:
            job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first()
            salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
            company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()

            #实例化一个item对象
            item = BossproItem()
            #将解析到的数据全部封装到item对象中
            item['job_name'] = job_name
            item['salary'] = salary
            item['company'] = company

            #将item提交给管道
            yield item

        if self.page <= 3:
            print('if 执行!!!')
            self.page += 1
            new_url = format(self.url%self.page)
            print(new_url)
            #手动请求发送
            yield scrapy.Request(url=new_url,callback=self.parse)

 

配置items.py文件,用来作为数据结构

import scrapy


class BossproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job_name = scrapy.Field()
    salary = scrapy.Field()
    company = scrapy.Field()

 

 

在pipelines.py里写入文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
from redis import Redis
class BossproPipeline(object):
    fp = None
    def open_spider(self, spider):
        print('开始爬虫......')
        self.fp = open('./boss.txt','w',encoding='utf-8')
    def close_spider(self, spider):
        print('结束爬虫......')
        self.fp.close()
    #爬虫文件每向管道提交一次item,则该方法就会被调用一次.
    #参数:item 就是管道接收到的item类型对象

    def process_item(self, item, spider):
        #print(item)
        self.fp.write(item['job_name']+':'+item['salary']+':'+item['company']+'\n')
        return item #返回给下一个即将被执行的管道类

class mysqlPileLine(object):
    conn = None
    cursor =None
    def open_spider(self,spider):
        self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='',db='scrapy',charset="utf8")
        print(self.conn)
    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        # print(item)
        #print('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))
        try:
            print('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))
            self.cursor.execute('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
    def close_spider(self,spider):
        self.conn.close()
        self.cursor.close()

class redisPileLine(object):
    conn = None
    def open_spider(self,spider):
        self.conn = Redis(host='127.0.0.1',port=6379)
        print(self.conn)
    def process_item(self, item, spider):
        # print(item)
        dic = {
            'name':item['job_name'],
            'salary':item['salary'],
            'company':item['company']
        }
        self.conn.lpush('boss',dic)

 

 

别忘了在setting里面配置

 

 

ITEM_PIPELINES = {
   # 'boss.pipelines.BossPipeline': 300,
   'boss.pipelines.redisPipeline': 301,
   # 'boss.pipelines.mysqlPipeline': 302,
}

 

posted @ 2019-03-01 19:36  逆欢  阅读(421)  评论(0编辑  收藏  举报