python-scrapy环境配置

window下:

1.先安装well            pip install wheel

2.先下载twisted       网址:https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted

3.安装twisted          pip install Twisted-20.3.0-cp38-cp38-win32.whl

4.安装pywin32        pip install pywin32

3.安装scrapy           pip install scrapy

linux下:

直接安装scrapy      pip install scrapy

 

创建爬虫项目MyProjectMovie

1.创建项目,以爬取https://www.1905.com/dianyinghao/为例    scrapy startproject MyProjectMovie

2.进入项目 cd MyProjectMovie

3.创建爬虫应用文件(必须放在spider目录下)    scrapy genspider movie www.xxx.com

项目文件夹目录

 

 

 4.movie.py文件修改

import scrapy


class MovieSpider(scrapy.Spider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.1905.com/dianyinghao/']

def parse(self, response):
print(response.text)
print(response)

5.settings文件配置

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

6.程序运行

scrapy crawl movie

 

持久化存储

import scrapy


class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']

def parse(self, response):
div_list = response.xpath('//*[@id="content"]/div/div[2]/div')
list_data = []
for div in div_list:
title = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
content = div.xpath('./a/div/span[1]//text()').extract()
content = "".join(content)
dic = {
'title' : title,
'content' :content,
}
list_data.append(dic)
return list_data

方式1:基于终端指令的持久化存储,只可以将parse方法的返回值存储在指定的文件中,例如csv/lxl/json等,需要在终端下写代码  scrapy crawl qiubai -o qiubai.csv

方式2:基于管道的持久化存储(推荐)

items.py

import scrapy

class MyprojectqiubaiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()

qiubai.py

import scrapy
from MyProjectQiubai.items import MyprojectqiubaiItem

class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']

def parse(self, response):
     div_list = response.xpath('//*[@id="content"]/div/div[2]/div')

for div in div_list:
           title = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
content = div.xpath('./a/div/span[1]//text()').extract()
content = "".join(content)
item = MyprojectqiubaiItem()
item["title"] = title
item["content"] = content
yield item

 pipelines.py

import pymysql

class MyprojectqiubaiPipeline:
fp = None
def open_spider(self,spider):
self.fp = open("qiubai.txt", mode="w", encoding="utf-8")

def process_item(self, item, spider):
content = item["content"]
title = item["title"]
self.fp.write(title+content)
return item

def close_spider(self,spider):
self.fp.close()


class MysqlPileline:
conn = None
cur = None
def open_spider(self,spider):
self.conn = pymysql.connect(
host="localhost",
port=3306,
user="root",
password="root",
database="student",
charset="utf8")

def process_item(self, item, spider):
content = item["content"]
title = item["title"]
sql = 'insert into qiubai(title,content)value("%s","%s")' % (title,content)
print(sql)

self.cur = self.conn.cursor()
try:
self.cur.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()

return item
settings.py
ITEM_PIPELINES = {
'MyProjectQiubai.pipelines.MyprojectqiubaiPipeline': 300,
'MyProjectQiubai.pipelines.MysqlPileline': 200,
}

执行项目 scrapy crawl qiubai

手动请求发送
yield scrapy.Request(url, callback=self.parse)    执行get请求
yield scrapy.FormRequest(url, callback=self.parse)   执行post请求
import scrapy
from MyProjectQiubai.items import MyprojectqiubaiItem

class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']

url_new = 'https://www.qiushibaike.com/text/page/%d/'
page = 2

def parse(self, response):

div_list = response.xpath('//*[@id="content"]/div/div[2]/div')
for div in div_list:
title = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
content = div.xpath('./a/div/span[1]//text()').extract()
content = "".join(content)
item = MyprojectqiubaiItem()
item["title"] = title
item["content"] = content
yield item

if self.page < 10:
url_new = format(self.url_new % self.page)
self.page += 1
yield scrapy.Request(url_new, callback=self.parse)
posted @ 2021-01-09 21:31  失忆525  阅读(150)  评论(0编辑  收藏  举报