Scrapy框架爬取实例
Scrapy框架爬取豆瓣图书及其详情实例
douban.py
import scrapy
import time
from bs4 import BeautifulSoup
from scrapy import Request
from Scripts.ScrapyProject.items import bookItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["book.douban.com"]
start_urls = ["http://book.douban.com/"]
def parse(self, response):
soup = BeautifulSoup(response.text,'lxml')
Booklists = soup.find_all(class_="list-col list-col5 list-express slide-item")
for Bl in Booklists:
booklists = Bl.find_all("li")
for book in booklists:
time.sleep(3)
item = bookItem()
name = book.find(class_="title").a.text
item['name']=name
author = book.find(class_="author").text
item['author'] = author
url = book.find(class_="title").a.attrs.get('href')
### 获得图书详情url后继续爬取图书详情;
### callback如果不写,默认回调到parse方法;
### meta将item传到自己写的方法中
yield Request(url,callback=self.detailparse,meta={'item':item})
def detailparse(self,response):
### 获得默认方法中的item
item = response.meta.get('item')
rate = response.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract_first()
item['rate'] =rate
soup = BeautifulSoup(response.text,'lxml')
### 使用string会得到None
content = soup.find(class_="intro").text
item['content'] = content
yield item
item.py
import scrapy
class bookItem(scrapy.Item):
name = scrapy.Field()
author = scrapy.Field()
rate = scrapy.Field()
content = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter
import csv
import pymongo
class BookMongoPipeline:
db_url ='mongodb://localhost:27017'
db_name = 'Scrapy'
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.db_url)
self.db = self.client[self.db_name]
def process_item(self,item,spider):
collection = self.db[spider.name]
data = {'书名':item['name'],'作者':item['author'],'评价':item['rate'],'推荐':item['content']}
collection.insert_one(data)
print('%s 数据保存成功' %(item['name']))
def close_spider(self,spider):
self.client.close
Scrapy框架爬取cnblog下一页简单实例
犯了一个错误:直接拿浏览器点出来第二页链接去做拼接,导致一直爬不到下一页
实际上应该是:
blog.py
import scrapy
from scrapy import Request
from bs4 import BeautifulSoup
import time
class BlogSpider(scrapy.Spider):
name = "blog"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["http://www.cnblogs.com/"]
def parse(self, response):
### 测试是否能正确获得下一页链接
print(response)
soup = BeautifulSoup(response.text,'lxml')
atrlists = soup.find_all(class_="post-item")
for article in atrlists:
title = article.find(class_="post-item-title").text
link = article.find(class_="post-item-title").attrs.get('href')
time.sleep(3)
page = soup.find(class_="pager")
next_list = page.find_all('a')
for i in next_list:
next_page = i.attrs.get('href')
if next_page !='/' :
Next_url = response.urljoin(next_page)
yield Request(Next_url)
### 打印每一页最后一个标题链接,测试正常否
print('标题:%s; 链接:%s'%(title,link))