240
我爱毛绒绒~~~~~~~

Scrapy框架爬取实例

Scrapy框架爬取豆瓣图书及其详情实例

douban.py

import scrapy

import time

from bs4 import BeautifulSoup
from scrapy import Request
from Scripts.ScrapyProject.items import bookItem

class DoubanSpider(scrapy.Spider):
    name = "douban"
    allowed_domains = ["book.douban.com"]
    start_urls = ["http://book.douban.com/"]

    def parse(self, response):
        soup = BeautifulSoup(response.text,'lxml')
        Booklists = soup.find_all(class_="list-col list-col5 list-express slide-item")
        for Bl in Booklists:
            booklists = Bl.find_all("li")
            for book in booklists:
                time.sleep(3)
                item = bookItem()
                name = book.find(class_="title").a.text
                item['name']=name
                author = book.find(class_="author").text
                item['author'] = author
                url = book.find(class_="title").a.attrs.get('href')
                ### 获得图书详情url后继续爬取图书详情;
                ### callback如果不写,默认回调到parse方法;
                ### meta将item传到自己写的方法中
                yield Request(url,callback=self.detailparse,meta={'item':item})


    def detailparse(self,response):
        ### 获得默认方法中的item
        item = response.meta.get('item')
        rate = response.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract_first()
        item['rate'] =rate
        soup  = BeautifulSoup(response.text,'lxml')
        ### 使用string会得到None
        content = soup.find(class_="intro").text
        item['content'] = content
        yield item

item.py

import scrapy

class bookItem(scrapy.Item):

    name = scrapy.Field()
    author = scrapy.Field()
    rate = scrapy.Field()
    content = scrapy.Field()

pipelines.py

from itemadapter import ItemAdapter
import csv
import pymongo


class BookMongoPipeline:
    db_url ='mongodb://localhost:27017'
    db_name = 'Scrapy'

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.db_url)
        self.db = self.client[self.db_name]

    def process_item(self,item,spider):
        collection = self.db[spider.name]
        data = {'书名':item['name'],'作者':item['author'],'评价':item['rate'],'推荐':item['content']}
        collection.insert_one(data)
        print('%s 数据保存成功' %(item['name']))
    def close_spider(self,spider):
        self.client.close

Scrapy框架爬取cnblog下一页简单实例

犯了一个错误:直接拿浏览器点出来第二页链接去做拼接,导致一直爬不到下一页

实际上应该是:

blog.py

import scrapy
from scrapy import Request

from bs4 import BeautifulSoup
import time

class BlogSpider(scrapy.Spider):
    name = "blog"
    allowed_domains = ["www.cnblogs.com"]
    start_urls = ["http://www.cnblogs.com/"]

    def parse(self, response):
        ### 测试是否能正确获得下一页链接
        print(response)
        soup = BeautifulSoup(response.text,'lxml')
        atrlists = soup.find_all(class_="post-item")
        for article in atrlists:
            title = article.find(class_="post-item-title").text
            link = article.find(class_="post-item-title").attrs.get('href')

        time.sleep(3)
        page = soup.find(class_="pager")
        next_list = page.find_all('a')
        for i in next_list:
            next_page = i.attrs.get('href')
            if next_page !='/' :
                Next_url =  response.urljoin(next_page)
                yield Request(Next_url)
        ### 打印每一页最后一个标题链接,测试正常否
        print('标题:%s;   链接:%s'%(title,link))

posted @ 2023-07-11 17:05  水开白  阅读(20)  评论(0编辑  收藏  举报