第四次作业

一、作业内容

作业①:要求:

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;

Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

候选网站:http://www.dangdang.com/关键词:学生自由选择输出信息:MYSQL的输出信息如

sql

create table books(
   btitle varchar(512) primary key,
   bauthor varchar(256),
   bpublisher varchar(256),
   bprice varchar(16),
   bdate varchar (32),
   bdetail text

);

items.py

import scrapy


class DangdangItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    author = scrapy.Field()
    publisher = scrapy.Field()
    date = scrapy.Field()
    price = scrapy.Field()
    detail = scrapy.Field()
    

pipelines.py

import pymysql

class DangdangPipeline(object):
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="", db="book",
                                       charset="utf8")  # 账号密码连接数据库
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from book")  # 删除
            self.opened = True  
            self.count = 0  # count计数
        except Exception as error:
            print(error)
            self.opened = False  
            
            
    def close_spider(self, spider):  #提交数据并关闭数据库
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
            print("closed")
            print("共爬取了", self.count, "本书籍")
   
    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            if self.opened:
                self.cursor.execute(
                    "insert into book(bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail)values(%s,%s,%s,%s,%s,%s)",
                    (item["title"], item["author"], item["publisher"], item["date"], item["price"], item["detail"]))
                self.count +=1
        except Exception as error:
            print(error)
        return item

setting.py

BOT_NAME = 'dangdang'

SPIDER_MODULES = ['dangdang.spiders']
NEWSPIDER_MODULE = 'dangdang.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dangdang (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False




ITEM_PIPELINES = {
  'dangdang.pipelines.DangdangPipeline': 300,
}

mydangdand.py

import scrapy
from ..items import DangdangItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
class BookSpider(scrapy.Spider):
    name = 'mydangdang'
    key = "python"
    start_url = 'http://search.dangdang.com/'
    def start_requests(self):
        url = BookSpider.start_url + "?key=" + BookSpider.key+"act=input"
        yield scrapy.Request(url=url, callback=self.parse)
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body,["utf-8","gdk"])
            data =dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title = li.xpath("./a[position()=1]/@title").extract_first()
                price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date = li.xpath("./p[@class='search_book_author']/span[position()=last-1]/text()").extract_first()
                publisher =li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
                detail = li.xpath("./p[@class='detail']/text()").extract_first()
                item = DangdangItem()
                item["title"]=title.strip()if title else""
                item["author"]=author.strip()if author else""
                item["publisher"]=publisher.strip()if publisher else""
                item["date"]=date.strip()[1:]if date else""
                item["price"]=price.strip()if price else""
                item["detail"]=detail.strip()if detail else""
                yield item
            #最后一页link为none,连续爬取不同页    
            link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
            if link:
                url=response.urljoin(link)
        except Exception as error:
            print(error)

 

总结:是对书上代码的复现,加强了对scrapy的理解,由于sql还没安装成功,所以只给出爬取结果

作业②

  • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

  • 候选网站:东方财富网:https://www.eastmoney.com/

    ​ 新浪股票:http://finance.sina.com.cn/stock/

  • 输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:

item.py

import scrapy


class StockItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

        code = scrapy.Field()#code f12
        name = scrapy.Field()#name f14
        newp = scrapy.Field()#newp f2
        zdf = scrapy.Field()#zdf f3
        zde = scrapy.Field()#zde f4
        cjl = scrapy.Field()#cjl f5
        cje = scrapy.Field()#cje f6
        zf = scrapy.Field()#zf f7
        max =scrapy.Field()#max f15
        min =scrapy.Field()#min f16
        today = scrapy.Field() #today  f16
        yesterday = scrapy.Field()  #yesterday f16

 

stock.py

import json
import scrapy
from  stock.items import StockItem


class StocksSpider(scrapy.Spider):
    name = 'stocks'
    start_urls = [
        'http://34.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406554985928274808_1604410757799&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1604410757861']

    def parse(self, response):
        jsons = response.text[41:][:-2]
        text_json = json.loads(jsons)
        for data in text_json['data']['diff']:
            item = StockItem()
            item["code"] = data['f12']
            item["name"] = data['f14']
            item["nepw"] = data['f2']
            item["zdf"] = data['f3']
            item["zde"] = data['f4']
            item["cjl"] = data['f5']
            item["cje"] = data['f6']
            item["zf"] = data['f7']
            item["max"] = data['15']
            item["min"] = data['f16']
            item["today"] = data['f17']
            item["yesterday"] = data['f18']
            yield item
        print("完成")

        # 再爬取后10页的内容
        for i in range(2, 11):
            new_url = 'http://34.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240918880626239239_1602070531441&pn=' + str(
                i) + '&pz=20&po=1&np=3&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,' \
                     'm:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,' \
                     'f22,f11,f62,f128,f136,f115,f152&_=1604410757861'
        if new_url:
            yield scrapy.Request(new_url, callback=self.parse)

settings

BOT_NAME = 'stock'

SPIDER_MODULES = ['stock.spiders']
NEWSPIDER_MODULE = 'stock.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stock (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'stock.pipelines.StockPipeline': 300,
}

 作业③
(1)要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据

item

import scrapy


class CurrencyItem(scrapy.Item):
    currency = scrapy.Field()
    tsp = scrapy.Field()
    csp = scrapy.Field()
    tbp = scrapy.Field()
    cbp = scrapy.Field()
    time = scrapy.Field()

setting

BOT_NAME = 'currency'

SPIDER_MODULES = ['currency.spiders']
NEWSPIDER_MODULE = 'currency.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'currency (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
   'currency.pipelines.CurrencyPipeline': 300,
}

pipeline

import pymysql
class CurrencyPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="host", passwd="123456", db="mydata", charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from currency")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
    def process_item(self, item, spider):
        try:
            print(self.count)
            print(item["currency"])
            print(item["tsp"])
            print(item["csp"])
            print(item["tsp"])
            print(item["cbp"])
            print(item["time"])
            print()
            if self.opened:
                self.cursor.execute(
                    "insert into currency (id,currency,tsp,csp,tbp,cbp,time) values(%d,%s,%s,%s,%s,%s,%s)",
                    (self.count,item["currency"], item["tsp"], item["csp"], item["tbp"], item["cbp"], item["time"]))
                self.count += 1
        except Exception as err:
            print(err)
        return item

spider

from bs4 import UnicodeDammit


class CurrencySpider(scrapy.Spider):
    name = 'currency'
    #allowed_domains = ['fx.cmbchina.com']
    start_urls = ["http://fx.cmbchina.com/hq/"]

    def parse(self,response):
        dammit = UnicodeDammit(response.body,["utf-8","gbk"])
        data = dammit.unicode_markup
        selector=scrapy.Selector(text=data)
        lis=selector.xpath("//div[@id='realRateInfo']/table/tr")
        for li in lis[1:]:
            item = CurrencyItem()
            currency=li.xpath("./td[position()=1][@class='fontbold']/text()").extract_first()
            tsp = li.xpath("./td[position()=4][@class='numberright']/text()").extract_first()
            csp = li.xpath("./td[position()=5][@class='numberright']/text()").extract_first()
            tbp=li.xpath("./td[position()=6][@class='numberright']/text()").extract_first()
            cbp=li.xpath("./td[position()=7][@class='numberright']/text()").extract_first()
            time=li.xpath("./td[position()=8][@align='center']/text()").extract_first()
            item["currency"]=str(currency.strip())
            item["tsp"]=str(tsp.strip())
            item["csp"]=str(csp.strip())
            item["tbp"]=str(tbp.strip())
            item["cbp"]=str(cbp.strip())
            item["time"]=str(time.strip())
            yield item

 

posted @ 2020-11-03 22:59  ww嘤  阅读(132)  评论(0编辑  收藏  举报