第四次作业

  • 作业①:

    • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

    • 候选网站:http://www.dangdang.com/

    • 关键词:学生自由选择

    • 输出信息:MYSQL的输出信息如下

实现代码:

btable:

CREATE TABLE `xy669`.`btable` (
  `bId` VARCHAR(8) NOT NULL,
  `bTitle` VARCHAR(512) NULL,
  `bAuthor` VARCHAR(256) NULL,
  `bPublisher` VARCHAR(256) NULL,
  `bDate` VARCHAR(32) NULL,
  `bPrice` VARCHAR(16) NULL,
  `bDetail` TEXT NULL,
  PRIMARY KEY (`bId`));

 

MySpider.py

import scrapy
# import sys
# sys.path.append('..')
from ..items import BookItem 
from bs4 import UnicodeDammit
from bs4 import BeautifulSoup

class MySpider(scrapy.Spider):
    name = "mySpider"
    key = '目送'
    source_url = 'http://search.dangdang.com/'
    def start_requests(self):   #程序开始时会调用
        url = MySpider.source_url+"?key="+MySpider.key+"&act=input"
        yield scrapy.Request(url=url, callback=self.parse)  # 调用parse函数
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")    # 取到当前页面中所有带有属性ddt-pit的<li>,即每一条书籍
            for li in lis:
                title = li.xpath("./a[position()=1]/@title").extract_first()
                price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
                publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
                detail = li.xpath("./p[@class='detail']/text()").extract_first()
                # detail 有时没有,结果为None
                item = BookItem()
                item["title"] = title.strip() if title else ""
                item["author"] = author.strip() if title else ""
                item["date"] = date.strip()[1:] if title else ""  #注意到日期前有一个符号/,所以从1开始取值
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item  # 将爬取到的一条记录推送到pipelines.py由process_item函数处理
            #最后一页是link为None
            link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
            if link:
                url = response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)
        except Exception as err:
            print(err)

piplines.py

import pymysql
 
class BookPipeline(object):
    # 爬虫开始是执行的函数
    def open_spider(self, spider):
        print("opened")
        try:
            # 连接数据库
            self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute('delete from btable')#删除旧表
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
 
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "本书籍")
 
    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            if self.opened:
                self.count += 1     # 用来构造bID,从1开始
                # 插入数据到表中
                self.cursor.execute("insert into btable(bID,bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values(%s,%s,%s,%s,%s,%s,%s)",(str(self.count), item["title"], item["author"], item["publisher"], item["date"], item["price"],item["detail"]))

        except Exception as err:
            print("wrong error:"+str(err))
        return item

实验结果:

心得体会:

虽然是复现代码,但过程中出现了许多问题,数据库的连接,空数据的插入等。

  • 作业②

    • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

    • 候选网站:东方财富网:https://www.eastmoney.com/

      ​ 新浪股票:http://finance.sina.com.cn/stock/

    • 输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:

      序号股票代码股票名称最新报价涨跌幅涨跌额成交量成交额振幅最高最低今开昨收
      1 688093 N世华 28.47 62.22% 10.92 26.13万 7.6亿 22.34 32.0 28.08 30.2 17.55
      2......

实现代码:

gptable

CREATE TABLE `xy669`.`gptable` (
  `count` VARCHAR(8) NOT NULL,
  `code` VARCHAR(64) NULL,
  `name` VARCHAR(64) NULL,
  `new_pr` VARCHAR(64) NULL,
  `rd_ran` VARCHAR(64) NULL,
  `rd_pr` VARCHAR(64) NULL,
  `deal_n` VARCHAR(64) NULL,
  `deal_pr` VARCHAR(64) NULL,
  `rpd` VARCHAR(64) NULL,PRIMARY KEY (`count`));

MySpider.py

import scrapy
from ..items import GPItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import re

class MySpider(scrapy.Spider):
    name = 'mySpider'
    start_urls=["http://73.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f2,f3,f4,f5,f6,f7,f12,f13,f14,f15,f16,f17,f18&_=1602901412583%20Request%20Method:%20GET"]
    def start_requests(self):   #程序开始时会调用
        url = MySpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)  # 调用parse函数

#{"f2":14.11,"f3":38.74,"f4":3.94,"f5":1503670,"f6":2115845584.0,"f7":23.99,"f12":"601568","f13":1,"f14":"N北元","f15":14.64,"f16":12.2,"f17":12.2,"f18":10.17}
    def parse(self,response):
        data = response.text
        pat = '"diff":\[\{(.*?)\}\]'
        data_t = re.compile(pat, re.S).findall(data)
        datas = data_t[0].strip("{").strip("}").split('},{')
        for i in range(len(datas)):
            item = GPItem()
            datab = datas[i].replace('"', "").split(',')#获取第i条数据中的各个元素
            # item["count"] = str(i)
            item['code'] = datab[6].split(":")[1]
            item['name'] = datab[8].split(":")[1]
            item['new_pr'] = datab[0].split(":")[1]
            item['rd_ran'] = datab[1].split(":")[1]
            item['rd_pr'] = datab[2].split(":")[1]
            item['deal_n'] = datab[3].split(":")[1]
            item['deal_pr'] = datab[4].split(":")[1]
            item['rdp'] = datab[5].split(":")[1]
            item['new_hpr'] = datab[9].split(":")[1]
            item['new_lpr'] = datab[10].split(":")[1]
            item['to_op'] = datab[11].split(":")[1]
            item['yes_op'] = datab[12].split(":")[1]
            yield item

piplines.py

import pymysql
 
class GPPipeline(object):
    # 爬虫开始是执行的函数
    def open_spider(self, spider):
        print("opened")
        try:
            # 连接数据库
            self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute('delete from gptable')#删除旧表
            self.opened = True #打开
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
 
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "行股票信息")
 
    def process_item(self, item, spider):
        try:
            print(str(self.count) + '\t' + item['code']+ '\t' + item['name'] + '\t' + item['new_pr']+ '\t' + 
            item['rd_ran']+ '\t' + item['rd_pr']+ '\t' + item['deal_n']+ '\t' + item['deal_pr']+ '\t' + 
            item['rdp'])
            print()
            if self.opened:
                self.count += 1     # 用来构造count,从1开始
                # 插入数据到表中
                self.cursor.execute("insert into gptable(count,code,name,new_pr,rd_ran,rd_pr,deal_n,deal_pr,rdp,zf) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                ,(str(self.count),item['code'],item['name'],item['new_pr'],item['rd_ran'],item['rd_pr'],item['deal_n'],item['deal_pr'],item['rdp']))
        except Exception as err:
            print("wrong error:"+str(err))

        return item

实验结果

心得体会:

第二个实验是对上次作业的输出的一个修改,在理解了第一个实验后这个就相对顺利多了。

  • 作业③:

    • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

    • 候选网站:招商银行网:http://fx.cmbchina.com/hq/

    • 输出信息:MYSQL数据库存储和输出格式

      IdCurrencyTSPCSPTBPCBPTime
      1 港币 86.60 86.60 86.26 85.65 15:36:30
      2......

实现代码

mtable

CREATE TABLE `xy669`.`mtable` (
  `Id` VARCHAR(8) NOT NULL,
  `Currency` VARCHAR(64) NULL,
  `TSP` VARCHAR(64) NULL,
  `CSP` VARCHAR(64) NULL,
  `TBP` VARCHAR(64) NULL,
  `CBP` VARCHAR(64) NULL,
  `Time` VARCHAR(64) NULL,
  PRIMARY KEY (`Id`));

MySpider.py

import scrapy
from ..items import BCItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit


class MySpider(scrapy.Spider):
    name = 'mySpider'
    start_urls = 'http://fx.cmbchina.com/hq/'

    def start_requests(self):   #程序开始时会调用
        url = MySpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)  # 调用parse函数

    def parse(self, response):
        dammit = UnicodeDammit(response.body, ["utf-8", "gdk"])
        data = dammit.unicode_markup
        selector = scrapy.Selector(text=data)
        # 找到要存储的信息
        trs = selector.xpath("//div[@id='realRateInfo']/table/tr")
        # count = 1
        for tr in trs[1:]:#第一行是标签名
            item = BCItem()
            # item['Id'] = str(count)
            # count +=1
            item['Currency'] = tr.xpath("./td[position()=1]/text()").extract_first().strip()
            item['TSP'] = tr.xpath("./td[position()=4]/text()").extract_first().strip()
            item['CSP'] = tr.xpath("./td[position()=5]/text()").extract_first().strip()
            item['TBP'] = tr.xpath("./td[position()=6]/text()").extract_first().strip()
            item['CBP'] = tr.xpath("./td[position()=7]/text()").extract_first().strip()
            item['Time'] = tr.xpath("./td[position()=8]/text()").extract_first().strip()
            yield item

piplines.py

import pymysql
 
class BCPipeline(object):
    # 爬虫开始是执行的函数
    def open_spider(self, spider):
        print("opened")
        try:
            # 连接数据库
            self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute('delete from mtable')#删除旧表
            self.opened = True #打开
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
 
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "种不同的汇币信息")
 
    def process_item(self, item, spider):
        try:
            print()
            if self.opened:
                self.count += 1     # 用来构造count,从1开始
                # 插入数据到表中
                self.cursor.execute("insert into gptable(Id,Currency,TSP,CSP,TBP,CBP,Time) values(%s,%s,%s,%s,%s,%s,%s)"
                ,(str(self.count),item['Currency'],item['TSP'],item['CSP'],item['TBP'],item['CBP'],item['Time']))
        except Exception as err:
            print("wrong error:"+str(err))

        return item

实验结果

 

 心得体会:

这次实验是第一个的复运用,换汤不换药,加深了对Xpath的理解和运用。

posted @ 2020-11-03 19:18  永-穆  阅读(95)  评论(0编辑  收藏  举报