第四次作业

作业①:
- 要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据
- 候选网站：http://www.dangdang.com/
- 关键词：学生自由选择
- 输出信息：MYSQL的输出信息如下

实现代码：

btable:

CREATE TABLE `xy669`.`btable` (
  `bId` VARCHAR(8) NOT NULL,
  `bTitle` VARCHAR(512) NULL,
  `bAuthor` VARCHAR(256) NULL,
  `bPublisher` VARCHAR(256) NULL,
  `bDate` VARCHAR(32) NULL,
  `bPrice` VARCHAR(16) NULL,
  `bDetail` TEXT NULL,
  PRIMARY KEY (`bId`));

MySpider.py

import scrapy
# import sys
# sys.path.append('..')
from ..items import BookItem 
from bs4 import UnicodeDammit
from bs4 import BeautifulSoup

class MySpider(scrapy.Spider):
    name = "mySpider"
    key = '目送'
    source_url = 'http://search.dangdang.com/'
    def start_requests(self):   #程序开始时会调用
        url = MySpider.source_url+"?key="+MySpider.key+"&act=input"
        yield scrapy.Request(url=url, callback=self.parse)  # 调用parse函数
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")    # 取到当前页面中所有带有属性ddt-pit的<li>,即每一条书籍
            for li in lis:
                title = li.xpath("./a[position()=1]/@title").extract_first()
                price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
                publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
                detail = li.xpath("./p[@class='detail']/text()").extract_first()
                # detail 有时没有，结果为None
                item = BookItem()
                item["title"] = title.strip() if title else ""
                item["author"] = author.strip() if title else ""
                item["date"] = date.strip()[1:] if title else ""  #注意到日期前有一个符号/,所以从1开始取值
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item  # 将爬取到的一条记录推送到pipelines.py由process_item函数处理
            #最后一页是link为None
            link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
            if link:
                url = response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)
        except Exception as err:
            print(err)

piplines.py

import pymysql
 
class BookPipeline(object):
    # 爬虫开始是执行的函数
    def open_spider(self, spider):
        print("opened")
        try:
            # 连接数据库
            self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute('delete from btable')#删除旧表
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
 
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "本书籍")
 
    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            if self.opened:
                self.count += 1     # 用来构造bID,从1开始
                # 插入数据到表中
                self.cursor.execute("insert into btable(bID,bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values(%s,%s,%s,%s,%s,%s,%s)",(str(self.count), item["title"], item["author"], item["publisher"], item["date"], item["price"],item["detail"]))

        except Exception as err:
            print("wrong error:"+str(err))
        return item

实验结果：

心得体会：

虽然是复现代码，但过程中出现了许多问题，数据库的连接，空数据的插入等。

作业②

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息
候选网站：东方财富网：https://www.eastmoney.com/

新浪股票：http://finance.sina.com.cn/stock/

输出信息：MYSQL数据库存储和输出格式如下，表头应是英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计表头：

序号	股票代码	股票名称	最新报价	涨跌幅	涨跌额	成交量	成交额	振幅	最高	最低	今开	昨收
1	688093	N世华	28.47	62.22%	10.92	26.13万	7.6亿	22.34	32.0	28.08	30.2	17.55
2......

实现代码：

gptable

CREATE TABLE `xy669`.`gptable` (
  `count` VARCHAR(8) NOT NULL,
  `code` VARCHAR(64) NULL,
  `name` VARCHAR(64) NULL,
  `new_pr` VARCHAR(64) NULL,
  `rd_ran` VARCHAR(64) NULL,
  `rd_pr` VARCHAR(64) NULL,
  `deal_n` VARCHAR(64) NULL,
  `deal_pr` VARCHAR(64) NULL,
  `rpd` VARCHAR(64) NULL,PRIMARY KEY (`count`));

MySpider.py

import scrapy
from ..items import GPItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import re

class MySpider(scrapy.Spider):
    name = 'mySpider'
    start_urls=["http://73.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f2,f3,f4,f5,f6,f7,f12,f13,f14,f15,f16,f17,f18&_=1602901412583%20Request%20Method:%20GET"]
    def start_requests(self):   #程序开始时会调用
        url = MySpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)  # 调用parse函数

#{"f2":14.11,"f3":38.74,"f4":3.94,"f5":1503670,"f6":2115845584.0,"f7":23.99,"f12":"601568","f13":1,"f14":"N北元","f15":14.64,"f16":12.2,"f17":12.2,"f18":10.17}
    def parse(self,response):
        data = response.text
        pat = '"diff":\[\{(.*?)\}\]'
        data_t = re.compile(pat, re.S).findall(data)
        datas = data_t[0].strip("{").strip("}").split('},{')
        for i in range(len(datas)):
            item = GPItem()
            datab = datas[i].replace('"', "").split(',')#获取第i条数据中的各个元素
            # item["count"] = str(i)
            item['code'] = datab[6].split(":")[1]
            item['name'] = datab[8].split(":")[1]
            item['new_pr'] = datab[0].split(":")[1]
            item['rd_ran'] = datab[1].split(":")[1]
            item['rd_pr'] = datab[2].split(":")[1]
            item['deal_n'] = datab[3].split(":")[1]
            item['deal_pr'] = datab[4].split(":")[1]
            item['rdp'] = datab[5].split(":")[1]
            item['new_hpr'] = datab[9].split(":")[1]
            item['new_lpr'] = datab[10].split(":")[1]
            item['to_op'] = datab[11].split(":")[1]
            item['yes_op'] = datab[12].split(":")[1]
            yield item

piplines.py

import pymysql
 
class GPPipeline(object):
    # 爬虫开始是执行的函数
    def open_spider(self, spider):
        print("opened")
        try:
            # 连接数据库
            self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute('delete from gptable')#删除旧表
            self.opened = True #打开
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
 
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "行股票信息")
 
    def process_item(self, item, spider):
        try:
            print(str(self.count) + '\t' + item['code']+ '\t' + item['name'] + '\t' + item['new_pr']+ '\t' + 
            item['rd_ran']+ '\t' + item['rd_pr']+ '\t' + item['deal_n']+ '\t' + item['deal_pr']+ '\t' + 
            item['rdp'])
            print()
            if self.opened:
                self.count += 1     # 用来构造count,从1开始
                # 插入数据到表中
                self.cursor.execute("insert into gptable(count,code,name,new_pr,rd_ran,rd_pr,deal_n,deal_pr,rdp,zf) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                ,(str(self.count),item['code'],item['name'],item['new_pr'],item['rd_ran'],item['rd_pr'],item['deal_n'],item['deal_pr'],item['rdp']))
        except Exception as err:
            print("wrong error:"+str(err))

        return item

实验结果

心得体会：

第二个实验是对上次作业的输出的一个修改，在理解了第一个实验后这个就相对顺利多了。

作业③:
- 要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
- 候选网站：招商银行网：http://fx.cmbchina.com/hq/
- 输出信息：MYSQL数据库存储和输出格式
  
  Id Currency TSP CSP TBP CBP Time
  
  1 港币 86.60 86.60 86.26 85.65 15：36：30
  
  2......

Id	Currency	TSP	CSP	TBP	CBP	Time
1	港币	86.60	86.60	86.26	85.65	15：36：30
2......

实现代码

mtable

CREATE TABLE `xy669`.`mtable` (
  `Id` VARCHAR(8) NOT NULL,
  `Currency` VARCHAR(64) NULL,
  `TSP` VARCHAR(64) NULL,
  `CSP` VARCHAR(64) NULL,
  `TBP` VARCHAR(64) NULL,
  `CBP` VARCHAR(64) NULL,
  `Time` VARCHAR(64) NULL,
  PRIMARY KEY (`Id`));

MySpider.py

import scrapy
from ..items import BCItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit


class MySpider(scrapy.Spider):
    name = 'mySpider'
    start_urls = 'http://fx.cmbchina.com/hq/'

    def start_requests(self):   #程序开始时会调用
        url = MySpider.start_urls
        yield scrapy.Request(url=url, callback=self.parse)  # 调用parse函数

    def parse(self, response):
        dammit = UnicodeDammit(response.body, ["utf-8", "gdk"])
        data = dammit.unicode_markup
        selector = scrapy.Selector(text=data)
        # 找到要存储的信息
        trs = selector.xpath("//div[@id='realRateInfo']/table/tr")
        # count = 1
        for tr in trs[1:]:#第一行是标签名
            item = BCItem()
            # item['Id'] = str(count)
            # count +=1
            item['Currency'] = tr.xpath("./td[position()=1]/text()").extract_first().strip()
            item['TSP'] = tr.xpath("./td[position()=4]/text()").extract_first().strip()
            item['CSP'] = tr.xpath("./td[position()=5]/text()").extract_first().strip()
            item['TBP'] = tr.xpath("./td[position()=6]/text()").extract_first().strip()
            item['CBP'] = tr.xpath("./td[position()=7]/text()").extract_first().strip()
            item['Time'] = tr.xpath("./td[position()=8]/text()").extract_first().strip()
            yield item

piplines.py

import pymysql
 
class BCPipeline(object):
    # 爬虫开始是执行的函数
    def open_spider(self, spider):
        print("opened")
        try:
            # 连接数据库
            self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute('delete from mtable')#删除旧表
            self.opened = True #打开
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
 
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "种不同的汇币信息")
 
    def process_item(self, item, spider):
        try:
            print()
            if self.opened:
                self.count += 1     # 用来构造count,从1开始
                # 插入数据到表中
                self.cursor.execute("insert into gptable(Id,Currency,TSP,CSP,TBP,CBP,Time) values(%s,%s,%s,%s,%s,%s,%s)"
                ,(str(self.count),item['Currency'],item['TSP'],item['CSP'],item['TBP'],item['CBP'],item['Time']))
        except Exception as err:
            print("wrong error:"+str(err))

        return item

实验结果

心得体会：

这次实验是第一个的复运用，换汤不换药，加深了对Xpath的理解和运用。

posted @ 2020-11-03 19:18 永-穆阅读(95) 评论(0) 编辑收藏举报

刷新页面返回顶部

第四次作业

公告