第四次作业
-
作业①:
-
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据
-
关键词:学生自由选择
-
输出信息:MYSQL的输出信息如下
-
实现代码:
btable:
CREATE TABLE `xy669`.`btable` ( `bId` VARCHAR(8) NOT NULL, `bTitle` VARCHAR(512) NULL, `bAuthor` VARCHAR(256) NULL, `bPublisher` VARCHAR(256) NULL, `bDate` VARCHAR(32) NULL, `bPrice` VARCHAR(16) NULL, `bDetail` TEXT NULL, PRIMARY KEY (`bId`));
MySpider.py
import scrapy # import sys # sys.path.append('..') from ..items import BookItem from bs4 import UnicodeDammit from bs4 import BeautifulSoup class MySpider(scrapy.Spider): name = "mySpider" key = '目送' source_url = 'http://search.dangdang.com/' def start_requests(self): #程序开始时会调用 url = MySpider.source_url+"?key="+MySpider.key+"&act=input" yield scrapy.Request(url=url, callback=self.parse) # 调用parse函数 def parse(self, response): try: dammit = UnicodeDammit(response.body, ["utf-8", "gbk"]) data = dammit.unicode_markup selector = scrapy.Selector(text=data) lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]") # 取到当前页面中所有带有属性ddt-pit的<li>,即每一条书籍 for li in lis: title = li.xpath("./a[position()=1]/@title").extract_first() price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first() author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first() date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first() publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first() detail = li.xpath("./p[@class='detail']/text()").extract_first() # detail 有时没有,结果为None item = BookItem() item["title"] = title.strip() if title else "" item["author"] = author.strip() if title else "" item["date"] = date.strip()[1:] if title else "" #注意到日期前有一个符号/,所以从1开始取值 item["publisher"] = publisher.strip() if publisher else "" item["price"] = price.strip() if price else "" item["detail"] = detail.strip() if detail else "" yield item # 将爬取到的一条记录推送到pipelines.py由process_item函数处理 #最后一页是link为None link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first() if link: url = response.urljoin(link) yield scrapy.Request(url=url, callback=self.parse) except Exception as err: print(err)
piplines.py
import pymysql class BookPipeline(object): # 爬虫开始是执行的函数 def open_spider(self, spider): print("opened") try: # 连接数据库 self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8") self.cursor = self.con.cursor(pymysql.cursors.DictCursor) self.cursor.execute('delete from btable')#删除旧表 self.opened = True self.count = 0 except Exception as err: print(err) self.opened = False def close_spider(self, spider): if self.opened: self.con.commit() self.con.close() self.opened = False print("closed") print("总共爬取", self.count, "本书籍") def process_item(self, item, spider): try: print(item["title"]) print(item["author"]) print(item["publisher"]) print(item["date"]) print(item["price"]) print(item["detail"]) print() if self.opened: self.count += 1 # 用来构造bID,从1开始 # 插入数据到表中 self.cursor.execute("insert into btable(bID,bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values(%s,%s,%s,%s,%s,%s,%s)",(str(self.count), item["title"], item["author"], item["publisher"], item["date"], item["price"],item["detail"])) except Exception as err: print("wrong error:"+str(err)) return item
实验结果:
心得体会:
虽然是复现代码,但过程中出现了许多问题,数据库的连接,空数据的插入等。
-
作业②
-
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息
-
候选网站:东方财富网:https://www.eastmoney.com/
-
输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:
序号 股票代码 股票名称 最新报价 涨跌幅 涨跌额 成交量 成交额 振幅 最高 最低 今开 昨收 1 688093 N世华 28.47 62.22% 10.92 26.13万 7.6亿 22.34 32.0 28.08 30.2 17.55 2......
-
实现代码:
gptable
CREATE TABLE `xy669`.`gptable` ( `count` VARCHAR(8) NOT NULL, `code` VARCHAR(64) NULL, `name` VARCHAR(64) NULL, `new_pr` VARCHAR(64) NULL, `rd_ran` VARCHAR(64) NULL, `rd_pr` VARCHAR(64) NULL, `deal_n` VARCHAR(64) NULL, `deal_pr` VARCHAR(64) NULL, `rpd` VARCHAR(64) NULL,PRIMARY KEY (`count`));
MySpider.py
import scrapy from ..items import GPItem from bs4 import BeautifulSoup from bs4 import UnicodeDammit import re class MySpider(scrapy.Spider): name = 'mySpider' start_urls=["http://73.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f2,f3,f4,f5,f6,f7,f12,f13,f14,f15,f16,f17,f18&_=1602901412583%20Request%20Method:%20GET"] def start_requests(self): #程序开始时会调用 url = MySpider.start_urls yield scrapy.Request(url=url, callback=self.parse) # 调用parse函数 #{"f2":14.11,"f3":38.74,"f4":3.94,"f5":1503670,"f6":2115845584.0,"f7":23.99,"f12":"601568","f13":1,"f14":"N北元","f15":14.64,"f16":12.2,"f17":12.2,"f18":10.17} def parse(self,response): data = response.text pat = '"diff":\[\{(.*?)\}\]' data_t = re.compile(pat, re.S).findall(data) datas = data_t[0].strip("{").strip("}").split('},{') for i in range(len(datas)): item = GPItem() datab = datas[i].replace('"', "").split(',')#获取第i条数据中的各个元素 # item["count"] = str(i) item['code'] = datab[6].split(":")[1] item['name'] = datab[8].split(":")[1] item['new_pr'] = datab[0].split(":")[1] item['rd_ran'] = datab[1].split(":")[1] item['rd_pr'] = datab[2].split(":")[1] item['deal_n'] = datab[3].split(":")[1] item['deal_pr'] = datab[4].split(":")[1] item['rdp'] = datab[5].split(":")[1] item['new_hpr'] = datab[9].split(":")[1] item['new_lpr'] = datab[10].split(":")[1] item['to_op'] = datab[11].split(":")[1] item['yes_op'] = datab[12].split(":")[1] yield item
piplines.py
import pymysql class GPPipeline(object): # 爬虫开始是执行的函数 def open_spider(self, spider): print("opened") try: # 连接数据库 self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8") self.cursor = self.con.cursor(pymysql.cursors.DictCursor) self.cursor.execute('delete from gptable')#删除旧表 self.opened = True #打开 self.count = 0 except Exception as err: print(err) self.opened = False def close_spider(self, spider): if self.opened: self.con.commit() self.con.close() self.opened = False print("closed") print("总共爬取", self.count, "行股票信息") def process_item(self, item, spider): try: print(str(self.count) + '\t' + item['code']+ '\t' + item['name'] + '\t' + item['new_pr']+ '\t' + item['rd_ran']+ '\t' + item['rd_pr']+ '\t' + item['deal_n']+ '\t' + item['deal_pr']+ '\t' + item['rdp']) print() if self.opened: self.count += 1 # 用来构造count,从1开始 # 插入数据到表中 self.cursor.execute("insert into gptable(count,code,name,new_pr,rd_ran,rd_pr,deal_n,deal_pr,rdp,zf) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" ,(str(self.count),item['code'],item['name'],item['new_pr'],item['rd_ran'],item['rd_pr'],item['deal_n'],item['deal_pr'],item['rdp'])) except Exception as err: print("wrong error:"+str(err)) return item
实验结果
心得体会:
第二个实验是对上次作业的输出的一个修改,在理解了第一个实验后这个就相对顺利多了。
-
作业③:
-
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
-
候选网站:招商银行网:http://fx.cmbchina.com/hq/
-
输出信息:MYSQL数据库存储和输出格式
Id Currency TSP CSP TBP CBP Time 1 港币 86.60 86.60 86.26 85.65 15:36:30 2......
-
实现代码
mtable
CREATE TABLE `xy669`.`mtable` ( `Id` VARCHAR(8) NOT NULL, `Currency` VARCHAR(64) NULL, `TSP` VARCHAR(64) NULL, `CSP` VARCHAR(64) NULL, `TBP` VARCHAR(64) NULL, `CBP` VARCHAR(64) NULL, `Time` VARCHAR(64) NULL, PRIMARY KEY (`Id`));
MySpider.py
import scrapy from ..items import BCItem from bs4 import BeautifulSoup from bs4 import UnicodeDammit class MySpider(scrapy.Spider): name = 'mySpider' start_urls = 'http://fx.cmbchina.com/hq/' def start_requests(self): #程序开始时会调用 url = MySpider.start_urls yield scrapy.Request(url=url, callback=self.parse) # 调用parse函数 def parse(self, response): dammit = UnicodeDammit(response.body, ["utf-8", "gdk"]) data = dammit.unicode_markup selector = scrapy.Selector(text=data) # 找到要存储的信息 trs = selector.xpath("//div[@id='realRateInfo']/table/tr") # count = 1 for tr in trs[1:]:#第一行是标签名 item = BCItem() # item['Id'] = str(count) # count +=1 item['Currency'] = tr.xpath("./td[position()=1]/text()").extract_first().strip() item['TSP'] = tr.xpath("./td[position()=4]/text()").extract_first().strip() item['CSP'] = tr.xpath("./td[position()=5]/text()").extract_first().strip() item['TBP'] = tr.xpath("./td[position()=6]/text()").extract_first().strip() item['CBP'] = tr.xpath("./td[position()=7]/text()").extract_first().strip() item['Time'] = tr.xpath("./td[position()=8]/text()").extract_first().strip() yield item
piplines.py
import pymysql class BCPipeline(object): # 爬虫开始是执行的函数 def open_spider(self, spider): print("opened") try: # 连接数据库 self.con = pymysql.connect(host="127.0.0.1",port=3306, user="root",passwd="123456",db='xy669', charset="utf8") self.cursor = self.con.cursor(pymysql.cursors.DictCursor) self.cursor.execute('delete from mtable')#删除旧表 self.opened = True #打开 self.count = 0 except Exception as err: print(err) self.opened = False def close_spider(self, spider): if self.opened: self.con.commit() self.con.close() self.opened = False print("closed") print("总共爬取", self.count, "种不同的汇币信息") def process_item(self, item, spider): try: print() if self.opened: self.count += 1 # 用来构造count,从1开始 # 插入数据到表中 self.cursor.execute("insert into gptable(Id,Currency,TSP,CSP,TBP,CBP,Time) values(%s,%s,%s,%s,%s,%s,%s)" ,(str(self.count),item['Currency'],item['TSP'],item['CSP'],item['TBP'],item['CBP'],item['Time'])) except Exception as err: print("wrong error:"+str(err)) return item
实验结果
心得体会:
这次实验是第一个的复运用,换汤不换药,加深了对Xpath的理解和运用。