第四次作业

作业①

1)、Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据实验

主函数:
import scrapy
from ..items import BookItem
from bs4 import UnicodeDammit
class MySpider(scrapy.Spider):
    name = "mySpider"
    key = 'python'
    source_url = 'http://search.dangdang.com/'
    def start_requests(self):
        url = MySpider.source_url + "?key=" + MySpider.key
        yield scrapy.Request(url=url, callback=self.parse)
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title = li.xpath("./a[position()=1]/@title").extract_first()
                price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date = li.xpath("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
                publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
                detail = li.xpath("./p[@class='detail']/text()").extract_first()
                item = BookItem()
                item["title"] = title.strip() if title else ""
                item["author"] = author.strip() if author else ""
                item["date"] = date.strip()[1:] if date else ""
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item
            link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
            if link:
                url = response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)
        except Exception as err:
            print(err)
pipelines:
from itemadapter import ItemAdapter
import pymysql
class BookPipeline(object):
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="123456",db="mydb",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from books")
            self.opened=True
            self.count=0
        except Exception as err:
            print(err)
            self.opened=False
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")
        print("总共爬取",self.count,"本书籍")
    def process_item(self, item, spider):
        try:
            if self.opened:
                self.count += 1
                id = str(self.count)
                self.cursor.execute("insert into books (id,bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values( % s, % s, % s, % s, % s, % s, % s)",(id,item["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
        except Exception as err:
            print(err)
        return item
items:
import scrapy
class BookItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    date = scrapy.Field()
    publisher = scrapy.Field()
    detail = scrapy.Field()
    price = scrapy.Field()
settings:
ITEM_PIPELINES = {
    'demo.pipelines.BookPipeline': 300,
}

2)、心得体会:

这次实验除了安装和配置mysql遇到点问题,花了很多时间,其他都挺顺利。代码照着书本上打的,主要是用来理解Scrapy+Xpath+MySQL数据库存储的方法。

作业②

1)、Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息实验

主函数:
import scrapy
from selenium import webdriver
from ..items import StockItem
class MySpider(scrapy.Spider):
    name = 'stock'
    def start_requests(self):
        url = 'http://quote.eastmoney.com/center/gridlist.html#hs_a_board'
        yield scrapy.Request(url=url, callback=self.parse)
    def parse(self, response):
        driver = webdriver.Chrome()
        try:
            driver.get("http://quote.eastmoney.com/center/gridlist.html#hs_a_board")
            list=driver.find_elements_by_xpath("//table[@class='table_wrapper-table']/tbody/tr")
            for li in list:
                id=li.find_elements_by_xpath("./td[position()=1]")[0].text
                Ticker_symbol=li.find_elements_by_xpath("./td[position()=2]/a")[0].text
                stock_name=li.find_elements_by_xpath("./td[position()=3]/a")[0].text
                Latest_offer=li.find_elements_by_xpath("./td[position()=5]/span")[0].text
                ChangeRate=li.find_elements_by_xpath("./td[position()=6]/span")[0].text
                ChangePrice =li.find_elements_by_xpath("./td[position()=7]/span")[0].text
                Volume =li.find_elements_by_xpath("./td[position()=8]")[0].text
                Turnover =li.find_elements_by_xpath("./td[position()=9]")[0].text
                Amplitude =li.find_elements_by_xpath("./td[position()=10]")[0].text
                Highest =li.find_elements_by_xpath("./td[position()=11]/span")[0].text
                Lowest =li.find_elements_by_xpath("./td[position()=12]/span")[0].text
                Open_today =li.find_elements_by_xpath("./td[position()=13]/span")[0].text
                Yesterday =li.find_elements_by_xpath("./td[position()=14]")[0].text
                item=StockItem()
                item["id"]=id
                item["Ticker_symbol"]=Ticker_symbol
                item["stock_name"]=stock_name
                item["Latest_offer"]=Latest_offer
                item["ChangeRate"]=ChangeRate
                item["ChangePrice"]=ChangePrice
                item["Volume"]=Volume
                item["Turnover"]=Turnover
                item["Amplitude"]=Amplitude
                item["Highest"]=Highest
                item["Lowest"]=Lowest
                item["Open_today"]=Open_today
                item["Yesterday"]=Yesterday
                yield item
        except Exception as err:
            print(err)
pipelines:
import pymysql
class StockPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("DROP TABLE IF EXISTS stock")
            self.cursor.execute("CREATE TABLE IF NOT EXISTS stock("
                                "Sid int PRIMARY KEY,"
                                "Ssymbol VARCHAR(32),"
                                "Sname VARCHAR(32),"
                                "Soffer VARCHAR(32),"
                                "SchangeRate VARCHAR(32),"
                                "SchangePrice VARCHAR(32),"
                                "Svolume VARCHAR(32),"
                                "Sturnover VARCHAR(32),"
                                "Samplitude VARCHAR(32),"
                                "Shighest VARCHAR(32),"
                                "Slowest VARCHAR(32),"
                                "Stoday VARCHAR(32),"
                                "Syesterday VARCHAR(32))")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
    def process_item(self, item, spider):
        try:
            if self.opened:
                self.cursor.execute(
                    "insert into stock(Sid,Ssymbol,Sname,Soffer,SchangeRate,SchangePrice,Svolume,Sturnover,Samplitude,Shighest,Slowest,Stoday,Syesterday) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                    (item["id"],item["Ticker_symbol"],item["stock_name"],item["Latest_offer"],item["ChangeRate"],item["ChangePrice"],
                     item["Volume"],item["Turnover"],item["Amplitude"],item["Highest"],item["Lowest"],item["Open_today"],item["Yesterday"]))
        except Exception as err:
            print(err)
        return item
items:
import scrapy
class StockItem(scrapy.Item):
    id=scrapy.Field()
    Ticker_symbol=scrapy.Field()
    stock_name=scrapy.Field()
    Latest_offer=scrapy.Field()
    ChangeRate=scrapy.Field()
    ChangePrice=scrapy.Field()
    Volume=scrapy.Field()
    Turnover=scrapy.Field()
    Amplitude=scrapy.Field()
    Highest=scrapy.Field()
    Lowest=scrapy.Field()
    Open_today=scrapy.Field()
    Yesterday=scrapy.Field()
    pass
settings:
ITEM_PIPELINES = {
    'stock.pipelines.StockPipeline': 300,
}

2)、心得体会:

本来以为安装chrome驱动程序会比较繁琐,没想到根据ppt和网上的教程,弄了几分钟就一次性成功了。这次实验还是爬取股票,已经算是挺熟悉的了,不过上次实验因为时间紧,没有用xpath,所以这次多花了些时间换成使用xpath进行爬取。刚开始觉得scrapy框架很麻烦,但随着使用次数的增加,我觉得自己差不多习惯使用它,也越发熟练了。

作业③

1)、使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据实验

主函数:
import scrapy
from bs4 import UnicodeDammit
from ..items import BankItem
class mySider(scrapy.spiders.Spider):
    name = "bank"
    source_url = 'http://fx.cmbchina.com/hq/'
    def start_requests(self):
        url = mySider.source_url
        yield scrapy.Request(url=url, callback=self.parse)
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            table = selector.xpath("//div[@class='box hq']/div[@id='realRateInfo']/table[@class='data']")
            trs=table.xpath("./tr")
            for tr in trs[1:]:
                Currency= tr.xpath("./td[position()=1]/text()").extract_first()
                TSP= tr.xpath("./td[position()=4]/text()").extract_first()
                CSP= tr.xpath("./td[position()=5]/text()").extract_first()
                TBP= tr.xpath("./td[position()=6]/text()").extract_first()
                CBP= tr.xpath("./td[position()=7]/text()").extract_first()
                Time= tr.xpath("./td[position()=8]/text()").extract_first()
                item=BankItem()
                item["Currency"]=Currency.strip() if Currency else ""
                item["TSP"]=TSP.strip() if TSP else ""
                item["CSP"]=CSP.strip() if CSP else ""
                item["TBP"]=TBP.strip() if TBP else ""
                item["CBP"]=CBP.strip() if CBP else ""
                item["Time"]=Time.strip() if Time else ""
                yield item
        except Exception as err:
            print(err)
pipelines:
import pymysql
from itemadapter import ItemAdapter
class BankPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb",charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("DROP TABLE IF EXISTS bank")
            self.cursor.execute("CREATE TABLE IF NOT EXISTS bank("
                                "id int PRIMARY KEY,"
                                "Currency VARCHAR(32),"
                                "TSP VARCHAR(32),"
                                "CSP VARCHAR(32),"
                                "TBP VARCHAR(32),"
                                "CBP VARCHAR(32),"
                                "TIME VARCHAR(32))")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "条信息")
    def process_item(self, item, spider):
        try:
            print(item)
            if self.opened:
                self.count += 1
                self.cursor.execute(
                    "insert into bank(id,Currency,TSP,CSP,TBP,CBP,Time) values(%s,%s,%s,%s,%s,%s,%s)",
                    (self.count, item["Currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["Time"]))
        except Exception as err:
            print(err)
        return item
items:
import scrapy
class BankItem(scrapy.Item):
    Currency = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    Time = scrapy.Field()
settings:
ITEM_PIPELINES = {
    'bank.pipelines.BankPipeline': 300,
}

2)、心得体会:

这次实验挺容易的,在相应网站上的资源管理器里找到所要爬取的信息,套入上面两题的模板就成功了。

posted @ 2020-11-03 21:32  二末三初  阅读(76)  评论(0编辑  收藏  举报