第四次大作业

第四次作业

作业①
要求：熟练掌握 scrapy 中Item、Pipeline 数据的序列化输出方法,使用Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据
候选网站：http://www.dangdang.com/
关键词：学生自由选择
输出信息：

id	title	author	publisher	date	price	detail
1	Python算法图解	何韬	清华大学出版社	2021-04-01	¥34.50	用到算法。数据结构是算法的基础，数组、字典、堆、栈、链表...
..	..	..	..	..	..	..

码云地址：https://gitee.com/eat-watermelon-bu/crawl_project/tree/master/%E7%AC%AC%E5%9B%9B%E6%AC%A1%E5%A4%A7%E4%BD%9C%E4%B8%9A/%E4%BD%9C%E4%B8%9A%E4%B8%80

思路

直接复现教参代码，学习大神的代码风格，这里就不reinvent wheels

核心代码

# spider 文件
class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['dangdang.com']
    key = 'python'
    source_url = 'http://search.dangdang.com/'
    # 用page 来控制显示当前页数
    page = 1
    # count用来控制爬取的数据项数目
    count = 1
    def start_requests(self):
        # key为搜索的词项，要爬取python,就将key设定为python,这样网页就变成了搜索python的结果
        url = self.source_url + "?key=" + self.key
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        selector = scrapy.Selector(text=data)
        lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")

        for li in lis:
            # 图书名，列表
            title = li.xpath("./a[position()=1]/@title").extract_first()
            # 价格，列表
            price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()        # 作者，列表
            author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()      # 日期，列表
            date = li.xpath("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
            # 出版社，列表
            publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
            # 介绍，列表
            detail = li.xpath("./p[@class='detail']/text()").extract_first()

            item = DdItem()
            item["title"] = title.strip() if title else ""
            item["author"] = author.strip() if author else ""
            item["date"] = date.strip()[1:] if date else ""
            item["publisher"] = publisher.strip() if publisher else ""
            item["price"] = price.strip() if price else ""
            item["detail"] = detail.strip() if detail else ""
           # 用来控制爬取 108项
            if self.count <=107:
                self.count += 1
                yield item

            else:
                print('结束爬取，共爬取108项')
                break


        link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
        #page<=3意味着要爬取4页
        if self.page <= 3:

            if link:
                #页数加一再翻页
                self.page += 1
                url = response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)


# items文件

class DdItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    publisher = scrapy.Field()
    date = scrapy.Field()
    price = scrapy.Field()
    detail = scrapy.Field()

'''
pipeline文件
'''
# 数据库类，用以存取数据进数据库
class bookDB:
    def openDB(self):
        self.con = sqlite3.connect("book.db")  # 连接数据库，没有的话会注定创建一个
        self.cursor = self.con.cursor()  # 设置一个游标

    def createTB(self):
        try:
            self.cursor.execute("create table books(title varchar(100),author varchar(100),date varchar(100),publisher varchar(100),price varchar(10),detail varchar(300))")
            # 创建表
        except:
            self.cursor.execute("delete from books")

    def closeDB(self):
        self.con.commit() # 提交
        self.con.close()  # 关闭数据库

    def insert(self,title,author,date,publisher,price,detail):
        try:
            self.cursor.execute("insert into books(title,author,date,publisher,price,detail) values (?,?,?,?,?,?)", (title,author,date,publisher,price,detail))
            # 插入数据
        except Exception as err:
            print(err)










class DdPipeline:
    # if_start标记是否创建了数据库中的基本表，0表示未创建
    if_start = 0
    
    def process_item(self, item, spider):
        title = item['title']
        author = item['author']
        date = item['date']
        publisher = item['publisher']
        price = item['price']
        detail = item['detail']
        # 连接数据库
        book_db = bookDB()
        book_db.openDB()

        if self.if_start == 0:
            self.if_start = 1
            book_db.createTB()
            print('Books Table Established')
            print()
        # 插入数据
        book_db.insert(title,author,date,publisher,price,detail)
        # 提交并关闭数据库
        book_db.closeDB()
        print(title)
        print(author)
        print(date)
        print(publisher)
        print(price)
        print(detail)
        print()

截图

心得

进一步熟悉了scrapy框架的使用，通过学习范例代码，学会了数据项可能为空时的操作，例如

item["title"] = title.strip() if title else ""

作业②

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站：招商银行网：http://fx.cmbchina.com/hq/
输出信息：MYSQL数据库存储和输出格式

id	currency	tsp	csp	tbp	cbp	time
1	港币	82.23	82.23	81.91	81.33	10:22:05
..	..	..	..	..	..	..

码云地址 https://gitee.com/eat-watermelon-bu/crawl_project/tree/master/%E7%AC%AC%E5%9B%9B%E6%AC%A1%E5%A4%A7%E4%BD%9C%E4%B8%9A/%E4%BD%9C%E4%B8%9A%E4%BA%8C

思路

单页爬取，直接用xpath定位元素

核心代码

class CurrencySpider(scrapy.Spider):
    name = 'currency'
    allowed_domains = ['http://fx.cmbchina.com']
    url = 'http://fx.cmbchina.com/Hq'
    # 起始页面固定为url
    def start_requests(self):
        yield scrapy.Request(self.url)

    def parse(self, response):
        # 货币名称列表
        currency_list = response.xpath('//div[@id="realRateInfo"]//tr[position()>1]/td[1]/text()').extract()      # tsp列表
        tsp_list = response.xpath('//div[@id="realRateInfo"]//tr/td[@class="numberright"][1]/text()').extract()
        # csp列表
        csp_list = response.xpath('//div[@id="realRateInfo"]//tr/td[@class="numberright"][2]/text()').extract()
        # tbp列表
        tbp_list = response.xpath('//div[@id="realRateInfo"]//tr/td[@class="numberright"][3]/text()').extract()
        # cbp列表
        cbp_list = response.xpath('//div[@id="realRateInfo"]//tr/td[@class="numberright"][4]/text()').extract()
        # 日期列表
        time_list = response.xpath('//div[@id="realRateInfo"]//tr/td[@align="center"][3]/text()').extract()

        for i in range(len(currency_list)):
            currency = currency_list[i].strip()
            tsp = tsp_list[i].strip()
            csp = csp_list[i].strip()
            tbp = tbp_list[i].strip()
            cbp = cbp_list[i].strip()
            tm = time_list[i].strip()

            item = BankItem()
            item['currency'] = currency
            item['tsp'] = tsp
            item['csp'] = csp
            item['tbp'] = tbp
            item['cbp'] = cbp
            item['time'] = tm
            yield item

        print('All data collected successfully')

'''
pipelines 文件
'''
class currencyDB:
    def openDB(self):
        self.con = sqlite3.connect("currency.db")  # 连接数据库，没有的话会注定创建一个
        self.cursor = self.con.cursor()  # 设置一个游标




    def createTB(self):
        try:
            self.cursor.execute("create table currencies(currency varchar(100),tsp varchar(100),csp varchar(100),tbp varchar(100),cbp varchar(100),date varchar(100))")
            # 创建表
        except:
            self.cursor.execute("delete from currencies")

    def closeDB(self):
        self.con.commit() # 提交
        self.con.close()  # 关闭数据库

    def insert(self,currency,tsp,csp,tbp,cbp,date):
        try:
            self.cursor.execute("insert into currencies(currency,tsp,csp,tbp,cbp,date) values (?,?,?,?,?,?)", (currency,tsp,csp,tbp,cbp,date))
            # 插入数据
        except Exception as err:
            print(err)




class BankPipeline:
    # 用来标记是否创建了数据库表，0代表未创建 
    if_start = 0


    def process_item(self, item, spider):
        currency = item['currency']
        tsp = item['tsp']
        csp = item['csp']
        tbp = item['tbp']
        cbp = item['cbp']
        tm = item['time']
        # 打开数据库
        currency_db = currencyDB()
        currency_db.openDB()
        if self.if_start == 0:
            self.if_start = 1
            currency_db.createTB()
            print('Currency Table Established')
        currency_db.insert(
            currency, tsp, csp, tbp, cbp, tm
        )
        currency_db.closeDB()
        print(currency)
        print(tsp)
        print(csp)
        print(tbp)
        print(cbp)
        print(tm)
        print()

'''
items文件
'''
class BankItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    currency = scrapy.Field()
    tsp = scrapy.Field()
    csp = scrapy.Field()
    tbp = scrapy.Field()
    cbp = scrapy.Field()
    time = scrapy.Field()

截图

心得

此次实验进一步加深了对xpath定位的理解，此网页目标数据全在tr标签下，但是全部提取tr不妥当，

如图

应该从第二个tr标签开始提取，用到

’‘//tr[position()>1]''的用法

同样，目标数据存在’‘td[@class="numberright"]’‘下，要通过类似二维数组的方法精确定位每个数据项，例如’‘td@class="numberright"’‘定位到tsp

作业③

要求：熟练掌握 Selenium查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容；使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站：东方财富网
输出信息：MySQL数据库存储和输出格式如下

序号	股票代码	股票名称	最新报价	涨跌幅	涨跌额	成交量	成交额	振幅	最高	最低	今开	昨收
1	688093	N世华	28.47	62.22%	10.92	26.13万	7.6亿	22.3%	32.0	28.08	30.2	17.55
2	...	...	...	...

码云地址 https://gitee.com/eat-watermelon-bu/crawl_project/tree/master/%E7%AC%AC%E5%9B%9B%E6%AC%A1%E5%A4%A7%E4%BD%9C%E4%B8%9A/%E4%BD%9C%E4%B8%9A%E4%B8%89

思路

通过selenium模拟浏览器，不需要找javascript下具体数据。通过模拟点击翻页按钮实现翻页

核心代码

'''@031904108fty
'''
import sqlite3
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions
import json
import csv
import time



class StocksSpider():
    # tbCreated用来记录是否建立了数据库的表格，0表示未创建
    tbCreated = 0
    # isLast用来判断是否翻页到最后
    isLast = False
    # 创建或来连接到数据库
    def openDB(self):
        self.con = sqlite3.connect("stocks.db")
        self.cursor = self.con.cursor()
        print('Connect to stocks.db successfully!')
    # 创建关系表
    def createTB(self):

        try:
            self.cursor.execute(
                "create table stocks "
                "(num varchar(32),"
                "st_code varchar(32),"
                "st_name varchar(32),"
                "st_price varchar(32),"
                "st_varition_rate varcahr(32),"
                "st_varition_value varchar(32),"
                "st_turnover varcahr(32),"
                "st_trading_volumn varchar(32),"
                "st_wave varchar(32),"
                "st_max varcahr(32),"
                "st_min varchar(32),"
                "st_td carcahr(32),"
                "st_yest varchar(32)"
                ")")
        except:
            # self.cursor.execute("delete from stocks")
            print('Table exists')
    # 提交数据库
    def commitDB(self):
        self.con.commit()
    # 关闭数据库
    def closeDB(self):

        self.con.commit()
        self.con.close()
    # 执行插入操作，输入的是各列表
    def insertDB(self, nums,
                 st_code,
                 st_name,
                 st_price,
                 st_varition_rate,
                 st_varition_value,
                 st_turnover,
                 st_trading_volumn,
                 st_wave,
                 st_max,
                 st_min,
                 st_td,
                 st_yest):
        try:
            for i in range(len(st_name)):
                self.cursor.execute \
                    ("insert into stocks (num,st_code,st_name,st_price,st_varition_rate,st_varition_value,st_turnover,st_trading_volumn,st_wave,st_max,st_min,st_td,st_yest) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
                     (nums[i],
                      st_code[i],
                      st_name[i],
                      st_price[i],
                      st_varition_rate[i],
                      st_varition_value[i],
                      st_turnover[i],
                      st_trading_volumn[i],
                      st_wave[i],
                      st_max[i],
                      st_min[i],
                      st_td[i],
                      st_yest[i]
                      )
                     )


        except Exception as err:
            print(err)

# 打开浏览器
    def open_browser(self):
        self.browser = webdriver.Chrome()
        self.browser.implicitly_wait(10)
        self.wait = WebDriverWait(self.browser,10)


    # 将表格内容一行一行打印出来
    def show(self,
             nums, codes, names, latest_prices, change_pencent,
             change_amount, turnover_hand, turnover_amount,
             wave, maximum, minimum, today, yesterday
             ):



        fmt = "{0:<16}\t{1:<16}\t{2:<16}\t{3:<16}\t{4:<16}" \
              "{5:<16}\t{6:<16}\t{7:<16}\t{8:<16}\t{9:<16}" \
              "{10:<16}\t{11:<16}\t{12:<16}"
        print(fmt.format('序号', '股票代码', '股票名称', '最新报价', '涨跌幅', '张跌额', '成交量', '成交额', '振幅', '最高', '最低', '今开', '昨收',
                         chr(12288)))
        # fmt = "{0:<16}\t{1:<16}\t{2:<16}\t{3:<16}\t{4:<16}" \
        #       "{5:<16}\t{6:<16}\t{7:<16}\t{8:<16}\t{9:<16}" \
        #       "{10:<16}\t{11:<16}\t{12:<16}"
        data = zip(nums, codes, names, latest_prices, change_pencent, change_amount, turnover_hand, turnover_amount,
                   wave, maximum, minimum, today, yesterday)
        for item in data:
            # print(i)
            print(
                fmt.format(item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[8], item[9],
                           item[10], item[11], item[12]))

    # 将网页中指定的信息爬取下来，再插入数据库
    def parse_page(self):
        try:
            #以下为各目标数据项
            nums = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[1]')))
            nums = [item.text for item in nums]

            codes = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[2]/a')))
            codes = [item.text for item in codes]
            names = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//td[@class="mywidth"]/a')))
            names = [item.text for item in names]

            latest_prices = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[@class="mywidth2"][1]/span')))
            latest_prices = [item.text for item in latest_prices]

            change_pencent = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[@class="mywidth2"][2]/span')))
            change_pencent = [item.text for item in change_pencent]

            change_amount = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[7]/span')))
            change_amount = [item.text for item in change_amount]

            turnover_hand = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[8]')))
            turnover_hand = [item.text for item in turnover_hand]

            turnover_amount = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[9]')))#'//*[@id="table_wrapper-table"]/tbody/tr[1]/td[9]'
            turnover_amount = [item.text for item in turnover_amount]

            wave = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[10]')))#'//*[@id="table_wrapper-table"]/tbody/tr[1]/td[9]'
            wave = [item.text for item in wave]

            maximum = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[11]/span')))#'//*[@id="table_wrapper-table"]/tbody/tr[1]/td[9]'
            maximum = [item.text for item in maximum]

            minimum = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[12]/span')))#'//*[@id="table_wrapper-table"]/tbody/tr[1]/td[9]'
            minimum = [item.text for item in minimum]

            today = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[13]/span')))#'//*[@id="table_wrapper-table"]/tbody/tr[1]/td[9]'
            today = [item.text for item in today]


            yesterday = today = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//tbody/tr/td[14]')))#'//*[@id="table_wrapper-table"]/tbody/tr[1]/td[9]'
            yesterday = [item.text for item in yesterday]
            today = [item.text for item in today]
            # 以上为个目标数据项

            # 将数据项插入关系表中
            self.insertDB(nums, codes, names, latest_prices, change_pencent, change_amount, turnover_hand, turnover_amount, wave, maximum, minimum, today, yesterday)
            # 插入之后立刻提交
            self.commitDB()
            # 打印出插入的数据
            self.show(nums, codes, names, latest_prices, change_pencent,
                      change_amount, turnover_hand, turnover_amount,
                       wave, maximum, minimum, today, yesterday)

        except selenium.common.exceptions.TimeoutException:
            print('parse_page: TimeoutException')
            self.parse_page()
        except selenium.common.exceptions.StaleElementReferenceException:
            print('parse_page: StaleElementReferenceException')
            self.browser.refresh()

    # 翻页功能，模拟点击‘下一页’按钮来实现翻页操作
    def turn_page(self):
        try:
            # 点击按钮：下一页
            self.wait.until(EC.element_to_be_clickable((By.XPATH,'//a[@class="next paginate_button"]'))).click()
            time.sleep(1)
            # 滑屏获取全部元素
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)
        except selenium.common.exceptions.NoSuchElementException:
            self.isLast = True
        except selenium.common.exceptions.TimeoutException:
            print('turn_page: TimeoutException')
            self.turn_page()
        except selenium.common.exceptions.StaleElementReferenceException:
            print('turn_page: StaleElementReferenceException')
            self.browser.refresh()



    def close_browser(self):
        self.browser.quit()
    # 爬取指定url开头的网页
    def crawl(self,url):
        self.open_browser()
        print('开始爬取: ',url)
        self.browser.get(url)
        time.sleep(1)
        # 将浏览器滑屏至网页底部实现网页数据全加载
        self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        time.sleep(2)
        # count 用来实现对页数的控制
        count = 1
        # 打开数据库
        self.openDB()
        if self.tbCreated == 0:
            self.createTB()
            self.tbCreated = 1
        # 爬取两页，故count最大可以取到2
        while count <= 2:

            print('正在爬取第 ' + str(count) + ' 页......')
            count += 1
            # 爬取网页内容
            self.parse_page()
            # 实现翻页
            self.turn_page()
        print('结束爬取')

if __name__ == '__main__':
    spider = StocksSpider()
    urlfmt = 'http://quote.eastmoney.com/center/gridlist.html#{}_a_board'
    # url_list存的是沪深，深证，上证的网页链接
    url_list = [urlfmt.format('hs'),urlfmt.format('sz'),urlfmt.format('sh')]

    for url in url_list:
        spider.crawl(url)
    spider.closeDB()

截图

心得

对selenium的理解进一步加深，将代码封装成类具有挑战性，但也为日后复用代码提供了便捷

posted on 2021-11-13 11:16 吃西瓜不吐葡萄皮阅读(15) 评论(0) 编辑收藏举报

刷新页面返回顶部

吃西瓜不吐葡萄皮

导航

公告

第四次大作业

第四次作业

作业①

思路

截图

心得

作业②

思路

截图

心得

作业③

思路

截图

心得