2024数据采集与融合技术实践第三次作业

这个作业属于哪个课程 <首页 - 2024数据采集与融合技术实践 - 福州大学 - 班级博客 - 博客园 (cnblogs.com)>
这个作业要求在哪里 <作业3 - 作业 - 2024数据采集与融合技术实践 - 班级博客 - 博客园 (cnblogs.com)>
学号 <102202126>

一、作业内容

作业①

  • 要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

  • 代码如下

    work.py
    import scrapy
    from Practical_work3.items import work1_Item
    
    class Work1Spider(scrapy.Spider):
        name = 'work1'
        # allowed_domains = ['www.weather.com.cn']
        start_urls = ['http://www.weather.com.cn/']
    
        def parse(self, response):
            data = response.body.decode()
            selector=scrapy.Selector(text=data) 
            img_datas = selector.xpath('//a/img/@src')
            for img_data in img_datas:
                item = work1_Item()
                item['img_url'] = img_data.extract()
                yield item
    
    
    items.py
    
    import scrapy
    class work1_Item(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        img_url = scrapy.Field()
        
    pipelines.py
    
    import threading
    from itemadapter import ItemAdapter
    import urllib.request
    import os
    import pathlib
    from Practical_work3.items import work1_Item
    
    class work1_Pipeline:
        count = 0
        desktopDir = str(pathlib.Path.home()).replace('\\','\\\\') + '\\Desktop'
        threads = []
        def open_spider(self,spider):
            picture_path=self.desktopDir+'\\images'
            if os.path.exists(picture_path):  # 判断文件夹是否存在
                for root, dirs, files in os.walk(picture_path, topdown=False):
                    for name in files:
                        os.remove(os.path.join(root, name))  # 删除文件
                    for name in dirs:
                        os.rmdir(os.path.join(root, name))  # 删除文件夹
                os.rmdir(picture_path)  # 删除文件夹
            os.mkdir(picture_path)  # 创建文件夹
    
        # # 单线程
        # def process_item(self, item, spider):
        #     url = item['img_url']
        #     print(url)
        #     img_data = urllib.request.urlopen(url=url).read()
        #     img_path = self.desktopDir + '\\images\\' + str(self.count)+'.jpg'
        #     with open(img_path, 'wb') as fp:
        #         fp.write(img_data)
        #     self.count = self.count + 1
        #     return item
        
        #多线程
        def process_item(self, item, spider):
            if isinstance(item,work1_Item):
                url = item['img_url']
                print(url)
                T=threading.Thread(target=self.download_img,args=(url,))
                T.setDaemon(False)
                T.start()
                self.threads.append(T)
            return item
    
        def download_img(self,url):
            img_data = urllib.request.urlopen(url=url).read()
            img_path = self.desktopDir + '\\images\\' + str(self.count)+'.jpg'
            with open(img_path, 'wb') as fp:
                fp.write(img_data)
            self.count = self.count + 1
        
        def close_spider(self,spider):
            for t in self.threads:
                t.join()
    
  • 输出信息:

  • Gitee文件夹链接:陈家凯第三次实践作业

  • 心得体会:

    • 单线程爬取:简单易懂,适合初学者。能够逐步掌握爬虫的基本逻辑,控制请求的频率,避免对目标网站造成过大压力。
    • 多线程爬取:显著提升了爬取速度,但需要注意线程安全问题,以及对目标网站的请求频率控制,防止被封IP。

    作业②

  • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。

  • 代码如下

    work.py
    from typing import Iterable
    import scrapy
    from scrapy.http import Request
    import re
    import json
    from Practical_work3.items import work2_Item
    
    class Work2Spider(scrapy.Spider):
        name = 'work2'
        # allowed_domains = ['25.push2.eastmoney.com']
    
        start_urls = ['http://25.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124021313927342030325_1696658971596&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18&_=1696658971636']
        def parse(self, response):
            data = response.body.decode()
            item = work2_Item()
            data = re.compile('"diff":\[(.*?)\]',re.S).findall(data)
            columns={'f2':'最新价','f3':'涨跌幅(%)','f4':'涨跌额','f5':'成交量','f6':'成交额','f7':'振幅(%)','f12':'代码','f14':'名称','f15':'最高',
            'f16':'最低','f17':'今开','f18':'昨收'}
            for one_data in re.compile('\{(.*?)\}',re.S).findall(data[0]):
                data_dic = json.loads('{' + one_data + '}')
                for k,v in data_dic.items():
                    item[k] = v
                yield item
    
    
    
    
    items.py
    
    import scrapy
    class work2_Item(scrapy.Item):
        f2 = scrapy.Field()
        f3 = scrapy.Field()
        f4 = scrapy.Field()
        f5 = scrapy.Field()
        f6 = scrapy.Field()
        f7 = scrapy.Field()
        f12 = scrapy.Field()
        f14 = scrapy.Field()
        f15 = scrapy.Field()
        f16 = scrapy.Field()
        f17 = scrapy.Field()
        f18 = scrapy.Field()
        
    pipelines.py
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    # useful for handling different item types with a single interface
    import threading
    from itemadapter import ItemAdapter
    import urllib.request
    import os
    import pathlib
    import pymysql
    from Practical_work3.items import work2_Item
    
    
    
    class work2_Pipeline:
        def open_spider(self, spider):
            try:
                self.db = pymysql.connect(host='127.0.0.1', user='root', passwd='Cjkmysql.', port=3306, charset='utf8',
                                          database='chenoojkk')
                self.cursor = self.db.cursor()
                self.cursor.execute('DROP TABLE IF EXISTS stock')
                sql = """CREATE TABLE stock(Latest_quotation Double,Chg Double,up_down_amount Double,turnover Double,transaction_volume Double,
                amplitude Double,id varchar(12) PRIMARY KEY,name varchar(32),highest Double, lowest Double,today Double,yesterday Double)"""
                self.cursor.execute(sql)
            except Exception as e:
                print(e)
    
        def process_item(self, item, spider):
            if isinstance(item, work2_Item):
                sql = """INSERT INTO stock VALUES (%f,%f,%f,%f,%f,%f,"%s","%s",%f,%f,%f,%f)""" % (
                item['f2'], item['f3'], item['f4'], item['f5'], item['f6'],
                item['f7'], item['f12'], item['f14'], item['f15'], item['f16'], item['f17'], item['f18'])
                self.cursor.execute(sql)
                self.db.commit()
            return item
    
        def close_spider(self, spider):
            self.cursor.close()
            self.db.close()
    
    
  • 输出信息:

  • Gitee文件夹链接:陈家凯第三次实践作业

  • 心得体会:

    • 在存储数据时,我学习了如何使用Python的MySQL库连接数据库,执行插入和更新操作。合理的数据库设计(如表结构和索引)能显著提升数据存取效率。
    • 在爬取过程中,遇到了一些问题,如网络请求失败或数据格式不符。通过设置异常处理机制和调试工具,我能够快速定位问题并进行修复,提高了爬虫的稳定性

    作业③

  • 要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

  • 代码如下(使用scrapy框架+Xpath+MySQL数据库)

    work.py
    import scrapy
    from Practical_work3.items import work3_Item
    
    class Work3Spider(scrapy.Spider):
        name = 'work3'
        # allowed_domains = ['www.boc.cn']
        start_urls = ['https://www.boc.cn/sourcedb/whpj/']
    
        def parse(self, response):
            data = response.body.decode()
            selector=scrapy.Selector(text=data) 
            data_lists = selector.xpath('//table[@align="left"]/tr')
            for data_list in data_lists:
                datas = data_list.xpath('.//td')
                if datas != []:
                    item = work3_Item()
                    keys = ['name','price1','price2','price3','price4','price5','date']
                    str_lists = datas.extract()
                    for i in range(len(str_lists)-1):
                        item[keys[i]] = str_lists[i].strip('<td class="pjrq"></td>').strip()
                    yield item
                    
                    
    items.py
    
    
    import scrapy
    class work3_Item(scrapy.Item):
        name = scrapy.Field()
        price1 = scrapy.Field()
        price2 = scrapy.Field()
        price3 = scrapy.Field()
        price4 = scrapy.Field()
        price5 = scrapy.Field()
        date = scrapy.Field()
        
        
    pipelines.py
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    # useful for handling different item types with a single interface
    import threading
    from itemadapter import ItemAdapter
    import urllib.request
    import os
    import pathlib
    import pymysql
    from Practical_work3.items import work3_Item
    
    
    class work3_Pipeline:
    
        def open_spider(self,spider):
            try:
                self.db = pymysql.connect(host='127.0.0.1', user='root', passwd='Cjkmysql.', port=3306,charset='utf8',database='chenoojkk')
                self.cursor = self.db.cursor()
                self.cursor.execute('DROP TABLE IF EXISTS bank')
                sql = """CREATE TABLE bank(Currency varchar(32),p1 varchar(17),p2 varchar(17),p3 varchar(17),p4 varchar(17),p5 varchar(17),Time varchar(32))"""
                self.cursor.execute(sql)
            except Exception as e:
                print(e)
    
        def process_item(self, item, spider):
            if isinstance(item,work3_Item):
                sql = 'INSERT INTO bank VALUES ("%s","%s","%s","%s","%s","%s","%s")' % (item['name'],item['price1'],item['price2'],
                                                                                                        item['price3'],item['price4'],item['price5'],item['date'])
                self.cursor.execute(sql)
                self.db.commit()
            return item
    
        def close_spider(self,spider):
            self.cursor.close()
            self.db.close()
    
  • 输出信息:

  • Gitee文件夹链接:陈家凯第三次实践作业

  • 心得体会:

    • 精准提取数据:XPath是一种非常灵活的选择器,能够高效地从HTML文档中提取所需的信息。在爬取外汇网站时,我通过编写XPath表达式准确抓取汇率、变动幅度等关键信息。

    • 连接与数据存储:使用Python的MySQL库建立数据库连接,并执行SQL操作。设置适当的表结构和索引能够提高数据访问速度,特别是在处理大量数据时。

posted @ 2024-10-29 20:31  chenoojkk  阅读(12)  评论(0编辑  收藏  举报