数据采集与融合技术作业2

学号姓名 102202103王文豪
gitee仓库地址 https://gitee.com/wwhpower/project_wwh.git

作业①:

(1)在中国气象网(http://www.weather.com.cn)给定城市集的 7日天气预报,并保存在数据库。
代码如下:

from bs4 import BeautifulSoup
import urllib.request
import sqlite3

class WeatherDB:
    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("CREATE TABLE IF NOT EXISTS weathers (wCity varchar(16), wDate varchar(16), wWeather varchar(64), wTemp varchar(32), PRIMARY KEY (wCity, wDate))")
        except Exception as err:
            print(f"Database creation error: {err}")
            self.cursor.execute("DELETE FROM weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("INSERT INTO weathers (wCity, wDate, wWeather, wTemp) VALUES (?, ?, ?, ?)", (city, date, weather, temp))
        except Exception as err:
            print(f"Error inserting data: {err}")

    def show(self):
        self.cursor.execute("SELECT * FROM weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))

class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
        }
        self.cityCode = {"北京": "101010100"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(f"{city} code cannot be found")
            return

        url = f"http://www.weather.com.cn/weather/{self.cityCode[city]}.shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            with urllib.request.urlopen(req) as response:
                data = response.read()
                soup = BeautifulSoup(data, "lxml")
                lis = soup.select("ul[class='t clearfix'] li")
                for li in lis:
                    try:
                        date = li.select('h1')[0].text
                        weather = li.select('p[class="wea"]')[0].text
                        temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                        print(city, date, weather, temp)
                        self.db.insert(city, date, weather, temp)
                    except Exception as err:
                        print(f"Error parsing data: {err}")
        except Exception as err:
            print(f"Error fetching data for {city}: {err}")

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()

        for city in cities:
            self.forecastCity(city)

        self.db.show()
        self.db.closeDB()

ws = WeatherForecast()
ws.process(["北京"])
print("completed")

爬取图片如下:

(2)心得体会
在实验中,我通过运用 BeautifulSoup 库对 HTML 文档进行解析,同时对网页结构进行分析,加强了我对于元素以及标签的提取能力。数据库创建及使用对数据进行存储,增强了我对于数据的储存管理能力。

作业②:

(1)用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。
抓包:

代码如下:

import requests
import json
import sqlite3

class StockDB:
    """数据库操作类"""
    def __init__(self, db_name='stock.db'):
        self.db_name = db_name

    def openDB(self):
        """打开数据库"""
        self.conn = sqlite3.connect(self.db_name)
        self.cursor = self.conn.cursor()
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS stocks (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                code TEXT NOT NULL,
                name TEXT NOT NULL,
                current_price REAL NOT NULL,
                open_price REAL,
                high_price REAL,
                low_price REAL,
                pre_close_price REAL,
                change REAL,
                change_rate REAL,
                volume REAL,
                amount REAL,
               流通股本 REAL,
                流通市值 REAL,
                总股本 REAL,
                总市值 REAL
            )
        ''')

    def insert(self, stock_data):
        """插入一条股票数据"""
        self.cursor.execute('''
            INSERT INTO stocks (code, name, current_price, open_price, high_price, low_price, pre_close_price, change, change_rate, volume, amount, 流通股本, 流通市值, 总股本, 总市值)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            stock_data['f12'],  # 股票代码
            stock_data['f14'],  # 股票名称
            stock_data['f2'],   # 当前价
            stock_data['f3'],   # 开盘价
            stock_data['f4'],   # 最高价
            stock_data['f5'],   # 最低价
            stock_data['f6'],   # 昨收价
            stock_data['f7'],   # 涨跌(元)
            stock_data['f8'],   # 涨跌幅(%)
            stock_data['f15'],  # 成交量(手)
            stock_data['f16'],  # 成交金额(万元)
            stock_data['f17'],  # 流通股本(亿股)
            stock_data['f18'],  # 流通市值(亿元)
            stock_data['f10'],  # 总股本(亿股)
            stock_data['f9']    # 总市值(亿元)
        ))
        self.conn.commit()

    def show(self):
        """展示所有股票数据"""
        self.cursor.execute('SELECT * FROM stocks')
        stocks = self.cursor.fetchall()
        for stock in stocks:
            print(stock)

    def closeDB(self):
        """关闭数据库"""
        self.conn.close()


class StockCrawler:
    def __init__(self):
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
            'cookie': 'qgqp_b_id=676cc891cb84acdefca65212ca97d62c; st_si=15061432798499; st_pvi=12986909889368; st_sp=2024-10-15%2017%3A24%3A20; st_inirUrl=; st_sn=1; st_psi=20241021201829620-113200301321-6882775284; st_asi=delete'
        }

    def crawl_stocks(self):
        searchlist = [1]  # 假设我们只爬取第一页的数据
        db = StockDB()
        db.openDB()
        for page in searchlist:
            response = requests.get(url='https://62.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124041053610502581095_1729513121384&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&dect=1&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1729513121385', params={
                'cb': 'jQuery1124041053610502581095_1729513121384',
                'pn': page,
                'pz': 20,
                'po': 1,
                'np': 1,
                'ut': 'bd1d9ddb04089700cf9c27f6f7426281',
                'fltt': 2,
                'invt': 2,
                'wbp2u': '|0|0|0|web',
                'fid': 'f3',
                'fs': 'm:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048',
                'fields': 'f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152',
                '_': '1729513121384'
            }, headers=self.headers)
            data = response.text[response.text.find('(') + 1:response.text.rfind(')')]
            data = json.loads(data)
            for stock_data in data['data']['diff']:
                db.insert(stock_data)
        db.show()
        db.closeDB()

    def process(self):
        self.crawl_stocks()

# 使用
crawler = StockCrawler()
crawler.process()

爬取图片如下:

(2)心得体会
通过抓包,我可以观察到网页加载数据时的完整请求和响应过程,包括请求的URL、请求方法(GET、POST等)、请求头、请求参数以及服务器的响应内容。同时可以查看API请求的参数(如f1、f2等)和服务器返回的数据格式(通常是JSON、XML等),有助于我理解API的工作方式和数据结构。

作业③:

(1)爬取中国大学 2021 主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器 F12 调试分析的过程录制 Gif 加
入至博客中。
调试的GIF:

代码如下:

import requests
import re
import sqlite3

# 创建数据库和表的函数
def create_database():
    conn = sqlite3.connect('university.db')  # 连接到SQLite数据库
    cursor = conn.cursor() 
    cursor.execute('''  
        CREATE TABLE IF NOT EXISTS uni (  
            排名 TEXT PRIMARY KEY,  # 排名作为主键
            学校名称 TEXT,  # 学校名称
            省市 TEXT,  # 学校所在的省市
            学校类型 TEXT,  # 学校类型
            总分 TEXT  # 学校的总分
        )  
    ''')
    conn.commit()  # 提交创建表的操作
    return conn, cursor

# 从URL获取大学数据的函数
def fetch_university_data(url):
    try:
        response = requests.get(url)  # 发送GET请求
        response.raise_for_status()  # 检查请求是否成功
        return response.text  # 返回响应的文本内容
    except requests.RequestException as e:
        print(f"获取数据时出错:{e}")  # 打印错误信息
        return None

# 解析大学数据的函数
def parse_university_data(text):
    universities = []  # 初始化一个空列表来存储大学数据
    
    # 使用正则表达式提取信息
    names = re.findall(r',univNameCn:"(.*?)",', text)
    scores = re.findall(r',score:(.*?),', text)
    categories = re.findall(r',univCategory:(.*?),', text)
    provinces = re.findall(r',province:(.*?),', text)

    # 提取城市和类型的编码和值
    code_segments = re.findall(r'function(.*?){', text)
    if code_segments:
        code = code_segments[0]
        start_index = code.find('a')
        end_index = code.find('pE')
        code_names = code[start_index:end_index].split(',')

        value_segments = re.findall(r'mutations:(.*?);', text)
        if value_segments:
            values = value_segments[0]
            start_value = values.find('(') + 1
            end_value = values.find(')')
            value_names = values[start_value:end_value].split(",")

            # 构建大学数据列表
            for i in range(len(names)):
                province_name = value_names[code_names.index(provinces[i])][1:-1]
                category_name = value_names[code_names.index(categories[i])][1:-1]
                universities.append((i + 1, names[i], province_name, category_name, scores[i]))

    return universities

# 将大学数据插入数据库的函数
def insert_universities(cursor, universities):
    for university in universities:
        cursor.execute(
            "INSERT INTO uni(排名, 学校名称, 省市, 学校类型, 总分) VALUES (?, ?, ?, ?, ?)", 
            (university[0], university[1], university[2], university[3], university[4])
        )

# 主函数
def main():
    # 创建数据库和表
    conn, cursor = create_database()

    url = "https://www.shanghairanking.cn/_nuxt/static/1728872418/rankings/bcur/2021/payload.js"  # URL
    
    # 获取数据
    data_text = fetch_university_data(url)
    if data_text:
        # 解析大学数据
        universities = parse_university_data(data_text)

        # 将数据插入数据库
        insert_universities(cursor, universities)

        # 提交更改并关闭数据库连接
        conn.commit()
    conn.close()

if __name__ == "__main__":
    main()

爬取图片如下:

(2)心得体会
通过本次爬取项目,我深刻体会到了数据抓取的复杂性与趣味性。在实践中,我不仅提升了编程能力,也增强了对网页结构的理解。在爬取过程中,使用 F12 工具调试是必不可少的。通过查看网页的 Network 和 Elements 标签,我能够实时监控请求与响应的内容,从而确保数据抓取的准确性。

posted @ 2024-10-21 23:54  Wroli  阅读(6)  评论(0编辑  收藏  举报