数据采集与融合第二次作业

作业①:

结果展示:

代码部分:

import sqlite3
import requests
from bs4 import BeautifulSoup

# 连接到SQLite数据库
conn = sqlite3.connect('weather.db')
c = conn.cursor()

# 创建表
c.execute('''
CREATE TABLE IF NOT EXISTS forecast (
    id INTEGER PRIMARY KEY,
    city TEXT NOT NULL,
    date TEXT NOT NULL,
    weather TEXT NOT NULL,
    temperature TEXT NOT NULL,
    UNIQUE(city, date) ON CONFLICT IGNORE
)
''')

# 提交更改
conn.commit()

def fetch_weather(city_id, city_name):
    url = f'http://www.weather.com.cn/weather/{city_id}.shtml'
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # 检查请求是否成功
        soup = BeautifulSoup(response.content, 'html.parser')

        # 解析天气数据
        days = soup.find_all('li', class_='sky')
        if not days:
            return

        for day in days:
            date = day.find('h1').text.strip()
            weather = day.find('p', class_='wea').text.strip()
            temp = day.find('p', class_='tem').text.strip().replace('\n', '/').replace(' ', '')

            # 插入数据到数据库
            c.execute('''
            INSERT OR IGNORE INTO forecast (city, date, weather, temperature)
            VALUES (?, ?, ?, ?)
            ''', (city_name, date, weather, temp))
        
        conn.commit()
    except requests.RequestException as e:
        print(f"请求错误: {e}")
    except Exception as e:
        print(f"解析错误: {e}")

# 示例城市列表
cities = [
    {'id': '101010100', 'name': '北京'},
    {'id': '101020100', 'name': '上海'},
    {'id': '101280601', 'name': '福州'}
]

# 清除已有数据
c.execute('DELETE FROM forecast')
conn.commit()

# 获取并保存每个城市的天气预报
for city in cities:
    fetch_weather(city['id'], city['name'])

# 计算总数据量
total_rows = len(cities) * 7

# 从数据库中读取并打印数据
c.execute(f'SELECT * FROM forecast LIMIT {total_rows}')
rows = c.fetchall()

if not rows:
    print("数据库中没有数据")
else:
    # 打印表头
    print(f"{'序号':<5} {'日期':<10} {'城市':<10} {'天气':<10} {'温度':<10}")
    print('-' * 50)

    # 打印数据
    for idx, row in enumerate(rows, start=1):
        print(f"{idx:<5} {row[2]:<10} {row[1]:<10} {row[3]:<10} {row[4]:<10}")

# 关闭数据库连接
conn.close()

Gitee文件夹链接

输出结果:

个人心得:

网页数据抓取过程中,遇到了网页结构复杂、数据动态加载等问题。通过不断学习和实践,掌握了使用正则表达式、XPath等方法提取数据的技巧。

作业②:

结果展示:

代码部分:

import requests
import pandas as pd
import json
import sqlite3

# 常量定义
BASE_URL = "http://44.push2.eastmoney.com/api/qt/clist/get"
API_PARAMS = {
    'cb': 'jQuery112406854618710877052_1696660618066',
    'pz': 20,
    'po': 1,
    'np': 1,
    'ut': 'bd1d9ddb04089700cf9c27f6f7426281',
    'fltt': 2,
    'invt': 2,
    'wbp2u': '|0|0|0|web',
    'fid': 'f3',
    'fs': 'm:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048',
    'fields': 'f2,f3,f4,f5,f6,f7,f12,f14',
    '_': '1696660618067'
}
DB_PATH = 'stock_info.db'
COLUMNS = ['序号', '代码', '名称', '最新价', '涨跌幅', '涨跌额', '成交量', '成交额', '振幅']

def fetch_stock_info(page_number):
    """
    获取指定页码的股票信息
    """
    params = API_PARAMS.copy()
    params['pn'] = page_number
    response = requests.get(BASE_URL, params=params)
    content = response.text
    json_str = content[content.find('(') + 1: content.rfind(')')]
    data_json = json.loads(json_str)
    stock_items = data_json['data']['diff']
    for idx, item in enumerate(stock_items):
        yield [
            idx + 1,
            item['f12'],
            item['f14'],
            item['f2'],
            item['f3'],
            item['f4'],
            item['f5'],
            item['f6'],
            item['f7']
        ]

def save_to_database(stock_data):
    """
    将股票数据保存到SQLite数据库
    """
    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS stock_data (
                序号 INTEGER,
                代码 TEXT,
                名称 TEXT,
                最新价 REAL,
                涨跌幅 REAL,
                涨跌额 REAL,
                成交量 INTEGER,
                成交额 REAL,
                振幅 REAL
            )
        ''')
        cursor.executemany('INSERT INTO stock_data VALUES (?,?,?,?,?,?,?,?,?)', stock_data)
        conn.commit()

def get_user_input():
    """
    获取用户输入的页数
    """
    while True:
        try:
            page_input = int(input("请输入要爬取的页数:"))
            if page_input <= 0:
                raise ValueError("页数必须为正整数")
            return page_input
        except ValueError as e:
            print(f"输入错误: {e}")

def main():
    """
    主函数,执行数据获取和存储操作
    """
    page_input = get_user_input()
    total_stock_data = []
    for page in range(1, page_input + 1):
        total_stock_data.extend(fetch_stock_info(page))

    if total_stock_data:
        save_to_database(total_stock_data)
        df = pd.DataFrame(total_stock_data, columns=COLUMNS)
        print(df)

if __name__ == "__main__":
    main()

Gitee文件夹链接

输出结果:

个人心得:

在爬取过程中,我遇到了反爬虫机制、动态加载数据等挑战。通过不断尝试和调整,我使用了伪装请求头、增加请求间隔等方法来绕过反爬虫机制,并通过分析JavaScript代码找到了动态加载数据的API接口。

作业③:

结果展示:

代码部分:

import requests
from bs4 import BeautifulSoup
import sqlite3
import re

# 常量定义
DATABASE_NAME = 'university_rankings.db'
TABLE_NAME = 'UniversityRanking'
URL = "https://www.shanghairanking.cn/rankings/bcur/2021"
LABELS_TO_REMOVE = ['双一流', '985工程', '211工程', '985', '211']
COLUMNS = ["排名", "学校", "省份", "类型", "总分"]
TEMPLATE = "{0:^6}\t{1:{5}<20}\t{2:^6}\t{3:^8}\t{4:^6}"

def fetch_html_content(url):
    """
    获取指定URL的HTML内容
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except requests.RequestException as e:
        print(f"请求错误: {e}")
        return None

def sanitize_university_name(name, labels_to_remove):
    """
    清理大学名称中的标签和非中文字符
    """
    for label in labels_to_remove:
        name = name.replace(label, '')
    name = re.sub(r'[A-Za-z]', '', name)
    name = re.sub(r'[^\u4e00-\u9fa5]', '', name)
    return name

def extract_university_data(html_content, labels_to_remove):
    """
    解析HTML内容,提取大学信息列表
    """
    soup = BeautifulSoup(html_content, "html.parser")
    university_list = []
    tbody = soup.find('tbody')
    if not tbody:
        print("未找到表格数据")
        return university_list

    rows = tbody.find_all('tr')
    for row in rows:
        columns = row.find_all('td')
        if len(columns) < 5:
            continue

        rank = columns[0].text.strip()
        name_tag = columns[1].find('a')
        name = sanitize_university_name(name_tag.text.strip() if name_tag else columns[1].text.strip(), labels_to_remove)
        province = columns[2].text.strip()
        category = columns[3].text.strip()
        score = columns[4].text.strip()

        university_list.append([rank, name, province, category, score])
    return university_list

def store_university_data(data_list, database_name, table_name):
    """
    将大学信息存储到SQLite数据库中
    """
    with sqlite3.connect(database_name) as conn:
        cursor = conn.cursor()
        cursor.execute(f'''
            CREATE TABLE IF NOT EXISTS {table_name} (
                Rank TEXT,
                Name TEXT,
                Province TEXT,
                Category TEXT,
                Score TEXT
            )
        ''')
        cursor.executemany(f'INSERT INTO {table_name} VALUES (?,?,?,?,?)', data_list)
        conn.commit()

def display_university_data(universities, count, template, columns):
    """
    打印大学信息列表
    """
    print(template.format(*columns, chr(12288)))
    for i in range(min(count, len(universities))):
        uni = universities[i]
        print(template.format(*uni, chr(12288)))

def execute_university_ranking(url, database_name, table_name, labels_to_remove, columns, template, count):
    """
    主函数,执行数据获取、解析、存储和显示操作
    """
    html_content = fetch_html_content(url)
    if not html_content:
        return

    university_data = extract_university_data(html_content, labels_to_remove)
    if not university_data:
        print("未提取到大学信息")
        return

    store_university_data(university_data, database_name, table_name)
    display_university_data(university_data, count, template, columns)

if __name__ == '__main__':
    execute_university_ranking(URL, DATABASE_NAME, TABLE_NAME, LABELS_TO_REMOVE, COLUMNS, TEMPLATE, 10)

Gitee文件夹链接

输出结果:

个人心得:

在网页结构分析阶段,我遇到了网页动态加载数据的问题。通过深入研究,我发现数据是通过Ajax请求从服务器获取的。于是,我使用requests库模拟了这些Ajax请求,成功获取了所需的数据。

posted @ 2024-12-17 19:49  梧桐陌雨  阅读(5)  评论(0编辑  收藏  举报