数据采集第二次作业

一.作业①
  (作业要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。)

  1.代码与运行
    (1)代码展示:

import sqlite3
from bs4 import BeautifulSoup
import urllib.request

class WTFC:
    def __init__(self, db_name='weather.db'):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
        self.conn = sqlite3.connect(db_name)
        self.cursor = self.conn.cursor()
        self.create_table()

    def create_table(self):
        """创建存储天气数据的表"""
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS weather (
                id INTEGER PRIMARY KEY,
                city TEXT,
                date TEXT,
                weather TEXT,
                temperature TEXT
            )
        ''')
        self.conn.commit()

    def fC(self, city):
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req).read()
            soup = BeautifulSoup(data, "lxml")
            eles = soup.select("ul[class='t clearfix'] li")

            for li in eles:
                try:
                    print(city,  li.select('h1')[0].text,li.select('p[class="wea"]')[0].text, li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text)
                    self.save_to_db(city,  li.select('h1')[0].text,li.select('p[class="wea"]')[0].text, li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text)
                except Exception as err:
                    print("Error")
        except Exception as err:
            print("Error")

    def save_to_db(self, city, date, weather, temperature):
        """将天气数据保存到数据库"""
        self.cursor.execute('''
            INSERT INTO weather (city, date, weather, temperature) VALUES (?, ?, ?, ?)
        ''', (city, date, weather, temperature))
        self.conn.commit()

    def dealing(self, cities):
        for city in cities:
            self.fC(city)

    def close(self):
        """关闭数据库连接"""
        self.conn.close()

# 实例化WTFC,打开数据库,爬取数据,提取数据,存进数据库,关闭数据库
a = WTFC()
a.dealing(["北京", "上海", "广州", "深圳"])
a.close()

    (2)运行展示:

2.心得体会:
1.对数据库使用的记忆更加深刻。
码云链接 https://gitee.com/jia1666372886/data-collection-work/issues

二.作业②
  (作业要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。)
  1.代码与运行
    (1)代码展示:

import requests
import re
import json
import sqlite3
import pandas as pd

def fetch_data(api_url, headers):
    """发送请求并获取数据"""
    response = requests.get(api_url, headers=headers)
    return response.text

def extract_json(jsonp_text):
    """从JSONP文本中提取JSON数据"""
    match = re.search(r'\((.*)\)', jsonp_text)
    if match:
        return json.loads(match.group(1))
    return None

def create_or_connect_db(db_name):
    """连接或创建数据库"""
    return sqlite3.connect(db_name)

def setup_table(cursor):
    """设置数据库表"""
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS stocks (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            code TEXT,
            name TEXT,
            price REAL,
            change_percent REAL,
            volume INTEGER,
            amount REAL
        )
    ''')

def insert_stock_data(cursor, stock):
    """插入单条股票数据"""
    cursor.execute('''
        INSERT INTO stocks (code, name, price, change_percent, volume, amount)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', (stock['f12'], stock['f14'], stock['f2'], stock['f3'], stock['f5'], stock['f6']))

def store_data(data, db_connection):
    """存储数据到数据库"""
    cursor = db_connection.cursor()
    setup_table(cursor)
    for stock in data['data']['diff']:
        insert_stock_data(cursor, stock)
    db_connection.commit()

def query_and_display_data(db_connection):
    """查询数据库并展示数据"""
    cursor = db_connection.cursor()
    query = 'SELECT * FROM stocks'
    df = pd.read_sql_query(query, db_connection)
    return df

def main():
    api_url = 'https://71.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124071710431543095_1728983729650&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&dect=1&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1728983729651'
    headers = {
         'user-agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
    }

    # 获取数据
    jsonp_text = fetch_data(api_url, headers)
    data = extract_json(jsonp_text)

    # 存储数据到数据库
    if data:
        db_connection = create_or_connect_db('stocks.db')
        store_data(data, db_connection)
        db_connection.close()
    else:
        print("No data retrieved.")

    # 查询并展示数据
    db_connection = create_or_connect_db('stocks.db')
    df = query_and_display_data(db_connection)
    db_connection.close()

    print(df)

if __name__ == "__main__":
    main()

    (2)运行展示:
抓包过程

运行结果

码云链接 https://gitee.com/jia1666372886/data-collection-work/issues

2.心得体会:
1学会了抓包过程,分析api返回值和参数

三.作业③
  (作业要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。)

  1.代码与运行
    (1)代码展示:

import urllib.request
from bs4 import BeautifulSoup
import sqlite3


def fetch_html(url, headers):
    """使用urllib请求网页内容,并返回HTML内容"""
    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as response:
        return response.read().decode('utf-8')


def parse_html(html_content):
    """使用BeautifulSoup解析HTML,并返回包含排名信息的表格"""
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.find('table', {'class': 'rk-table'})


def create_db_table(cursor):
    """在数据库中创建表格"""
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS rankings (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            rank TEXT,
            name TEXT,
            province TEXT,
            type TEXT,
            score TEXT
        )
    ''')


def extract_data(table):
    """从表格中提取排名信息,并返回数据列表"""
    data = []
    for row in table.find_all('tr')[1:]:
        cols = row.find_all('td')
        data.append({
            'rank': cols[0].get_text(strip=True),
            'name': cols[1].get_text(strip=True),
            'province': cols[2].get_text(strip=True),
            'type': cols[3].get_text(strip=True),
            'score': cols[4].get_text(strip=True)
        })
    return data


def insert_data(cursor, data):
    """将数据插入数据库"""
    cursor.executemany('''
        INSERT INTO rankings (rank, name, province, type, score)
        VALUES (?, ?, ?, ?, ?)
    ''', [(item['rank'], item['name'], item['province'], item['type'], item['score']) for item in data])


def main():
    url = 'http://www.shanghairanking.cn/rankings/bcur/2021'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

    # 获取网页内容
    html_content = fetch_html(url, headers)

    # 解析网页
    table = parse_html(html_content)

    # 连接或创建数据库
    conn = sqlite3.connect('university_rankings.db')
    cursor = conn.cursor()

    # 创建表格
    create_db_table(cursor)

    # 提取数据并插入数据库
    data = extract_data(table)
    insert_data(cursor, data)

    # 提交事务
    conn.commit()

    # 关闭数据库连接
    conn.close()

    print("数据成功保存到数据库。")


if __name__ == "__main__":
    main()

    (2)运行展示:

码云链接 https://gitee.com/jia1666372886/data-collection-work/issues

2.心得体会:
加深了抓包爬取的理解,巩固了api分析技巧的记忆

posted on 2024-10-27 19:57  朱艾伦  阅读(9)  评论(0编辑  收藏  举报

导航