数据采集作业2

学号姓名 码云仓库地址
102202128林子豪 (https://gitee.com/102202128林子豪)

1.要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库
代码如下:

点击查看代码 import requests from bs4 import BeautifulSoup, UnicodeDammit import sqlite3 import logging from typing import List

配置日志

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

天气数据库类

class WeatherDB:
def init(self, db_name: str = "weathers.db"):
self.db_name = db_name

def __enter__(self):
    self.con = sqlite3.connect(self.db_name)
    self.cursor = self.con.cursor()
    self.create_table()
    return self

def __exit__(self, exc_type, exc_val, exc_tb):
    if exc_type:
        logging.error(f"An error occurred: {exc_val}")
        self.con.rollback()
    else:
        self.con.commit()
    self.con.close()

def create_table(self):
    create_table_sql = """
    CREATE TABLE IF NOT EXISTS weathers (
        wCity TEXT NOT NULL,
        wDate TEXT NOT NULL,
        wWeather TEXT,
        wHighTemp TEXT,
        wLowTemp TEXT,
        PRIMARY KEY (wCity, wDate)
    )
    """
    self.cursor.execute(create_table_sql)
    logging.info("确保数据库表已创建")

def insert(self, city: str, date: str, weather: str, high_temp: str, low_temp: str):
    try:
        self.cursor.execute("""
            INSERT INTO weathers (wCity, wDate, wWeather, wHighTemp, wLowTemp)
            VALUES (?, ?, ?, ?, ?)
        """, (city, date, weather, high_temp, low_temp))
        logging.info(f"插入数据: {city}, {date}, {weather}, {high_temp}, {low_temp}")
    except sqlite3.IntegrityError:
        logging.warning(f"数据已存在,跳过插入: {city} on {date}")
    except Exception as e:
        logging.error(f"插入数据时出错: {e}")

def show(self):
    try:
        self.cursor.execute("SELECT * FROM weathers")
        rows = self.cursor.fetchall()
        print(f"{'City':<16}{'Date':<16}{'Weather':<32}{'High Temp':<12}{'Low Temp':<12}")
        for row in rows:
            print(f"{row[0]:<16}{row[1]:<16}{row[2]:<32}{row[3]:<12}{row[4]:<12}")
    except Exception as e:
        logging.error(f"查询数据时出错: {e}")

天气预报类

class WeatherForecast:
def init(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/90.0.4430.93 Safari/537.36"
}
self.city_code = {
"北京": "101010100",
"上海": "101020100",
"广州": "101280101",
"深圳": "101280601"
}

def forecast_city(self, city: str, db: WeatherDB):
    if city not in self.city_code:
        logging.error(f"{city} 的代码无法找到")
        return

    url = f"http://www.weather.com.cn/weather/{self.city_code[city]}.shtml"
    try:
        response = requests.get(url, headers=self.headers, timeout=10)
        response.raise_for_status()
        dammit = UnicodeDammit(response.content, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, "lxml")
        lis = soup.select("ul.t.clearfix li")

        for li in lis:
            try:
                date = li.select_one('h1').get_text(strip=True)
                weather = li.select_one('p.wea').get_text(strip=True)
                high_temp_tag = li.select_one('p.tem span')
                low_temp_tag = li.select_one('p.tem i')
                high_temp = high_temp_tag.get_text(strip=True) if high_temp_tag else "N/A"
                low_temp = low_temp_tag.get_text(strip=True).replace('℃', '') if low_temp_tag else "N/A"
                logging.info(f"{city} {date} {weather} 高温: {high_temp} 低温: {low_temp}")
                db.insert(city, date, weather, high_temp, low_temp)
            except Exception as parse_err:
                logging.error(f"解析数据时出错: {parse_err}")

    except requests.RequestException as req_err:
        logging.error(f"请求错误: {req_err}")

def process(self, cities: List[str]):
    with WeatherDB() as db:
        for city in cities:
            self.forecast_city(city, db)
        db.show()

主程序

def main():
cities = ["北京", "上海", "广州", "深圳"]
wf = WeatherForecast()
wf.process(cities)
logging.info("完成")

if name == "main":
main()

运行结果:![](https://img2024.cnblogs.com/blog/3513976/202410/3513976-20241017212033837-1965025557.png) ![](https://img2024.cnblogs.com/blog/3513976/202410/3513976-20241017212101061-282094706.png)

心得体会
代码组织:代码结构清晰,通过类和方法的合理划分,使得每个功能模块化,易于理解和维护。
异常处理:代码中对可能出现的异常进行了处理,例如网络请求失败、数据解析错误等,这有助于程序的健壮性。
日志记录:通过日志记录,可以清晰地看到程序运行的每一步,便于问题追踪和性能监控。
数据存储:使用SQLite数据库存储数据,方便后续的数据查询和分析。
代码复用:WeatherDB类和WeatherForecast类可以被复用,用于抓取和存储不同城市的天气数据。
要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。
代码如下:

点击查看代码 import requests import sqlite3 import pandas as pd import json

def gethtml(page):
url = "https://18.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408425405289872392_1728984110482&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&dect=1&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1728984110483"
try:
response = requests.get(url)
response.raise_for_status()
json_data = response.text[response.text.find("{"):response.text.rfind("}")+1]
data = json.loads(json_data)
return data
except requests.RequestException as e:
print(f"Error fetching data: {e}")
return None

def getonepagestock(page):
data = gethtml(page)
if not data or 'data' not in data or 'diff' not in data['data']:
return []
return data['data']['diff']

def saveToDatabase(stock_list):
conn = sqlite3.connect('stocks.db')
c = conn.cursor()
try:
c.execute('''CREATE TABLE IF NOT EXISTS stocks
(code TEXT PRIMARY KEY, name TEXT, price REAL, change REAL, percent_change REAL, volume INTEGER, amount REAL)''')
for stock in stock_list:
c.execute("INSERT OR IGNORE INTO stocks VALUES (?, ?, ?, ?, ?, ?, ?)",
(stock.get('f12'), stock.get('f14'), stock.get('f2'), stock.get('f3'), stock.get('f4'), stock.get('f5'), stock.get('f6')))
conn.commit()
except sqlite3.Error as e:
print(f"Database error: {e}")
finally:
c.execute("SELECT * FROM stocks")
rows = c.fetchall()
df = pd.DataFrame(rows, columns=['Code', 'Name', 'Price', 'Change', 'Percent Change', 'Volume', 'Amount'])
print(df)
conn.close()

def main():
page = 1
stock_list = getonepagestock(page)
if stock_list:
print("爬取到的股票数据:")
for stock in stock_list:
print(stock)
saveToDatabase(stock_list)
print("存储到数据库。")
else:
print("未能获取")

if name == "main":
main()

运行结果:![](https://img2024.cnblogs.com/blog/3513976/202410/3513976-20241017212124720-725319828.png) ![](https://img2024.cnblogs.com/blog/3513976/202410/3513976-20241017212134908-413183336.png)

心得体会
模块化设计:代码被合理地分解为多个函数,每个函数负责一个特定的任务,这使得代码易于理解和维护。
异常处理:在gethtml函数中,通过异常处理来确保网络请求失败时能够给出明确的错误信息,这有助于调试和增强程序的健壮性。
数据库操作:使用SQLite作为数据库存储解决方案,它简单、轻量且不需要额外的数据库服务器,适合个人项目和小型应用。
数据展示:在saveToDatabase函数中,使用pandas库将数据库中的数据转换为DataFrame并打印出来,这有助于快速查看和分析数据。

点击查看代码 import requests import re

if name == "main":
url = 'https://www.shanghairanking.cn/_nuxt/static/1728872418/rankings/bcur/2024/payload.js' # 假定的URL
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3947.100 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
x = 'univNameCn:"(.?)"'
y = 'score:(.
?),'
name = re.findall(x, response, re.S)
score = re.findall(y, response, re.S)

# 由于原始数据中可能包含一些非大学名称的项,这里添加一个简单的过滤条件
# 假设大学名称不包含特殊字符,仅包含中文字符和空格
filtered_names = [name[i] for i in range(len(name)) if re.match(r'^[\u4e00-\u9fa5\s]+$', name[i])]
filtered_scores = [score[i] for i in range(len(score)) if re.match(r'^[\u4e00-\u9fa5\s]+$', name[i])]

print("排名 学校 总分")
with open("./university_rankings.txt", "w", encoding='utf-8') as fp:
    fp.write("排名 学校 总分\n")
    for i in range(0, len(filtered_names)):
        print(str(i + 1) + ' ' + filtered_names[i] + ' ' + filtered_scores[i])
        fp.write(str(i + 1) + ' ' + filtered_names[i] + ' ' + filtered_scores[i] + '\n')
print("任务完成")
运行结果:![](https://img2024.cnblogs.com/blog/3513976/202410/3513976-20241017212213551-417244264.png)

心得体会:
代码结构清晰:代码逻辑清晰,步骤分明,易于理解。
正则表达式应用:代码中正则表达式的使用是关键,它能够准确地从文本中提取所需信息。
性能考虑:如果网页内容很大,一次性加载可能会消耗较多内存和时间。
安全性:在进行网络请求时,应考虑安全性问题,例如避免潜在的注入攻击等。

posted @ 2024-10-17 21:22  淋祁  阅读(9)  评论(0编辑  收藏  举报