Python数据处理训练

(一)、中国大学排名数据分析与可视化(写到实验报告中)

【源代码程序】

import requests

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

 

# URL 模板,按年份爬取数据

URL_TEMPLATE = "https://www.shanghairanking.cn/rankings/bcur/{}"

 

 

# 爬取数据函数

def fetch_rankings(year):

    url = URL_TEMPLATE.format(year)

    response = requests.get(url)

 

    # 检查响应状态码

    if response.status_code != 200:

        print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")

        return []

 

    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.find("table", {"class": "rk-table"})

 

    # 检查是否成功找到表格

    if not table:

        print(f"Failed to find the ranking table for year {year}.")

        return []

 

    rows = table.find_all("tr")[1:11]  # 取前10行数据

 

    rankings = []

    for row in rows:

        cols = row.find_all("td")

        rank = cols[0].text.strip()

        university = cols[1].text.strip()

        score = cols[2].text.strip()

        rankings.append((rank, university, score))

 

    return rankings

 

 

# 打印排名信息

def print_rankings(rankings, year):

    if not rankings:

        print(f"No data available for year {year}.")

        return

 

    print(f"\n{year} 年前 10 名大学排名:")

    print(f"{'排名':<5} {'大学':<20} {'得分':<10}")

    print("-" * 40)

    for rank, university, score in rankings:

        print(f"{rank:<5} {university:<20} {score:<10}")

 

 

# 可视化函数

def plot_rankings(rankings_dict):

    # 设置字体

    plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体字体

    plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

 

    years = list(rankings_dict.keys())

    universities = {university for year in years for _, university, _ in rankings_dict[year]}

 

    plt.figure(figsize=(10, 6))

 

    for university in universities:

        ranks = []

        for year in years:

            rank = next((int(rank) for rank, uni, _ in rankings_dict[year] if uni == university), None)

            ranks.append(rank)

        plt.plot(years, ranks, marker='o', label=university if ranks[-1] and ranks[-1] <= 10 else "")

 

    plt.gca().invert_yaxis()

    plt.xticks(years)

    plt.xlabel('年份')

    plt.ylabel('排名')

    plt.title('2015-2019年前10大学排名变化')

    plt.legend()

    plt.show()

 

 

# 查询排名信息

def query_ranking(rankings_dict):

    while True:

        university = input("请输入大学名称:")

        year = input("请输入年份(2015-2019):")

 

        if not year.isdigit() or int(year) not in rankings_dict:

            print("年份输入有误,请重新输入。")

            continue

 

        year = int(year)

        rank_info = next((rank for rank, uni, _ in rankings_dict[year] if uni == university), None)

 

        if rank_info:

            print(f"{year} 年 {university} 排名:{rank_info}")

        else:

            print(f"{year} 年没有找到 {university} 的排名信息。")

 

        cont = input("是否继续查询?(y/n): ")

        if cont.lower() != 'y':

            break

 

 

if __name__ == "__main__":

    rankings_dict = {}

 

    for year in range(2015, 2019+1):

        rankings_dict[year] = fetch_rankings(year)

        print_rankings(rankings_dict[year], year)

 

    plot_rankings(rankings_dict)

 

    query_ranking(rankings_dict)

 

 

)、豆瓣图书评论数据分析与可视化(写到实验报告中)

【源代码程序】

import requests

from bs4 import BeautifulSoup

import jieba

from wordcloud import WordCloud

import matplotlib.pyplot as plt

 

 

# 爬取短评数据函数

def fetch_comments(book_id, start=0, limit=20, status='P', sort_by='time'):

    url = f"https://book.douban.com/subject/{book_id}/comments/?start={start}&limit={limit}&status={status}&sort={sort_by}"

    headers = {

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"

    }

    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.content, "html.parser")

    comments = soup.find_all("div", class_="comment")

 

    comment_data = []

    for comment in comments:

        username = comment.find("span", class_="comment-info").find("a").text

        content = comment.find("span", class_="short").text

        date = comment.find("a", class_="comment-time").text.strip()

        rating_tag = comment.find("span", class_="rating")

        rating = rating_tag['class'][0][7:8] if rating_tag else "无评分"

        votes = comment.find("span", class_="vote-count").text

        comment_data.append({

            "username": username,

            "content": content,

            "date": date,

            "rating": rating,

            "votes": int(votes)

        })

 

    return comment_data

 

 

# 爬取多页数据

def fetch_multiple_pages(book_id, start=0, limit=20, status='P',sort_by='score', num_pages=3):

    all_comments = []

    for page in range(num_pages):

        start_page = start + page * limit

        comments = fetch_comments(book_id, start=start_page, limit=limit, status=status, sort_by=sort_by)

        all_comments.extend(comments)

    return all_comments

 

 

# 输出前10条短评信息

def print_top_comments(comments, top_n=10):

    for i, comment in enumerate(comments[:top_n], 1):

        print(

            f"{i}. 用户名: {comment['username']}, 评论时间: {comment['date']}, 评分: {comment['rating']}, 点赞数: {comment['votes']}")

        print(f"   短评: {comment['content']}")

 

 

# 按照点赞数排序并输出前10条短评信息

def print_top_comments_by_votes(comments, top_n=10):

    sorted_comments = sorted(comments, key=lambda x: x['votes'], reverse=True)

    print_top_comments(sorted_comments, top_n)

 

 

# 文本分析与词云生成

def generate_wordcloud(comments):

    text = " ".join([comment['content'] for comment in comments])

    words = " ".join(jieba.cut(text))

 

    if not words.strip():

        print("没有足够的评论内容生成词云。")

        return

 

    wordcloud = WordCloud(font_path='msyh.ttc', width=800, height=400, background_color='white').generate(words)

 

    plt.figure(figsize=(10, 5))

    plt.imshow(wordcloud, interpolation="bilinear")

    plt.axis("off")

    plt.show()

 

    # 统计词频

    words_list = jieba.lcut(text)

    word_freq = {}

    for word in words_list:

        if len(word) > 1:

            word_freq[word] = word_freq.get(word, 0) + 1

    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

    print("前10位词频统计结果:")

    for word, freq in sorted_word_freq[:10]:

        print(f"{word}: {freq}")

 

def total(book_id,sort_by):

 

    start = 0  # 起始位置

    limit = 20  # 每页数量

    statuses = ['P', 'N', 'F']  # 读书状态:P代表读过,N代表在读,F代表想读

    # 存储各状态下的评论

    all_comments = {}

 

    # 爬取数据并存储

    for status in statuses:

        comments = fetch_multiple_pages(book_id, start=start, limit=limit, status=status, sort_by=sort_by, num_pages=3)

        all_comments[status] = comments

 

    if book_id =='36781566':

        print(f"《叙事本能》")

    else:

        print(f"《暗处的女儿》")

    if sort_by == 'time':

        print("最新排序前10位短评信息:")

    else:

        print("热门排序前10位短评信息:")

    # 输出各状态下前10位短评信息

    for status, comments in all_comments.items():

        # 读书状态:P代表读过,N代表在读,F代表想读

        if status == 'P':

            print(f"读书状态:读过")

        elif status == 'N':

            print(f"读书状态:在读")

        else:

            print(f"读书状态:想读")

        print_top_comments(comments)

        print()

    # 文本分析与词云生成

    print("\n按点赞数排序前10位短评信息:")

    all_comments_merged = sum(all_comments.values(), [])

    generate_wordcloud(all_comments_merged)

 

 

 

if __name__ == "__main__":

    books_id = ["36701566","36721763"]  

    sorts = ['time','score']  # 'time' for 最新, 'score' for 热门

 

    # 爬取数据

    for book_id in books_id:

        for sort in sorts:

            total(book_id,sort)

 

    print("\n热门排序前10位短评信息:")

 

)、函数图形1绘制(写到实验报告中)

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

 

x = np.arange(0, 10, 0.0001)

y1 = x ** 2

y2 = np.cos(x * 2)

y3 = y1 * y2

plt.plot(x, y1,linestyle='-.')

plt.plot(x, y2,linestyle=':')

plt.plot(x, y3,linestyle='--')

plt.savefig("3-1.png")

plt.show()

 

 

 

import matplotlib.pyplot as plt

import numpy as np

fig, subs = plt.subplots(2, 2)

subs[0][0].plot(x, y1)

subs[0][1].plot(x, y2)

subs[1][0].plot(x, y3)

plt.savefig("3-2.png")

plt.show()

 

(四)、函数图形2绘制(写到实验报告中)

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

 

x = np.arange(-2, 2, 0.0001)

y1 = np.sqrt(2 * np.sqrt(x ** 2) - x ** 2)

y2 = (-2.14) * np.sqrt(np.sqrt(2) - np.sqrt(np.abs(x)))

plt.plot(x, y1, 'r', x, y2, 'r')

plt.fill_between(x, y1, y2, facecolor='pink')

plt.savefig("heart.png")

plt.show()

posted @   不会JAVA的小袁  阅读(24)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示
历史上的今天:
2023-05-26 回文数
2023-05-26 自守数
2023-05-26 亲密数
2023-05-26 完数
2023-05-26 马克思手稿中的数学题
点击右上角即可分享
微信分享提示