5.21

Python数据处理训练

班级: 信2205-1 学号: 20224074 姓名:王晨宇

一实验目的

l 使学生熟练安装扩展库numpy、requests、bs4、pandas、seaborn、matplotlib等；

l 使学生熟悉使用标准库cvs操作文件；

l 使学生熟悉使用pandas进行数据分析的基本操作；

l 使学生了解使用seaborn绘制热力图的方法；

l 使学生熟练使用matplotlib进行数据可视化；

l 使学生熟练使用nmupy进行科学计算；

l 使学生熟练运用requests库和bs4库进行基本的数据爬取

二实验环境及实验准备

l 所需硬件环境为微机；

l 所需软件环境为Python 3.X等；

l 掌握Python下numpy、requests、bs4、pandas、seaborn、matplotlib、cvs等的使用；

三实验内容

（一）、中国大学排名数据分析与可视化；

【源代码程序】

import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

# 获取网页内容
def get_page_content(year):
    url = f"https://www.shanghairanking.cn/rankings/bcur/{year}.html".format(year=year) # 修正URL格式错误
    try:
        response = requests.get(url)
        response.raise_for_status()
        # 明确指定使用UTF-8编码解码内容
        return response.content.decode('utf-8')
    except requests.RequestException as e:
        print(f"请求错误: {year}年 - {e}")
        return None

# 解析网页内容
def parse_content(content):
    if content is None:
        print("未获取到网页内容，无法解析")
        return [], []

    soup = BeautifulSoup(content, 'html.parser')
    table = soup.find('table', class_='rk-table') # 定位表格

    if table is None:
        print("未找到预期的表格结构")
        return [], []

    rows = table.find_all("tr")[1:] # 跳过表头
    universities = []
    ranks = []

    for row in rows:
        cols = row.find_all("td")
        if len(cols) >= 2:
            try:
                rank = cols[0].text.strip()
                university = cols[1].find('a', class_='name-cn').text.strip() # 假定学校名称在a标签内
                universities.append(university)
                ranks.append(int(rank) if rank.isdigit() else None)
            except (AttributeError, ValueError):
                print("解析错误，跳过当前行")
                continue

    return universities, ranks

# 可视化展示
def plot_data(years, universities):
    assert len(years) == len(universities), "年数不匹配"
    for year, uni_list in zip(years, universities):
        plt.plot(range(1, len(uni_list)+1, uni_list), label=year) # 修正绘图逻辑

    plt.xlabel('排名')
    plt.ylabel('Universities')
    plt.xticks(range(1, len(uni_list)+1)) # 调整x轴刻度
    plt.legend()
    plt.tight_layout()
    plt.show()

years = ['2015', '2016', '2017', '2018', '2019']
all_universities = []

for year in years:
    content = get_page_content(year)
    universities, _ = parse_content(content) # 只需大学名称
    if universities:
        all_universities.append(universities)
        print(f"----- {year}年软科最好大学排名 Top 10 -----")
        for i, university in enumerate(universities, 1):
            if i<=10:
             print(f"{i}. {university}")
    else:
        print(f"未找到{year}年排名信息")

if all_universities:
    plot_data(years, all_universities)

    def get_rank_by_university_and_year(university, year):
        url = f"https://www.shanghairanking.cn/rankings/bcur/{year}.html"
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
            table = soup.find('table', class_='rk-table')

            if table is None:
                return None

            rows = table.find_all("tr")[1:] # Skip header row
            for row in rows:
                cols = row.find_all("td")
                if len(cols) >= 2:
                    rank_text = cols[0].text.strip()
                    name = cols[1].find('a', class_='name-cn').text.strip()
                    if name == university:
                        return int(rank_text) if rank_text.isdigit() else None
        except requests.RequestException as e:
            print(f"请求错误: {year}年 - {e}")
            return None

    def main():
        while True:
            university = input("请输入大学名称（输入'q'退出）：")
            if university.lower() == 'q':
                print("查询结束。")
                break

            year_str = input("请输入年份（例如：2023）：")
            if not year_str.isdigit() or int(year_str) < 2015:
                print("年份输入无效，请输入2015年及以后的整数年份。")
                continue

            ranking = get_rank_by_university_and_year(university, year_str)
            if ranking is not None:
                print(f"{university}在{year_str}年的排名是：{ranking}")
            else:
                print(f"未能找到{university}在{year_str}年的排名信息。")

            # 提供重新查询或结束的选择
            choice = input("是否重新查询？(y/n): ")
            if choice.lower() != 'y':
                print("查询结束。")
                break

    if __name__ == "__main__":
        main()

【运行测试】

（二）、豆瓣图书评论数据分析与可视化；

【源代码程序】

import re

from collections import Counter



import requests

from lxml import etree

import pandas as pd

import jieba

import matplotlib.pyplot as plt

from wordcloud import WordCloud



headers = {

    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"

}



comments = []

words = []





def regex_change(line):

    # 前缀的正则

    username_regex = re.compile(r"^\d+::")

    # URL，为了防止对中文的过滤，所以使用[a-zA-Z0-9]而不是\w

    url_regex = re.compile(r"""

        (https?://)?

        ([a-zA-Z0-9]+)

        (\.[a-zA-Z0-9]+)

        (\.[a-zA-Z0-9]+)*

        (/[a-zA-Z0-9]+)*

    """, re.VERBOSE | re.IGNORECASE)

    # 剔除日期

    data_regex = re.compile(u"""        #utf-8编码

        年 |

        月 |

        日 |

        (周一) |

        (周二) | 

        (周三) | 

        (周四) | 

        (周五) | 

        (周六)

    """, re.VERBOSE)

    # 剔除所有数字

    decimal_regex = re.compile(r"[^a-zA-Z]\d+")

    # 剔除空格

    space_regex = re.compile(r"\s+")

    regEx = "[\n”“|,，；;''/?! 。的了是]"  # 去除字符串中的换行符、中文冒号、|，需要去除什么字符就在里面写什么字符

    line = re.sub(regEx, "", line)

    line = username_regex.sub(r"", line)

    line = url_regex.sub(r"", line)

    line = data_regex.sub(r"", line)

    line = decimal_regex.sub(r"", line)

    line = space_regex.sub(r"", line)

    return line





def getComments(url):

    score = 0

    resp = requests.get(url, headers=headers).text

    html = etree.HTML(resp)

    comment_list = html.xpath(".//div[@class='comment']")

    for comment in comment_list:

        status = ""

        name = comment.xpath(".//span[@class='comment-info']/a/text()")[0]  # 用户名

        content = comment.xpath(".//p[@class='comment-content']/span[@class='short']/text()")[0]  # 短评内容

        content = str(content).strip()

        word = jieba.cut(content, cut_all=False, HMM=False)

        time = comment.xpath(".//span[@class='comment-info']/a/text()")[1]  # 评论时间

        mark = comment.xpath(".//span[@class='comment-info']/span/@title")  # 评分

        if len(mark) == 0:

            score = 0

        else:

            for i in mark:

                status = str(i)

            if status == "力荐":

                score = 5

            elif status == "推荐":

                score = 4

            elif status == "还行":

                score = 3

            elif status == "较差":

                score = 2

            elif status == "很差":

                score = 1

        good = comment.xpath(".//span[@class='comment-vote']/span[@class='vote-count']/text()")[0]  # 点赞数（有用数）

        comments.append([str(name), content, str(time), score, int(good)])

        for i in word:

            if len(regex_change(i)) >= 2:

                words.append(regex_change(i))





def getWordCloud(words):

    # 生成词云

    all_words = []

    all_words += [word for word in words]

    dict_words = dict(Counter(all_words))

    bow_words = sorted(dict_words.items(), key=lambda d: d[1], reverse=True)

    print("热词前10位：")

    for i in range(10):

        print(bow_words[i])

    text = ' '.join(words)



    w = WordCloud(background_color='white',

                     width=1000,

                     height=700,

                     font_path='simhei.ttf',

                     margin=10).generate(text)

    plt.show()

    plt.imshow(w)

    w.to_file('wordcloud.png')





print("请选择以下选项:")

print("   1.热门评论")

print("   2.最新评论")

info = int(input())

print("前10位短评信息：")

title = ['用户名', '短评内容', '评论时间', '评分', '点赞数']

if info == 1:

    comments = []

    words = []

    for i in range(0, 60, 20):

        url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=new_score".format(

            i)  # 前3页短评信息（热门）

        getComments(url)

    df = pd.DataFrame(comments, columns=title)

    print(df.head(10))

    print("点赞数前10位的短评信息：")

    df = df.sort_values(by='点赞数', ascending=False)

    print(df.head(10))

    getWordCloud(words)

elif info == 2:

    comments = []

    words=[]

    for i in range(0, 60, 20):

        url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=time".format(

            i)  # 前3页短评信息（最新）

        getComments(url)

    df = pd.DataFrame(comments, columns=title)

    print(df.head(10))

    print("点赞数前10位的短评信息：")

    df = df.sort_values(by='点赞数', ascending=False)

    print(df.head(10))

    getWordCloud(words)

【运行测试】

（三）、函数图形1绘制；

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np



x = np.arange(0, 10, 0.0001)

y1 = x ** 2

y2 = np.cos(x * 2)

y3 = y1 * y2

plt.plot(x, y1,linestyle='-.')

plt.plot(x, y2,linestyle=':')

plt.plot(x, y3,linestyle='--')

plt.savefig("3-1.png")

plt.show()



fig, subs = plt.subplots(2, 2)

subs[0][0].plot(x, y1)

subs[0][1].plot(x, y2)

subs[1][0].plot(x, y3)

plt.savefig("3-2.png")

plt.show()

【运行测试】

（四）、函数图形2绘制；

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np



x = np.arange(-2, 2, 0.0001)

y1 = np.sqrt(2 * np.sqrt(x ** 2) - x ** 2)

y2 = (-2.14) * np.sqrt(np.sqrt(2) - np.sqrt(np.abs(x)))

plt.plot(x, y1, 'r', x, y2, 'r')

plt.fill_between(x, y1, y2, facecolor='orange')

plt.savefig("heart.png")

plt.show()

【运行测试】

Python数据处理训练

班级: 信2205-1 学号: 20224074 姓名:王晨宇

实验自评

实验内容	自评结果（在对应格内打ü）
实验内容	不熟练	一般	比较熟练	熟练
Python下数据爬取及应用			ü
Python下科学计算及数据分析			ü
Python下可视化展示			ü

实验体会

通过本次训练，我深刻体会到了Python在数据处理，可视化，网络爬取数据方面的高效便捷，但是，在此次实验中我也遇到了许多问题，比如HTML网页的分析不正确，和编码问题特别是UTF-8 ，但是通过网络上的知识都一一解决了。希望我在Python日后的学习中能够更加努力。

posted @ 2024-06-05 09:20 晨观夕阅读(3) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

wcy1111

5.21

公告