大型数据库技术大作业(其二)

今天主要完成:

1)      数据采集:要求从定期自动从网络中爬取信息领域的相关热词;

2)      数据清洗:对热词信息进行数据清洗,并采用自动分类技术生成信息领域热词目录,;

3)      热词解释:针对每个热词名词自动添加中文解释(参照百度百科或维基百科);

import requests
import re
import pandas as pd
import jieba
import logging
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager
import wikipediaapi # 用于维基百科解释
from newsapi import NewsApiClient # 用于新闻引用
from fake_useragent import UserAgent # 用于随机User-Agent

# 配置日志和随机User-Agent
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
ua = UserAgent()


class HotwordAnalyzer:
def __init__(self):
self.stop_words = set(open('stopwords.txt', encoding='utf-8').read().splitlines())
self.wiki = wikipediaapi.Wikipedia('zh')
self.newsapi = NewsApiClient(api_key='your_newsapi_key') # 需替换实际API key
self.font_path = "C:/Windows/Fonts/simhei.ttf"

# ----------------- 数据采集模块 -----------------
def fetch_data(self, url):
try:
headers = {"user-agent": ua.random}
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
logging.info(f"成功获取网页: {url}")
return response.text
logging.warning(f"获取失败: {url} 状态码: {response.status_code}")
except Exception as e:
logging.error(f"请求异常: {e}")
return None

def fetch_all_data(self, urls):
with ThreadPoolExecutor(max_workers=5) as executor:
return [result for result in executor.map(self.fetch_data, urls) if result]

# ----------------- 数据处理模块 -----------------
def clean_text(self, text):
try:
text = re.sub(r'<.*?>', '', text)
text = re.sub(r'[^\w\s]', '', text)
words = [word for word in jieba.cut(text) if word not in self.stop_words]
return ' '.join(words).strip()
except Exception as e:
logging.error(f"清洗失败: {e}")
return text

def tokenize(self, text):
return list(jieba.cut(text))

# ----------------- 特征工程模块 -----------------
def extract_keywords(self, texts, max_features=100):
try:
vectorizer = TfidfVectorizer(max_features=max_features)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel()
return tfidf_matrix, dict(zip(feature_names, tfidf_scores))
except Exception as e:
logging.error(f"关键词提取失败: {e}")
return None, {}

def train_word2vec(self, sentences):
try:
return Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
except Exception as e:
logging.error(f"Word2Vec训练失败: {e}")
return None

# ----------------- 分类与聚类模块 -----------------
def classify_text(self, text, categories, model):
try:
score = {k: 0 for k in categories}
for word in text:
if word in model.wv:
for cat in categories:
if cat in model.wv:
score[cat] += cosine_similarity([model.wv[word]], [model.wv[cat]])[0][0]
return max(score, key=score.get) if any(score.values()) else "其他"
except Exception as e:
logging.error(f"分类失败: {e}")
return "其他"

def cluster_texts(self, tfidf_matrix, n_clusters=5):
try:
return KMeans(n_clusters=n_clusters, random_state=42).fit_predict(tfidf_matrix)
except Exception as e:
logging.error(f"聚类失败: {e}")
return []

# ----------------- 热词解释模块 -----------------
def get_wiki_explanation(self, keyword):
try:
page = self.wiki.page(keyword)
return page.summary[:200] + "..." if page.exists() else "暂无百科解释"
except Exception as e:
logging.error(f"获取解释失败: {e}")
return "解释获取异常"

# ----------------- 新闻引用模块 -----------------
def get_news_links(self, keyword, num=3):
try:
articles = self.newsapi.get_everything(q=keyword, language='zh', sort_by='relevancy')['articles']
return [(a['title'], a['url']) for a in articles[:num]]
except Exception as e:
logging.error(f"获取新闻失败: {e}")
return []

# ----------------- 可视化模块 -----------------
def generate_wordcloud(self, text):
try:
wordcloud = WordCloud(font_path=self.font_path,
width=800, height=400,
background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
except Exception as e:
logging.error(f"生成词云失败: {e}")

def visualize_results(self, df):
try:
prop = font_manager.FontProperties(fname=self.font_path)

plt.figure(figsize=(12, 6))
sns.countplot(x='分类', data=df, palette="viridis")
plt.title("热词分类分布", fontproperties=prop)
plt.xticks(rotation=45, fontproperties=prop)
plt.show()

self.generate_wordcloud(' '.join(df['热词']))
except Exception as e:
logging.error(f"可视化失败: {e}")

# ----------------- 主流程 -----------------
def run(self, urls):
# 数据采集
raw_data = []
for page_content in self.fetch_all_data(urls):
items = re.findall(r'<h2 class="news_entry">.*?<a href=".*?" target="_blank">(.*?)</a>',
page_content, re.S)
raw_data.extend(items)

# 数据预处理
df = pd.DataFrame(raw_data, columns=["原始文本"])
df["清洗文本"] = df["原始文本"].apply(self.clean_text)
df["分词结果"] = df["清洗文本"].apply(self.tokenize)

# 特征工程
tfidf_matrix, keywords = self.extract_keywords(df["清洗文本"])
model = self.train_word2vec(df["分词结果"])

# 分类与聚类
categories = ["人工智能", "区块链", "物联网", "大数据", "云计算", "其他"]
df["分类"] = df["分词结果"].apply(lambda x: self.classify_text(x, categories, model))
df["聚类"] = self.cluster_texts(tfidf_matrix) if tfidf_matrix is not None else 0

# 热词处理
df["热词"] = df["清洗文本"].apply(lambda x: max(re.findall(r'\w{2,}', x), key=len, default=''))
df["解释"] = df["热词"].apply(self.get_wiki_explanation)
df["相关新闻"] = df["热词"].apply(lambda x: self.get_news_links(x))

# 结果处理
self.visualize_results(df)
df.to_excel("热词分析结果.xlsx", index=False)
logging.info("分析完成,结果已保存")


if __name__ == "__main__":
urls = [f'https://news.cnblogs.com/n/recommend?page={i}' for i in range(1, 6)]
analyzer = HotwordAnalyzer()
analyzer.run(urls)
目前正卡在newsapi的获取上面,后续会进行完善
posted @   芊羽鱼  阅读(5)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· Ollama——大语言模型本地部署的极速利器
· 使用C#创建一个MCP客户端
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· ollama系列1:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现
点击右上角即可分享
微信分享提示