一. 选题背景
二. 主题式网络爬虫设计方案
- 该网络爬虫为联合早报新闻数据爬虫
- 旨在爬取新闻的文本数据及图片数据进行持久化处理,并对文本数据进行普通词云及LDA模型词云生成,获取较为精确的热点信息。
- 设计方案:
对数据爬取及清洗,持久化处理,词云生成等功能进行模块化处理,以单独的函数进行单 一功能模块设计,
三. 主题页面的结构特征分析
- 联合早报新闻包括中国主题及世界主题。
中国主题url: https://www.zaobao.com/news/china
世界主题url: https://www.zaobao.com/news/world
且其每一个子标签<a>内的信息 ,即为一条新闻的所有数据,href属性即为具体新闻页面的url链接变量,通过获取该主页的所有<a>标签数据,就获取到了对应新闻的链接关键字,对具体新闻页面数据进行爬取时,进行链接拼接。再逐一对新闻具体链接进行爬取。
3.通过Beatifulsoup对页面进行解析,并使用finaAll()方法即可完成主页中各个链接信息。再进行拼接:url = fhttps://www.zaobao.com/{i.attrs['href']} 即为完整的新闻页面链接。
四. 网络爬虫程序设计
def start(url, data_path):
# 创建数据文件夹
isExists = os.path.exists(data_path)
if not isExists:
r = requests.get(url, headers=headers)
html = r.text
soup = BeautifulSoup(html)
a_all = soup.findAll('a')
for i in a_all:
if 'class' not in i.attrs.keys():
elif i.attrs['class'] == ['col-lg-4', 'col-12', 'list-block', 'no-gutters', 'row']:
url = f"https://www.zaobao.com/{i.attrs['href']}"
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | def get_info(url): r = requests.get(url = url, headers = headers) html = r.text soup = BeautifulSoup(html) article = soup.find( 'article' ) title = soup.find( 'h1' , class_ = 'article-title' ) photo_src_lst = [] try : imgs = soup.select( '#carousel-article > div > a > img' ) for img in imgs: photo_src_lst.append(img.get( 'src' )) except : print ( '这篇文章没有图片' ) title = title.text article = article.text print (f '《{title.strip()}》' , '爬取成功' ) # 创建文件夹 info_path = data_path + '/' + title.strip() isExists = os.path.exists(info_path) if not isExists: os.makedirs(info_path) # 保存图片 for i in range ( len (photo_src_lst)): # 图片保存路径 pic_path = info_path + os.sep + '%d.jpg' % i try : req = requests.get(photo_src_lst[i]) except requests.exceptions.MissingSchema as e: print ( '图片URL出错,尝试补全URL' ) print (e) req = requests.get( 'http:' + photo_src_lst[i]) finally : img = req.content f = open (pic_path, "wb" ) f.write(img) save_text(info_path, title, article) # return title,article,photo_src_lst |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | # 切分文本 with open ( 'merged_txt.txt' , 'r' , encoding = 'utf-8' ) as f: seg_list = jieba.cut(f.read()) seg_space = ' ' .join(seg_list) 即得到分割好的word_lst,传入WordCloud方法,即可实现词云的绘制。具体函数如下: def keyword_cloud_main(): # 合并 merge_txt_file( 'zaobao_news' ) # 读取文本和背景图片,rb即二进制读取 stopword_list = open (stop_word_path, encoding = 'utf-8' ).readlines() myword_list = open (my_word_path, encoding = 'utf-8' ).readlines() # bg_img = imread(img_path) news_text = getnewstext(newsTextdir) print ( type (news_text)) # 设置停用词 stop_words = add_stop_words(stopword_list) print ( '停用词共:' , stop_words.__len__()) # 加载自定义词库 jieba.load_userdict(my_word_path) # 切分文本 with open ( 'merged_txt.txt' , 'r' , encoding = 'utf-8' ) as f: seg_list = jieba.cut(f.read()) seg_space = ' ' .join(seg_list) wc = WordCloud(font_path = 'C:\Windows\Fonts\simfang.ttf' , max_words = 40 , random_state = 42 , background_color = 'black' , stopwords = stop_words, max_font_size = 100 , scale = 5 , collocations = False ).generate(seg_space) plt.imshow(wc) plt.axis( "off" ) plt.show() # 保存结果 wc.to_file( '.\wordcloud_res.jpg' ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | def histogram(seg): mpl.rcParams[ 'font.sans-serif' ] = [ 'SimHei' ] # X 轴可以显示中文 mpl.rcParams[ 'axes.unicode_minus' ] = False # X 轴可以显示中文 wods = [x for x in jieba.cut(seg) if len (x) > = 2 ] word_count = Counter(wods) x = [x[ 0 ] for x in word_count.most_common( 10 )] # 统计top20个关键字 y = [x[ 1 ] for x in word_count.most_common( 10 )] # 统计top20个关键字出现的次数 fig = plt.figure() plt.grid( False ) # c = np.random.randint(0,1,len(y)) plt.bar(x, y, color = 'lightskyblue' ) plt.xlabel( 'keywords' , ) plt.ylabel( 'count' ,) plt.title( '高频词频统计图' ) plt.show() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | def LDA_main(): merge_txt_file( 'zaobao_news' ) alllines = open ( 'merged_txt.txt' , 'r' , encoding = "utf-8" ).readlines() # 对文档集进行词汇切分、停用词过滤 stoplist = open ( 'stopword.txt' , 'r' , encoding = "utf-8" ).readlines() stoplist = set (w.strip() for w in stoplist) segtexts = [] for line in alllines: doc = [] for w in list (jieba.cut(line, cut_all = True )): if len (w) > 1 and w not in stoplist: doc.append(w) segtexts.append(doc) dictionary = Dictionary(segtexts) dictionary.filter_extremes( 2 , 1.0 , keep_n = 1000 ) # 词典过滤,保留1000个 corpus = [dictionary.doc2bow(text) for text in segtexts] lda = LdaModel(corpus, id2word = dictionary, num_topics = num_topics) # 指定id2word,可以直接显示词汇而非其id topics = lda.print_topics(num_topics = num_topics, num_words = 10 ) # list (topic_id, [(word, value), … ]) # print(topics) # 可视化 font = r 'C:\Windows\Fonts\simfang.ttf' wc = WordCloud(collocations = False , font_path = font, width = 2800 , height = 2800 , max_words = 20 , margin = 2 ) for topicid in range ( 0 , num_topics): tlist = lda.get_topic_terms(topicid, topn = 1000 ) # 定义词云图中的词汇数 p(w|z) # print(tlist) wdict = {} # ['词a':100 '词b':90,'词c':80] for wv in tlist: wdict[dictionary[wv[ 0 ]]] = wv[ 1 ] # print(wdict) wordcloud = wc.generate_from_frequencies(wdict) wordcloud.to_file( 'topic_' + str (topicid) + '.png' ) # 保存图片 print ( 'topic_' + str (topicid) + '.png' , '保存成功' ) |
1 2 3 4 5 6 7 8 9 10 11 12 | def save_text(file_dir, title, article): res = ( '标题' + title + '\r\n' + '新闻正文:' + article + '\r\n' ) # 文件名中不能出现一些符号,需要进行过滤 title = re.sub(r '[\\/:*?"<>|!:?!;\n]' , '_' , title) # 输出 file_path = file_dir + os.sep + title + '.txt' f = open (file_path, "wb" ) f.write(res.encode( "utf-8" )) f.close() |
1 | wordcloud.to_file( 'topic_' + str (topicid) + '.png' ) # 保存图片 |
五. 总结
- 对改天的新闻文本分析发现,该新闻栏目无论是国际板块还是中国板块,主要内容均与“中国”、“政策”、“经济”等关键词相关,可以得出结论:该新闻栏目的新闻内容以时事政治为主。通过LDA模型,多方面的分析也可以验证以上结论。基本达到预期效果。
- 在此设计过程中发现爬虫程序的强大与便捷,但是仅仅掌握了数据爬取能力还不够,还需要学习数据分析的相关知识,从大量的数据中获取有效、有价值的数据。改进:可以选取主流的新闻平台进行新闻内容的获取,提高实际作用。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | import matplotlib import requests import re from bs4 import BeautifulSoup import jieba import chardet from wordcloud import WordCloud, ImageColorGenerator from imageio import imread import jieba import os import chardet from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel import matplotlib.pyplot as plt headers = { 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36' } url_china = 'https://www.zaobao.com/news/china' url_world = 'https://www.zaobao.com/news/world' data_path = 'zaobao_news' def save_text(file_dir, title, article): res = ( '标题' + title + '\r\n' + '新闻正文:' + article + '\r\n' ) # 文件名中不能出现一些符号,需要进行过滤 title = re.sub(r '[\\/:*?"<>|!:?!;\n]' , '_' , title) # 输出 file_path = file_dir + os.sep + title + '.txt' f = open (file_path, "wb" ) f.write(res.encode( "utf-8" )) f.close() def get_info(url): r = requests.get(url = url, headers = headers) r.encoding = 'utf-8' html = r.text soup = BeautifulSoup(html) article = soup.find( 'article' ) title = soup.find( 'h1' , class_ = 'article-title' ) photo_src_lst = [] try : imgs = soup.select( '#carousel-article > div > a > img' ) for img in imgs: photo_src_lst.append(img.get( 'src' )) except : print ( '这篇文章没有图片' ) title = title.text article = article.text print (f '《{title.strip()}》' , '爬取成功' ) # 创建文件夹 info_path = data_path + '/' + title.strip() isExists = os.path.exists(info_path) if not isExists: os.makedirs(info_path) # 保存图片 for i in range ( len (photo_src_lst)): # 图片保存路径 pic_path = info_path + os.sep + '%d.jpg' % i try : req = requests.get(photo_src_lst[i]) except requests.exceptions.MissingSchema as e: print ( '图片URL出错,尝试补全URL' ) print (e) req = requests.get( 'http:' + photo_src_lst[i]) finally : img = req.content f = open (pic_path, "wb" ) f.write(img) save_text(info_path, title, article) # return title,article,photo_src_lst def start(url, data_path): # 创建数据文件夹 isExists = os.path.exists(data_path) if not isExists: os.makedirs(data_path) r = requests.get(url, headers = headers) html = r.text soup = BeautifulSoup(html) a_all = soup.findAll( 'a' ) for i in a_all: if 'class' not in i.attrs.keys(): continue elif i.attrs[ 'class' ] = = [ 'col-lg-4' , 'col-12' , 'list-block' , 'no-gutters' , 'row' ]: url = f "https://www.zaobao.com/{i.attrs['href']}" get_info(url) def download_data_main(): start(url_china, data_path) start(url_world, data_path) print ( '------' * 3 ) print ( '爬取全部完成' ) print ( '------' * 3 ) def merge_txt_file(txt_dir): with open ( 'merged_txt.txt' , 'w' , encoding = 'utf-8' ) as fa: txt_file_path_lst = [] for root, dirs, files in os.walk(txt_dir): # print(root) # print(dirs) txt_file = '' for file in files: if file [ - 4 :] = = '.txt' : # print(file) txt_file = file # print(files) #当前路径下所有非目录子文件 # print(txt_file) if txt_file: txt_file_path = root + '\\' + txt_file # print(txt_file_path) txt_file_path_lst.append(txt_file_path) with open (txt_file_path, 'r' , encoding = 'utf-8' ) as fs: fa.write(fs.read()) # print('合并成功') # 设置新闻文本根目录、图像路径 newsTextdir = r 'merged_txt.txt' # img_path = r'.\background.jpg' stop_word_path = r '.\stopword.txt' my_word_path = r '.\myword.txt' # 增加停用词库 def add_stop_words( list ): stop_words = set () for item in list : stop_words.add(item.strip()) return stop_words def getnewstext(newsdir): news_text = "" sd = os.walk(newsdir) for d, s, fns in sd: for fn in fns: if fn[ - 3 :] = = 'txt' : file = d + os.sep + fn print ( file ) try : f = open ( file ) lines = f.readlines() except : ft = open ( file , "rb" ) cs = chardet.detect(ft.read()) ft.close() f = open ( file , encoding = cs[ 'encoding' ]) lines = f.readlines() for i in range ( len (lines)): news_text + = '.' .join(lines) return news_text def keyword_cloud_main(): # 合并 merge_txt_file( 'zaobao_news' ) # 读取文本和背景图片,rb即二进制读取 stopword_list = open (stop_word_path, encoding = 'utf-8' ).readlines() myword_list = open (my_word_path, encoding = 'utf-8' ).readlines() # bg_img = imread(img_path) news_text = getnewstext(newsTextdir) print ( type (news_text)) # 设置停用词 stop_words = add_stop_words(stopword_list) print ( '停用词共:' , stop_words.__len__()) # 加载自定义词库 jieba.load_userdict(my_word_path) # 切分文本 with open ( 'merged_txt.txt' , 'r' , encoding = 'utf-8' ) as f: seg_list = jieba.cut(f.read()) seg_space = ' ' .join(seg_list) wc = WordCloud(font_path = 'C:\Windows\Fonts\simfang.ttf' , max_words = 40 , random_state = 42 , background_color = 'black' , stopwords = stop_words, max_font_size = 100 , scale = 5 , collocations = False ).generate(seg_space) plt.imshow(wc) plt.axis( "off" ) plt.show() # 保存结果 wc.to_file( '.\wordcloud_res.jpg' ) return seg_space def getnewstext(newsdir): docs = [] news_text = "" sd = os.walk(newsdir) for d, s, fns in sd: for fn in fns: if fn[ - 3 :] = = 'txt' : file = d + os.sep + fn # print(file) try : f = open ( file ) lines = f.readlines() except : ft = open ( file , "rb" ) cs = chardet.detect(ft.read()) ft.close() f = open ( file , encoding = cs[ 'encoding' ]) lines = f.readlines() docs.append( '\n' .join(lines)) return docs def LDA_main(): merge_txt_file( 'zaobao_news' ) alllines = open ( 'merged_txt.txt' , 'r' , encoding = "utf-8" ).readlines() # 对文档集进行词汇切分、停用词过滤 stoplist = open ( 'stopword.txt' , 'r' , encoding = "utf-8" ).readlines() stoplist = set (w.strip() for w in stoplist) segtexts = [] for line in alllines: doc = [] for w in list (jieba.cut(line, cut_all = True )): if len (w) > 1 and w not in stoplist: doc.append(w) segtexts.append(doc) dictionary = Dictionary(segtexts) dictionary.filter_extremes( 2 , 1.0 , keep_n = 1000 ) # 词典过滤,保留1000个 corpus = [dictionary.doc2bow(text) for text in segtexts] lda = LdaModel(corpus, id2word = dictionary, num_topics = num_topics) # 指定id2word,可以直接显示词汇而非其id topics = lda.print_topics(num_topics = num_topics, num_words = 10 ) # list (topic_id, [(word, value), … ]) # print(topics) # 可视化 font = r 'C:\Windows\Fonts\simfang.ttf' wc = WordCloud(collocations = False , font_path = font, width = 2800 , height = 2800 , max_words = 20 , margin = 2 ) for topicid in range ( 0 , num_topics): tlist = lda.get_topic_terms(topicid, topn = 1000 ) # 定义词云图中的词汇数 p(w|z) # print(tlist) wdict = {} # ['词a':100 '词b':90,'词c':80] for wv in tlist: wdict[dictionary[wv[ 0 ]]] = wv[ 1 ] # print(wdict) wordcloud = wc.generate_from_frequencies(wdict) wordcloud.to_file( 'topic_' + str (topicid) + '.png' ) # 保存图片 print ( 'topic_' + str (topicid) + '.png' , '保存成功' ) from matplotlib.font_manager import FontProperties from collections import Counter # from pylab import * def histogram(seg): matplotlib.rcParams[ 'font.sans-serif' ] = [ 'SimHei' ] # X 轴可以显示中文 matplotlib.rcParams[ 'axes.unicode_minus' ] = False # X 轴可以显示中文 wods = [x for x in jieba.cut(seg) if len (x) > = 2 ] word_count = Counter(wods) x = [x[ 0 ] for x in word_count.most_common( 10 )] # 统计top20个关键字 y = [x[ 1 ] for x in word_count.most_common( 10 )] # 统计top20个关键字出现的次数 fig = plt.figure() plt.grid( False ) # c = np.random.randint(0,1,len(y)) plt.bar(x, y, color = 'lightskyblue' ) plt.xlabel( 'keywords' , ) plt.ylabel( 'count' ,) plt.title( '高频词频统计图' ) plt.savefig( 'his_res.png' ) plt.show() if __name__ = = '__main__' : download_data_main() merge_txt_file( 'zaobao_news' ) seg = keyword_cloud_main() histogram(seg) num_topics = 5 newsTextdir = r 'mergerd_txt' LDA_main() |
