爬取网站出现高频关键词
import requests from bs4 import BeautifulSoup import jieba #爬取页面代码并解析 def get_html(url): try: response=requests.get(url) response.raise_for_status response.encoding=response.apparent_encoding html=BeautifulSoup(response.text,'html.parser') return html except: print('爬取出错') #计算关键词出现次数 def count_word(txt): counts={} words=jieba.cut(txt) for word in words: if len(word)==1: continue else: counts[word]=counts.get(word,0)+1 return counts def main(): url='http://www.c114.com.cn/' html=get_html(url) print('get html') t=html.get_text('+',strip=True) txt = "".join(i for i in t if ord(i) >= 256) #txt中除去英文 print('get txt') counts=count_word(txt) items=list(counts.items()) items.sort(key=lambda x:x[1],reverse=True) for i in range(15): word,count=items[i] print('{:<15}{:>5}'.format(word,count)) main()
分别以 c11通信网[http://www.c114.com.cn/] & 通信人家园[http://www.txrjy.com/forum.php] 这两个网站为例: