爬取网站出现高频关键词

import requests
from bs4 import BeautifulSoup
import jieba
    
    
#爬取页面代码并解析
def get_html(url):
    try:
        response=requests.get(url)
        response.raise_for_status
        response.encoding=response.apparent_encoding
        html=BeautifulSoup(response.text,'html.parser')
        return html
    except:
        print('爬取出错')


#计算关键词出现次数
def count_word(txt):
    counts={}
    words=jieba.cut(txt)
    for word in words:
        if len(word)==1:
            continue
        else:
            counts[word]=counts.get(word,0)+1
    return counts


def main():
    url='http://www.c114.com.cn/'
    html=get_html(url)
    print('get html')
    t=html.get_text('+',strip=True)
    txt = "".join(i for i in t if ord(i) >= 256)  #txt中除去英文
    print('get txt')
    counts=count_word(txt)
    items=list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)
    for i in range(15):
        word,count=items[i]
        print('{:<15}{:>5}'.format(word,count))
main()

 分别以    c11通信网[http://www.c114.com.cn/]   &   通信人家园[http://www.txrjy.com/forum.php]  这两个网站为例:

 

 

 

 

 

posted on 2019-12-10 20:31  ifran  阅读(848)  评论(0编辑  收藏  举报

导航