爬取网站出现高频关键词

import requests
from bs4 import BeautifulSoup
import jieba
    
    
#爬取页面代码并解析
def get_html(url):
    try:
        response=requests.get(url)
        response.raise_for_status
        response.encoding=response.apparent_encoding
        html=BeautifulSoup(response.text,'html.parser')
        return html
    except:
        print('爬取出错')


#计算关键词出现次数
def count_word(txt):
    counts={}
    words=jieba.cut(txt)
    for word in words:
        if len(word)==1:
            continue
        else:
            counts[word]=counts.get(word,0)+1
    return counts


def main():
    url='http://www.c114.com.cn/'
    html=get_html(url)
    print('get html')
    t=html.get_text('+',strip=True)
    txt = "".join(i for i in t if ord(i) >= 256)  #txt中除去英文
    print('get txt')
    counts=count_word(txt)
    items=list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)
    for i in range(15):
        word,count=items[i]
        print('{:<15}{:>5}'.format(word,count))
main()

分别以 c11通信网[http://www.c114.com.cn/] & 通信人家园[http://www.txrjy.com/forum.php] 这两个网站为例：

posted on 2019-12-10 20:31 ifran 阅读(848) 评论(0) 编辑收藏举报

刷新页面返回顶部

爬取网站出现高频关键词

导航

公告