10行代码使用python统计词频

# -*- coding: utf-8 -*-
#!/usr/bin/env python
import re

f = open("C:\\Users\\陶敏\\Documents\\Pyscript\\test.txt")
str = f.read()
li = re.split(r'[, ;.\n\t]',str)
for i in li:
    if(len(i))==0:
        li.remove(i)

res_world = []
res_count = []


for i in li:
    if i not in res_world:
        res_world.append(i)
        res_count.append(1)
    else:
        res_count[res_world.index(i)]+=1


for i in range(3):
    print(res_world[res_count.index(max(res_count))],max(res_count))
    res_world.pop(res_count.index(max(res_count)))
    res_count.pop(res_count.index(max(res_count)))

使用哈利波特文档作为测试文件,下载地址:https://pan.baidu.com/share/link?shareid=424773&uk=3744444146

测试结果如下

C:\python\python36\python3.exe C:/Users/陶敏/PycharmProjects/day1/.idea/cipin.py
the 3305
to 1841
and 1797

 

posted @ 2018-01-04 15:30  taomin  阅读(870)  评论(0编辑  收藏  举报