英文处理

#1.准备utf-8编码的文本文件file
f = open("songs.text","r",encoding="utf-8")
# 2.通过文件读取字符串 str
str = f.read()
print(str)
# 3.对文本进行预处理
strLower = str.lower()
print(strLower)
结果如下:


# 4.分解提取单词 list
strList = list(str)
strList = str.split()
print(len(strList),strList)


5.单词计数字典 set , dict
#集合统计单词的个数
strSet = set(strList)
strSet=set(strList)
for word in strSet:
print(word,strList.count(word))
#字典统计单词的个数
dict = {}
for i in str:
if i in dict:
dict[i] = dict[i] + 1
else:
dict[i] = 1
print(dict)


# 6.按词频排序 list.sort(key=)
csList = list(dict.items())
csList.sort(key=lambda x:x[1],reverse = True)
print(csList)
# 7.排除语法型词汇,代词、冠词、连词等无语义词
exclude = {'a','the','and','i','you','in'}
strSet = strSet - exclude
print(len(strSet),strSet)
# 8.输出TOP(20)
for i in range(20):
print(strList[i])



posted @ 2018-09-28 20:15  傻猪一号  阅读(190)  评论(0编辑  收藏  举报