中文处理

# 准备utf - 8
# 编码的文本文件file
import jieba
f=open('novels.text','r',encoding='utf-8')
# 2.
# 通过文件读取字符串
str =f.read()

# 5.
# 单词计数字典

dict = {}
for i in str:
    if i in dict:
        dict[i] = dict[i] + 1
    else:
        dict[i] = 1
        print(dict)

# 6.按词频排序 list.sort(key=)
csList = list(dict.items())
csList.sort(key=lambda x:x[1],reverse = True)
print(csList)

 

# 3.
# 对文本进行预处理
str = jieba.cut(str)
str = jieba.cut(str,cut_all=True)
print(str)
# 4.
# 分解提取单词
strList = jieba.cut_for_search(str)
# 5.

 

#7输出TOP(20)
for i in range(20):
   print(str[i])

 

#1.准备utf-8编码的文本文件file
f = open("songs.text","r",encoding="utf-8")
# 2.通过文件读取字符串 str
str = f.read()
print(str)
# 3.对文本进行预处理
strLower = str.lower()
print(strLower) 结果如下:
 
# 4.分解提取单词

# 4.分解提取单词
list strList = list(str) strList = str.split() print(len(strList),strList)

 


5.单词计数字典 set , dict #集合统计单词的个数

#集合统计个数
strSet = set(strList) strSet=set(strList) for word in strSet: print(word,strList.count(word))
#字典统计单词的个数 dict = {} for i in str: if i in dict: dict[i] = dict[i] + 1 else: dict[i] = 1 print(dict)

 


# 6.按词频排序
csList = list(dict.items())
csList.sort(key=lambda x:x[1],reverse = True)
print(csList)
# 7.排除语法型词汇,代词、冠词、连词等无语义 exclude = {'a','the','and','i','you','in'} strSet = strSet - exclude print(len(strSet),strSet) # 8.输出TOP(20) for i in range(20): print(strList[i])

 


posted @ 2018-09-29 10:05  傻猪一号  阅读(134)  评论(0编辑  收藏  举报