词频统计
import os import codecs import chardet word_lst = [] word_dict = {} infile=input("请输入统计文件名:") outfile=input("请输入输出结果文件名:") exclude_str = input("请输入过滤字符:") n = input("统计前多少位:") #oldfile:UTF8文件的路径 #newfile:要保存的ANSI文件的路径 def convertUTF8ToANSI(infile): #打开UTF8文本文件 f = codecs.open(infile,'r','utf8') utfstr = f.read() f.close() #把UTF8字符串转码成ANSI字符串 outansestr = utfstr.encode('mbcs') #使用二进制格式保存转码后的文本 f = open(infile,'wb') f.write(outansestr) f.close() #读取docx文档 def ReadWord(): # 读取word fword = docx.Document(infile) # 括号内的为文件路径 for para in fword.paragraphs: # 读取word的每一段内容 for char in para.text: word_lst.append(char) #para.text即为该段落的内容 print(word_lst) # 返回读取到的文件内容 def ReadTxt(): try: convertUTF8ToANSI(infile) except : print("编码格式错误") # 添加每一个字到列表中 for line in fileIn: for char in line: word_lst.append(char) with open(infile,"r") as fileIn ,open(outfile,'w') as fileOut: fileName = infile.split('.') if fileName[-1] == "docx": ReadWord() if fileName[-1] == "txt": ReadTxt() # 用字典统计每个字出现的个数 word_lst=[x.strip() for x in word_lst if x.strip() != ''] for char in word_lst: if char not in exclude_str: word_dict[char] = word_dict.get(char,0) + 1 # 排序 # x[1]是按字频排序,x[0]则是按字排序 lstWords = sorted(word_dict.items(), key=lambda x:x[1], reverse=True) # 输出结果 (前100) print('字符\t字频') print('=============') i = 1 if n == '': for e in lstWords: print(str(i) + '\t' + '%s\t%d' % e) i+=1 fileOut.write('%s, %d\n' % e) else: n = int(n) for e in lstWords[:n]: print(str(i) + '\t' + '%s\t%d' % e) i+=1 fileOut.write('%s, %d\n' % e)