中文转换成英文处理(文件加载&jieba分词)
中文分词后文档转换成英文处理,代码:
paths = r"路径" #文件名0.txt posts = [open(os.path.join(paths,f)).read() for f in os.listdir(paths)] #打开方式1 # 中文分词/写入分词后的文件 pathsCut = r"C:\Users\Administrator\workspace\eclipse\mlTest\src\kmeans\cut" postAll = [] for n in range(5): postTmp = [] tmp = jieba.cut(posts[n],cut_all = False) #存储在生成器 fileName=pathsCut+"/" +str(n) + ".txt" f = open(fileName,"a") for s in tmp: #f.write(s.join(" "))#词之间加空格 f.write(s) f.write(" ") postTmp.append(s) postAll.append(postTmp)
文件打开方式2:
for line in open("file.txt"): for word in line.split(): print(word)
jieba分词概述:
import jieba as jb import jieba.posseg as pos jb.load_userdict("C:\et_corpus.txt") #加载自定义词典 tmp = jb.cut(posts[n],cut_all = False) #存储在生成器,接上述代码 #cut_all模式,True全模式;False精确模式(默认);jb.cut_for_search 搜索引擎模式 cutPos = pos.cut ("text") # 词性标注 for w in cutPos: print(w.word,w.flag)
jieba词干提取:
def tfidf1(self, word,file,files): # 逐词计算-人工 tf = float(file.count(word)) / sum(file.count(w) for w in set(file)) idf = sp.log(float(len(files)) / (len([doc for doc in files if word in doc]))) return tf * idf def tfidf2(self, paths,n): posts = [open(os.path.join(paths,f)).readlines() for f in os.listdir(paths)] import jieba.analyse #jieba自带词干提取功能,原理同tfidf keyWords = [] for f in posts: keyWord = jieba.analyse.extract_tags(f[0],n) keyWords.append(str(keyWord).replace("'", "")) return keyWords def tfidf3(self, paths,n): from sklearn.feature_extraction.text import TfidfTransformer as tf from sklearn.feature_extraction.text import CountVectorizer as cv ins = PreProcessing() posts = ins.loadData(paths) vectorizer = cv() transformer = tf() tfidf = transformer.fit_transform(vectorizer.fit_transform(posts)) word = vectorizer.get_feature_names() print(word) weight = tfidf.toarray() for i in range(len(weight)): for j in range(len(word)): print(word[j],weight[i][i])