jieba分词--python


import
os import os.path import codecs import numpy import pandas import jieba #创建语料库 corpos = pandas.DataFrame(columns=['filePath', 'content']) for root, dirs, files in os.walk( "D:\\PDM\\2.1\\SogouC.mini\\Sample\\C000007\\" ): for name in files: filePath = root + '\\' + name; f = codecs.open(filePath, 'r', 'utf-8') content = f.read() f.close() corpos.loc[len(corpos)+1] = [filePath, content.strip()]; #进行分词 segments = pandas.DataFrame(columns=["filePath", 'segment']) for content in corpos['content']: segs = jieba.cut(content) for seg in segs: segments.loc[len(segments)+1] = [filePath, seg]

 

posted @ 2016-06-03 15:55  草莓干123456  阅读(907)  评论(0编辑  收藏  举报