jieba分词--python

复制代码

import
os import os.path import codecs import numpy import pandas import jieba #创建语料库 corpos = pandas.DataFrame(columns=['filePath', 'content']) for root, dirs, files in os.walk( "D:\\PDM\\2.1\\SogouC.mini\\Sample\\C000007\\" ): for name in files: filePath = root + '\\' + name; f = codecs.open(filePath, 'r', 'utf-8') content = f.read() f.close() corpos.loc[len(corpos)+1] = [filePath, content.strip()]; #进行分词 segments = pandas.DataFrame(columns=["filePath", 'segment']) for content in corpos['content']: segs = jieba.cut(content) for seg in segs: segments.loc[len(segments)+1] = [filePath, seg]
复制代码

 

posted @   草莓干123456  阅读(908)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示