jieba分词--python
import os import os.path import codecs import numpy import pandas import jieba #创建语料库 corpos = pandas.DataFrame(columns=['filePath', 'content']) for root, dirs, files in os.walk( "D:\\PDM\\2.1\\SogouC.mini\\Sample\\C000007\\" ): for name in files: filePath = root + '\\' + name; f = codecs.open(filePath, 'r', 'utf-8') content = f.read() f.close() corpos.loc[len(corpos)+1] = [filePath, content.strip()]; #进行分词 segments = pandas.DataFrame(columns=["filePath", 'segment']) for content in corpos['content']: segs = jieba.cut(content) for seg in segs: segments.loc[len(segments)+1] = [filePath, seg]
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步