python tokenizer

# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer
import codecs
import sys
import os

def analyze(path):
fi = codecs.open(path, encoding="UTF-8")
lines = fi.readlines()
fi.close()
wordSet = set()
fo = codecs.open(path + "x" , "w")
t = Tokenizer()
tokenStr = None
for line in lines:
for token in t.tokenize(line):
tokenStr = str(token)
if (tokenStr.find("記号") < 0) and (tokenStr.find("人名")) < 0:
wordSet.add(tokenStr.split(",")[6])
#print(wordSet)
wordList = list(wordSet)
#fo.write(wordSet.pop() + "\r\n")
for item in wordList:
fo.write(item + "\r\n")
fo.close()

analyze("C:\\Users\\70485528\\mymail.txt")

posted @ 2017-03-23 13:51  天生弱智难自弃  阅读(1617)  评论(0编辑  收藏  举报