西游记关键字提取和语句分词

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
str1 = 'as,gh,rt,ujrk'
str2 = ','
str1 = str1[str1.find(str2)+1:]
print(str1)
s='as,gh,rt,ujrk'
print(s.split(','))
import re
pattern = re.compile(r'hello.*\!')
match = pattern.match('hello,aklhgslhgfhg!gfdh')
if match:
    print(match.group())
"""
import jieba
import time
import sys
import jieba.analyse as analyse

lines = open(u'西游记.txt',encoding='gb18030').read()
print(' '.join(analyse.extract_tags(lines,topK=20,withWeight=False,
                                    allowPOS=())))
"""
jieba.enable_parallel(4)  #并行模式只支持POSIX系统
content = open(u'西游记.txt',"r").read()
t1 = time.time()
words = "/".join(jieba.cut(content))
t2 = time.time()
tm_cost = t2-t1
print('并行速度为:%s bytes/second'% (len(content)/tm_cost))
"""
jieba.disable_parallel()
content = open(u'西游记.txt',"r",encoding='gb18030', errors='ignore').read()
t1 = time.time()
words = "/".join(jieba.cut(content))
t2 = time.time()
tm_cost = t2-t1
print('非并行速度为:%s bytes/second'% (len(content)/tm_cost))

list = jieba.cut('我在学习自然语言处理',cut_all=False)
print(list)
print('/'.join(list))
print("/".join(jieba.cut('如果放到旧字典中将出错',HMM=False)))
jieba.suggest_freq(('中','将'),True)
print('/'.join(jieba.cut('如果放到旧字典中将出错',HMM=False)))

line = open('西游记.txt',encoding='gb18030').read()
print(" ".join(analyse.textrank(line,topK=20,withWeight=False,
                                allowPOS=('ns','n','v','vn'))))

  

posted on 2018-11-27 19:31  李凤五  阅读(331)  评论(0编辑  收藏  举报

导航