自然语言处理
1.处理框架
Ltp:中文分词、分词标注、未登陆词识别、句法分析、语义角色标注
Stanford NLP:中文分词、分词标注、未登陆词识别、句法分析
FudanNLP:中文分词、句法分析
HanLP:中文分词、句法分析等各类算法
ICTCLAS分词系统:具有里程碑意义的中文分词系统
Anjs中文分词系统:中等规模的分词系统
jieba:小规模中文分词
2.分词功能
(1).ltp分词
import sys import os from pyltp import Segmentor seg = Segmentor() seg.load("modal path") words = seg.segment('大家好我是一个例子') print(words)
(2).jieba分词
import sys import os import jieba #全模式 wordlist = jieba.cut("大家好我是一个例子",cut_all=True) print("|".join(wordlist)) #精确模式 wordlist = jieba.cut("大家好我是一个例子") print("|".join(wordlist)) #搜索模式 wordlist = jieba.cut_for_search("大家好我是一个例子") print("|".join(wordlist))
3.词性标注
import sys import os from pyltp import * words = ['我','是','一个','人'] pst = Postagger() pst.load('model path') tags = pst.postag(words) for word,tag in zip(words,tags): print(word + '-' + tag)
4.实体识别
rcz = NamedEntityRecognizer() rez.load('model path') ntags = rez.recognize(words,tags) print(ntags)
5.句法分析
import nltk from nltk.tree import Tree from nltk.grammar import DependencyGrammarp from nltk.parse import * parser = Parser() parser.load('model path') arcs = parser.parse(words,tags) coll = '' for i in xrange(len(arcs)): if arcs[i].head == 0: arcs[i].relation = 'ROOT' coll += '\t' + words[i] + '(' + tags[i] + ')' + '\t' + tags[i] + '\t' + arcs[i].head + '\t' + arcs[i].relation print(coll) ctree = DependencyGraph(coll) ctree.tree().draw()