nltk(三)
1.tokenize
主要用于单词的拆分,
主要啊包括MWETokenizer(多单词拆分),RegexpTokenizer(正则抽取单词拆分),SpaceTokenizer(空格单词拆分),TabTokenizer,StanfordSegmenter(斯坦福分词器),TreebankWordTokenizer,TreebankWordDeTokenizer
from nltk.tokenize.util import string_span_tokenize s = '''Good muffins cost $3.88\nin New York. Please buy me two of them.\n\nThanks.''' list(string_span_tokenize(s, " "))
>>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True
2.cgg
用于自定义词类型词典
主要包括CCG(CCG的保存类型,由子类完成),CCGChartParser(句子解析器),CCGLexicon(词类型词典)
lex = fromstring( ''' :- S, NP, N, VP # Primitive categories, S is the target primitive Det :: NP/N # Family of words Pro :: NP TV :: VP/NP Modal :: (S\\NP)/VP # Backslashes need to be escaped I => Pro # Word -> Category mapping you => Pro the => Det # Variables have the special keyword 'var' # '.' prevents permutation # ',' prevents composition and => var\\.,var/.,var which => (N\\N)/(S/NP) will => Modal # Categories can be either explicit, or families. might => Modal cook => TV eat => TV mushrooms => N parsnips => N bacon => N ''' ) def demo(): parser = CCGChartParser(lex, DefaultRuleSet) for parse in parser.parse("I might cook and eat the bacon".split()): printCCGDerivation(parse) if __name__ == '__main__': demo()
3.cluster(聚类)和classify(分类)
实现的聚类算法kmeans,gaac,em
分类算法maxent,naviebayes,DecisionTree等
分类和聚类都有统一接口
class ClassifierI(object): """ A processing interface for labeling tokens with a single category label (or "class"). Labels are typically strs or ints, but can be any immutable type. The set of labels that the classifier chooses from must be fixed and finite. Subclasses must define: - ``labels()`` - either ``classify()`` or ``classify_many()`` (or both) Subclasses may define: - either ``prob_classify()`` or ``prob_classify_many()`` (or both) """ def labels(self): """ :return: the list of category labels used by this classifier. :rtype: list of (immutable) """ raise NotImplementedError() def classify(self, featureset): """ :return: the most appropriate label for the given featureset. :rtype: label """ if overridden(self.classify_many): return self.classify_many([featureset])[0] else: raise NotImplementedError() def prob_classify(self, featureset): """ :return: a probability distribution over labels for the given featureset. :rtype: ProbDistI """ if overridden(self.prob_classify_many): return self.prob_classify_many([featureset])[0] else: raise NotImplementedError() def classify_many(self, featuresets): """ Apply ``self.classify()`` to each element of ``featuresets``. I.e.: return [self.classify(fs) for fs in featuresets] :rtype: list(label) """ return [self.classify(fs) for fs in featuresets] def prob_classify_many(self, featuresets): """ Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: return [self.prob_classify(fs) for fs in featuresets] :rtype: list(ProbDistI) """ return [self.prob_classify(fs) for fs in featuresets] @add_metaclass(ABCMeta) class ClusterI(object): """ Interface covering basic clustering functionality. """ @abstractmethod def cluster(self, vectors, assign_clusters=False): """ Assigns the vectors to clusters, learning the clustering parameters from the data. Returns a cluster identifier for each vector. """ @abstractmethod def classify(self, token): """ Classifies the token into a cluster, setting the token's CLUSTER parameter to that cluster identifier. """ def likelihood(self, vector, label): """ Returns the likelihood (a float) of the token having the corresponding cluster. """ if self.classify(vector) == label: return 1.0 else: return 0.0 def classification_probdist(self, vector): """ Classifies the token into a cluster, returning a probability distribution over the cluster identifiers. """ likelihoods = {} sum = 0.0 for cluster in self.cluster_names(): likelihoods[cluster] = self.likelihood(vector, cluster) sum += likelihoods[cluster] for cluster in self.cluster_names(): likelihoods[cluster] /= sum return DictionaryProbDist(likelihoods) @abstractmethod def num_clusters(self): """ Returns the number of clusters. """ def cluster_names(self): """ Returns the names of the clusters. :rtype: list """ return list(range(self.num_clusters())) def cluster_name(self, index): """ Returns the names of the cluster at index. """ return index
4.corpus词库
每一个词库都有同样的方法
-
words(): list of str
-
sents(): list of (list of str)
-
paras(): list of (list of (list of str))
-
tagged_words(): list of (str,str) tuple
-
tagged_sents(): list of (list of (str,str))
-
tagged_paras(): list of (list of (list of (str,str)))
-
chunked_sents(): list of (Tree w/ (str,str) leaves)
-
parsed_sents(): list of (Tree with str leaves)
-
parsed_paras(): list of (list of (Tree with str leaves))
-
xml(): A single xml ElementTree
-
raw(): unprocessed corpus contents
from nltk.corpus import brown print(", ".join(brown.words()))
5.draw
画图模块,包括cfg(控制流图),tree(树),dispersion(散图)
def demo(): import random def fill(cw): cw['fill'] = '#%06d' % random.randint(0, 999999) cf = CanvasFrame(width=550, height=450, closeenough=2) t = Tree.fromstring( ''' (S (NP the very big cat) (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''' ) tc = TreeWidget( cf.canvas(), t, draggable=1, node_font=('helvetica', -14, 'bold'), leaf_font=('helvetica', -12, 'italic'), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue2', ) cf.add_widget(tc, 10, 10) def boxit(canvas, text): big = ('helvetica', -16, 'bold') return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green') def ovalit(canvas, text): return OvalWidget(canvas, TextWidget(canvas, text), fill='cyan') treetok = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP shapeable)))') tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1) def color(node): node['color'] = '#%04d00' % random.randint(0, 9999) def color2(treeseg): treeseg.label()['fill'] = '#%06d' % random.randint(0, 9999) treeseg.label().child()['color'] = 'white' tc.bind_click_trees(tc.toggle_collapsed) tc2.bind_click_trees(tc2.toggle_collapsed) tc.bind_click_nodes(color, 3) tc2.expanded_tree(1).bind_click(color2, 3) tc2.expanded_tree().bind_click(color2, 3) paren = ParenWidget(cf.canvas(), tc2) cf.add_widget(paren, tc.bbox()[2] + 10, 10) tree3 = Tree.fromstring( ''' (S (NP this tree) (AUX was) (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''' ) tc3 = tree_to_treesegment( cf.canvas(), tree3, tree_color='green4', tree_xspace=2, tree_width=2 ) tc3['draggable'] = 1 cf.add_widget(tc3, 10, tc.bbox()[3] + 10) def orientswitch(treewidget): if treewidget['orientation'] == 'horizontal': treewidget.expanded_tree(1, 1).subtrees()[0].set_text('vertical') treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('vertical') treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical') treewidget.collapsed_tree().subtrees()[3].set_text('vertical') treewidget['orientation'] = 'vertical' else: treewidget.expanded_tree(1, 1).subtrees()[0].set_text('horizontal') treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('horizontal') treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal') treewidget.collapsed_tree().subtrees()[3].set_text('horizontal') treewidget['orientation'] = 'horizontal' text = """ Try clicking, right clicking, and dragging different elements of each of the trees. The top-left tree is a TreeWidget built from a Tree. The top-right is a TreeWidget built from a Tree, using non-default widget constructors for the nodes & leaves (BoxWidget and OvalWidget). The bottom-left tree is built from tree_to_treesegment.""" twidget = TextWidget(cf.canvas(), text.strip()) textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1) cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10) tree4 = Tree.fromstring('(S (NP this tree) (VP (V is) (Adj horizontal)))') tc4 = TreeWidget( cf.canvas(), tree4, draggable=1, line_color='brown2', roof_color='brown2', node_font=('helvetica', -12, 'bold'), node_color='brown4', orientation='horizontal', ) tc4.manage() cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10) tc4.bind_click(orientswitch) tc4.bind_click_trees(tc4.toggle_collapsed, 3) # Run mainloop cf.mainloop() if __name__ == '__main__': demo()
6.chunk
语句块,用于从一个句子中找到一个语句块,
统一的接口
class ChunkParserI(ParserI): """ A processing interface for identifying non-overlapping groups in unrestricted text. Typically, chunk parsers are used to find base syntactic constituents, such as base noun phrases. Unlike ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method will always generate a parse. """ def parse(self, tokens): """ Return the best chunk structure for the given tokens and return a tree. :param tokens: The list of (word, tag) tokens to be chunked. :type tokens: list(tuple) :rtype: Tree """ raise NotImplementedError() def evaluate(self, gold): """ Score the accuracy of the chunker against the gold standard. Remove the chunking the gold standard text, rechunk it using the chunker, and return a ``ChunkScore`` object reflecting the performance of this chunk peraser. :type gold: list(Tree) :param gold: The list of chunked sentences to score the chunker on. :rtype: ChunkScore """ chunkscore = ChunkScore() for correct in gold: chunkscore.score(correct, self.parse(correct.leaves())) return chunkscore
两个实现子类ne和regex,第一个是通过语料库来完成分块,第二个用正则定义语句块
def demo_eval(chunkparser, text): """ Demonstration code for evaluating a chunk parser, using a ``ChunkScore``. This function assumes that ``text`` contains one sentence per line, and that each sentence has the form expected by ``tree.chunk``. It runs the given chunk parser on each sentence in the text, and scores the result. It prints the final score (precision, recall, and f-measure); and reports the set of chunks that were missed and the set of chunks that were incorrect. (At most 10 missing chunks and 10 incorrect chunks are reported). :param chunkparser: The chunkparser to be tested :type chunkparser: ChunkParserI :param text: The chunked tagged text that should be used for evaluation. :type text: str """ from nltk import chunk from nltk.tree import Tree # Evaluate our chunk parser. chunkscore = chunk.ChunkScore() for sentence in text.split('\n'): print(sentence) sentence = sentence.strip() if not sentence: continue gold = chunk.tagstr2tree(sentence) tokens = gold.leaves() test = chunkparser.parse(Tree('S', tokens), trace=1) chunkscore.score(gold, test) print() print('/' + ('=' * 75) + '\\') print('Scoring', chunkparser) print(('-' * 77)) print('Precision: %5.1f%%' % (chunkscore.precision() * 100), ' ' * 4, end=' ') print('Recall: %5.1f%%' % (chunkscore.recall() * 100), ' ' * 6, end=' ') print('F-Measure: %5.1f%%' % (chunkscore.f_measure() * 100)) # Missed chunks. if chunkscore.missed(): print('Missed:') missed = chunkscore.missed() for chunk in missed[:10]: print(' ', ' '.join(map(str, chunk))) if len(chunkscore.missed()) > 10: print(' ...') # Incorrect chunks. if chunkscore.incorrect(): print('Incorrect:') incorrect = chunkscore.incorrect() for chunk in incorrect[:10]: print(' ', ' '.join(map(str, chunk))) if len(chunkscore.incorrect()) > 10: print(' ...') print('\\' + ('=' * 75) + '/') print() def demo(): """ A demonstration for the ``RegexpChunkParser`` class. A single text is parsed with four different chunk parsers, using a variety of rules and strategies. """ from nltk import chunk, Tree text = """\ [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. """ print('*' * 75) print('Evaluation text:') print(text) print('*' * 75) print() grammar = r""" NP: # NP stage {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns {<NNP>+} # chunk proper nouns """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods <DT|JJ>{}<NN.*> # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns VP: {<TO>?<VB.*>} # VP = verb words """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # chink any verbs, prepositions or periods <.*>}{<DT> # separate on determiners PP: {<IN><NP>} # PP = preposition + noun phrase VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) # Evaluation from nltk.corpus import conll2000 print() print("Demonstration of empty grammar:") cp = chunk.RegexpParser("") print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',)))) print() print("Demonstration of accuracy evaluation using CoNLL tags:") grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods <DT|JJ>{}<NN.*> # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5])) print() print("Demonstration of tagged token input") grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # chink any verbs, prepositions or periods <.*>}{<DT> # separate on determiners PP: {<IN><NP>} # PP = preposition + noun phrase VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) print( cp.parse( [ ("the", "DT"), ("little", "JJ"), ("cat", "NN"), ("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN"), (".", "."), ] ) ) if __name__ == '__main__': demo()