nltk(四)
1.nltk.lm语言模型
统一的语言模型接口
class LanguageModel(object): """ABC for Language Models. Cannot be directly instantiated itself. """ def __init__(self, order, vocabulary=None, counter=None): """Creates new LanguageModel. :param vocabulary: If provided, this vocabulary will be used instead of creating a new one when training. :type vocabulary: `nltk.lm.Vocabulary` or None :param counter: If provided, use this object to count ngrams. :type vocabulary: `nltk.lm.NgramCounter` or None :param ngrams_fn: If given, defines how sentences in training text are turned to ngram sequences. :type ngrams_fn: function or None :param pad_fn: If given, defines how senteces in training text are padded. :type pad_fn: function or None """ self.order = order self.vocab = Vocabulary() if vocabulary is None else vocabulary self.counts = NgramCounter() if counter is None else counter def fit(self, text, vocabulary_text=None): """Trains the model on a text. :param text: Training text as a sequence of sentences. """ if not self.vocab: if vocabulary_text is None: raise ValueError( "Cannot fit without a vocabulary or text to " "create it from." ) self.vocab.update(vocabulary_text) self.counts.update(self.vocab.lookup(sent) for sent in text) def score(self, word, context=None): """Masks out of vocab (OOV) words and computes their model score. For model-specific logic of calculating scores, see the `unmasked_score` method. """ return self.unmasked_score( self.vocab.lookup(word), self.vocab.lookup(context) if context else None ) @abstractmethod def unmasked_score(self, word, context=None): """Score a word given some optional context. Concrete models are expected to provide an implementation. Note that this method does not mask its arguments with the OOV label. Use the `score` method for that. :param str word: Word for which we want the score :param tuple(str) context: Context the word is in. If `None`, compute unigram score. :param context: tuple(str) or None :rtype: float """ raise NotImplementedError() def logscore(self, word, context=None): """Evaluate the log score of this word in this context. The arguments are the same as for `score` and `unmasked_score`. """ return log_base2(self.score(word, context)) def context_counts(self, context): """Helper method for retrieving counts for a given context. Assumes context has been checked and oov words in it masked. :type context: tuple(str) or None """ return ( self.counts[len(context) + 1][context] if context else self.counts.unigrams ) def entropy(self, text_ngrams): """Calculate cross-entropy of model for given evaluation text. :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples. :rtype: float """ return -1 * _mean( [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams] ) def perplexity(self, text_ngrams): """Calculates the perplexity of the given text. This is simply 2 ** cross-entropy for the text, so the arguments are the same. """ return pow(2.0, self.entropy(text_ngrams)) def generate(self, num_words=1, text_seed=None, random_seed=None): """Generate words from the model. :param int num_words: How many words to generate. By default 1. :param text_seed: Generation can be conditioned on preceding context. :param random_seed: A random seed or an instance of `random.Random`. If provided, makes the random sampling part of generation reproducible. :return: One (str) word or a list of words generated from model. Examples: >>> from nltk.lm import MLE >>> lm = MLE(2) >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c']) >>> lm.fit([[("a",), ("b",), ("c",)]]) >>> lm.generate(random_seed=3) 'a' >>> lm.generate(text_seed=['a']) 'b' """ text_seed = [] if text_seed is None else list(text_seed) random_generator = _random_generator(random_seed) # base recursion case if num_words == 1: context = ( text_seed[-self.order + 1 :] if len(text_seed) >= self.order else text_seed ) samples = self.context_counts(self.vocab.lookup(context)) while context and not samples: context = context[1:] if len(context) > 1 else [] samples = self.context_counts(self.vocab.lookup(context)) # sorting achieves two things: # - reproducible randomness when sampling # - turning Mapping into Sequence which _weighted_choice expects samples = sorted(samples) return _weighted_choice( samples, tuple(self.score(w, context) for w in samples), random_generator ) # build up text one word at a time generated = [] for _ in range(num_words): generated.append( self.generate( num_words=1, text_seed=text_seed + generated, random_seed=random_generator, ) ) return generated
MLE(最大似然),Laplace(拉普拉斯)
NgramCounter用于统计词的数量
text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]] from nltk.util import ngrams text_bigrams = [ngrams(sent, 2) for sent in text] text_unigrams = [ngrams(sent, 1) for sent in text]
Vocabulary词汇表,用于词的查找
words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd'] from nltk.lm import Vocabulary vocab = Vocabulary(words, unk_cutoff=2) 'b' in vocab #FLASE 'c' in vocab #TRUE
2.nltk.stem
用于获得词的原始形式
统一的接口
@add_metaclass(ABCMeta) class StemmerI(object): """ A processing interface for removing morphological affixes from words. This process is known as stemming. """ @abstractmethod def stem(self, token): """ Strip affixes from the token and return the stem. :param token: The token that should be stemmed. :type token: str """
包含子类ARLSTem,RegexSTem,Cistem
3.nltk.metrics
用于度量距离的模块
jaccard_distance,interval_distance,masi_distance