


  • 扩展缩写词。
  • 通过词形还原实现文本处理规范化。
  • 去除特殊字符与符号。
  • 去停用词。


contraction.py 折叠源码
# -*- coding: utf-8 -*-
Created on Mon Aug 01 01:11:02 2016
@author: DIP
"ain't""is not",
"aren't""are not",
"can't've""cannot have",
"could've""could have",
"couldn't""could not",
"couldn't've""could not have",
"didn't""did not",
"doesn't""does not",
"don't""do not",
"hadn't""had not",
"hadn't've""had not have",
"hasn't""has not",
"haven't""have not",
"he'd""he would",
"he'd've""he would have",
"he'll""he will",
"he'll've""he he will have",
"he's""he is",
"how'd""how did",
"how'd'y""how do you",
"how'll""how will",
"how's""how is",
"I'd""I would",
"I'd've""I would have",
"I'll""I will",
"I'll've""I will have",
"I'm""I am",
"I've""I have",
"i'd""i would",
"i'd've""i would have",
"i'll""i will",
"i'll've""i will have",
"i'm""i am",
"i've""i have",
"isn't""is not",
"it'd""it would",
"it'd've""it would have",
"it'll""it will",
"it'll've""it will have",
"it's""it is",
"let's""let us",
"mayn't""may not",
"might've""might have",
"mightn't""might not",
"mightn't've""might not have",
"must've""must have",
"mustn't""must not",
"mustn't've""must not have",
"needn't""need not",
"needn't've""need not have",
"o'clock""of the clock",
"oughtn't""ought not",
"oughtn't've""ought not have",
"shan't""shall not",
"sha'n't""shall not",
"shan't've""shall not have",
"she'd""she would",
"she'd've""she would have",
"she'll""she will",
"she'll've""she will have",
"she's""she is",
"should've""should have",
"shouldn't""should not",
"shouldn't've""should not have",
"so've""so have",
"so's""so as",
"that'd""that would",
"that'd've""that would have",
"that's""that is",
"there'd""there would",
"there'd've""there would have",
"there's""there is",
"they'd""they would",
"they'd've""they would have",
"they'll""they will",
"they'll've""they will have",
"they're""they are",
"they've""they have",
"to've""to have",
"wasn't""was not",
"we'd""we would",
"we'd've""we would have",
"we'll""we will",
"we'll've""we will have",
"we're""we are",
"we've""we have",
"weren't""were not",
"what'll""what will",
"what'll've""what will have",
"what're""what are",
"what's""what is",
"what've""what have",
"when's""when is",
"when've""when have",
"where'd""where did",
"where's""where is",
"where've""where have",
"who'll""who will",
"who'll've""who will have",
"who's""who is",
"who've""who have",
"why's""why is",
"why've""why have",
"will've""will have",
"won't""will not",
"won't've""will not have",
"would've""would have",
"wouldn't""would not",
"wouldn't've""would not have",
"y'all""you all",
"y'all'd""you all would",
"y'all'd've""you all would have",
"y'all're""you all are",
"y'all've""you all have",
"you'd""you would",
"you'd've""you would have",
"you'll""you will",
"you'll've""you will have",
"you're""you are",
"you've""you have"


from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

上面的代码中,载入了英文的停用词、出自 CONTRACTION_MAP 的缩写映射和 WordNetLemmatizer 的一个实例来实现原型还原。实现,定义一个函数实现文本的切分,它将使用在其他的规范化函数中。下面的函数实现词语切分,并去除分割后符号中的多余空格。

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens


def expand_contractions(text, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                      
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


from pattern.en import tag
from nltk.corpus import wordnet as wn
# Annotate text tokens with POS tags
def pos_tag_text(text):
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
            return None
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
    return tagged_lower_text
# lemmatize text based on POS tags   
def lemmatize_text(text):
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                    
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

上面的代码片段描述了两个词形还原函数。主函数是 lemmatize_text,该函数接受文本数据,基于每个词形标签还原词形,接着给用户返回词形还原处理后的文本。为实现这个功能,需要标注每个文本符号的词性标签。使用 pattern 函数库中的 tag 函数对每个符号标注词性标签。因为 WordNetLemmatizer 基于 WordNet 语法格式。将每个单词符号转换为小写,纠正拼写,转换为 WordNet 词性标签,返回这些标注好的单词符号,最后将这些符号送入 lemmatize_text 函数。


def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

通过文本切分去除了一些特殊字符,因此可以去除一些实际上是缩写的标识,但无法在第一步中去除 “s” “re” 等。将在去除停用词时去除它们。然而,也可以不通过文本切分来去除这些特殊字符。通过正则表达式匹配来去除 string.punctuation 中定义的特殊字符。下面的函数有助于去除文本数据中的停用词。

def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)   
    return filtered_text


def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []   
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        text = lemmatize_text(text)
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if tokenize:
            text = tokenize_text(text)
    return normalized_corpus


