NLP(五) 词性标注和文法

原文链接:http://www.one2know.cn/nlp5/

  • NLTK内置词性标注器
    用nltk.pos_tag()函数进行词性标注
import nltk
nltk.download('averaged_perceptron_tagger')

simpleSentence = 'Bangalore is the capital of Karnataka.'

# 分词
wordsInSentence = nltk.word_tokenize(simpleSentence)
print(wordsInSentence)

# 词性标注
partsOfSpeechTags = nltk.pos_tag(wordsInSentence)
print(partsOfSpeechTags)

输出:

['Bangalore', 'is', 'the', 'capital', 'of', 'Karnataka', '.']
[('Bangalore', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('Karnataka', 'NNP'), ('.', '.')]
  • 自己的词性标注器
import nltk

# 默认:不认识的都标成NN
def learnDefaultTagger(simpleSentence):
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger('NN')
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

# 正则表达式标注
def learnRETagger(simpleSentence):
    # 元组列表,r不能省哦
    customPatterns =[
        (r'.*ing$','ADJECTIVE'),
        (r'.*ly$','ADVERB'),
        (r'.*ion$','NOUN'),
        (r'(.*ate|.*en|is)$','VERB'),
        (r'^an$','INDEFINITE-ARTICLE'),
        (r'^(with|on|at)$','PREPOSITION'),
        (r'^[0-9]*$','NUMBER'),
        (r'.*$',None),
    ]
    tagger = nltk.RegexpTagger(customPatterns)
    wordsInSentencs = nltk.word_tokenize(simpleSentence)
    posEnabledTags =tagger.tag(wordsInSentencs)
    print(posEnabledTags)

# 字典标注
def learnLookupTagger(simpleSentence):
    mapping = {
        '.':'.','place':'NN','on':'IN','earth':'NN','Mysore':'NNP',
        'is':'VBZ','an':'DT','amazing':'JJ',
    }
    tagger = nltk.UnigramTagger(model=mapping)
    wordsInSentencs = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentencs)
    print(posEnabledTags)

if __name__ == "__main__":
    testSentence = 'Mysore is an amazing place on earth. I have visited Mysore 10 times.'
    learnDefaultTagger(testSentence)
    learnRETagger(testSentence)
    learnLookupTagger(testSentence)

输出:

[('Mysore', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Mysore', 'NN'), ('10', 'NN'), ('times', 'NN'), ('.', 'NN')]
[('Mysore', None), ('is', 'VERB'), ('an', 'INDEFINITE-ARTICLE'), ('amazing', 'ADJECTIVE'), ('place', None), ('on', 'PREPOSITION'), ('earth', None), ('.', None), ('I', None), ('have', None), ('visited', None), ('Mysore', None), ('10', 'NUMBER'), ('times', None), ('.', None)]
[('Mysore', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), ('.', '.'), ('I', None), ('have', None), ('visited', None), ('Mysore', 'NNP'), ('10', None), ('times', None), ('.', '.')]
  • 训练自己的词性标注器
import nltk
import pickle

# 训练集
def sampleData():
    return [
        'Bangalore is the capital of Karnataka.',
        'Steve Jobs was the CEO of Apple.',
        'iPhone was Invented by Apple.',
        'Books can be purchased in Market.',
    ]

# 逐句分词,得到词性,将训练集的词和词性放到字典里
def buildDictionary():
    dictionary = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in partsOfSpeechTags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
    return dictionary

def saveMyTagger(tagger,fileName):
    fileHandle = open(fileName,'wb')
    pickle.dump(tagger,fileHandle) # 写入二进制
    fileHandle.close()

# 用学习的字典得到tagger
def saveMyTraining(fileName):
    tagger = nltk.UnigramTagger(model=buildDictionary())
    saveMyTagger(tagger,fileName)

# 读取自己的模型
def loadMyTagger(fileName):
    return pickle.load(open(fileName,'rb'))

sentence = 'IPhone is purchased by Steve Jobs in Bangalore Market.'
fileName = 'myTagger.pickle'

saveMyTraining(fileName)

myTagger = loadMyTagger(fileName)

print(myTagger.tag(nltk.word_tokenize(sentence)))

输出:

[('IPhone', None), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('Bangalore', 'NNP'), ('Market', 'NNP'), ('.', '.')]
  • 编写自己的文法
    上下文无关文法:
    1.开始符号/标记
    2.终结符号集合
    3.非终结符号集合
    4.定义开始符号和规则(产生式)
    5.语言是英文时,a-z是符号/标记/字母
    6.语言是数字时,0-9是符号/标记/字母
    产生式是用巴克斯-诺尔(BNF)范式写的
import nltk
import string
from nltk.parse.generate import generate
import sys

# 定义一个起始符号为ROOT的文法
productions = [
    'ROOT -> WORD',
    'WORD -> \' \'',
    'WORD -> NUMBER LETTER',
    'WORD -> LETTER NUMBER',
]

# 添加新的生成方式 'NUMBER -> 0|1|2|3'
digits = list(string.digits) # str格式的数字
for digit in digits[:4]:
    productions.append('NUMBER -> \'{w}\''.format(w=digit))

# 添加新的生成方式 'LETTER -> a|b|c|d'
letters ="' | '".join(list(string.ascii_lowercase)[:4])
productions.append('LETTER -> \'{w}\''.format(w=letters))

# 将文法分行存于grammarString
grammarString = '\n'.join(productions)

# 创建文法对象,并查看之
grammar = nltk.CFG.fromstring(grammarString)
print(grammar)

# 读取语法树 最多个数:5 最多层数:4
for sentence in generate(grammar,n=5,depth=4):
    palindrome = ''.join(sentence).replace(' ','')
    print('Generated Word: {} , Size : {}'.format(palindrome,len(palindrome)))
  • 输出
Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'
Generated Word:   , Size : 0
Generated Word: 0a , Size : 2
Generated Word: 0b , Size : 2
Generated Word: 0c , Size : 2
Generated Word: 0d , Size : 2
  • 基于概率的上下文无关文法
    所有非终结符号(左侧)的概率之和等于1
描述 内容
开始符号 ROOT
非终结符号 WORD,P1,P2,P3,P4
终结符号 'A','B','C','D','E','F','G','H'
import nltk
from nltk.parse.generate import generate

productions = [
    "ROOT -> WORD [1.0]",
    "WORD -> P1 [0.25]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 P3 [0.25]",
    "WORD -> P1 P2 P3 P4 [0.25]",
    "P1 -> 'A' [1.0]",
    "P2 -> 'B' [0.5]",
    "P2 -> 'C' [0.5]",
    "P3 -> 'D' [0.3]",
    "P3 -> 'E' [0.3]",
    "P3 -> 'F' [0.4]",
    "P4 -> 'G' [0.9]",
    "P4 -> 'H' [0.1]",
]
grammarString = '\n'.join(productions)

# 创建grammar对象
grammar = nltk.PCFG.fromstring(grammarString)
print(grammar)

for sentence in generate(grammar,n=5,depth=4):
    palindrome = ''.join(sentence).replace(' ','')
    print('String : {} , Size : {}'.format(palindrome,len(palindrome)))

输出:

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]
String : A , Size : 1
String : AB , Size : 2
String : AC , Size : 2
String : ABD , Size : 3
String : ABE , Size : 3
  • 编写递归的上下文无关文法
    以递归方法生成回文为例,回文:比如01语言系统的 010010 等
# 生成偶数回文数字
import nltk
import string
from nltk.parse.generate import generate

productions = [
    'ROOT -> WORD',
    "WORD -> ' '",
]

alphabets = list(string.digits)

for alphabet in alphabets:
    productions.append("WORD -> '{w}' WORD '{w}'".format(w=alphabet))

grammarString = '\n'.join(productions)

grammar = nltk.CFG.fromstring(grammarString)
print(grammar)

for sentence in generate(grammar,n=5,depth=5):
    palindrome = ''.join(sentence).replace(' ','')
    print('Palindrome : {} , Size : {}'.format(palindrome,len(palindrome)))

输出:

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> '0' WORD '0'
    WORD -> '1' WORD '1'
    WORD -> '2' WORD '2'
    WORD -> '3' WORD '3'
    WORD -> '4' WORD '4'
    WORD -> '5' WORD '5'
    WORD -> '6' WORD '6'
    WORD -> '7' WORD '7'
    WORD -> '8' WORD '8'
    WORD -> '9' WORD '9'
Palindrome :  , Size : 0
Palindrome : 00 , Size : 2
Palindrome : 0000 , Size : 4
Palindrome : 0110 , Size : 4
Palindrome : 0220 , Size : 4
posted @ 2019-07-02 20:37  鹏懿如斯  阅读(1028)  评论(0编辑  收藏  举报