9 建立基于特征的文法
1.文法特征
在本章中,我们将探讨建立在基于规则上的文法中特征的作用。对比特征提取,记录已经自动检测到的特征,我们现在要介绍词和短语的特征
特征结构包含各种有关文法实体的信息
文法实体的信息
CAT(文法类别) ORTH(拼写) REF(指示物) REL(关系)
kim = {'CAT': 'NP', 'ORTH': 'Kim', 'REF': 'k'}
chase = {'CAT': 'V', 'ORTH': 'chased', 'REL': 'chase'}
lee = {'CAT':'NP','ORTH':'Lee','REF':'l'}
# AGT(agent施事的角色) PAT(patient受事角色)
chase['AGT'] = 'sbj'
chase['PAT'] = 'obj'
sent = "Kim chased Lee"
tokens = sent.split()
def lex2fs(word):
for fs in [kim,lee,chase]:
if fs['ORTH'] == word:
return fs
subj, verb, obj = lex2fs(tokens[0]),lex2fs(tokens[1]),lex2fs(tokens[2])
print(subj, verb, obj )
verb['AGT'] = subj['REF'] # agent of 'chase' is Kim k
verb['PAT'] = obj['REF'] # patient of 'chase' is Lee l
for k in ['ORTH','REL','AGT','PAT']: # check featstruct of 'chase'
print("%-5s => %s"%(k,verb[k]))
surprise = {'CAT':'V','ORTH':'surprised','REL':'surprised','SRC':'sbj','EXP':'obj'}
句法协议
动词的形态属性同主语名词短语的句法属性一起变化,该过程被称为协议(agreement)
使用属性和约束
非正式的语言类别具有属性,例如:名词具有复数的属性。
import nltk
print(nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg'))
# % start S
# # ###################
# # Grammar Productions
# # ###################
# # S expansion productions
# S -> NP[NUM=?n] VP[NUM=?n]
# # NP expansion productions
# NP[NUM=?n] -> N[NUM=?n]
# NP[NUM=?n] -> PropN[NUM=?n]
# NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
# NP[NUM=pl] -> N[NUM=pl]
# # VP expansion productions
# VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]
# VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP
# # ###################
# # Lexical Productions
# # ###################
# Det[NUM=sg] -> 'this' | 'every'
# Det[NUM=pl] -> 'these' | 'all'
# Det -> 'the' | 'some' | 'several'
# PropN[NUM=sg]-> 'Kim' | 'Jody'
# N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'
# N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children'
# IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks'
# TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'
# IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk'
# TV[TENSE=pres, NUM=pl] -> 'see' | 'like'
# IV[TENSE=past] -> 'disappeared' | 'walked'
# TV[TENSE=past] -> 'saw' | 'liked'
# None
跟踪基于特征的图表分析器
tokens1 = 'Kim likes children'.split()
from nltk import load_parser
cp = load_parser('grammars/book_grammars/feat0.fcfg',trace=2)
trees = cp.parse(tokens1)
# |.Kim .like.chil.|
# Leaf Init Rule:
# |[----] . .| [0:1] 'Kim'
# |. [----] .| [1:2] 'likes'
# |. . [----]| [2:3] 'children'
# Feature Bottom Up Predict Combine Rule:
# |[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' *
# Feature Bottom Up Predict Combine Rule:
# |[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] *
# Feature Bottom Up Predict Combine Rule:
# |[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
# Feature Bottom Up Predict Combine Rule:
# |. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' *
# Feature Bottom Up Predict Combine Rule:
# |. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'}
# Feature Bottom Up Predict Combine Rule:
# |. . [----]| [2:3] N[NUM='pl'] -> 'children' *
# Feature Bottom Up Predict Combine Rule:
# |. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] *
# Feature Bottom Up Predict Combine Rule:
# |. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'}
# Feature Single Edge Fundamental Rule:
# |. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] *
# Feature Single Edge Fundamental Rule:
# |[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] *
# for tree in trees:
# print(tree)
# (S[]
# (NP[NUM='sg'] (PropN[NUM='sg'] Kim))
# (VP[NUM='sg', TENSE='pres']
# (TV[NUM='sg', TENSE='pres'] likes)
# (NP[NUM='pl'] (N[NUM='pl'] children))))
术语
2.处理特征结构
fs1 = nltk.FeatStruct(TENSE = 'past',NUM = 'sg')
print(fs1)
# |.Kim .like.chil.|
# Leaf Init Rule:
# |[----] . .| [0:1] 'Kim'
# |. [----] .| [1:2] 'likes'
# |. . [----]| [2:3] 'children'
# Feature Bottom Up Predict Combine Rule:
# |[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' *
# Feature Bottom Up Predict Combine Rule:
# |[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] *
# Feature Bottom Up Predict Combine Rule:
# |[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
# Feature Bottom Up Predict Combine Rule:
# |. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' *
# Feature Bottom Up Predict Combine Rule:
# |. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'}
# Feature Bottom Up Predict Combine Rule:
# |. . [----]| [2:3] N[NUM='pl'] -> 'children' *
# Feature Bottom Up Predict Combine Rule:
# |. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] *
# Feature Bottom Up Predict Combine Rule:
# |. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'}
# Feature Single Edge Fundamental Rule:
# |. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] *
# Feature Single Edge Fundamental Rule:
# |[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] *
# [ NUM = 'sg' ]
# [ TENSE = 'past' ]
fs2 = nltk.FeatStruct(PER = 3, NUM = 'pl',GND = 'fem')
print(fs2['GND'])
指定特征结构
fs3 = nltk.FeatStruct(POS = 'N',AGR = fs1)
print(fs3)
# [ AGR = [ NUM = 'sg' ] ]
# [ [ TENSE = 'past' ] ]
# [ ]
# [ POS = 'N' ]
fs4 = nltk.FeatStruct("[POS='N',AGR = [PER = 3, NUM = 'pl',GND = 'fem']]")
print(fs4)
# [ [ GND = 'fem' ] ]
# [ AGR = [ NUM = 'pl' ] ]
# [ [ PER = 3 ] ]
# [ ]
# [ POS = 'N' ]
一个更一般的特征结构包含一个较一般的。
合并两个特征结构的信息被称为统一
包含的特征结构是偏序的。
\[FS_0包含FS_1,当FS_0比FS_1更一般(较少信息)的时候
\]
fs0 = nltk.FeatStruct("""[NAME = Lee,
ADDRESS = (1)[NUMBER = 74,
STREET = 'rue Pascal'],
SPOUSE = [NAME = Kim,
ADDRESS->(1)]]""")
print(fs0)
fs0_1 = nltk.FeatStruct("[SPOUSE = [ADDRESS = [CITY = Paris]]]")
print(fs0_1.unify(fs0))
fs0_2 = nltk.FeatStruct("[ADDRESS1 = [NUMBER = 74,STREET = 'rue Pascal']]")
f20_3 = nltk.FeatStruct("[ADDRESS1 = ?x,ADDRESS2 = ?x]")#结构共享用标了表示
print(f20_3.unify(fs0_2))
3.扩展基于特征的文法
我们可以使用特征结构对大量广泛语言学现象进行简洁的分析,包括动词子类别、倒装结构、无限制依赖结构和格支配。
在本节,将会探索各种语言问题,并展示将特征纳入文法的好处。
子类别
核心词回顾
助动词与倒装
无限制依赖成分
具有倒装从句和长距离依赖的产生式的文法,使用斜线类别
print(nltk.data.show_cfg('grammars/book_grammars/feat1.fcfg'))
# % start S
# # ###################
# # Grammar Productions
# # ###################
# S[-INV] -> NP VP
# S[-INV]/?x -> NP VP/?x
# S[-INV] -> NP S/NP
# S[-INV] -> Adv[+NEG] S[+INV]
# S[+INV] -> V[+AUX] NP VP
# S[+INV]/?x -> V[+AUX] NP VP/?x
# SBar -> Comp S[-INV]
# SBar/?x -> Comp S[-INV]/?x
# VP -> V[SUBCAT=intrans, -AUX]
# VP -> V[SUBCAT=trans, -AUX] NP
# VP/?x -> V[SUBCAT=trans, -AUX] NP/?x
# VP -> V[SUBCAT=clause, -AUX] SBar
# VP/?x -> V[SUBCAT=clause, -AUX] SBar/?x
# VP -> V[+AUX] VP
# VP/?x -> V[+AUX] VP/?x
# # ###################
# # Lexical Productions
# # ###################
# V[SUBCAT=intrans, -AUX] -> 'walk' | 'sing'
# V[SUBCAT=trans, -AUX] -> 'see' | 'like'
# V[SUBCAT=clause, -AUX] -> 'say' | 'claim'
# V[+AUX] -> 'do' | 'can'
# NP[-WH] -> 'you' | 'cats'
# NP[+WH] -> 'who'
# Adv[+NEG] -> 'rarely' | 'never'
# NP/NP ->
# Comp -> 'that'
# None
tokens1 = 'who do you claim that you like'.split()
from nltk import load_parser
cp = load_parser('grammars/book_grammars/feat1.fcfg')
for tree in cp.parse(tokens1):
print(tree)#有缺口的句子
# (S[-INV]
# (NP[+WH] who)
# (S[+INV]/NP[]
# (V[+AUX] do)
# (NP[-WH] you)
# (VP[]/NP[]
# (V[-AUX, SUBCAT='clause'] claim)
# (SBar[]/NP[]
# (Comp[] that)
# (S[-INV]/NP[]
# (NP[-WH] you)
# (VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] )))))))
没有缺口的句子
tokens2 = 'you claim that you like cats'.split()
for tree in cp.parse(tokens2):
print(tree)
tree.draw()
# (S[-INV]
# (NP[-WH] you)
# (VP[]
# (V[-AUX, SUBCAT='clause'] claim)
# (SBar[]
# (Comp[] that)
# (S[-INV]
# (NP[-WH] you)
# (VP[] (V[-AUX, SUBCAT='trans'] like) (NP[-WH] cats))))))
tokens3 = 'rarely do you sing'.split()
for tree in cp.parse(tokens3):
print(tree)
tree.draw()
# (S[-INV]
# (Adv[+NEG] rarely)
# (S[+INV]
# (V[+AUX] do)
# (NP[-WH] you)
# (VP[] (V[-AUX, SUBCAT='intrans'] sing))))