CRF++地名实体识别(特征为词性和词)
http://x-algo.cn/index.php/2016/02/29/crf-name-entity-recognition/
类似使用CRF实现分词和词性标注,地域识别也是需要生成相应的tag进行标注。这里使用的语料库是1998年1月人民日报语料集。最终学习出来的模型,对复杂的地名识别准确率(F值)非常低,推测是预料中对地名的标注多处是前后矛盾。例如 [华南/ns 地区/n]ns 标为地名实体,但是 东北/f 地区/n 确分开标注,类似错误还有很多。将来有时间可以考虑使用微软的词库 戳我下载-微软词库。
本文还是在人民日报的语料之下,在分完词的粒度BMES标注最后效果如下:
1
2
3
4
5
6
7
8
9
10
|
------ LOC_E -------
[LOC_E] P = 0.832215, R = 0.629442, F-score = 0.716763
------ LOC_B -------
[LOC_B] P = 0.781022, R = 0.543147, F-score = 0.640719
------ LOC_S -------
[LOC_S] P = 0.986800, R = 0.994489, F-score = 0.990629
------ LOC_I -------
[LOC_I] P = 0.736842, R = 0.442105, F-score = 0.552632
------ All -------
[All] P = 0.975204, R = 0.957399, F-score = 0.966219
|
由于单字识别F值很高,并且数量多,所以整个识别的效果还是很高。 语料、相关代码下载:[戳我下载]crf++地名实体识别,下面为具体流程。
文章目录 [展开]
生成训练和测试数据
通过一个python脚本按照一定比例生成训练和测试数据,生成过程中按照BMES对语料进行标识,具体规则如下:
通过调用脚本: cat people-daily.txt | python get_ner_loc_train_test_data.py >log 生成所需要的训练和测试数据,中间过程打印出来很多调试信息,打印到标准输出话费较多时间。具体代码如下(已折叠):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
#coding=utf8
import sys
home_dir = "./"
def saveDataFile(trainobj,testobj,isTest,word,handle,tag):
if isTest:
saveTrainFile(testobj,word,handle,tag)
else:
saveTrainFile(trainobj,word,handle,tag)
def saveTrainFile(fiobj,word,handle,tag):
if len(word) > 0 and word != "。" and word != ",":
fiobj.write(word + '\t' + handle + '\t' +tag +'\n')
else:
fiobj.write('\n')
#填充地点标注,非地点的不添加
def fill_local_tag(words, tags):
pos = 0
while True:
print "pos:", pos, " len:", len(words)
if pos == len(words):
print "添加地点tag执行结束"
print tags
break
word = words[pos]
left = word.find("[")
if left == -1 :
print "单个词", word
w,h = word.split("/")
print w,h
if h == "ns": #单个词是地点
tags[pos] = "LOC_S"
print "本轮tag",tags[pos]
pos += 1
elif left >= 0:
print "发现词组" ,word
search_pos = pos
for word in words[pos+1:]:
print word
search_pos += 1
if word.find("[") >=0:
print "括号配对异常"
sys.exit(255)
if word.find("]") >=0:
break
if words[search_pos].find("]") == -1:
print "括号配对异常,搜索到句尾没有找都另一半括号"
sys.exit(255)
else:
#找到另一半,判断原始标注是不是ns,如果是就进行tag标注
print "match到一个组", words[pos:search_pos+1]
h = words[search_pos].split("]")[-1] #最后一个词性
if h == "ns":
tags[pos] = "LOC_B" #添加首个词
for p in range(pos + 1,search_pos + 1):
tags[p] = "LOC_I" #中间词
tags[search_pos] = "LOC_E" #找到最后一个词
else:
p = pos
for word in words[pos:search_pos+1]:
print "hhhhhhh", word
w,h = word.strip("[").split("]")[0].split("/")
if h == "ns":
tags[p] = "LOC_S"
p += 1
#移动pos
print "本轮添加的tag", tags[pos:search_pos+1]
pos = search_pos + 1
def convertTag():
fiobj = open( home_dir + 'people-daily.txt','r')
trainobj = open( home_dir +'train.data','w' )
testobj = open( home_dir +'test.data','w')
arr = fiobj.readlines()
i = 0
for a in sys.stdin:
i += 1
a = a.strip('\r\n\t ')
if a=="":continue
words = a.split(" ")
test = False
if i % 5 == 0:
test = True
words = words[1:]
if len(words) == 0: continue
tags = ["O"] * len(words)
fill_local_tag(words, tags)
pos = -1
for word in words:
pos += 1
print "---->", word
word = word.strip('\t ')
if len(word) == 0:
print "Warning 发现空词"
continue
l1 = word.find('[')
if l1 >=0:
word = word[l1+1:]
l2 = word.find(']')
if l2 >= 0:
word = word[:l2]
w,h = word.split('/')
saveDataFile(trainobj,testobj,test,w,h,tags[pos])
saveDataFile(trainobj, testobj, test,"","","")
trainobj.flush()
testobj.flush()
if __name__ == '__main__':
convertTag()
|
模板文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
#Unigram
U01:%x[-1,0]
U02:%x[0,0]
U03:%x[1,0]
U04:%x[2,0]
U05:%x[-2,1]
U06:%x[-1,1]
U07:%x[0,1]
U08:%x[1,1]
U09:%x[2,1]
U0:%x[-2,0]
U10:%x[0,0]/%x[0,1]
U11:%x[-2,1]%x[-1,1]
U18:%x[0,0]/%x[-1,0]
U12:%x[0,0]%x[1,0]
U13:%x[0,1]%x[-1,0]
U14:%x[0,0]%x[1,1]
U15:%x[-1,0]%x[-1,1]
U16:%x[-1,0]%x[-2,0]
U17:%x[-2,0]%x[-2,1]
U18:%x[1,0]%x[2,0]
U19:%x[-1,0]%x[1,0]
U20:%x[1,0]%x[0,1]
U22:%x[-2,1]%x[0,1]
U23:%x[-1,1]%x[0,1]
U24:%x[-1,1]%x[1,1]
U25:%x[0,1]%x[1,1]
U26:%x[0,1]%x[2,1]
U27:%x[1,1]%x[2,1]
|
开始训练和测试
通过下面命令执行训练和测试过程:
1
2
|
crf_learn -f 4 -p 4 -c 3 template train.data model > train.rst
crf_test -m model test.data > test.rst
|
分类型计算F值
通过执行: python clc.py test.rst 执行脚本,脚本内容如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
god_dic={"LOC_S":0,"LOC_B":0, "LOC_I":0, "LOC_E":0}
pre_dic={"LOC_S":0,"LOC_B":0, "LOC_I":0, "LOC_E":0}
correct_dic={"LOC_S":0,"LOC_B":0, "LOC_I":0, "LOC_E":0}
if __name__=="__main__":
try:
file = open(sys.argv[1], "r")
except:
print "result file is not specified, or open failed!"
sys.exit()
wc = 0
loc_wc = 0
wc_of_test = 0
wc_of_gold = 0
wc_of_correct = 0
flag = True
for l in file:
wc += 1
if l=='\n': continue
_,_, g, r = l.strip().split()
#并不涉及到地点实体识别
if "LOC" not in g and "LOC" not in r: continue
loc_wc += 1
if "LOC" in g:
god_dic[g]+= 1
if "LOC" in r:
pre_dic[r]+=1
if g == r:
correct_dic[r]+=1
print "WordCount from result:", wc
print "WordCount of loc_wc post :", loc_wc
print "真实位置标记个数:", god_dic
print "预估位置标记个数:",pre_dic
print "正确标记个数:", correct_dic
res ={"LOC_S":0.0,"LOC_B":0.0, "LOC_I":0.0, "LOC_E":0.0}
all_gold = 0
all_correct = 0
all_pre = 0
for k in god_dic:
print "------ %s -------"%(k)
R = correct_dic[k]/float(god_dic[k])
P = correct_dic[k]/float(pre_dic[k])
print "[%s] P = %f, R = %f, F-score = %f" % (k,P, R, (2*P*R)/(P+R))
all_pre += pre_dic[k]
all_correct += correct_dic[k]
all_gold += god_dic[k]
print "------ All -------"
all_R = all_correct/float(all_gold)
all_P = all_correct/float(all_pre)
print "[%s] P = %f, R = %f, F-score = %f" % ("All",all_P, all_R, (2*all_P*all_R)/(all_P+all_R))
|
参考文献
基于 CRF和规则相结合的地理命名实体识别方法 何炎祥1,2 罗楚威2 胡彬尧