将BMES标记的NER语料库转换为BIO标记的语料库

将BMES标记的NER语料库转换为BIO标记的语料库

读取

f = open("./data/msra/dev.char.bmes",encoding='utf-8')
sentences = []
sentence = []
label_set=set()
cnt_line=0
for line in f:
    #print(line)
    cnt_line+=1
    if len(line)==0  or line[0]=="\n":
        if len(sentence) > 0:
            sentences.append(sentence)
            #print(sentence)
            sentence = []
        continue
    splits = line.split(' ')
    sentence.append([splits[0],splits[-1][:-1]])
    label_set.add(splits[-1])
    if('\n' not in splits[-1]):
        print(splits[0],splits[-1])
        print(cnt_line)
    #print([splits[0],splits[-1]])


if len(sentence) >0:
    sentences.append(sentence)
    sentence = []
f.close()

转换

f=open("./output/msra-bio/dev.char.bmes","w+",encoding="utf-8")
for sen in sentences:
    for word in sen:
        char=word[0]
        label=word[1]
        if(label[0]=='S'):
            label='B'+label[1:]
        elif(label[0]=='E' or label[0]=='M'):
            label='I'+label[1:]
        f.write(f'{char} {label}\n')
    f.write('\n')
f.close()

posted @ 2021-03-16 15:10  NTS100K  阅读(597)  评论(0编辑  收藏  举报
浏览器标题切换end