利用hf datasets库包构建自己的数据集

!wget https://raw.githubusercontent.com/zhangbo2008/data_ner/main/aomanyupianjian -O aaaa
!pip install datasets


#=======加载自己数据集

with open('aaaa') as f:
    t=f.readlines()
save1=[]
save2=[]
a=''
b=''
for i in t:

    if i=='\n':
        save1.append(a[:-1])
        save2.append(b[:-1])
        a=''
        b=''
    else:
        a+=i.split(' ')[0]+' '
        b+=i.split(' ')[1].replace('\n',' ')
print(3)
all2=[]
import copy


aaa=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
tmp={'tokens':[], 'ner_tags':[]}
for i,j in zip(save1,save2):
  if i:

    tmp=tmp.copy()
    # tmp=copy.deepcopy(tmp) #没有多级数组所以可以用浅拷贝.
    tmp['tokens']=i.split(' ')
    tmp['ner_tags']=j.split(' ')#末尾句号一般也要进行独立设置O
    if '.' in tmp['tokens'][-1] and '.' !=tmp['tokens'][-1]:
        tmp['tokens'][-1]=tmp['tokens'][-1][:-1]
        tmp['tokens'].append('.')
        tmp['ner_tags'].append('O')
    if ',' in tmp['tokens'][-1] and ',' !=tmp['tokens'][-1]:
        tmp['tokens'][-1]=tmp['tokens'][-1][:-1]
        tmp['tokens'].append(',')
        tmp['ner_tags'].append('O')

    for kkk in range(len(tmp['ner_tags'])):
         tmp['ner_tags'][kkk]=tmp['ner_tags'][kkk].replace('B-Person','B-PER').replace('I-Person','I-PER').replace('B-Location','B-LOC').replace('I-Location','I-LOC').replace('B-Misc','B-MISC').replace('I-Misc','I-MISC')
    for i1 in range(len(tmp['ner_tags'])):
        for j1 in range(len(aaa)):
            if tmp['ner_tags'][i1]==aaa[j1]:
                tmp['ner_tags'][i1]=j1
    all2.append(tmp)
    print(111)


from datasets import Dataset
ds = Dataset.from_list(all2)
#===========tag进行编码
#'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'






ds[0]
ds2=ds.train_test_split(test_size=0.3)#下面我们使用ds2即可.
ds2

 

posted on 2023-05-05 10:38  张博的博客  阅读(45)  评论(0编辑  收藏  举报

导航