利用hf datasets库包构建自己的数据集
!wget https://raw.githubusercontent.com/zhangbo2008/data_ner/main/aomanyupianjian -O aaaa !pip install datasets #=======加载自己数据集 with open('aaaa') as f: t=f.readlines() save1=[] save2=[] a='' b='' for i in t: if i=='\n': save1.append(a[:-1]) save2.append(b[:-1]) a='' b='' else: a+=i.split(' ')[0]+' ' b+=i.split(' ')[1].replace('\n',' ') print(3) all2=[] import copy aaa=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'] tmp={'tokens':[], 'ner_tags':[]} for i,j in zip(save1,save2): if i: tmp=tmp.copy() # tmp=copy.deepcopy(tmp) #没有多级数组所以可以用浅拷贝. tmp['tokens']=i.split(' ') tmp['ner_tags']=j.split(' ')#末尾句号一般也要进行独立设置O if '.' in tmp['tokens'][-1] and '.' !=tmp['tokens'][-1]: tmp['tokens'][-1]=tmp['tokens'][-1][:-1] tmp['tokens'].append('.') tmp['ner_tags'].append('O') if ',' in tmp['tokens'][-1] and ',' !=tmp['tokens'][-1]: tmp['tokens'][-1]=tmp['tokens'][-1][:-1] tmp['tokens'].append(',') tmp['ner_tags'].append('O') for kkk in range(len(tmp['ner_tags'])): tmp['ner_tags'][kkk]=tmp['ner_tags'][kkk].replace('B-Person','B-PER').replace('I-Person','I-PER').replace('B-Location','B-LOC').replace('I-Location','I-LOC').replace('B-Misc','B-MISC').replace('I-Misc','I-MISC') for i1 in range(len(tmp['ner_tags'])): for j1 in range(len(aaa)): if tmp['ner_tags'][i1]==aaa[j1]: tmp['ner_tags'][i1]=j1 all2.append(tmp) print(111) from datasets import Dataset ds = Dataset.from_list(all2) #===========tag进行编码 #'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC' ds[0] ds2=ds.train_test_split(test_size=0.3)#下面我们使用ds2即可. ds2