读取Ontonote4的数据并转换格式并转成BIO标记

#读取ontonote4并转换格式
f=open("./data/ontonote4/train.char.bmes","r+",encoding="utf-8")
sentences = []
sentence = []
label_set=set()
cnt_line=0
for line in f:
    sentence = []
    cnt_line+=1
    splits = line.split('	')
    splits[0]=splits[0].split(' ')
    splits[1]=splits[1].split(' ')
    len_line=len(splits[0])
    for i in range(len_line):
        sentence.append([splits[0][i],splits[1][i]])
        label_set.add(splits[1][i])
    #print(sentence)
    sentences.append(sentence)
f=open("./data/ontonote4/train-trans.char.bmes","w+",encoding="utf-8")
for sen in sentences:
    for word in sen:
        char=word[0]
        label=word[1]
        if(label[0]=='S'):
            label='B'+label[1:]
        elif(label[0]=='E' or label[0]=='M'):
            label='I'+label[1:]
        f.write(f'{char} {label}
')
    #f.write('
') 数据集的结尾已经有了
f.close()
#综合分析数据集 输入sentences,输出标签名称 出现次数 B的次数 I的次数 实体种类数  意义
sentences
label_set
class_set={i[2:] for i in label_set if i[0]=='B'}
class_set#{'GPE', 'LOC', 'ORG', 'PER'}
class_map={i:[0,0,0] for i in class_set}
class_map
#出现次数
for sen in sentences:
    for word in sen:
        label=word[1]
        if(label=='O'):
            continue
        class_map[label[2:]][0]+=1
        if(label[0]=='B'):
            class_map[label[2:]][1]+=1
        if(label[0]=='I'):
            class_map[label[2:]][2]+=1
#实体种类数
class_entity={i:set() for i in class_set}
for sen in sentences:    
    entity=''
    for i in range(len(sen)):
        word=sen[i]
        char=word[0]
        label=word[1]
        #print(label)
        if(label[0]=='B'):
            if(entity!=''):
                #print(entity)
                #print(state)
                class_entity[state[2:]].add(entity)
                entity=''
            entity+=char
        elif(label[0]=='O'):
            if(entity!=''):
                #print(entity)
                #print(state)
                class_entity[state[2:]].add(entity)
                entity=''
            #state=label[2:]
            #print(state)
                #print(entity)
        elif(label[0]=='I'):
            entity+=char
        state=label
    if(entity!=''):
        #print(entity)
        #print(state)   
        class_entity[state[2:]].add(entity)
        entity=''
        
#for sen in sentences:
    #for word in sen:
class_map
原文地址:https://www.cnblogs.com/Tony100K/p/14602506.html