python

超轻量级php框架startmvc

python 利用已有Ner模型进行数据清洗合并代码

更新时间:2020-08-15 23:06:01 作者:startmvc
我就废话不多说了,直接上代码吧!#-*-coding:utf-8-*-fromkashgari.corpusimportDataReaderimportrefromtqdmim

我就废话不多说了,直接上代码吧!


# -*- coding: utf-8 -*-
from kashgari.corpus import DataReader
import re
from tqdm import tqdm


def cut_text(text, lenth):
 textArr = re.findall('.{' + str(lenth) + '}', text)
 textArr.append(text[(len(textArr) * lenth):])
 return textArr


def clean_data(source_file, target_file, ner_model):
 
 data_x, data_y = DataReader().read_conll_format_file(source_file)

 with tqdm(total=len(data_x)) as pbar:
 for idx, text_array in enumerate(data_x):
 if len(text_array) <= 100:
 ners = ner_model.predict([text_array])
 ner = ners[0]
 else:
 texts = cut_text(''.join(text_array), 100)
 ners = []
 for text in texts:
 ner = ner_model.predict([[char for char in text]])
 ners = ners + ner[0]
 ner = ners 
 # print('[-----------------------', idx, len(data_x))
 # print(data_y[idx])
 # print(ner)
 
 for jdx, t in enumerate(text_array):
 if ner[jdx].startswith('B') or ner[jdx].startswith('I') :
 if data_y[idx][jdx] == 'O':
 data_y[idx][jdx] = ner[jdx]
 
 # print(data_y[idx])
 # print('-----------------------]') 
 pbar.update(1)
 
 f = open(target_file, 'a', encoding="utf-8") 
 for idx, text_array in enumerate(data_x):
 if idx != 0:
 f.writelines(['\n']) 
 for jdx, t in enumerate(text_array):
 text = t + ' ' + data_y[idx][jdx] 
 if idx == 0 and jdx == 0:
 text = text
 else:
 text = '\n' + text
 f.writelines([text]) 
 
 f.close() 
 
 data_x2, data_y2 = DataReader().read_conll_format_file(source_file)
 print(data_x == data_x2, len(data_y) == len(data_y2), '数据清洗完成') 

# -*- coding: utf-8 -*-
import kashgari
from data_tools import clean_data
time_ner = kashgari.utils.load_model('time_ner.h5')
clean_data('./data/example.dev', 'example.dev', time_ner)

以上这篇python 利用已有Ner模型进行数据清洗合并代码就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持脚本之家。

python Ner 数据清洗 合并