
NLP.TM | 命名实体识别基线 BiLSTM+CRF(下)

  • 数据预处理

  • 模型部分

  • 训练主程序

  存在缺陷










  1. # 初步提取信息

  2. fout = open(SOURCE_2_DATA, "w")

  3. with open(SOUTCE_DATA, "r") as f:

  4. for line in f:

  5. line = line.split(' ')

  6. i = 1

  7. while i < len(line) - 1:

  8. if line[i][0] == '[':

  9. fout.write(line[i].split('/')[0][1:])

  10. i += 1

  11. while i < len(line) - 1 and line[i].find(']') == -1:

  12. if line[i] != '':

  13. fout.write(line[i].split('/')[0])

  14. i += 1

  15. fout.write(line[i].split('/')[0].strip() + '/' +

  16. line[i].split('/')[1][-2:] + ' ')

  17. elif line[i].split('/')[1] == 'nr':

  18. word = line[i].split('/')[0]

  19. i += 1

  20. if i < len(line) - 1 and line[i].split('/')[1] == 'nr':

  21. fout.write(word + line[i].split('/')[0] + '/nr ')

  22. else:

  23. fout.write(word + '/nr ')

  24. continue

  25. else:

  26. fout.write(line[i] + ' ')

  27. i += 1

  28. fout.write('\n')

  29. fout.close()


  • 剔除"19980101-01-001-001/m"之类的有关时间、行数之类的信息

  • 方括号处理

  • 有关nr的切词,此处可以看到nr的切词其实对姓名是分开的,但实际上我们要把他们组合起来。

  1. # 只保留nr、ns和nt

  2. fout = open(SOURCE_3_DATA, "w")

  3. with open(SOURCE_2_DATA, "r") as f:

  4. for line in f:

  5. line = line.split(' ')

  6. i = 0

  7. while i<len(line)-1:

  8. if line[i]=='':

  9. i+=1

  10. continue

  11. word = line[i].split('/')[0]

  12. tag = line[i].split('/')[1]

  13. if tag=='nr' or tag=='ns' or tag=='nt':

  14. fout.write(word[0]+"/B_"+tag+" ")

  15. for j in word[1:len(word)-1]:

  16. if j!=' ':

  17. fout.write(j+"/M_"+tag+" ")

  18. fout.write(word[-1]+"/E_"+tag+" ")

  19. else:

  20. for wor in word:

  21. fout.write(wor+'/O ')

  22. i+=1

  23. fout.write('\n')

  24. fout.close()


  1. # 删除标点符号,断句

  2. fout = open(SOURCE_4_DATA, "w")

  3. with open(SOURCE_3_DATA, "r") as f:

  4. texts = f.read()

  5. sentences = re.split('[,。!?、‘’“”:]/[O]', texts)

  6. for sentence in sentences:

  7. if sentence != " ":

  8. fout.write(sentence.strip()+'\n')

  9. fout.close()


  • 机器学习训练非常要求数据量,而这个数据量并不是体现在存储大小上,而是数据的条数,即样本量上,长文本虽然更有利于分析,但是对于机器学习模型而言,样本量更为重要,因此通过断句的方式,将一条样本转化为多条样本在当前条件下更为合适。


  1. # 数据集最终构建

  2. datas = []

  3. labels = []

  4. linedata=[]

  5. linelabel=[]

  6. tags = {}

  7. tags[''] = 0

  8. tag_id_tmp = 1

  9. words = {}

  10. words["unk_"] = 0

  11. word_id_tmp = 1

  12. f = open(SOURCE_4_DATA, "r")

  13. for line in f:

  14. line = line.split()

  15. linedata=[]

  16. linelabel=[]

  17. numNotO=0

  18. for word in line:

  19. word = word.split('/')

  20. linedata.append(word[0])

  21. linelabel.append(word[1])

  22. if word[0] not in words:

  23. words[word[0]] = word_id_tmp

  24. word_id_tmp = word_id_tmp + 1

  25. if word[1] not in tags:

  26. tags[word[1]] = tag_id_tmp

  27. tag_id_tmp = tag_id_tmp + 1

  28. if word[1]!='O':

  29. numNotO+=1

  30. if numNotO!=0:

  31. datas.append(linedata)

  32. labels.append(linelabel)

  33. words[""] = word_id_tmp

  34. f.close()

  35. # word&id

  36. fout_w2id = open("../../data/people_daily/word2id_dict", "w")

  37. fout_id2w = open("../../data/people_daily/id2word_dict", "w")

  38. for word_key in words.keys():

  39. fout_w2id.write("%s\t%s\n" % (word_key, words[word_key]))

  40. fout_id2w.write("%s\t%s\n" % (words[word_key], word_key))

  41. fout_w2id.close()

  42. fout_id2w.close()

  43. # tag&id

  44. fout_t2id = open("../../data/people_daily/tag2id_dict", "w")

  45. fout_id2t = open("../../data/people_daily/id2tag_dict", "w")

  46. for tag_key in tags.keys():

  47. fout_t2id.write("%s\t%s\n" % (tag_key, tags[tag_key]))

  48. fout_id2t.write("%s\t%s\n" % (tags[tag_key], tag_key))

  49. fout_t2id.close()

  50. fout_id2t.close()

  51. x_train,x_test, y_train, y_test = train_test_split(datas, labels, test_size=0.2, random_state=43)

  52. x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.25, random_state=43)

  53. with open("../../data/people_daily/x_train", "w") as f:

  54. for idx in range(len(x_train)):

  55. write_str = "%s\n" % ("\t".join([str(i) for i in x_train[idx]]))

  56. f.write(write_str)

  57. with open("../../data/people_daily/x_test", "w") as f:

  58. for idx in range(len(x_test)):

  59. write_str = "%s\n" % ("\t".join([str(i) for i in x_test[idx]]))

  60. f.write(write_str)

  61. with open("../../data/people_daily/x_valid", "w") as f:

  62. for idx in range(len(x_valid)):

  63. write_str = "%s\n" % ("\t".join([str(i) for i in x_valid[idx]]))

  64. f.write(write_str)

  65. with open("../../data/people_daily/y_train", "w") as f:

  66. for idx in range(len(y_train)):

  67. write_str = "%s\n" % ("\t".join([str(i) for i in y_train[idx]]))

  68. f.write(write_str)

  69. with open("../../data/people_daily/y_test", "w") as f:

  70. for idx in range(len(y_test)):

  71. write_str = "%s\n" % ("\t".join([str(i) for i in y_test[idx]]))

  72. f.write(write_str)

  73. with open("../../data/people_daily/y_valid", "w") as f:

  74. for idx in range(len(y_valid)):

  75. write_str = "%s\n" % ("\t".join([str(i) for i in y_valid[idx]]))

  76. f.write(write_str)


  • 避免使用pandas。pandas虽然有很多操作比较方便,但是个人认为在数据量较大的环境下,IO流操作比pandas更加省内存,在一些操作下甚至可以达到常数级别的空间复杂度(读一条操作一条输出一条)

  • 避免使用pkl。pkl要求bytes编码存储,但实际上python3下虽然解决了中文的问题,但是编码仍有坑。




  1. def __init__(self, config):

  2. self.config = config

  3. # 三个待输入的数据

  4. self.input_x = tf.placeholder(

  5. tf.int32, [None, self.config.seq_length], name='input_x')

  6. self.input_y = tf.placeholder(

  7. tf.int32, [None, self.config.seq_length], name='input_y')

  8. self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

  9. self.bilstm_crf()


  1. with tf.name_scope("embedding"):

  2. # embedding layer

  3. w2v_matrix = tf.get_variable(name="w2v_matrix", shape=[

  4. self.config.vocab_size, self.config.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer())

  5. embedding_inputs = tf.nn.embedding_lookup(w2v_matrix, self.input_x)

  6. embedding_inputs = tf.nn.dropout(embedding_inputs, self.keep_prob)


  1. with tf.name_scope("BiLSTM"):

  2. # BiLSTM layer

  3. lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(

  4. 100, forget_bias=1.0, state_is_tuple=True)

  5. lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(

  6. 100, forget_bias=1.0, state_is_tuple=True)

  7. (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,

  8. lstm_bw_cell,

  9. embedding_inputs,

  10. dtype=tf.float32,

  11. time_major=False,

  12. scope=None)

  13. bilstm_out = tf.concat([output_fw, output_bw], axis=2)

  14. self.bilstm_tmp = bilstm_out


  1. with tf.name_scope("dense"):

  2. W = tf.get_variable(name="W_dense", shape=[self.config.batch_size, 2 * 100, self.config.num_classes],

  3. dtype=tf.float32, initializer=tf.truncated_normal_initializer())

  4. b = tf.get_variable(name="b_dense", shape=[self.config.batch_size, self.config.seq_length, self.config.num_classes], dtype=tf.float32,

  5. initializer=tf.zeros_initializer())

  6. dense_out = tf.tanh(tf.matmul(bilstm_out, W) + b)


  1. with tf.name_scope("crf"):

  2. # CRF

  3. sequence_lengths = np.full(

  4. self.config.batch_size, self.config.seq_length, dtype=np.int32)

  5. self.shape1 = sequence_lengths

  6. log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(

  7. dense_out, self.input_y, sequence_lengths)

  8. self.viterbi_sequence, self.viterbi_score = tf.contrib.crf.crf_decode(

  9. dense_out, self.transition_params, sequence_lengths)



  1. def crf_log_likelihood(inputs,

  2. tag_indices,

  3. sequence_lengths,

  4. transition_params=None):

  5. """Computes the log-likelihood of tag sequences in a CRF.

  6. Args:

  7. inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials

  8. to use as input to the CRF layer.

  9. tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we

  10. compute the log-likelihood.

  11. sequence_lengths: A [batch_size] vector of true sequence lengths.

  12. transition_params: A [num_tags, num_tags] transition matrix, if available.

  13. Returns:

  14. log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of

  15. each example, given the sequence of tag indices.

  16. transition_params: A [num_tags, num_tags] transition matrix. This is either

  17. provided by the caller or created in this function.

  18. """

  19. # Get shape information.

  20. num_tags = inputs.get_shape()[2].value

  21. # Get the transition matrix if not provided.

  22. if transition_params is None:

  23. transition_params = vs.get_variable("transitions", [num_tags, num_tags])

  24. sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,

  25. transition_params)

  26. log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)

  27. # Normalize the scores to get the log-likelihood per example.

  28. log_likelihood = sequence_scores - log_norm

  29. return log_likelihood, transition_params


  1. self.loss = tf.reduce_mean(-log_likelihood)

  2. optimizer = tf.train.AdamOptimizer(self.config.learning_rate)

  3. self.train = optimizer.minimize(self.loss)



  1. # 单测

  2. input_x = [[0, 1, 2], [2, 3, 4]]

  3. input_y = [[1, 1, 0], [2, 2, 1]]

  4. model_config = modelConfig()

  5. model_config.batch_size = 2

  6. model_config.embedding_dim = 5

  7. model_config.num_classes = 3

  8. model_config.seq_length = 3

  9. model_config.vocab_size = 5

  10. model = BiLSTM_CRF(model_config)

  11. with tf.Session() as sess:

  12. sess.run(tf.global_variables_initializer())

  13. # print(shape1)

  14. crf_out = sess.run([model.viterbi_sequence, model.viterbi_score], feed_dict={

  15. "input_x:0": input_x, "input_y:0": input_y, "keep_prob:0": model_config.keep_prob})

  16. loss_out = sess.run([model.loss], feed_dict={

  17. "input_x:0": input_x, "input_y:0": input_y, "keep_prob:0": model_config.keep_prob})

  18. print(loss_out)

  19. print(crf_out)

  20. for i in range(500):

  21. sess.run(model.train, feed_dict={

  22. "input_x:0": input_x, "input_y:0": input_y, "keep_prob:0": model_config.keep_prob})

  23. crf_out = sess.run([model.viterbi_sequence, model.viterbi_score], feed_dict={

  24. "input_x:0": input_x, "input_y:0": input_y, "keep_prob:0": model_config.keep_prob})

  25. loss_out = sess.run([model.loss], feed_dict={

  26. "input_x:0": input_x, "input_y:0": input_y, "keep_prob:0": model_config.keep_prob})

  27. print(loss_out)

  28. print(crf_out)





  1. # 数据加载

  2. x_train = utils.load_dataset(x_train_PATH, pad_len=SEQ_LEN)

  3. y_train = utils.load_dataset(y_train_PATH, pad_len=SEQ_LEN)

  4. # 字典加载

  5. t2id_dict = utils.load_2id_dic(t2id_PATH)

  6. w2id_dict = utils.load_2id_dic(w2id_PATH)

  7. # 数据转化

  8. x_train = utils.item2id_batch(x_train, w2id_dict)

  9. y_train = utils.item2id_batch(y_train, t2id_dict)


  1. def load_dataset(path, batch_size=64, pad_len=30):

  2. dataset = []

  3. with open(path, encoding="utf8") as f:

  4. data_batch = []

  5. for line in f:

  6. ll = line.strip().split("\t")

  7. while len(ll) < pad_len:

  8. ll.append("")

  9. data_batch.append(ll[:pad_len])

  10. if len(data_batch) == batch_size:

  11. dataset.append(data_batch)

  12. data_batch = []

  13. return dataset

  14. def load_2id_dic(path):

  15. dic_get = {}

  16. with open(path) as f:

  17. for line in f:

  18. ll = line.strip().split("\t")

  19. if len(ll) < 2:

  20. dic_get[""] = 0

  21. else:

  22. dic_get[ll[0]] = int(ll[1])

  23. return dic_get

  24. def item2id_batch(items_batch, dic_get):

  25. res = []

  26. for batch_ in items_batch:

  27. res_batch = []

  28. for item in batch_:

  29. sentence = []

  30. for i in item:

  31. if i in dic_get:

  32. sentence.append(dic_get[i])

  33. res_batch.append(sentence)

  34. res.append(res_batch)

  35. return res


  1. # 模型初始化

  2. modelConf = modelConfig()

  3. modelConf.seq_length = len(x_train[-1][-1]) # 序列长度

  4. modelConf.num_classes = len(t2id_dict) # 类别数

  5. modelConf.batch_size = len(x_train[-1]) # 每批训练大小

  6. modelConf.num_batches = len(x_train) # 一共有多少batch

  7. modelConf.vocab_size = len(w2id_dict) # 词汇量

  8. modelConf.num_epochs = 10 # 迭代代数

  9. model = BiLSTM_CRF(modelConf)


  1. class modelConfig(object):

  2. """模型必要参数"""

  3. embedding_dim = 300 # 词向量维度

  4. seq_length = 20 # 序列长度

  5. num_classes = 11 # 类别数

  6. # hidden_dim = 64 # 全连接层神经元

  7. keep_prob = 0.5 # dropout保留比例

  8. learning_rate = 1e-4 # 学习率

  9. batch_size = 64 # 每批训练大小

  10. num_batches = 263 # 一共有多少batch

  11. num_epochs = 20 # 总迭代轮次

  12. print_per_batch = 100 # 每多少轮输出一次结果


  1. tmp_batch_id = 0

  2. # training

  3. while tmp_batch_id < len(x_train):

  4. sess.run(model.train, feed_dict={

  5. "input_x:0": x_train[tmp_batch_id], "input_y:0": y_train[tmp_batch_id], "keep_prob:0": modelConf.keep_prob})

  6. tmp_batch_id = tmp_batch_id + 1

  7. loss = sess.run(model.loss, feed_dict={

  8. "input_x:0": x_train[0], "input_y:0": y_train[0], "keep_prob:0": modelConf.keep_prob})


  1. # validating

  2. tmp_batch_id = 0

  3. y_pred = []

  4. y_valid_combine = []

  5. while tmp_batch_id < len(x_valid):

  6. y_pred_batch = sess.run(model.viterbi_sequence, feed_dict={

  7. "input_x:0": x_valid[tmp_batch_id], "input_y:0": y_valid[tmp_batch_id], "keep_prob:0": modelConf.keep_prob})

  8. for idx in range(len(y_pred_batch)):

  9. y_pred = y_pred + y_pred_batch[idx].tolist()

  10. y_valid_combine = y_valid_combine + y_valid[tmp_batch_id][idx]

  11. tmp_batch_id = tmp_batch_id + 1

  12. p, r, f1score = utils.model_rep(y_pred, y_valid_combine)

  13. print("epoch: %s, loss:%s, f1: %s" %

  14. (i, loss, f1score))

  15. utils.print_matrix(utils.model_conf(y_pred, y_valid_combine))

  16. print("-----------------------------")



  1. from sklearn.metrics import precision_score, recall_score, f1_score

  2. def model_rep(y_true, y_pred, average="micro"):

  3. p = precision_score(y_true, y_pred, average=average)

  4. r = recall_score(y_true, y_pred, average=average)

  5. f1score = f1_score(y_true, y_pred, average=average)

  6. return p, r, f1score


  1. from sklearn.metrics import confusion_matrix

  2. def model_conf(y_true, y_pred):

  3. return confusion_matrix(y_true, y_pred)



  1. def print_matrix(mat):

  2. for idx in range(len(mat)):

  3. for j in mat[idx]:

  4. print("%s\t" % j, end="")

  5. print("\n", end="")

  6. print("",end="\n")



  • 在进行预测阶段,仍需要凑够batch_size个才能够进行预测,不能一个一个预测,主要原因在于条件随机场计算下输入矩阵有要求。

  • 虽然评价指标数据都非常好看,但是看了混淆矩阵就会发现并不理想,原因在于padding阶段补长策略下,以及实际问题下,补充标签和无属性的点过多,样本极度不平衡。



