[深度学习] 自然语言处理--- 基于Keras Bert使用(下)
原标题:[深度学习] 自然语言处理--- 基于Keras Bert使用(下)
原文来自:CSDN 原文链接:https://blog.csdn.net/zwqjoy/article/details/103733662
bert4keras 使用最新版本
本文所用bert4keras时间:2019-12-23
https://github.com/bojone/bert4keras
import os #使用tf2.0.0 版本 os.environ['TF_KERAS'] = '1' import numpy as np import json import tensorflow as tf print(tf.__version__)
import tensorflow as tf print(tf.__version__) tf.config.set_soft_device_placement(True) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # if gpus: # tf.config.experimental.set_visible_devices(gpus[0], 'GPU') print("Physical GPU Devices Num:", len(gpus)) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print("Logical GPU Devices Num:", len(gpus)) strategy = tf.distribute.MirroredStrategy() strategy = tf.distribute.MirroredStrategy() print('Number of devices: %d' % strategy.num_replicas_in_sync)
from bert4keras.tokenizer import Tokenizer from bert4keras.bert import build_bert_model from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.backend import set_gelu from bert4keras.tokenizer import load_vocab # 设置预训练模型的路径 pretrained_path = 'chinese_L-12_H-768_A-12' config_path = os.path.join(pretrained_path, 'bert_config.json') checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt') vocab_path = os.path.join(pretrained_path, 'vocab.txt')
import codecs def get_data(): ''' 读取数据的函数 :return: list 类型的 数据 ''' pos = [] neg = [] with codecs.open('./dataset/pos.txt','r','utf-8') as reader: for line in reader: pos.append(line.strip()) with codecs.open('./dataset/neg.txt','r','utf-8') as reader: for line in reader: neg.append(line.strip()) return pos,neg # 读取数据 pos, neg = get_data()
data, tokens = [], {} # 读取词典 _token_dict = load_vocab(vocab_path) # 建立临时分词器 _tokenizer = Tokenizer(_token_dict) for d in neg: data.append((d, 0)) for t in _tokenizer.tokenize(d): tokens[t] = tokens.get(t, 0) + 1 for d in pos: data.append((d, 1)) for t in _tokenizer.tokenize(d): tokens[t] = tokens.get(t, 0) + 1 tokens = {i: j for i, j in tokens.items() if j >= 4} # token_dict是本任务需要用到的字 # keep_words是在bert中保留的字表 token_dict, keep_words = {}, [] for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t]) for t in tokens: if t in _token_dict and t not in token_dict: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t]) # 建立分词器 tokenizer = Tokenizer(token_dict) ''' data = [] for d in neg: data.append((d, 0)) for d in pos: data.append((d, 1)) # 建立分词器 tokenizer = Tokenizer(vocab_path) # 构建字典 token_dict = tokenizer._token_dict '''
if not os.path.exists('./random_order.json'): random_order = list(range(len(data))) np.random.shuffle(random_order) json.dump( random_order, open('./random_order.json', 'w'), indent=4 ) else: random_order = json.load(open('./random_order.json')) # 按照9:1的比例划分训练集和验证集 train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
def seq_padding(X, padding=0): L = [len(x) for x in X] ML = max(L) return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ]) class data_generator: def __init__(self, data, batch_size=16): self.data = data self.batch_size = batch_size self.steps = len(self.data) // self.batch_size if len(self.data) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def __iter__(self): while True: idxs = list(range(len(self.data))) np.random.shuffle(idxs) X1, X2, Y = [], [], [] for i in idxs: d = self.data[i] text = d[0][:maxlen] x1, x2 = tokenizer.encode(text) y = d[1] X1.append(x1) X2.append(x2) Y.append([y]) if len(X1) == self.batch_size or i == idxs[-1]: X1 = seq_padding(X1) X2 = seq_padding(X2) Y = seq_padding(Y) yield [X1, X2], Y [X1, X2, Y] = [], [], []
from tensorflow.keras.layers import * from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam maxlen = 100 droup_out_rate = 0.5 learning_rate = 1e-5 epochs = 1
bert = build_bert_model( config_path, checkpoint_path, # 只保留keep_words中的字,精简原字表, 也可以不使用 keep_words=keep_words, model='albert', with_pool=True, return_keras_model=False, )
output = Dropout(rate=0.1)(bert.model.output) output = Dense(units=1, activation='sigmoid', kernel_initializer=bert.initializer)(output) with strategy.scope(): model = Model(bert.model.input, output) model.compile( loss='binary_crossentropy', optimizer=Adam(learning_rate), metrics=['accuracy'], ) model.summary()
train_D = data_generator(train_data, batch_size=16) valid_D = data_generator(valid_data, batch_size=16)
epochs = 5 model.fit_generator( train_D.__iter__(), steps_per_epoch=len(train_D), epochs=epochs, validation_data=valid_D.__iter__(), validation_steps=len(valid_D) )
参考:https://www.cnblogs.com/dogecheng/p/11824494.html#_label1
免责声明:本文来自互联网新闻客户端自媒体,不代表本网的观点和立场。
合作及投稿邮箱:E-mail:editor@tusaishared.com
热门资源
Python 爬虫(二)...
所谓爬虫就是模拟客户端发送网络请求,获取网络响...
TensorFlow从1到2...
原文第四篇中,我们介绍了官方的入门案例MNIST,功...
TensorFlow从1到2...
“回归”这个词,既是Regression算法的名称,也代表...
机器学习中的熵、...
熵 (entropy) 这一词最初来源于热力学。1948年,克...
TensorFlow2.0(10...
前面的博客中我们说过,在加载数据和预处理数据时...
智能在线
400-630-6780
聆听.建议反馈
E-mail: support@tusaishared.com