[深度学习] 自然语言处理--- 基于Keras Bert使用(下)
原标题:[深度学习] 自然语言处理--- 基于Keras Bert使用(下)
原文来自:CSDN 原文链接:https://blog.csdn.net/zwqjoy/article/details/103733662
bert4keras 使用最新版本
本文所用bert4keras时间:2019-12-23
https://github.com/bojone/bert4keras
import os #使用tf2.0.0 版本 os.environ['TF_KERAS'] = '1' import numpy as np import json import tensorflow as tf print(tf.__version__)
import tensorflow as tf
print(tf.__version__)
tf.config.set_soft_device_placement(True)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# if gpus:
# tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
print("Physical GPU Devices Num:", len(gpus))
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print("Logical GPU Devices Num:", len(gpus))
strategy = tf.distribute.MirroredStrategy()
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: %d' % strategy.num_replicas_in_sync)
from bert4keras.tokenizer import Tokenizer from bert4keras.bert import build_bert_model from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.backend import set_gelu from bert4keras.tokenizer import load_vocab # 设置预训练模型的路径 pretrained_path = 'chinese_L-12_H-768_A-12' config_path = os.path.join(pretrained_path, 'bert_config.json') checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt') vocab_path = os.path.join(pretrained_path, 'vocab.txt')
import codecs
def get_data():
'''
读取数据的函数
:return: list 类型的 数据
'''
pos = []
neg = []
with codecs.open('./dataset/pos.txt','r','utf-8') as reader:
for line in reader:
pos.append(line.strip())
with codecs.open('./dataset/neg.txt','r','utf-8') as reader:
for line in reader:
neg.append(line.strip())
return pos,neg
# 读取数据
pos, neg = get_data()
data, tokens = [], {}
# 读取词典
_token_dict = load_vocab(vocab_path)
# 建立临时分词器
_tokenizer = Tokenizer(_token_dict)
for d in neg:
data.append((d, 0))
for t in _tokenizer.tokenize(d):
tokens[t] = tokens.get(t, 0) + 1
for d in pos:
data.append((d, 1))
for t in _tokenizer.tokenize(d):
tokens[t] = tokens.get(t, 0) + 1
tokens = {i: j for i, j in tokens.items() if j >= 4}
# token_dict是本任务需要用到的字
# keep_words是在bert中保留的字表
token_dict, keep_words = {}, []
for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
token_dict[t] = len(token_dict)
keep_words.append(_token_dict[t])
for t in tokens:
if t in _token_dict and t not in token_dict:
token_dict[t] = len(token_dict)
keep_words.append(_token_dict[t])
# 建立分词器
tokenizer = Tokenizer(token_dict)
'''
data = []
for d in neg:
data.append((d, 0))
for d in pos:
data.append((d, 1))
# 建立分词器
tokenizer = Tokenizer(vocab_path)
# 构建字典
token_dict = tokenizer._token_dict
'''
if not os.path.exists('./random_order.json'):
random_order = list(range(len(data)))
np.random.shuffle(random_order)
json.dump(
random_order,
open('./random_order.json', 'w'),
indent=4
)
else:
random_order = json.load(open('./random_order.json'))
# 按照9:1的比例划分训练集和验证集
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
def seq_padding(X, padding=0): L = [len(x) for x in X] ML = max(L) return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ]) class data_generator: def __init__(self, data, batch_size=16): self.data = data self.batch_size = batch_size self.steps = len(self.data) // self.batch_size if len(self.data) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def __iter__(self): while True: idxs = list(range(len(self.data))) np.random.shuffle(idxs) X1, X2, Y = [], [], [] for i in idxs: d = self.data[i] text = d[0][:maxlen] x1, x2 = tokenizer.encode(text) y = d[1] X1.append(x1) X2.append(x2) Y.append([y]) if len(X1) == self.batch_size or i == idxs[-1]: X1 = seq_padding(X1) X2 = seq_padding(X2) Y = seq_padding(Y) yield [X1, X2], Y [X1, X2, Y] = [], [], []
from tensorflow.keras.layers import * from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam maxlen = 100 droup_out_rate = 0.5 learning_rate = 1e-5 epochs = 1
bert = build_bert_model( config_path, checkpoint_path, # 只保留keep_words中的字,精简原字表, 也可以不使用 keep_words=keep_words, model='albert', with_pool=True, return_keras_model=False, )
output = Dropout(rate=0.1)(bert.model.output) output = Dense(units=1, activation='sigmoid', kernel_initializer=bert.initializer)(output) with strategy.scope(): model = Model(bert.model.input, output) model.compile( loss='binary_crossentropy', optimizer=Adam(learning_rate), metrics=['accuracy'], ) model.summary()
train_D = data_generator(train_data, batch_size=16) valid_D = data_generator(valid_data, batch_size=16)
epochs = 5 model.fit_generator( train_D.__iter__(), steps_per_epoch=len(train_D), epochs=epochs, validation_data=valid_D.__iter__(), validation_steps=len(valid_D) )


参考:https://www.cnblogs.com/dogecheng/p/11824494.html#_label1
免责声明:本文来自互联网新闻客户端自媒体,不代表本网的观点和立场。
合作及投稿邮箱:E-mail:editor@tusaishared.com
热门资源
TensorFlow从1到2...
原文第四篇中,我们介绍了官方的入门案例MNIST,功...
Python 爬虫(二)...
所谓爬虫就是模拟客户端发送网络请求,获取网络响...
TensorFlow从1到2...
“回归”这个词,既是Regression算法的名称,也代表...
剑指Offer(三十一...
这个系列是我在牛客网上刷《剑指Offer》的刷题笔记...
TensorFlow2.0(9...
TensorBoard是TensorFlow中的又一神器级工具,想用...
智能在线
400-630-6780
聆听.建议反馈
E-mail: support@tusaishared.com