任務(wù)描述
本示例教程演示如何在IMDB數(shù)據(jù)集上使用RNN網(wǎng)絡(luò)完成文本分類的任務(wù)。IMDB數(shù)據(jù)集包含對電影評論進(jìn)行正向和負(fù)向標(biāo)注的數(shù)據(jù),共有25000條文本數(shù)據(jù)作為訓(xùn)練集,25000條文本數(shù)據(jù)作為測試集。數(shù)據(jù)集的官方地址為:IMDB Dataset
一、環(huán)境設(shè)置
本示例基于飛槳開源框架2.0版本。
import paddle
import numpy as np
import matplotlib.pyplot as plt
import paddle.nn as nn
print(paddle.__version__) # 查看當(dāng)前版本
# cpu/gpu環(huán)境選擇,在 paddle.set_device() 輸入對應(yīng)運(yùn)行設(shè)備。
device = paddle.set_device('gpu')
2.0.1
二、數(shù)據(jù)準(zhǔn)備
由于IMDB是NLP領(lǐng)域中常見的數(shù)據(jù)集,飛槳框架將其內(nèi)置,路徑為paddle.text.datasets.Imdb
。通過mode
參數(shù)可以控制訓(xùn)練集與測試集。
print('loading dataset...')
train_dataset = paddle.text.datasets.Imdb(mode='train')
test_dataset = paddle.text.datasets.Imdb(mode='test')
print('loading finished')
構(gòu)建了訓(xùn)練集與測試集后,可以通過word_idx
獲取數(shù)據(jù)集的詞表。
word_dict = train_dataset.word_idx # 獲取數(shù)據(jù)集的詞表
# add a pad token to the dict for later padding the sequence
word_dict['<pad>'] = len(word_dict)
for k in list(word_dict)[:5]:
print("{}:{}".format(k.decode('ASCII'), word_dict[k]))
print("...")
for k in list(word_dict)[-5:]:
print("{}:{}".format(k if isinstance(k, str) else k.decode('ASCII'), word_dict[k]))
print("totally {} words".format(len(word_dict)))
2.1 參數(shù)設(shè)置
在這里設(shè)置詞表大小、embedding大小、batch_size等參數(shù)。
vocab_size = len(word_dict) + 1
print(vocab_size)
emb_size = 256
seq_len = 200
batch_size = 32
epochs = 2
pad_id = word_dict['<pad>']
classes = ['negative', 'positive']
# 生成句子列表
def ids_to_str(ids):
words = []
for k in ids:
w = list(word_dict)[k]
words.append(w if isinstance(w, str) else w.decode('ASCII'))
return " ".join(words)
2.2 用padding的方式對齊數(shù)據(jù)
文本數(shù)據(jù)中,每一句話的長度都是不一樣的,為了方便后續(xù)的神經(jīng)網(wǎng)絡(luò)的計算,通常使用padding的方式對齊數(shù)據(jù)。
# 讀取數(shù)據(jù)歸一化處理
def create_padded_dataset(dataset):
padded_sents = []
labels = []
for batch_id, data in enumerate(dataset):
sent, label = data[0], data[1]
padded_sent = np.concatenate([sent[:seq_len], [pad_id] * (seq_len - len(sent))]).astype('int32')
padded_sents.append(padded_sent)
labels.append(label)
return np.array(padded_sents), np.array(labels)
# 對train、test數(shù)據(jù)進(jìn)行實(shí)例化
train_sents, train_labels = create_padded_dataset(train_dataset)
test_sents, test_labels = create_padded_dataset(test_dataset)
# 查看數(shù)據(jù)大小及舉例內(nèi)容
print(train_sents.shape)
print(train_labels.shape)
print(test_sents.shape)
print(test_labels.shape)
for sent in train_sents[:3]:
print(ids_to_str(sent))
2.3 用Dataset與DataLoader加載
將前面準(zhǔn)備好的訓(xùn)練集與測試集用Dataset
與DataLoader
封裝后,完成數(shù)據(jù)的加載。
class IMDBDataset(paddle.io.Dataset):
'''
繼承paddle.io.Dataset類進(jìn)行封裝數(shù)據(jù)
'''
def __init__(self, sents, labels):
self.sents = sents
self.labels = labels
def __getitem__(self, index):
data = self.sents[index]
label = self.labels[index]
return data, label
def __len__(self):
return len(self.sents)
train_dataset = IMDBDataset(train_sents, train_labels)
test_dataset = IMDBDataset(test_sents, test_labels)
train_loader = paddle.io.DataLoader(train_dataset, return_list=True,
shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = paddle.io.DataLoader(test_dataset, return_list=True,
shuffle=True, batch_size=batch_size, drop_last=True)
三、模型配置
本示例中使用一個序列特性的RNN網(wǎng)絡(luò),在查找到每個詞對應(yīng)的embedding后,取平均作為一個句子的表示。然后用Linear進(jìn)行線性變換,同時使用Dropout防止過擬合。文章來源:http://www.zghlxwxcb.cn/news/detail-795410.html
class MyRNN(paddle.nn.Layer):
def __init__(self):
super(MyRNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, 256)
self.rnn = nn.SimpleRNN(256, 256, num_layers=2, direction='forward',dropout=0.5)
self.linear = nn.Linear(in_features=256*2, out_features=2)
self.dropout = nn.Dropout(0.5)
def forward(self, inputs):
emb = self.dropout(self.embedding(inputs))
output, hidden = self.rnn(emb)
hidden = paddle.concat((hidden[-2,:,:], hidden[-1,:,:]), axis = 1)
hidden = self.dropout(hidden)
return self.linear(hidden)
四、模型訓(xùn)練
# 可視化定義
def draw_process(title, color, iters, data, label):
plt.title(title, fontsize=24)
plt.xlabel("iter", fontsize=20)
plt.ylabel(label, fontsize=20)
plt.plot(iters, data, color=color, label=label)
plt.legend()
plt.grid()
plt.show()
# 對模型進(jìn)行封裝
def train(model):
model.train()
opt = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())
steps = 0
Iters, total_loss, total_acc = [], [], []
for epoch in range(epochs):
for batch_id, data in enumerate(train_loader):
steps +=
1
sent = data[0]
label = data[1]
logits = model(sent)
loss = paddle.nn.functional.cross_entropy(logits, label)
acc = paddle.metric.accuracy(logits, label)
if batch_id % 500 == 0: # 500個epoch輸出一次結(jié)果
Iters.append(steps)
total_loss.append(loss.numpy()[0])
total_acc.append(acc.numpy()[0])
print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, loss.numpy()))
loss.backward()
opt.step()
opt.clear_grad()
# evaluate model after one epoch
model.eval()
accuracies = []
losses = []
for batch_id, data in enumerate(test_loader):
sent = data[0]
label = data[1]
logits = model(sent)
loss = paddle.nn.functional.cross_entropy(logits, label)
acc = paddle.metric.accuracy(logits, label)
accuracies.append(acc.numpy())
losses.append(loss.numpy())
avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)
print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss))
model.train()
# 保存模型
paddle.save(model.state_dict(), str(epoch) + "_model_final.pdparams")
# 可視化查看
draw_process("training loss", "red", Iters, total_loss, "training loss")
draw_process("training acc", "green", Iters, total_acc, "training acc")
model = MyRNN()
train(model)
五、模型評估
model_state_dict = paddle.load('1_model_final.pdparams') # 導(dǎo)入模型
model = MyRNN()
model.set_state_dict(model_state_dict)
model.eval()
accuracies = []
losses = []
for batch_id, data in enumerate(test_loader):
sent = data[0]
label = data[1]
logits = model(sent)
loss = paddle.nn.functional.cross_entropy(logits, label)
acc = paddle.metric.accuracy(logits, label)
accuracies.append(acc.numpy())
losses.append(loss.numpy())
avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)
print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss))
六、模型預(yù)測
def ids_to_str(ids):
words = []
for k in ids:
w = list(word_dict)[k]
words.append(w if isinstance(w, str) else w.decode('UTF-8'))
return " ".join(words)
label_map = {0: "negative", 1: "positive"}
# 導(dǎo)入模型
model_state_dict = paddle.load('1_model_final.pdparams')
model = MyRNN()
model.set_state_dict(model_state_dict)
model.eval()
for batch_id, data in enumerate(test_loader):
sent = data[0]
results = model(sent)
predictions = []
for probs in results:
# 映射分類label
idx = np.argmax(probs)
labels = label_map[idx]
predictions.append(labels)
for i, pre in enumerate(predictions):
print(' 數(shù)據(jù): {} \n 情感: {}'.format(ids_to_str(sent[0]), pre))
break
break
以上是使用RNN完成IMDB電影評論情感分析的示例。通過搭建RNN網(wǎng)絡(luò),對文本數(shù)據(jù)進(jìn)行預(yù)處理、模型訓(xùn)練和評估,最終實(shí)現(xiàn)了對電影評論情感的分類。在實(shí)際應(yīng)用中,可以根據(jù)需求調(diào)整網(wǎng)絡(luò)結(jié)構(gòu)和超參數(shù),提高模型性能。文章來源地址http://www.zghlxwxcb.cn/news/detail-795410.html
到了這里,關(guān)于使用RNN完成IMDB電影評論情感分析的文章就介紹完了。如果您還想了解更多內(nèi)容,請?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!