????????以下是一個基于PyTorch的文本分類模型的示例代碼,用于將給定的文本分為多個預(yù)定義類別:
import torch
import torch.nn as nn
import torch.nn.functional as F
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)
self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths):
embedded = self.dropout(self.embedding(text))
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False)
packed_output, (hidden, cell) = self.rnn(packed_embedded)
output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1) if self.rnn.bidirectional else hidden[-1,:,:])
return self.fc(hidden.squeeze(0))
????????該模型將輸入的文本作為整數(shù)序列傳遞給嵌入層,然后通過多層LSTM層進行處理,最終輸出每個類別的預(yù)測概率。
????????在訓(xùn)練模型之前,需要將文本序列轉(zhuǎn)換為整數(shù)標(biāo)記,通常使用分詞器/標(biāo)記器完成此任務(wù)。另外還需要定義優(yōu)化器和損失函數(shù)來訓(xùn)練模型。
????????以下是一個完整的訓(xùn)練腳本的示例:
import torch.optim as optim
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer
from torch.utils.data.dataset import random_split
from collections import Counter
# 獲取數(shù)據(jù)集和分詞器
train_iter = AG_NEWS(split='train')
tokenizer = get_tokenizer('basic_english')
# 構(gòu)建詞匯表
counter = Counter()
for (label, line) in train_iter:
counter.update(tokenizer(line))
vocab = build_vocab_from_iterator([counter])
vocab.set_default_index(vocab['<unk>'])
# 定義標(biāo)記化函數(shù)和文本處理函數(shù)
def yield_tokens(data_iter):
for _, text in data_iter:
yield tokenizer(text)
def text_transform(tokenizer, vocab, data):
"""將文本數(shù)據(jù)轉(zhuǎn)換為張量數(shù)據(jù)"""
data = [vocab[token] for token in tokenizer(data)]
return torch.tensor(data)
# 定義批次生成器
def collate_batch(batch):
label_list, text_list, offsets = [], [], [0]
for (_label, _text) in batch:
label_list.append(_label-1)
processed_text = torch.cat([text_transform(tokenizer, vocab, _text), torch.tensor([vocab['<eos>']])])
text_list.append(processed_text)
offsets.append(processed_text.size(0))
label_list = torch.tensor(label_list)
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
text_list = torch.cat(text_list)
return label_list, text_list, offsets
# 構(gòu)建數(shù)據(jù)集和數(shù)據(jù)加載器
train_iter, test_iter = AG_NEWS()
train_iter = list(train_iter)
test_iter = list(test_iter)
train_dataset = list(map(lambda x: (x[0], x[1]), train_iter))
test_dataset = list(map(lambda x: (x[0], x[1]), test_iter))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
# 創(chuàng)建模型和優(yōu)化器
model = TextClassifier(len(vocab), 64, 128, 4, 2, True, 0.5)
optimizer = optim.Adam(model.parameters())
# 定義損失函數(shù)和訓(xùn)練函數(shù)
criterion = nn.CrossEntropyLoss()
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
model.train()
for (label, text, offsets) in iterator:
optimizer.zero_grad()
predictions = model(text, offsets)
loss = criterion(predictions, label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)
# 訓(xùn)練模型
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
train_loss = train(model, train_loader, optimizer, criterion)
print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')
????????在訓(xùn)練過程結(jié)束后,可以使用該模型對新的文本進行分類。具體方法是將文本轉(zhuǎn)換為整數(shù)標(biāo)記序列,然后使用模型進行預(yù)測:
# 對新文本進行分類
def predict(model, sentence):
model.eval()
tokenized = torch.tensor([vocab[token] for token in tokenizer(sentence)])
length = torch.tensor([len(tokenized)])
prediction = model(tokenized, length)
return F.softmax(prediction, dim=1).detach().numpy()[0]
# 進行預(yù)測
test_sentence = "World markets are reacting to the news that the UK is set to leave the European Union."
pred_probs = predict(model, test_sentence)
print(pred_probs)
????????以上代碼示例中使用了AG_NEWS數(shù)據(jù)集作為示例訓(xùn)練數(shù)據(jù),可通過以下方式加載數(shù)據(jù)集:文章來源:http://www.zghlxwxcb.cn/news/detail-626492.html
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split='train')
test_iter = AG_NEWS(split='test')
????????該數(shù)據(jù)集包含四個類別的新聞數(shù)據(jù),每個類別各有120,000個訓(xùn)練示例和7,600個測試示例。完整的訓(xùn)練腳本和數(shù)據(jù)集可以在PyTorch官方文檔中找到。文章來源地址http://www.zghlxwxcb.cn/news/detail-626492.html
到了這里,關(guān)于自然語言文本分類模型代碼的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!