比賽鏈接:訊飛開放平臺
來源:DataWhale?AI夏令營3(NLP)
?文章來源地址http://www.zghlxwxcb.cn/news/detail-668346.html
Roberta-base(BERT的改進)
①Roberta在預訓練的階段中沒有對下一句話進行預測(NSP)
②采用了動態(tài)掩碼 ③使用字符級和詞級別表征的混合文本編碼。
論文:https://arxiv.org/pdf/1907.11692.pdf
?
DataWhale Topline的改進:
??特征1:平均池化MeanPooling(768維) -> 全連接層fc(128維)
??特征2:末隱藏層Last_hidden (768維) -> 全連接層fc(128維)
?
運行方式:阿里云機器學習平臺PAI-交互式建模DSW
鏡像選擇:pytorch:1.12-gpu-py39-cu113-ubuntu20.04
上傳代碼,解壓指令:
unzip [filename]
運行py腳本指令(遇到網(wǎng)絡錯誤重新運行即可):
?
python [python_filename]
① 數(shù)據(jù)處理模塊
導入需要的模塊:
from transformers import AutoTokenizer #文本分詞
import pandas as pd
import numpy as np
from tqdm import tqdm #顯示進度條
import torch
from torch.nn.utils.rnn import pad_sequence
#填充序列,保證向量中各序列維度的大小一樣
MAX_LENGTH = 128 #定義最大序列長度為128
訓練集制作:
def get_train(model_name, model_dict):
model_index = model_dict[model_name] # 獲取模型索引
train = pd.read_csv('./dataset/train.csv') #讀取訓練數(shù)據(jù)為DataFrame
train['content'] = train['title'] + train['author'] + train['abstract']
#將標題、作者和摘要拼接為訓練內(nèi)容
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=MAX_LENGTH, cache_dir=f'./premodels/{model_name}_saved') # 實例化分詞器對象
# 通過分詞器對訓練數(shù)據(jù)進行分詞,并獲取輸入ID、注意力掩碼和標記類型ID(這個可有可無)
input_ids_list, attention_mask_list, token_type_ids_list = [], [], []
y_train = [] # 存儲訓練數(shù)據(jù)的標簽
for i in tqdm(range(len(train['content']))): # 遍歷訓練數(shù)據(jù)
sample = train['content'][i] # 獲取樣本內(nèi)容
tokenized = tokenizer(sample, truncation='longest_first')
#分詞處理,【最長優(yōu)先方式】截斷
input_ids, attention_mask = tokenized['input_ids'], tokenized['attention_mask'] # 獲取輸入ID和注意力掩碼
input_ids, attention_mask = torch.tensor(input_ids), torch.tensor(attention_mask) # 轉換為PyTorch張量
try:
token_type_ids = tokenized['token_type_ids'] # 獲取標記類型ID
token_type_ids = torch.tensor(token_type_ids) # 轉換為PyTorch張量
except:
token_type_ids = input_ids #異常處理
input_ids_list.append(input_ids) # 將輸入ID添加到列表中
attention_mask_list.append(attention_mask) # 將注意力掩碼添加到列表中
token_type_ids_list.append(token_type_ids) # 將標記類型ID添加到列表中
y_train.append(train['label'][i]) # 將訓練數(shù)據(jù)的標簽添加到列表中
# 保存 對下述對象進行填充,保證向量中各序列維度的大小一樣,生成張量
# 輸入 ID input_ids_tensor、
# 注意力掩碼 attention_mask_tensor
# 標記類型ID token_type_ids_tensor
input_ids_tensor = pad_sequence(input_ids_list, batch_first=True, padding_value=0)
attention_mask_tensor = pad_sequence(attention_mask_list, batch_first=True, padding_value=0)
token_type_ids_tensor = pad_sequence(token_type_ids_list, batch_first=True, padding_value=0)
x_train = torch.stack([input_ids_tensor, attention_mask_tensor, token_type_ids_tensor], dim=1) # 將輸入張量堆疊為一個張量
x_train = x_train.numpy() # 轉換為NumPy數(shù)組(ndarray)
np.save(f'./models_input_files/x_train{model_index}.npy', x_train) #保存訓練數(shù)據(jù)
y_train = np.array(y_train) # 轉換為NumPy數(shù)組(ndarray)
np.save(f'./models_input_files/y_train{model_index}.npy', y_train) #保存標簽數(shù)據(jù)
測試集制作:
def get_test(model_name, model_dict):
model_index = model_dict[model_name] # 獲取模型索引
test = pd.read_csv('./dataset/testB.csv') # 從CSV文件中讀取測試數(shù)據(jù)為DataFrame
test['content'] = test['title'] + ' ' + test['author'] + ' ' + test['abstract']
# 將標題、作者和摘要拼接為測試內(nèi)容
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=MAX_LENGTH,cache_dir=f'./premodels/{model_name}_saved') # 實例化分詞器對象
# 通過分詞器對測試數(shù)據(jù)進行分詞,創(chuàng)建輸入ID、注意力掩碼和標記類型ID列表進行記錄(可有可無)
input_ids_list, attention_mask_list, token_type_ids_list = [], [], []
for i in tqdm(range(len(test['content']))): # 遍歷測試數(shù)據(jù)
sample = test['content'][i] # 獲取樣本內(nèi)容
tokenized = tokenizer(sample, truncation='longest_first')
# 分詞處理,使用最長優(yōu)先方式截斷
input_ids, attention_mask = tokenized['input_ids'], tokenized['attention_mask'] # 獲取輸入ID和注意力掩碼
input_ids, attention_mask = torch.tensor(input_ids), torch.tensor(attention_mask) # 轉換為PyTorch張量
try:
token_type_ids = tokenized['token_type_ids'] # 獲取標記類型ID
token_type_ids = torch.tensor(token_type_ids) # 轉換為PyTorch張量
except:
token_type_ids = input_ids #異常處理
input_ids_list.append(input_ids) # 將輸入ID添加到列表中
attention_mask_list.append(attention_mask) # 將注意力掩碼添加到列表中
token_type_ids_list.append(token_type_ids) # 將標記類型ID添加到列表中
# 保存,對輸入ID、注意力掩碼、標記類型ID進行填充,保證向量中各序列維度的大小一樣,生成張量
input_ids_tensor = pad_sequence(input_ids_list, batch_first=True, padding_value=0)
attention_mask_tensor = pad_sequence(attention_mask_list, batch_first=True, padding_value=0)
token_type_ids_tensor = pad_sequence(token_type_ids_list, batch_first=True, padding_value=0)
x_test = torch.stack([input_ids_tensor, attention_mask_tensor, token_type_ids_tensor], dim=1) # 將輸入張量堆疊為一個張量
x_test = x_test.numpy() # 轉換為NumPy數(shù)組
np.save(f'./models_input_files/x_test{model_index}.npy', x_test) # 保存測試數(shù)據(jù)
劃分訓練集和驗證集:
def split_train(model_name, model_dict):
# 訓練集:驗證集 = 9 : 1
split_rate = 0.90
# 處理樣本內(nèi)容
model_index = model_dict[model_name] # 獲取模型索引
train = np.load(f'./models_input_files/x_train{model_index}.npy') # 加載訓練數(shù)據(jù)
state = np.random.get_state() # 獲取隨機數(shù)狀態(tài),保證樣本間的隨機是可重復的
# 或者也可以設置經(jīng)典隨機種子random_seed=42
np.random.shuffle(train) # 隨機打亂訓練數(shù)據(jù),數(shù)據(jù)洗牌
val = train[int(train.shape[0] * split_rate):] # 劃分驗證集 validation
train = train[:int(train.shape[0] * split_rate)] # 劃分訓練集 train set
np.save(f'./models_input_files/x_train{model_index}.npy', train) # 保存訓練集
np.save(f'./models_input_files/x_val{model_index}.npy', val) # 保存驗證集
train = np.load(f'./models_input_files/y_train{model_index}.npy') # 加載標簽數(shù)據(jù)
# 處理樣本標簽
np.random.set_state(state) # 恢復隨機數(shù)狀態(tài),讓樣本標簽的隨機可重復
np.random.shuffle(train) # 隨機打亂標簽數(shù)據(jù)
val = train[int(train.shape[0] * split_rate):] # 劃分驗證集 validation
train = train[:int(train.shape[0] * split_rate)] # 劃分訓練集 train set
np.save(f'./models_input_files/y_train{model_index}.npy', train) # 保存訓練集標簽
np.save(f'./models_input_files/y_val{model_index}.npy', val) # 保存驗證集標簽
print('split done.')
數(shù)據(jù)處理主函數(shù):
if __name__ == '__main__':
model_dict = {'xlm-roberta-base':1,
'roberta-base':2,
'bert-base-uncased':3,
'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext':4,
'dmis-lab/biobert-base-cased-v1.2':5,
'marieke93/MiniLM-evidence-types':6,
'microsoft/MiniLM-L12-H384-uncased':7,
'cambridgeltl/SapBERT-from-PubMedBERT-fulltext':8,
'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract':9,
'microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract':10}
model_name = 'roberta-base'
get_train(model_name, model_dict) #讀取訓練集
get_test(model_name, model_dict) #讀取測試集
split_train(model_name, model_dict) #劃分訓練集和測試集
② 模型訓練
導入需要的模塊:
import numpy as np
import torch
import torch.nn as nn
from sklearn import metrics
import os
import time
from transformers import AutoModel, AutoConfig
# 導入AutoModel和AutoConfig類,用于加載預訓練模型
from tqdm import tqdm #顯示進度條
超參數(shù)類(可修改的所有超參數(shù)):
class opt:
seed = 42 # 隨機種子
batch_size = 16 # 批處理大小
set_epoch = 5 # 訓練輪數(shù)
early_stop = 5 # 提前停止epoch數(shù)
learning_rate = 1e-5 # 學習率
weight_decay = 2e-6 # 權重衰減,L2正則化
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 選擇設備,GPU或CPU
gpu_num = 1 # GPU個數(shù)
use_BCE = False # 是否使用BCE損失函數(shù)
models = ['xlm-roberta-base', 'roberta-base', 'bert-base-uncased',
'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', 'dmis-lab/biobert-base-cased-v1.2', 'marieke93/MiniLM-evidence-types',
'microsoft/MiniLM-L12-H384-uncased','cambridgeltl/SapBERT-from-PubMedBERT-fulltext', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
'microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract'] # 模型名稱列表
model_index = 2 # 根據(jù)上面選擇使用的模型,這里填對應的模型索引
model_name = models[model_index-1] # 使用的模型名稱
continue_train = False # 是否繼續(xù)訓練
show_val = False # 是否顯示驗證過程
定義模型類:
# 定義模型
class MODEL(nn.Module):
def __init__(self, model_index):
super(MODEL, self).__init__()
# 若是第一次下載權重,則下載至同級目錄的./premodels/內(nèi),以防占主目錄的存儲空間
self.model = AutoModel.from_pretrained(opt.models[model_index-1], cache_dir='./premodels/'+opt.models[model_index-1]+'_saved', from_tf=False) # 加載預訓練語言模型
# 加載模型配置,可以直接獲得模型最后一層的維度,而不需要手動修改
config = AutoConfig.from_pretrained(opt.models[model_index-1], cache_dir='./premodels/'+opt.models[model_index-1]+'_saved') # 獲取配置
last_dim = config.hidden_size # 最后一層的維度
if opt.use_BCE:out_size = 1 # 損失函數(shù)如果使用BCE,則輸出大小為1
else :out_size = 2 # 否則則使用CE,輸出大小為2
feature_size = 128 # 設置特征的維度大小
self.fc1 = nn.Linear(last_dim, feature_size) # 全連接層1
self.fc2 = nn.Linear(last_dim, feature_size) # 全連接層2
self.classifier = nn.Linear(feature_size, out_size) # 分類器
self.dropout = nn.Dropout(0.3) # Dropout層
def forward(self, x): #BP
input_ids, attention_mask, token_type_ids = x[:,0],x[:,1],x[:,2] # 獲取輸入
x = self.model(input_ids, attention_mask) # 通過模型
all_token = x[0] # 全部序列分詞的表征向量
pooled_output = x[1] # [CLS]的表征向量+一個全連接層+Tanh激活函數(shù)
feature1 = all_token.mean(dim=1) # 對全部序列分詞的表征向量取均值
feature1 = self.fc1(feature1) # 再輸入進全連接層,得到feature1
feature2 = pooled_output # [CLS]的表征向量+一個全連接層+Tanh激活函數(shù)
feature2 = self.fc2(feature2) # 再輸入進全連接層,得到feature2
feature = 0.5*feature1 + 0.5*feature2 # 加權融合特征
feature = self.dropout(feature) # Dropout
x = self.classifier(feature) # 分類
return x
?
數(shù)據(jù)加載:
def load_data():
#數(shù)據(jù)集路徑
train_data_path = f'models_input_files/x_train{model_index}.npy'
train_label_path = f'models_input_files/y_train{model_index}.npy'
val_data_path = f'models_input_files/x_val{model_index}.npy'# 驗證集
val_label_path = f'models_input_files/y_val{model_index}.npy'# 驗證集標簽
test_data_path = f'models_input_files/x_test{model_index}.npy'# 測試集輸入
#數(shù)據(jù)集讀取
#data=torch.tensor([path],allow_pickle=True).tolist())
train_data = torch.tensor(np.load(train_data_path , allow_pickle=True).tolist())
train_label = torch.tensor(np.load(train_label_path , allow_pickle=True).tolist()).long()
val_data = torch.tensor(np.load(val_data_path , allow_pickle=True).tolist())
val_label = torch.tensor(np.load(val_label_path , allow_pickle=True).tolist()).long()
test_data = torch.tensor(np.load(test_data_path , allow_pickle=True).tolist())
#構造訓練集、驗證集、測試集
train_dataset = torch.utils.data.TensorDataset(train_data , train_label)
val_dataset = torch.utils.data.TensorDataset(val_data , val_label)
test_dataset = torch.utils.data.TensorDataset(test_data)
return train_dataset, val_dataset, test_dataset # 返回數(shù)據(jù)集
模型預訓練:
def model_pretrain(model_index, train_loader, val_loader):
# 超參數(shù)設置
set_epoch = opt.set_epoch # 訓練輪數(shù)
early_stop = opt.early_stop # 提前停止epoch數(shù)
learning_rate = opt.learning_rate # 學習率
weight_decay = opt.weight_decay # 權重衰減
device = opt.device # 設備
gpu_num = opt.gpu_num # GPU個數(shù)
continue_train = opt.continue_train # 是否繼續(xù)訓練
model_save_dir = 'checkpoints' # 模型保存路徑
# 是否要繼續(xù)訓練,若是,則加載模型進行訓練;若否,則跳過訓練,直接對測試集進行推理
if not continue_train:
# 判斷最佳模型是否已經(jīng)存在,若存在則直接讀取,若不存在則進行訓練
if os.path.exists(f'checkpoints/best_model{model_index}.pth'):
best_model = MODEL(model_index)
best_model.load_state_dict(torch.load(f'checkpoints/best_model{model_index}.pth')) # 加載模型
return best_model
else:
pass
# 模型初始化
model = MODEL(model_index).to(device)
if continue_train:
model.load_state_dict(torch.load(f'checkpoints/best_model{model_index}.pth')) # 繼續(xù)訓練加載模型
# 優(yōu)化器初始化
if device != 'cpu' and gpu_num > 1: # 多張顯卡
optimizer = torch.optim.AdamW(model.module.parameters(), lr=learning_rate, weight_decay=weight_decay)
optimizer = torch.nn.DataParallel(optimizer, device_ids=list(range(gpu_num))) # 多GPU
else: # 單張顯卡
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # 單GPU
# 損失函數(shù)初始化
if opt.use_BCE:
loss_func = nn.BCEWithLogitsLoss() # BCE損失
else:
loss_func = nn.CrossEntropyLoss() # 交叉熵損失(CE)
# 模型訓練
best_epoch = 0 # 最佳epoch
best_train_loss = 100000 # 最佳訓練損失
train_acc_list = [] # 訓練準確率列表
train_loss_list = [] # 訓練損失列表
val_acc_list = [] # 驗證準確率列表
val_loss_list = [] # 驗證損失列表
start_time = time.time() # 訓練開始時間
for epoch in range(set_epoch): # 輪數(shù)
model.train() # 模型切換到訓練模式
train_loss = 0 # 訓練損失
train_acc = 0 # 訓練準確率
for x, y in tqdm(train_loader): # 遍歷訓練集
# 訓練前先將數(shù)據(jù)放到GPU上
x = x.to(device)
y = y.to(device)
outputs = model(x) # 前向傳播
if opt.use_BCE: # BCE損失
loss = loss_func(outputs, y.float().unsqueeze(1))
else: # 交叉熵損失
loss = loss_func(outputs, y)
train_loss += loss.item() # 累加訓練損失
optimizer.zero_grad() # 清空梯度
loss.backward() # 反向傳播
if device != 'cpu' and gpu_num > 1: # 多GPU更新
optimizer.module.step()
else:
optimizer.step() # 單GPU更新
if not opt.use_BCE: # 非BCE損失
_, predicted = torch.max(outputs.data, 1) # 預測結果
else:
predicted = (outputs > 0.5).int() # 預測結果
predicted = predicted.squeeze(1)
train_acc += (predicted == y).sum().item() # 計算訓練準確率
average_mode = 'binary'
# 計算F1、Precision、Recall
train_f1 = metrics.f1_score(y.cpu(), predicted.cpu(), average=average_mode)
train_pre = metrics.precision_score(y.cpu(), predicted.cpu(), average=average_mode)
train_recall = metrics.recall_score(y.cpu(), predicted.cpu(), average=average_mode)
train_loss /= len(train_loader) # 平均所有步數(shù)的訓練損失作為一個epoch的訓練損失
train_acc /= len(train_loader.dataset) # 平均所有步數(shù)訓練準確率作為一個epoch的準確率
train_acc_list.append(train_acc) # 添加訓練準確率
train_loss_list.append(train_loss) # 添加訓練損失
print('-'*50)
print('Epoch [{}/{}]\n Train Loss: {:.4f}, Train Acc: {:.4f}'.format(epoch + 1, set_epoch, train_loss, train_acc))
print('Train-f1: {:.4f}, Train-precision: {:.4f} Train-recall: {:.4f}'.format(train_f1, train_pre, train_recall))
if opt.show_val: # 顯示驗證過程
# 驗證
model.eval() # 模型切換到評估模式
val_loss = 0 # 驗證損失
val_acc = 0 # 驗證準確率
for x, y in tqdm(val_loader): # 遍歷驗證集
# 訓練前先將數(shù)據(jù)放到GPU上
x = x.to(device)
y = y.to(device)
outputs = model(x) # 前向傳播
if opt.use_BCE: # BCE損失
loss = loss_func(outputs, y.float().unsqueeze(1))
else: # 交叉熵損失
loss = loss_func(outputs, y)
val_loss += loss.item() # 累加驗證損失
if not opt.use_BCE: # 非BCE損失
_, predicted = torch.max(outputs.data, 1)
else:
predicted = (outputs > 0.5).int() # 預測結果
predicted = predicted.squeeze(1)
val_acc += (predicted == y).sum().item() # 計算驗證準確率
#計算F1、Precision、Recall
val_f1 = metrics.f1_score(y.cpu(), predicted.cpu(), average=average_mode)
val_pre = metrics.precision_score(y.cpu(), predicted.cpu(), average=average_mode)
val_recall = metrics.recall_score(y.cpu(), predicted.cpu(), average=average_mode)
val_loss /= len(val_loader) # 平均驗證損失
val_acc /= len(val_loader.dataset) # 平均驗證準確率
val_acc_list.append(val_acc) # 添加驗證準確率
val_loss_list.append(val_loss) # 添加驗證損失
print('\nVal Loss: {:.4f}, Val Acc: {:.4f}'.format(val_loss, val_acc))
print('Val-f1: {:.4f}, Val-precision: {:.4f} Val-recall: {:.4f}'.format(val_f1, val_pre, val_recall))
if train_loss < best_train_loss: # 更新最佳訓練損失
best_train_loss = train_loss
best_epoch = epoch + 1
if device == 'cuda' and gpu_num > 1: # 多GPU保存模型
torch.save(model.module.state_dict(), f'{model_save_dir}/best_model{model_index}.pth')
else:
torch.save(model.state_dict(), f'{model_save_dir}/best_model{model_index}.pth') # 單GPU保存模型
# 提前停止判斷
if epoch+1 - best_epoch == early_stop:
print(f'{early_stop} epochs later, the loss of the validation set no longer continues to decrease, so the training is stopped early.')
end_time = time.time()
print(f'Total time is {end_time - start_time}s.')
break
best_model = MODEL(model_index) # 初始化最佳模型
best_model.load_state_dict(torch.load(f'checkpoints/best_model{model_index}.pth')) # 加載模型參數(shù)
return best_model # 返回最佳模型
模型推理:
def model_predict(model, model_index, test_loader):
device = 'cuda'
model.to(device) # 模型到GPU
model.eval() # 切換到評估模式
test_outputs = None
with torch.no_grad(): # 禁用梯度計算
for i, data in enumerate(tqdm(test_loader)):
data = data[0].to(device) # 測試數(shù)據(jù)到GPU
outputs = model(data) # 前向傳播
if i == 0:
test_outputs = outputs # 第一個batch直接賦值
else:
test_outputs = torch.cat([test_outputs, outputs], dim=0)
# 其余batch拼接
del data, outputs # 釋放不再需要的Tensor
# 保存預測結果
if not opt.use_BCE:
test_outputs = torch.softmax(test_outputs, dim=1) # 轉換為概率
torch.save(test_outputs, f'./models_prediction/{model_index}_prob.pth')
# 保存概率
模型訓練主函數(shù):
def run(model_index):
# 固定隨機種子
seed = opt.seed
torch.seed = seed
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
train_dataset, val_dataset, test_dataset = load_data() # 加載數(shù)據(jù)集
# 打印數(shù)據(jù)集信息
print('-數(shù)據(jù)集信息:')
print(f'-訓練集樣本數(shù):{len(train_dataset)},測試集樣本數(shù):{len(test_dataset)}')
train_labels = len(set(train_dataset.tensors[1].numpy()))
# 查看訓練樣本類別均衡狀況
print(f'-訓練集的標簽種類個數(shù)為:{train_labels}')
numbers = [0] * train_labels
for i in train_dataset.tensors[1].numpy():
numbers[i] += 1
print(f'-訓練集各種類樣本的個數(shù):')
for i in range(train_labels):
print(f'-{i}的樣本個數(shù)為:{numbers[i]}')
batch_size = opt.batch_size # 批處理大小
# 構建DataLoader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
best_model = model_pretrain(model_index, train_loader, val_loader)
# 使用驗證集評估模型
model_predict(best_model, model_index, test_loader) # 模型推理
if __name__ == '__main__':
model_index = opt.model_index # 獲取模型索引
run(model_index) # 運行程序
③ 模型評估
import torch
import pandas as pd
from models_training import MODEL # 從本地文件models_training.py中導入MODEL類
from tqdm import tqdm
from sklearn.metrics import classification_report
import numpy as np
# 推理
def inference(model_indexs, use_BCE):
device = 'cuda' # 設備選擇為cuda
for model_index in model_indexs:
# 加載模型
model = MODEL(model_index).to(device) # 創(chuàng)建MODEL類的實例,并將模型移至設備(device)
model.load_state_dict(torch.load(f'checkpoints/best_model{model_index}.pth')) # 加載模型的權重參數(shù)
model.eval() # 切換到評估模式
# 加載val數(shù)據(jù)
val_data_path = f'models_input_files/x_val{model_index}.npy' # val數(shù)據(jù)的路徑
val_data = torch.tensor(np.load(val_data_path, allow_pickle=True).tolist()) # 加載val數(shù)據(jù),并轉換為Tensor格式
val_dataset = torch.utils.data.TensorDataset(val_data) # 創(chuàng)建val數(shù)據(jù)集
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=32, shuffle=False) # 創(chuàng)建val數(shù)據(jù)的數(shù)據(jù)加載器
val_outputs = None # 初始化val_outputs變量
with torch.no_grad(): # 禁用梯度計算
for i, data in enumerate(tqdm(val_loader)): # 遍歷val_loader,顯示進度條
data = data[0].to(device) # 將數(shù)據(jù)移至GPU
outputs = model(data) # 模型推理,獲取輸出
if i == 0:
val_outputs = outputs # 若為第一次迭代,直接賦值給val_outputs
else:
val_outputs = torch.cat([val_outputs, outputs], dim=0)
# 否則在dim=0上拼接val_outputs和outputs
del data, outputs # 釋放不再需要的Tensor對象
# 輸出預測概率
if not use_BCE:
val_outputs = torch.softmax(val_outputs, dim=1) # 對val_outputs進行softmax操作
torch.save(val_outputs, f'evaluate_prediction/{model_index}_prob.pth') # 保存預測概率結果
def run(model_indexs, use_BCE):
# 讀取所有的model_prob.pth,并全加在一起
avg_pred = None # 初始化avg_pred變量
for i in model_indexs:
pred = torch.load(f'evaluate_prediction/{i}_prob.pth').data
# 加載預測概率結果
if use_BCE:
# 選取大于0.5的作為預測結果
pred = (pred > 0.5).int() # 將大于0.5的值轉換為整數(shù)(0或1)
pred = pred.reshape(-1) # 將預測結果進行形狀重塑
else:
# 選取最大的概率作為預測結果
pred = torch.argmax(pred, dim=1) # 獲取最大概率的索引作為預測結果
pred = pred.cpu().numpy() # 將預測結果轉移到CPU上,并轉換為NumPy數(shù)組
# to_evaluate
# 讀取真實標簽
val_label_path = f'models_input_files/y_val{i}.npy' # 真實標簽的路徑
y_true = np.load(val_label_path) # 加載真實標簽
# 分類報告
print(f'model_index = {i}:')
print(classification_report(y_true, pred, digits=4))
# 打印分類報告,包括精確度、召回率等指標
zero_acc = 0; one_acc = 0 # 初始化0類和1類的準確率
zero_num = 0; one_num= 0 # 初始化0類和1類的樣本數(shù)量
for i in range(pred.shape[0]):
if y_true[i] == 0:
zero_num += 1 # 統(tǒng)計0類的樣本數(shù)量
elif y_true[i] == 1:
one_num += 1 # 統(tǒng)計1類的樣本數(shù)量
if pred[i] == y_true[i]:
if pred[i] == 0:
zero_acc += 1 # 統(tǒng)計0類的正確預測數(shù)量
elif pred[i] == 1:
one_acc += 1 # 統(tǒng)計1類的正確預測數(shù)量
zero = np.sum(pred == 0) / pred.shape[0] # 計算預測為0類的樣本占比
zero_acc /= zero_num # 計算0類的正確率
print(f'預測0類占比:{zero} 0類正確率:{zero_acc}')
one = np.sum(pred == 1) / pred.shape[0] # 計算預測為1類的樣本占比
one_acc /= one_num # 計算1類的正確率
print(f'預測1類占比:{one} 1類正確率:{one_acc}')
print('-' * 80)
if __name__ == '__main__':
use_BCE = False # 是否使用BCE損失函數(shù)的標志,這里我只用交叉熵CE,所以是False
inference([2], use_BCE=use_BCE) # 進行推理,傳入模型索引和use_BCE標志
model_indexs = [2] # 模型索引列表
run(model_indexs, use_BCE=use_BCE) # 進行運行,傳入模型索引和use_BCE標志
④ 測試集推理
import torch
import pandas as pd
import warnings # 過濾警告
warnings.filterwarnings('ignore')
def run(model_indexs, use_BCE):
# 記錄模型數(shù)量
model_num = len(model_indexs)
# 讀取所有的model_prob.pth,并全加在一起
for i in model_indexs:
# 加載模型在訓練完成后對測試集推理所得的預測文件
pred = torch.load(f'./models_prediction/{i}_prob.pth', map_location='cpu').data
# 這里的操作是將每個模型對測試集推理的概率全加在一起
if i == model_indexs[0]:
avg_pred = pred
else:
avg_pred += pred
# 取平均
avg_pred /= model_num # 使用全加在一起的預測概率除以模型數(shù)量
if use_BCE:
# 選取概率大于0.5的作為預測結果
pred = (avg_pred > 0.5).int()
pred = pred.reshape(-1)
else:
# 后處理 - 根據(jù)標簽數(shù)目的反饋,對預測閾值進行調(diào)整
pred[:, 0][pred[:, 0]>0.001] = 1
pred[:, 1][pred[:, 1]>0.999] = 1.2
# 選取最大的概率作為預測結果
pred = torch.argmax(avg_pred, dim=1)
pred = pred.cpu().numpy()
# to_submit
# 讀取test.csv文件
test = pd.read_csv('./dataset/testB_submit_exsample.csv')
# 開始寫入預測結果
for i in range(len(pred)):
test['label'][i] = pred[i]
print(test['label'].value_counts())
# 保存為提交文件
test.to_csv(f'submit.csv',index=False)
if __name__ == '__main__':
run([2], use_BCE=False)
# run([1,2,3,4,5,6,7,8,9,10], use_BCE=False)
?
模型優(yōu)化的思路:
超參數(shù)調(diào)整、最大序列長度調(diào)整、損失函數(shù)更改、模型參數(shù)凍結
特征工程、模型集成、對比學習、提示學習サ
?
ChatGML2-6B
LLMs:自回歸模型
Pretrained => prompt、finetune => RLHF 強化對齊學習
?
LoRA低秩適應:凍結預訓練好的模型權重參數(shù),在凍結原模型參數(shù)的情況下,通過往模型中加入額外的網(wǎng)絡層,并只訓練這些新增的網(wǎng)絡層參數(shù)。
「instruction -->?」「input: X」「output: Y」
P-tuning v2:在原有的大型語言模型上添加一些新的參數(shù),這些新的參數(shù)可以幫助模型更好地理解和處理特定的任務。
微調(diào)應用:垂直領域、個性化
?
在阿里云Pytorch環(huán)境中,克隆代碼、下載chatglm2-6b模型,
安裝依賴,并且運行訓練腳本。
xfg_train.sh
CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
--model_name_or_path chatglm2-6b \ 本地模型的目錄
--stage sft \ 微調(diào)方法
--use_v2 \ 使用glm2模型微調(diào),默認值true
--do_train \ 是否訓練,默認值true
--dataset paper_label \ 數(shù)據(jù)集名字
--finetuning_type lora \
--lora_rank 8 \ LoRA 微調(diào)中的秩大小
--output_dir ./output/label_xfg \ 輸出lora權重存放目錄
--per_device_train_batch_size 4 \ 用于訓練的批處理大小
--gradient_accumulation_steps 4 \ 梯度累加次數(shù)
--lr_scheduler_type cosine \
--logging_steps 10 \ 日志輸出間隔
--save_steps 1000 \ 斷點保存間隔
--learning_rate 5e-5 \ 學習率
--num_train_epochs 4.0 \ 訓練輪數(shù)
--fp16 是否使用 fp16 半精度 默認值:False
導入數(shù)據(jù)
import pandas as pd
train_df = pd.read_csv('./csv_data/train.csv')
testB_df = pd.read_csv('./csv_data/testB.csv')
制作數(shù)據(jù)集
res = [] #存儲數(shù)據(jù)樣本
for i in range(len(train_df)):# 遍歷訓練數(shù)據(jù)的每一行
paper_item = train_df.loc[i] # 獲取當前行的數(shù)據(jù)
# 創(chuàng)建一個字典,包含LoRA的指令、輸入和輸出信息
tmp = {
"instruction": "Please judge whether it is a medical field paper according to the given paper title and abstract, output 1 or 0, the following is the paper title and abstract -->",
"input": f"title:{paper_item[1]},abstract:{paper_item[3]}",
"output": str(paper_item[5])
}
res.append(tmp) # 將字典添加到結果列表中
import json #用于保存數(shù)據(jù)集
# 將制作好的數(shù)據(jù)集保存到data目錄下
with open('./data/paper_label.json', mode='w', encoding='utf-8') as f:
json.dump(res, f, ensure_ascii=False, indent=4)
修改data/data_info.json
{
"paper_label": {
"file_name": "paper_label.json"
}
}
加載訓練好的LoRA權重,進行預測
from peft import PeftModel
from transformers import AutoTokenizer, AutoModel, GenerationConfig, AutoModelForCausalLM
# 定義預訓練模型的路徑
model_path = "../chatglm2-6b"
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# 加載 label lora權重
model = PeftModel.from_pretrained(model, './output/label_xfg').half()
model = model.eval()
# 使用加載的模型和分詞器進行聊天,生成回復
response, history = model.chat(tokenizer, "你好", history=[])
response
預測函數(shù):
def predict(text):
# 使用加載的模型和分詞器進行聊天,生成回復
response, history = model.chat(tokenizer, f"Please judge whether it is a medical field paper according to the given paper title and abstract, output 1 or 0, the following is the paper title and abstract -->{text}", history=[],
temperature=0.01)
return response
預測,導出csv
from tqdm import tqdm #預測過程的進度條
label = [] #存儲預測結果
for i in tqdm(range(len(testB_df))): # 遍歷測試集中的每一條樣本
test_item = testB_df.loc[i] # 測試集中的每一條樣本
# 構建預測函數(shù)的輸入:prompt
test_input = f"title:{test_item[1]},author:{test_item[2]},abstract:{test_item[3]}"
label.append(int(predict(test_input)))# 預測結果存入lable列表
testB_df['label'] = label # 把label列表存入testB_df
# task1雖然只需要label,但需要有一個keywords列,用個隨意的字符串代替
testB_df['Keywords'] = ['tmp' for _ in range(2000)]
# 制作submit,提交submit
submit = testB_df[['uuid', 'Keywords', 'label']]
submit.to_csv('submit.csv', index=False)
提交結果:
?ライト文章來源:http://www.zghlxwxcb.cn/news/detail-668346.html
?
到了這里,關于NLP | 基于LLMs的文本分類任務的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關文章,希望大家以后多多支持TOY模板網(wǎng)!