在完成fastspeech論文學(xué)習(xí)后,對(duì)github上一個(gè)復(fù)現(xiàn)的倉(cāng)庫(kù)進(jìn)行學(xué)習(xí),幫助理解算法實(shí)現(xiàn)過(guò)程中的一些細(xì)節(jié);所選擇的倉(cāng)庫(kù)復(fù)現(xiàn)倉(cāng)庫(kù)是基于pytorch實(shí)現(xiàn),鏈接為https://github.com/ming024/FastSpeech2。該倉(cāng)庫(kù)是基于https://github.com/xcmyz/FastSpeech中的FastSpeech復(fù)現(xiàn)代碼完成的,很多代碼基本一致。作者前期已對(duì)該FastSpeech復(fù)現(xiàn)倉(cāng)庫(kù)進(jìn)行注釋分析,感興趣的讀者可見(jiàn)此專欄。
通過(guò)論文可知,F(xiàn)astSpeech2模型整體架構(gòu)與FastSpeech基本一致,只是除了Duration Predicator外,還增加了Pitch Predictor和Energy Predictor兩部分,并且此三部分的網(wǎng)絡(luò)架構(gòu)是一樣的。所以,本倉(cāng)庫(kù)中transformer路徑下的文件基本與https://github.com/xcmyz/FastSpeech中基本一致,在搭建FastSpeech2模型時(shí),主要使用到其中定義的Encoder, Decoder, PostNet模塊,可以進(jìn)入專欄中詳細(xì)了解。在本倉(cāng)庫(kù)中,F(xiàn)astSpeech2模型搭建主要涉及的兩個(gè)文件為fastspeech.py和model路徑下的modules.py文件。
model/modules.py
本文件主要是定義Variance Adaptor,其中主要包括Duration Predictor、Length Regulator、Pitch Predictor和Energy Predictor,詳細(xì)代碼和注釋解析如下所示
import os
import json
import copy
import math
from collections import OrderedDict
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from utils.tools import get_mask_from_lengths, pad
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 完整Variance Adaptor
class VarianceAdaptor(nn.Module):
"""Variance Adaptor"""
def __init__(self, preprocess_config, model_config):
super(VarianceAdaptor, self).__init__()
self.duration_predictor = VariancePredictor(model_config)
self.length_regulator = LengthRegulator()
self.pitch_predictor = VariancePredictor(model_config)
self.energy_predictor = VariancePredictor(model_config)
# 設(shè)置pitch和energy的級(jí)別
self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"]["feature"]
self.energy_feature_level = preprocess_config["preprocessing"]["energy"]["feature"]
assert self.pitch_feature_level in ["phoneme_level", "frame_level"]
assert self.energy_feature_level in ["phoneme_level", "frame_level"]
# 設(shè)置pitch和energy的量化方式
pitch_quantization = model_config["variance_embedding"]["pitch_quantization"]
energy_quantization = model_config["variance_embedding"]["energy_quantization"]
n_bins = model_config["variance_embedding"]["n_bins"]
assert pitch_quantization in ["linear", "log"]
assert energy_quantization in ["linear", "log"]
# 加載pitch和energy的正則化所需參數(shù)
with open(os.path.join(preprocess_config["path"]["preprocessed_path"], "stats.json")) as f:
stats = json.load(f)
pitch_min, pitch_max = stats["pitch"][:2]
energy_min, energy_max = stats["energy"][:2]
if pitch_quantization == "log":
self.pitch_bins = nn.Parameter(
torch.exp(torch.linspace(np.log(pitch_min), np.log(pitch_max), n_bins - 1)),
requires_grad=False,)
else:
self.pitch_bins = nn.Parameter(
torch.linspace(pitch_min, pitch_max, n_bins - 1),
requires_grad=False,)
if energy_quantization == "log":
self.energy_bins = nn.Parameter(
torch.exp(torch.linspace(np.log(energy_min), np.log(energy_max), n_bins - 1)),
requires_grad=False,)
else:
self.energy_bins = nn.Parameter(
torch.linspace(energy_min, energy_max, n_bins - 1),
requires_grad=False,)
# pitch和energy的嵌入層
self.pitch_embedding = nn.Embedding(n_bins, model_config["transformer"]["encoder_hidden"])
self.energy_embedding = nn.Embedding(n_bins, model_config["transformer"]["encoder_hidden"])
# 計(jì)算pitch嵌入層
def get_pitch_embedding(self, x, target, mask, control):
prediction = self.pitch_predictor(x, mask) # pitch預(yù)測(cè)器預(yù)測(cè)的數(shù)值
if target is not None: # target存在,訓(xùn)練過(guò)程,使用target計(jì)算embedding
embedding = self.pitch_embedding(torch.bucketize(target, self.pitch_bins))
else: # target不存在,預(yù)測(cè)過(guò)程,使用prediction計(jì)算embedding
prediction = prediction * control # control是用于控制的系數(shù)
embedding = self.pitch_embedding(torch.bucketize(prediction, self.pitch_bins))
return prediction, embedding # prediction用于訓(xùn)練過(guò)程計(jì)算損失,embedding與x相加進(jìn)行后續(xù)計(jì)算
# # 計(jì)算energy嵌入層
def get_energy_embedding(self, x, target, mask, control):
prediction = self.energy_predictor(x, mask) # energy預(yù)測(cè)器預(yù)測(cè)的數(shù)值
if target is not None: # target存在,訓(xùn)練過(guò)程,使用target計(jì)算embedding
embedding = self.energy_embedding(torch.bucketize(target, self.energy_bins))
else: # target不存在,預(yù)測(cè)過(guò)程,使用prediction計(jì)算embedding
prediction = prediction * control # control是用于控制的系數(shù)
embedding = self.energy_embedding(torch.bucketize(prediction, self.energy_bins))
return prediction, embedding # prediction用于訓(xùn)練過(guò)程計(jì)算損失,embedding與x相加進(jìn)行后續(xù)計(jì)算
def forward(
self,
x,
src_mask,
mel_mask=None,
max_len=None,
pitch_target=None,
energy_target=None,
duration_target=None,
p_control=1.0,
e_control=1.0,
d_control=1.0,
):
log_duration_prediction = self.duration_predictor(x, src_mask) # 對(duì)音素序列預(yù)測(cè)的持續(xù)時(shí)間
if self.pitch_feature_level == "phoneme_level":
pitch_prediction, pitch_embedding = self.get_pitch_embedding(x, pitch_target, src_mask, p_control)
x = x + pitch_embedding # 累加pitch嵌入層
if self.energy_feature_level == "phoneme_level":
energy_prediction, energy_embedding = self.get_energy_embedding(x, energy_target, src_mask, p_control)
x = x + energy_embedding # 累加energy嵌入層
if duration_target is not None: # duration_target,訓(xùn)練過(guò)程,使用duration_target計(jì)算
x, mel_len = self.length_regulator(x, duration_target, max_len) # 使用duration_target調(diào)整x
duration_rounded = duration_target
else: # 預(yù)測(cè)過(guò)程
# 基于log_duration_prediction構(gòu)建duration_rounded,用于調(diào)整x
duration_rounded = torch.clamp((torch.round(torch.exp(log_duration_prediction) - 1) * d_control), min=0,)
x, mel_len = self.length_regulator(x, duration_rounded, max_len)
mel_mask = get_mask_from_lengths(mel_len)
# 同上,與phoneme_level一致
if self.pitch_feature_level == "frame_level":
pitch_prediction, pitch_embedding = self.get_pitch_embedding(x, pitch_target, mel_mask, p_control)
x = x + pitch_embedding
if self.energy_feature_level == "frame_level":
energy_prediction, energy_embedding = self.get_energy_embedding(x, energy_target, mel_mask, p_control)
x = x + energy_embedding
return (
x,
pitch_prediction, # 此處三個(gè)prediction用于后續(xù)計(jì)算損失
energy_prediction,
log_duration_prediction,
duration_rounded,
mel_len,
mel_mask,
)
# 長(zhǎng)度調(diào)節(jié)器
class LengthRegulator(nn.Module):
"""Length Regulator"""
def __init__(self):
super(LengthRegulator, self).__init__()
# 對(duì)輸入的音素序列x進(jìn)行長(zhǎng)度調(diào)正
def LR(self, x, duration, max_len):
"""
基于音素持續(xù)時(shí)間將音素序列長(zhǎng)度與mel譜圖長(zhǎng)度對(duì)齊
@param x: 經(jīng)過(guò)FFT塊轉(zhuǎn)換后的音素序列,[batch_size, max_sequence_len, encoder_dim]
@param duration: 音素持續(xù)時(shí)間矩陣,[batch_size, max_sequence_len]
@param max_len: 音素譜圖序列中最大長(zhǎng)度
@return: 長(zhǎng)度經(jīng)過(guò)調(diào)整后的音素序列,[batch_size, max_len, encoder_dim]
"""
output = list()
mel_len = list()
for batch, expand_target in zip(x, duration):
expanded = self.expand(batch, expand_target) # 獲得一個(gè)長(zhǎng)度完整調(diào)整之后音素序列
output.append(expanded)
mel_len.append(expanded.shape[0]) # 記錄mel譜圖長(zhǎng)度大小,方便后續(xù)生成mask
# 如果傳入max_len就按其進(jìn)行pad,如果沒(méi)有就以output中最長(zhǎng)序列大小進(jìn)行pad
if max_len is not None:
output = pad(output, max_len)
else:
output = pad(output)
return output, torch.LongTensor(mel_len).to(device)
def expand(self, batch, predicted):
"""
將輸入的一個(gè)音素序列的長(zhǎng)度按其對(duì)應(yīng)的持續(xù)時(shí)間調(diào)整
@param batch:一個(gè)音頻對(duì)應(yīng)文本的音素序列,[max_sequence_len, encoder_dim]
@param predicted:音素序列中每個(gè)音素對(duì)應(yīng)的持續(xù)序列,長(zhǎng)度為max_sequence_len
@return:長(zhǎng)度調(diào)整后的音素序列,長(zhǎng)度與mel譜圖長(zhǎng)度一致
"""
out = list()
for i, vec in enumerate(batch):
expand_size = predicted[i].item() # i對(duì)應(yīng)的音素對(duì)應(yīng)持續(xù)時(shí)間,即需要重復(fù)的次數(shù)
out.append(vec.expand(max(int(expand_size), 0), -1)) # 將i對(duì)應(yīng)的音素的表征向量vec重復(fù)expand_size次
out = torch.cat(out, 0) # 將整個(gè)音素序列cat起來(lái)
return out
def forward(self, x, duration, max_len):
output, mel_len = self.LR(x, duration, max_len)
return output, mel_len
class VariancePredictor(nn.Module):
"""Duration, Pitch and Energy Predictor"""
def __init__(self, model_config):
super(VariancePredictor, self).__init__()
self.input_size = model_config["transformer"]["encoder_hidden"] # 輸入尺寸
self.filter_size = model_config["variance_predictor"]["filter_size"] # 輸出尺寸
self.kernel = model_config["variance_predictor"]["kernel_size"] # 卷積核大小
self.conv_output_size = model_config["variance_predictor"]["filter_size"]
self.dropout = model_config["variance_predictor"]["dropout"]
# 定義一個(gè)包含激活函數(shù)和正則項(xiàng)的卷積序列,即[Con1D+Relu+LN+Dropout]+[Con1D+Relu+LN+Dropout]
self.conv_layer = nn.Sequential(
OrderedDict(
[
(
"conv1d_1",
Conv(
self.input_size,
self.filter_size,
kernel_size=self.kernel,
padding=(self.kernel - 1) // 2,
),
),
("relu_1", nn.ReLU()),
("layer_norm_1", nn.LayerNorm(self.filter_size)),
("dropout_1", nn.Dropout(self.dropout)),
(
"conv1d_2",
Conv(
self.filter_size,
self.filter_size,
kernel_size=self.kernel,
padding=1,
),
),
("relu_2", nn.ReLU()),
("layer_norm_2", nn.LayerNorm(self.filter_size)),
("dropout_2", nn.Dropout(self.dropout)),
]
)
)
self.linear_layer = nn.Linear(self.conv_output_size, 1)
def forward(self, encoder_output, mask):
out = self.conv_layer(encoder_output) # [Con1D+Relu+LN+Dropout]+[Con1D+Relu+LN+Dropout]
out = self.linear_layer(out) # 最后輸出前的線性層
out = out.squeeze(-1) # 因?yàn)榫€性層返回的是1,即輸出的尺寸的最后一維是1,將其壓縮掉
if mask is not None:
out = out.masked_fill(mask, 0.0) # 將mask對(duì)應(yīng)地方設(shè)置為0
return out
# 自定義的一維卷積網(wǎng)絡(luò)
class Conv(nn.Module):
"""
Convolution Module
"""
def __init__(
self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
bias=True,
w_init="linear",
):
"""
:param in_channels: dimension of input
:param out_channels: dimension of output
:param kernel_size: size of kernel
:param stride: size of stride
:param padding: size of padding
:param dilation: dilation rate
:param bias: boolean. if True, bias is included.
:param w_init: str. weight inits with xavier initialization.
"""
super(Conv, self).__init__()
self.conv = nn.Conv1d(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias,
)
def forward(self, x):
x = x.contiguous().transpose(1, 2)
x = self.conv(x)
x = x.contiguous().transpose(1, 2)
return x
model/fastspeech2.py
本文件將Encoder, Decoder, PostNet和Variance Adaptor模塊集成在一起,完成FastSpeech2模型搭建文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-420708.html
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformer import Encoder, Decoder, PostNet
from .modules import VarianceAdaptor
from utils.tools import get_mask_from_lengths
class FastSpeech2(nn.Module):
""" FastSpeech2 """
def __init__(self, preprocess_config, model_config):
super(FastSpeech2, self).__init__()
self.model_config = model_config
self.encoder = Encoder(model_config) # Variance Adaptor之前網(wǎng)絡(luò),為編碼器
self.variance_adaptor = VarianceAdaptor(preprocess_config, model_config) # Variance Adaptor
self.decoder = Decoder(model_config) # Variance Adaptor之后網(wǎng)絡(luò),為解碼器
self.mel_linear = nn.Linear(
model_config["transformer"]["decoder_hidden"],
preprocess_config["preprocessing"]["mel"]["n_mel_channels"],)
self.postnet = PostNet()
self.speaker_emb = None
if model_config["multi_speaker"]: # 如果為多speaker
# 加載speaker文件
with open(os.path.join(preprocess_config["path"]["preprocessed_path"], "speakers.json"),) as f:
n_speaker = len(json.load(f))
# 構(gòu)建speaker嵌入層
self.speaker_emb = nn.Embedding(n_speaker, model_config["transformer"]["encoder_hidden"],)
def forward(
self,
speakers,
texts,
src_lens,
max_src_len,
mels=None,
mel_lens=None,
max_mel_len=None,
p_targets=None,
e_targets=None,
d_targets=None,
p_control=1.0, # 控制系數(shù)
e_control=1.0,
d_control=1.0,
):
src_masks = get_mask_from_lengths(src_lens, max_src_len) # 原始文本序列mask
mel_masks = (
get_mask_from_lengths(mel_lens, max_mel_len)
if mel_lens is not None
else None) # mel譜圖序列mask
output = self.encoder(texts, src_masks) # 編碼
if self.speaker_emb is not None: # 如果存在speaker嵌入層,將其和output相加
output = output + self.speaker_emb(speakers).unsqueeze(1).expand(-1, max_src_len, -1)
# 通過(guò)Variance Adaptor模塊計(jì)算
(
output,
p_predictions,
e_predictions,
log_d_predictions,
d_rounded,
mel_lens,
mel_masks,
) = self.variance_adaptor(
output,
src_masks,
mel_masks,
max_mel_len,
p_targets,
e_targets,
d_targets,
p_control,
e_control,
d_control,
)
output, mel_masks = self.decoder(output, mel_masks) # 解碼
output = self.mel_linear(output) # 線性轉(zhuǎn)換
postnet_output = self.postnet(output) + output # 后處理
return (
output,
postnet_output,
p_predictions,
e_predictions,
log_d_predictions,
d_rounded,
src_masks,
mel_masks,
src_lens,
mel_lens,
)
本筆記主要記錄所選擇的fastspeech2復(fù)現(xiàn)倉(cāng)庫(kù)中模型構(gòu)建相關(guān)的代碼,結(jié)合之前FastSppech2論文閱讀筆記筆記中的模型部分進(jìn)行理解。本筆記主要是對(duì)代碼進(jìn)行詳細(xì)的注釋,讀者若發(fā)現(xiàn)問(wèn)題或錯(cuò)誤,請(qǐng)?jiān)u論指出,互相學(xué)習(xí)。文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-420708.html
到了這里,關(guān)于fastspeech2復(fù)現(xiàn)github項(xiàng)目--模型構(gòu)建的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!