1.whisper部署
詳細(xì)過程可以參照:??
創(chuàng)建項目文件夾
mkdir whisper cd whisper
conda創(chuàng)建虛擬環(huán)境
conda create -n py310 python=3.10 -c conda-forge -y
安裝pytorch
pip install --pre torch torchvision torchaudio --extra-index-url
下載whisper
pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git
安裝相關(guān)包
pip install tqdm pip install numba pip install tiktoken==0.3.3 brew install ffmpeg
測試一下whispet是否安裝成功(默認(rèn)識別為中文)
whisper test.wav --model small #test.wav為自己的測試wav文件,map3也支持 small是指用小模型
whisper識別中文的時候經(jīng)常會輸出繁體,加入一下參數(shù)可以避免:
whisper test.wav --model small --language zh --initial_prompt "以下是普通話的句子。" #注意"以下是普通話的句子。"不能隨便修改,只能是這句話才有效果。
2.腳本批量測試
創(chuàng)建test.sh腳本,輸入一下內(nèi)容,可以實現(xiàn)對某一文件夾下的wav文件逐個中文語音識別。
#!/bin/bash
for ((i=0;i<300;i++));do
file="wav/A13_${i}.wav"
if [ ! -f "$file" ];then
break
fi
whisper "$file" --model medium --output_dir denied --language zh --initial_prompt "以下是普通話的句子。"
done
?實現(xiàn)英文語音識別需要修改為:
#!/bin/bash
for ((i=0;i<300;i++));do
file="en/${i}.wav"
if [ ! -f "$file" ];then
break
fi
whisper "$file" --model small --output_dir denied --language en
done
3.對運(yùn)行出來的結(jié)果進(jìn)行評測
一般地,語音識別通常采用WER,即詞錯誤率,評估語音識別和文本轉(zhuǎn)換質(zhì)量。
這里我們主要采用 github上的開源項目:???編寫的python-wer代碼對結(jié)果進(jìn)行評價。
其中,我們的正確樣本形式為:
?whisper輸出的預(yù)測結(jié)果形式為:
?因此要對文本進(jìn)行處理(去空格、去標(biāo)點符號)后進(jìn)行wer評價,相關(guān)代碼如下:
(可根據(jù)具體情況修改calculate_WER)
import sys
import numpy
def editDistance(r, h):
'''
This function is to calculate the edit distance of reference sentence and the hypothesis sentence.
Main algorithm used is dynamic programming.
Attributes:
r -> the list of words produced by splitting reference sentence.
h -> the list of words produced by splitting hypothesis sentence.
'''
d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8).reshape((len(r)+1, len(h)+1))
for i in range(len(r)+1):
d[i][0] = i
for j in range(len(h)+1):
d[0][j] = j
for i in range(1, len(r)+1):
for j in range(1, len(h)+1):
if r[i-1] == h[j-1]:
d[i][j] = d[i-1][j-1]
else:
substitute = d[i-1][j-1] + 1
insert = d[i][j-1] + 1
delete = d[i-1][j] + 1
d[i][j] = min(substitute, insert, delete)
return d
def getStepList(r, h, d):
'''
This function is to get the list of steps in the process of dynamic programming.
Attributes:
r -> the list of words produced by splitting reference sentence.
h -> the list of words produced by splitting hypothesis sentence.
d -> the matrix built when calulating the editting distance of h and r.
'''
x = len(r)
y = len(h)
list = []
while True:
if x == 0 and y == 0:
break
elif x >= 1 and y >= 1 and d[x][y] == d[x-1][y-1] and r[x-1] == h[y-1]:
list.append("e")
x = x - 1
y = y - 1
elif y >= 1 and d[x][y] == d[x][y-1]+1:
list.append("i")
x = x
y = y - 1
elif x >= 1 and y >= 1 and d[x][y] == d[x-1][y-1]+1:
list.append("s")
x = x - 1
y = y - 1
else:
list.append("d")
x = x - 1
y = y
return list[::-1]
def alignedPrint(list, r, h, result):
'''
This funcition is to print the result of comparing reference and hypothesis sentences in an aligned way.
Attributes:
list -> the list of steps.
r -> the list of words produced by splitting reference sentence.
h -> the list of words produced by splitting hypothesis sentence.
result -> the rate calculated based on edit distance.
'''
print("REF:", end=" ")
for i in range(len(list)):
if list[i] == "i":
count = 0
for j in range(i):
if list[j] == "d":
count += 1
index = i - count
print(" "*(len(h[index])), end=" ")
elif list[i] == "s":
count1 = 0
for j in range(i):
if list[j] == "i":
count1 += 1
index1 = i - count1
count2 = 0
for j in range(i):
if list[j] == "d":
count2 += 1
index2 = i - count2
if len(r[index1]) < len(h[index2]):
print(r[index1] + " " * (len(h[index2])-len(r[index1])), end=" ")
else:
print(r[index1], end=" "),
else:
count = 0
for j in range(i):
if list[j] == "i":
count += 1
index = i - count
print(r[index], end=" "),
print("\nHYP:", end=" ")
for i in range(len(list)):
if list[i] == "d":
count = 0
for j in range(i):
if list[j] == "i":
count += 1
index = i - count
print(" " * (len(r[index])), end=" ")
elif list[i] == "s":
count1 = 0
for j in range(i):
if list[j] == "i":
count1 += 1
index1 = i - count1
count2 = 0
for j in range(i):
if list[j] == "d":
count2 += 1
index2 = i - count2
if len(r[index1]) > len(h[index2]):
print(h[index2] + " " * (len(r[index1])-len(h[index2])), end=" ")
else:
print(h[index2], end=" ")
else:
count = 0
for j in range(i):
if list[j] == "d":
count += 1
index = i - count
print(h[index], end=" ")
print("\nEVA:", end=" ")
for i in range(len(list)):
if list[i] == "d":
count = 0
for j in range(i):
if list[j] == "i":
count += 1
index = i - count
print("D" + " " * (len(r[index])-1), end=" ")
elif list[i] == "i":
count = 0
for j in range(i):
if list[j] == "d":
count += 1
index = i - count
print("I" + " " * (len(h[index])-1), end=" ")
elif list[i] == "s":
count1 = 0
for j in range(i):
if list[j] == "i":
count1 += 1
index1 = i - count1
count2 = 0
for j in range(i):
if list[j] == "d":
count2 += 1
index2 = i - count2
if len(r[index1]) > len(h[index2]):
print("S" + " " * (len(r[index1])-1), end=" ")
else:
print("S" + " " * (len(h[index2])-1), end=" ")
else:
count = 0
for j in range(i):
if list[j] == "i":
count += 1
index = i - count
print(" " * (len(r[index])), end=" ")
print("\nWER: " + result)
return result
def wer(r, h):
"""
This is a function that calculate the word error rate in ASR.
You can use it like this: wer("what is it".split(), "what is".split())
"""
# build the matrix
d = editDistance(r, h)
# find out the manipulation steps
list = getStepList(r, h, d)
# print the result in aligned way
result = float(d[len(r)][len(h)]) / len(r) * 100
result = str("%.2f" % result) + "%"
result=alignedPrint(list, r, h, result)
return result
# 計算總WER
def calculate_WER():
with open("whisper_out.txt", "r") as f:
text1_list = [i[11:].strip("\n") for i in f.readlines()]
with open("A13.txt", "r") as f:
text2_orgin_list = [i[11:].strip("\n") for i in f.readlines()]
total_distance = 0
total_length = 0
WER=0
symbols = ",@#¥%……&*()——+~!{}【】;‘:“”‘。?》《、"
# calculate distance between each pair of texts
for i in range(len(text1_list)):
match1 = re.search('[\u4e00-\u9fa5]', text1_list[i])
if match1:
index1 = match1.start()
else:
index1 = len(text1_list[i])
match2 = re.search('[\u4e00-\u9fa5]', text2_orgin_list[i])
if match2:
index2 = match2.start()
else:
index2 = len( text2_orgin_list[i])
result1= text1_list[i][index1:]
result1= result1.translate(str.maketrans('', '', symbols))
result2= text2_orgin_list[i][index2:]
result2=result2.replace(" ", "")
print(result1)
print(result2)
result=wer(result1,result2)
WER+=float(result.strip('%')) / 100
WER=WER/len(text1_list)
print("總WER:", WER)
print("總WER:", WER.__format__('0.2%'))
calculate_WER()
評價結(jié)果形如:
4.與paddlespeech的測試對比:
數(shù)據(jù)集 |
數(shù)據(jù)量 |
paddle (中英文分開) |
paddle (同一模型) |
whisper(small) (同一模型) |
whisper(medium) (同一模型) |
||
zhthchs30 (中文錯字率) |
250 |
11.61% |
45.53% |
24.11% |
13.95% |
||
LibriSpeech (英文錯字率) |
125 |
7.76% |
50.88% |
9.31%文章來源:http://www.zghlxwxcb.cn/news/detail-496316.html |
9.31% |
5.測試所用數(shù)據(jù)集
自己處理過的開源wav數(shù)據(jù)文章來源地址http://www.zghlxwxcb.cn/news/detail-496316.html
到了這里,關(guān)于whisper語音識別部署及WER評價的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!