接著前面的Langchain,繼續(xù)實(shí)現(xiàn)讀取YouTube的視頻腳本來問答Indexes for information retrieve
- LangChain 實(shí)現(xiàn)給動(dòng)物取名字,
- LangChain 2模塊化prompt template并用streamlit生成網(wǎng)站 實(shí)現(xiàn)給動(dòng)物取名字
- LangChain 3使用Agent訪問Wikipedia和llm-math計(jì)算狗的平均年齡
1. 安裝youtube-transcript-api
pip install youtube-transcript-api
pip install faiss-cpu
pip install tiktoken
引用向量數(shù)據(jù)庫Faiss
2. 編寫讀取視頻字幕并存入向量數(shù)據(jù)庫Faiss,文件langchain_helper.py
# 從langchain包和其他庫中導(dǎo)入必要的模塊
from langchain.document_loaders import YoutubeLoader # 導(dǎo)入YoutubeLoader,用于加載YouTube視頻數(shù)據(jù)
from langchain.text_splitter import RecursiveCharacterTextSplitter # 導(dǎo)入文本分割器,用于處理文檔
from langchain.embeddings.openai import OpenAIEmbeddings # 導(dǎo)入OpenAIEmbeddings,用于生成嵌入向量
from langchain.vectorstores import FAISS # 導(dǎo)入FAISS,用于大數(shù)據(jù)集中高效的相似性搜索
from langchain.llms import OpenAI # 導(dǎo)入OpenAI,用于語言模型功能
from langchain import PromptTemplate # 導(dǎo)入PromptTemplate,用于模板化提示
from langchain.chains import LLMChain # 導(dǎo)入LLMChain,用于創(chuàng)建語言模型鏈
from dotenv import load_dotenv # 導(dǎo)入load_dotenv,用于管理環(huán)境變量
load_dotenv() # 從.env文件加載環(huán)境變量
embedding = OpenAIEmbeddings() # 初始化OpenAI嵌入向量,用于生成文檔嵌入向量
# YouTube視頻的URL
video_url = "https://youtu.be/-Osca2Zax4Y?si=iy0iePxzUy_bUayO"
def create_vector_db_from_youtube_url(video_url: str) -> FAISS:
# 加載YouTube視頻字幕
loader = YoutubeLoader.from_youtube_url(video_url)
transcript = loader.load()
# 將字幕分割成較小的片段
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(transcript)
# 從文檔片段創(chuàng)建FAISS數(shù)據(jù)庫
db = FAISS.from_documents(docs, embedding)
return db
# 示例:從給定YouTube URL創(chuàng)建向量數(shù)據(jù)庫
print(create_vector_db_from_youtube_url(video_url))
zgpeaces-MBP at ~/Workspace/LLM/langchain-llm-app ±(feature/infoRetrievel) ? ? python langchain_helper.py
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/langchain/__init__.py:39: UserWarning: Importing PromptTemplate from langchain root module is no longer supported.
warnings.warn(
<langchain.vectorstores.faiss.FAISS object at 0x11b1e96f0>
3. 根據(jù)向量數(shù)據(jù)庫的信息查詢
查看OpenAI model
3.1 添加查詢方法
# 從langchain包和其他庫中導(dǎo)入必要的模塊
from langchain.document_loaders import YoutubeLoader # 導(dǎo)入YoutubeLoader,用于從YouTube視頻加載數(shù)據(jù)
from langchain.text_splitter import RecursiveCharacterTextSplitter # 導(dǎo)入用于處理長文檔的文本分割器
from langchain.embeddings.openai import OpenAIEmbeddings # 導(dǎo)入OpenAIEmbeddings,用于生成文檔嵌入向量
from langchain.vectorstores import FAISS # 導(dǎo)入FAISS,用于大數(shù)據(jù)集中高效的相似性搜索
from langchain.llms import OpenAI # 導(dǎo)入OpenAI,用于訪問語言模型功能
from langchain import PromptTemplate # 導(dǎo)入PromptTemplate,用于創(chuàng)建結(jié)構(gòu)化的語言模型提示
from langchain.chains import LLMChain # 導(dǎo)入LLMChain,用于構(gòu)建使用語言模型的操作鏈
from dotenv import load_dotenv # 導(dǎo)入load_dotenv,用于從.env文件加載環(huán)境變量
load_dotenv() # 從.env文件加載環(huán)境變量
embedding = OpenAIEmbeddings() # 初始化OpenAI嵌入向量的實(shí)例,用于生成文檔嵌入向量
# YouTube視頻的URL
video_url = "https://youtu.be/-Osca2Zax4Y?si=iy0iePxzUy_bUayO"
def create_vector_db_from_youtube_url(video_url: str) -> FAISS:
# 加載YouTube視頻字幕
loader = YoutubeLoader.from_youtube_url(video_url)
transcript = loader.load()
# 將字幕分割成較小的片段
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(transcript)
# 從文檔片段創(chuàng)建FAISS數(shù)據(jù)庫
db = FAISS.from_documents(docs, embedding)
return db
def get_response_from_query(db, query, k=4):
# 對(duì)給定查詢執(zhí)行數(shù)據(jù)庫的相似性搜索
docs = db.similarity_search(query, k=k)
# 連接前幾個(gè)文檔的內(nèi)容
docs_page_content = " ".join([d.page_content for d in docs])
# 初始化一個(gè)OpenAI語言模型
llm = OpenAI(model="text-davinci-003")
# 定義語言模型的提示模板
prompt = PromptTemplate(
input_variables=["question", "docs"],
template = """
You are a helpful assistant that that can answer questions about youtube videos
based on the video's transcript.
Answer the following question: {question}
By searching the following video transcript: {docs}
Only use the factual information from the transcript to answer the question.
If you feel like you don't have enough information to answer the question, say "I don't know".
Your answers should be verbose and detailed.
""",
)
# 使用定義的提示創(chuàng)建一個(gè)語言模型鏈
chain = LLMChain(llm=llm, prompt=prompt)
# 使用查詢和連接的文檔運(yùn)行鏈
response = chain.run(question=query, docs=docs_page_content)
# 通過替換換行符來格式化響應(yīng)
response = response.replace("\n", " ")
return response, docs
# 示例用法:從YouTube視頻URL創(chuàng)建向量數(shù)據(jù)庫
# print(create_vector_db_from_youtube_url(video_url))
3.2 Streamlit 實(shí)現(xiàn)入?yún)⒁曨l地址和查詢內(nèi)容
main.py
import streamlit as st # 導(dǎo)入Streamlit庫,用于創(chuàng)建Web應(yīng)用程序
import langchain_helper as lch # 導(dǎo)入自定義模塊'langchain_helper',用于處理langchain操作
import textwrap # 導(dǎo)入textwrap模塊,用于格式化文本
st.title("YouTube Assistant") # 設(shè)置Streamlit網(wǎng)頁應(yīng)用的標(biāo)題
# 使用Streamlit的側(cè)邊欄功能來創(chuàng)建輸入表單
with st.sidebar:
# 在側(cè)邊欄中創(chuàng)建一個(gè)表單
with st.form(key='my_form'):
# 創(chuàng)建一個(gè)文本區(qū)域用于輸入YouTube視頻URL
youtube_url = st.sidebar.text_area(
label="What is the YouTube video URL?",
max_chars=50
)
# 創(chuàng)建一個(gè)文本區(qū)域用于輸入關(guān)于YouTube視頻的查詢
query = st.sidebar.text_area(
label="Ask me about the video?",
max_chars=50,
key="query"
)
# 創(chuàng)建一個(gè)提交表單的按鈕
submit_button = st.form_submit_button(label='Submit')
# 檢查是否同時(shí)提供了查詢和YouTube URL
if query and youtube_url:
# 從YouTube視頻URL創(chuàng)建向量數(shù)據(jù)庫
db = lch.create_vector_db_from_youtube_url(youtube_url)
# 根據(jù)向量數(shù)據(jù)庫獲取查詢的響應(yīng)
response, docs = lch.get_response_from_query(db, query)
# 在應(yīng)用程序中顯示一個(gè)副標(biāo)題“回答:”
st.subheader("Answer:")
# 顯示響應(yīng),格式化為每行85個(gè)字符
st.text(textwrap.fill(response, width=85))
運(yùn)行
$ streamlit run main.py
You can now view your Streamlit app in your browser.
Local URL: http://localhost:8501
Network URL: http://192.168.50.10:8501
For better performance, install the Watchdog module:
What is the YouTube video URL?
https://youtu.be/-Osca2Zax4Y?si=iy0iePxzUy_bUayO
Ask me about the video?
What did they tal about Ransomware?
文章來源:http://www.zghlxwxcb.cn/news/detail-754237.html
參考文章來源地址http://www.zghlxwxcb.cn/news/detail-754237.html
- https://github.com/zgpeace/pets-name-langchain/tree/feature/infoRetrievel
- https://python.langchain.com/docs/integrations/document_loaders/youtube_transcript
- https://youtu.be/lG7Uxts9SXs?si=H1CISGkoYiKRSF5V
- https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/
- https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
到了這里,關(guān)于LangChain 4用向量數(shù)據(jù)庫Faiss存儲(chǔ),讀取YouTube的視頻文本搜索Indexes for information retrieve的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!