使用Langchain+GPT+向量數(shù)據(jù)庫(kù)chromadb 來(lái)創(chuàng)建文檔對(duì)話機(jī)器人
一.效果圖如下:
二.安裝包
pip install langchain
pip install chromadb
pip install unstructured
pip install jieba
三.代碼如下
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os # 導(dǎo)入os模塊,用于操作系統(tǒng)相關(guān)的操作
import chromadb
import jieba as jb # 導(dǎo)入結(jié)巴分詞庫(kù)
from langchain.chains import ConversationalRetrievalChain # 導(dǎo)入用于創(chuàng)建對(duì)話檢索鏈的類(lèi)
from langchain.chat_models import ChatOpenAI # 導(dǎo)入用于創(chuàng)建ChatOpenAI對(duì)象的類(lèi)
from langchain.document_loaders import DirectoryLoader # 導(dǎo)入用于加載文件的類(lèi)
from langchain.embeddings import OpenAIEmbeddings # 導(dǎo)入用于創(chuàng)建詞向量嵌入的類(lèi)
from langchain.text_splitter import TokenTextSplitter # 導(dǎo)入用于分割文檔的類(lèi)
from langchain.vectorstores import Chroma # 導(dǎo)入用于創(chuàng)建向量數(shù)據(jù)庫(kù)的類(lèi)
import os
os.environ["OPENAI_API_KEY"] = 'xxxxxx'
# 初始化函數(shù),用于處理輸入的文檔
def init():
files = ['2023NBA.txt'] # 需要處理的文件列表
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
for file in files: # 遍歷每個(gè)文件
data_path = os.path.join(cur_dir, f'data/{file}')
with open(data_path, 'r', encoding='utf-8') as f: # 以讀模式打開(kāi)文件
data = f.read() # 讀取文件內(nèi)容
cut_data = " ".join([w for w in list(jb.cut(data))]) # 對(duì)讀取的文件內(nèi)容進(jìn)行分詞處理
cut_file =os.path.join(cur_dir, f"data/cut/cut_{file}")
with open(cut_file, 'w',encoding='utf-8') as f: # 以寫(xiě)模式打開(kāi)文件
f.write(cut_data) # 將處理后的內(nèi)容寫(xiě)入文件
# 新建一個(gè)函數(shù)用于加載文檔
def load_documents(directory):
# 創(chuàng)建DirectoryLoader對(duì)象,用于加載指定文件夾內(nèi)的所有.txt文件
loader = DirectoryLoader(directory, glob='**/*.txt')
docs = loader.load() # 加載文件
return docs # 返回加載的文檔
# 新建一個(gè)函數(shù)用于分割文檔
def split_documents(docs):
# 創(chuàng)建TokenTextSplitter對(duì)象,用于分割文檔
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_texts = text_splitter.split_documents(docs) # 分割加載的文本
return docs_texts # 返回分割后的文本
# 新建一個(gè)函數(shù)用于創(chuàng)建詞嵌入
def create_embeddings(api_key):
# 創(chuàng)建OpenAIEmbeddings對(duì)象,用于獲取OpenAI的詞向量
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
return embeddings # 返回創(chuàng)建的詞嵌入
# 新建一個(gè)函數(shù)用于創(chuàng)建向量數(shù)據(jù)庫(kù)
def create_chroma(docs_texts, embeddings, persist_directory):
new_client = chromadb.EphemeralClient()
vectordb = Chroma.from_documents(
docs_texts, embeddings, client=new_client, collection_name="openai_collection"
)
return vectordb # 返回創(chuàng)建的向量數(shù)據(jù)庫(kù)
# load函數(shù),調(diào)用上面定義的具有各個(gè)職責(zé)的函數(shù) pip install unstructured
def load():
docs = load_documents('data/cut') # 調(diào)用load_documents函數(shù)加載文檔
docs_texts = split_documents(docs) # 調(diào)用split_documents函數(shù)分割文檔
api_key = os.environ.get('OPENAI_API_KEY') # 從環(huán)境變量中獲取OpenAI的API密鑰
embeddings = create_embeddings(api_key) # 調(diào)用create_embeddings函數(shù)創(chuàng)建詞嵌入
# 調(diào)用create_chroma函數(shù)創(chuàng)建向量數(shù)據(jù)庫(kù)
vectordb = create_chroma(docs_texts, embeddings, 'data/cut/')
# 創(chuàng)建ChatOpenAI對(duì)象,用于進(jìn)行聊天對(duì)話
openai_ojb = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
# 從模型和向量檢索器創(chuàng)建ConversationalRetrievalChain對(duì)象
chain = ConversationalRetrievalChain.from_llm(openai_ojb, vectordb.as_retriever())
return chain # 返回該對(duì)象
init()
# 調(diào)用load函數(shù),獲取ConversationalRetrievalChain對(duì)象
# pip install chromadb
# pip install unstructured
# pip install jieba
chain = load()
# 定義一個(gè)函數(shù),根據(jù)輸入的問(wèn)題獲取答案
def get_ans(question):
chat_history = [] # 初始化聊天歷史為空列表
result = chain({ # 調(diào)用chain對(duì)象獲取聊天結(jié)果
'chat_history': chat_history, # 傳入聊天歷史
'question': question, # 傳入問(wèn)題
})
return result['answer'] # 返回獲取的答案
if __name__ == '__main__': # 如果此腳本作為主程序運(yùn)行
s = input('please input:') # 獲取用戶輸入
while s != 'exit': # 如果用戶輸入的不是'exit'
ans = get_ans(s) # 調(diào)用get_ans函數(shù)獲取答案
print(ans) # 打印答案
s = input('please input:') # 獲取用戶輸入
文件存放地址
參考:
https://python.langchain.com/docs/use_cases/chatbots
https://python.langchain.com/docs/integrations/vectorstores/chroma文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-768443.html
https://blog.csdn.net/v_JULY_v/article/details/131552592?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522169450205816800226590967%2522%252C%2522scm%2522%253A%252220140713.130102334…%2522%257D&request_id=169450205816800226590967&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2alltop_positive~default-1-131552592-null-null.142v93chatsearchT3_2&utm_term=langchain&spm=1018.2226.3001.4449文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-768443.html
到了這里,關(guān)于使用Langchain+GPT+向量數(shù)據(jù)庫(kù)chromadb 來(lái)創(chuàng)建文檔對(duì)話機(jī)器人的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!