# 讀取markdown內容
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain import document_loaders
# 用到的所有方法
# load_pdf_file_langchain_unstructed # x按照行,無結構化
# load_pdf_file_pypdf # x按照頁碼,無結構化
# load_pdf_file_MathPix # x需要填寫app_id、app_key(公司付費可申請api),可以轉成markdown,多級標題,字體大小相似不能識別
# load_pdf_file_unstructed # x按照行或者全文,無結構化
# load_pdf_file_PyPDFium2 # x按照頁碼,無結構化
# load_pdf_file_PDFMiner # x無結構化,甚至沒有分頁
# load_pdf_file_html # 需要改進算法
# load_pdf_file_PyPDFDirectory # x無結構化,只是能從文件夾去讀取pdf文件,讀取結果還是按照頁碼
# load_pdf_file_AmazonTextractPDFLoader # x無結構話,官方文檔只提到提取文本,為提取到提取header或者提取結構
def load_pdf_file_langchain_unstructed(content_path):
loader = loader = UnstructuredPDFLoader(content_path, mode="elements")
data = loader.load()
for page in data:
print('-------------------')
print('content')
print(page.page_content)
print('metadata')
print(page.metadata)
return data
def load_pdf_file_pypdf(content_path):
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(content_path)
pages = loader.load_and_split()
for page in pages:
print('-------------------')
print('content')
print(page.page_content)
print('metadata')
print(page.metadata)
return pages
def load_pdf_file_MathPix(content_path):
from langchain_community.document_loaders import MathpixPDFLoader
loader = MathpixPDFLoader(content_path)
data = loader.load()
for page in data:
print('-------------------')
print('content')
print(page.page_content)
print('metadata')
print(page.metadata)
return data
def load_pdf_file_unstructed(content_path):
from langchain_community.document_loaders import UnstructuredPDFLoader
loader = UnstructuredPDFLoader(content_path, mode="elements")
loader2 = UnstructuredPDFLoader(content_path)
data = loader.load()
for page in data:
print('-------------------')
print('content')
print(page.page_content)
print('metadata')
print(page.metadata)
return data
def load_pdf_file_PyPDFium2(content_path):
from langchain_community.document_loaders import PyPDFium2Loader
loader = PyPDFium2Loader(content_path)
data = loader.load()
for page in data:
print('-------------------')
print('content')
print(page.page_content)
print('metadata')
print(page.metadata)
return data
def load_pdf_file_PDFMiner(content_path):
from langchain_community.document_loaders import PDFMinerLoader
loader = PDFMinerLoader(content_path)
data = loader.load()
for page in data:
print(page.page_content)
print(page.metadata)
return data
def load_pdf_file_html(content_path):
'''
這個函數(shù)的邏輯可以分為以下幾個步驟:
使用 PDFMinerPDFasHTMLLoader 加載 PDF 文件并將其轉換為 HTML 格式。
使用 BeautifulSoup 解析 HTML 內容,并找到所有的 'div' 標簽。
遍歷所有的 'div' 標簽,并從每個標簽的 'style' 屬性中提取出字體大小('font-size')。
將具有相同字體大小的連續(xù)文本片段合并為一個片段,并將這些片段及其對應的字體大小存儲在 snippets 列表中。
遍歷 snippets 列表,根據(jù)每個片段的字體大小將其分類為標題或內容,并將其存儲在 semantic_snippets 列表中。具體的分類規(guī)則如下:
如果當前片段的字體大小大于前一個片段的標題字體大小,那么將當前片段視為新的標題。
如果當前片段的字體大小小于或等于前一個片段的內容字體大小,那么將當前片段視為前一個片段的內容。
如果當前片段的字體大小大于前一個片段的內容字體大小但小于前一個片段的標題字體大小,那么將當前片段視為新的標題。
返回 semantic_snippets 列表,其中每個元素都是一個 Document 對象,包含一個標題和其對應的內容。
'''
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
loader = PDFMinerPDFasHTMLLoader(content_path)
data = loader.load()[0] # entire PDF is loaded as a single Document
from bs4 import BeautifulSoup
soup = BeautifulSoup(data.page_content,'html.parser')
content = soup.find_all('div')
import re
cur_fs = None
cur_text = ''
snippets = [] # first collect all snippets that have the same font size
for c in content:
sp = c.find('span')
if not sp:
continue
st = sp.get('style')
if not st:
continue
fs = re.findall('font-size:(\d+)px',st)
if not fs:
continue
fs = int(fs[0])
if not cur_fs:
cur_fs = fs
if fs == cur_fs:
cur_text += c.text
else:
snippets.append((cur_text,cur_fs))
cur_fs = fs
cur_text = c.text
snippets.append((cur_text,cur_fs))
# Note: The above logic is very straightforward. One can also add more strategies such as removing duplicate snippets (as
# headers/footers in a PDF appear on multiple pages so if we find duplicates it's safe to assume that it is redundant info)
from langchain.docstore.document import Document
cur_idx = -1
semantic_snippets = []
# Assumption: headings have higher font size than their respective content
for s in snippets:
# if current snippet's font size > previous section's heading => it is a new heading
if not semantic_snippets or s[1] > semantic_snippets[cur_idx].metadata['heading_font']:
metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]}
metadata.update(data.metadata)
semantic_snippets.append(Document(page_content='',metadata=metadata))
cur_idx += 1
continue
# if current snippet's font size <= previous section's content => content belongs to the same section (one can also create
# a tree like structure for sub sections if needed but that may require some more thinking and may be data specific)
if not semantic_snippets[cur_idx].metadata['content_font'] or s[1] <= semantic_snippets[cur_idx].metadata['content_font']:
semantic_snippets[cur_idx].page_content += s[0]
semantic_snippets[cur_idx].metadata['content_font'] = max(s[1], semantic_snippets[cur_idx].metadata['content_font'])
continue
# if current snippet's font size > previous section's content but less than previous section's heading than also make a new
# section (e.g. title of a PDF will have the highest font size but we don't want it to subsume all sections)
metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]}
metadata.update(data.metadata)
semantic_snippets.append(Document(page_content='',metadata=metadata))
cur_idx += 1
return semantic_snippets
def load_pdf_file_PyPDFDirectory(content_path):
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader(content_path)
docs = loader.load()
for doc in docs:
print('-------------------')
print('content')
print(doc.page_content)
print('metadata')
print(doc.metadata)
return docs
def load_pdf_file_AmazonTextractPDFLoader(content_path):
from langchain_community.document_loaders import AmazonTextractPDFLoader
loader = AmazonTextractPDFLoader(content_path)
documents = loader.load()
for doc in documents:
print('-------------------')
print('content')
print(doc.page_content)
print('metadata')
print(doc.metadata)
return documents
content_path= r"/home/xinrui/project/xinren-rag-inti/tests/data/測試-導入文本策略.pdf"
Directory_path= r"/home/xinrui/project/xinren-rag-inti/tests/data/"
# load_pdf_file_AmazonTextractPDFLoader(content_path)
參考文件:
langchain_community.document_loaders.pdf.AmazonTextractPDFLoader文章來源:http://www.zghlxwxcb.cn/news/detail-856788.html
How to Extract Data From PDFs Using AWS Textract With Python
Amazon Textract
langchain-pdf文章來源地址http://www.zghlxwxcb.cn/news/detail-856788.html
到了這里,關于#langchain | RAG |富文本一致性解析 | NLP # langchain支持的9種PDF內容提取方式的文章就介紹完了。如果您還想了解更多內容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關文章,希望大家以后多多支持TOY模板網(wǎng)!