使用python讀取文件,其中pdf、docx、pptx可以直接讀,.ppt和.doc文件不能直接讀,需要轉(zhuǎn)換成.pptx和.docx文件,并且需要區(qū)分系統(tǒng)
如果是linux系統(tǒng),請(qǐng)先安裝組件文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-668601.html
#doc2docx
yum install -y libreoffice-headless
yum install -y libreoffice-writer
# ppt2pptx
yum install epel-release -y
yum install libgdiplus -y
pip3 install aspose.slides
python代碼如下:文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-668601.html
import os
def read_pptx(fp):
import pptx
prs = pptx.Presentation(fp)
for i, slide in enumerate(prs.slides):
# if i == 1: 在這里可以指定提取ppt的具體頁(yè)數(shù)
for shape in slide.shapes:
if shape.has_text_frame:
text_frame = shape.text_frame
print(str(i) + '頁(yè):' + text_frame.text)
def read_ppt(fp):
import platform
os_type = platform.system()
if os_type == "Windows":
import win32com.client as wc
powerpoint = wc.Dispatch("PowerPoint.Application")
wc.gencache.EnsureDispatch("PowerPoint.Application")
powerpoint.Visible = 1
ppt = powerpoint.Presentations.Open(fp)
ppt.SaveAs(fp+"x")
powerpoint.Quit()
elif os_type == "Linux":
import aspose.slides as slides
with slides.Presentation(fp) as presentation:
presentation.save(fp+"x", slides.export.SaveFormat.PPTX)
read_pptx(fp+"x")
def read_docx(fp):
import docx
file = docx.Document(fp)
print("段落數(shù):" + str(len(file.paragraphs))) # 段落數(shù)為13,每個(gè)回車(chē)隔離一段
# 輸出每一段的內(nèi)容
for para in file.paragraphs:
print(para.text)
def read_doc(fp):
import platform
os_type = platform.system()
if os_type == "Windows":
import doc2docx
doc2docx.convert(fp,fp+"x")
elif os_type == "Linux":
import subprocess
subprocess.check_output(["soffice", "--headless", "–-invisible", "--convert-to", "docx", fp, "--outdir", fp+"x"])
read_docx(fp + "x")
def read_pdf(fp):
import pdfplumber
pdfFile = open(fp,"rb")
pdf = pdfplumber.open(pdfFile)
for page in pdf.pages:
text = page.extract_text()
print(text)
# file_path = "*.pptx"
file_path = "*.ppt"
# file_path = "*.docx"
# file_path = "*.doc"
# file_path = "*.pdf"
file_extension = os.path.splitext(file_path)[-1].lower()
print("文件后綴:" + file_extension)
if file_extension == '.pptx':
print("讀取pptx文件")
read_pptx(file_path)
elif file_extension == '.ppt':
print("讀取ppt文件")
read_ppt(file_path)
elif file_extension == ".docx":
print("讀取docx文件")
read_docx(file_path)
elif file_extension == ".doc":
print("讀取doc文件")
read_doc(file_path)
elif file_extension == ".pdf":
print("讀取pdf文件")
read_pdf(file_path)
到了這里,關(guān)于python讀取pdf、doc、docx、ppt、pptx文件內(nèi)容的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!