前言:項(xiàng)目用到了導(dǎo)出文檔,綜合考慮使用python-docx
模塊
python-docx
安裝
pip install python-docx
docx文檔布局詞匯
三個(gè)部分
文檔Document 段落Paragraph 文字塊Run
文檔
就是docx文檔
段落
就是尋常段落
文字塊
如下,短句子中有多種不同的樣式,則會(huì)被劃分成多個(gè)文字塊。
如果所示,這個(gè)paragraph一共四個(gè)run。
四級(jí)結(jié)構(gòu)(表格)
Document - Table - Row/Column - Cell四級(jí)結(jié)構(gòu)
使用
導(dǎo)入word
from docx import Document
# 只要不指定路徑,就默認(rèn)為創(chuàng)建新Word文件
wordfile = Document(path)
讀操作
獲取段落
三個(gè)部分:一個(gè)doc由多個(gè)paragraph組成
paragraphs = wordfile.paragraphs
# 得到一個(gè)段落對(duì)象列表
# [ p1,p2,p3...]
print(paragraphs)
獲取段落文本內(nèi)容
for paragraph in wordfile.paragraphs:
print(paragraph.text)
獲取文字塊文本內(nèi)容
一個(gè)paragraph段落由一個(gè)或者多個(gè)run文字塊組成
for paragraph in wordfile.paragraphs:
for run in paragraph.runs:
print(run.text)
遍歷表格
# 按行遍歷
for table in wordfile.tables:
for row in table.rows:
for cell in row.cells:
print(cell.text)
# 按列遍歷
for table in wordfile.tables:
for column in table.columns:
for cell in column.cells:
print(cell.text)
表格設(shè)置字體樣式
表格中文字樣式修改,與在段落中的樣式修改一樣,只是在添加文本時(shí)調(diào)用的方法不同。
run=table.cell(row,col).paragraphs[0].add_run(str) #添加文本的方法
run.font.name = u'宋體'
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋體')
run.font.bold=True
寫(xiě)操作
保存文件
wordfile.save(...)
... 放需要保存的路徑
添加標(biāo)題
wordfile.add_heading(…, level=…)
添加段落
wordfile.add_paragraph(...)
--------------------------------------------------
wordfile = Document()
wordfile.add_heading('一級(jí)標(biāo)題', level=1)
wordfile.add_paragraph('新的段落')
添加文字塊
wordfile.add_run(...)
添加空白頁(yè)
wordfile.add_page_break(...)
添加圖片
wordfile.add_picture(..., width=…, height=…)
設(shè)置樣式
-
字體設(shè)置
-
文字其他樣式設(shè)置
from docx import Document from docx.shared import RGBColor, Pt wordfile = Document(file) for paragraph in wordfile.paragraphs: for run in paragraph.runs: run.font.bold = True # 加粗 run.font.italic = True # 斜體 run.font.underline = True # 下劃線 run.font.strike = True # 刪除線 run.font.shadow = True # 陰影 run.font.size = Pt(20) # 字號(hào) run.font.color.rgb = RGBColor(255, 0, 0) # 字體顏色
-
段落樣式設(shè)置
默認(rèn)左對(duì)齊
word轉(zhuǎn)pdf,html
word---->html
pip install pydocx
from pydocx import PyDocX
// 傳入docx文件路徑 或 文件content
html = PyDocX.to_html("./test.docx")
// 返回html:string
f = open("test.html", 'w', encoding="utf-8")
f.write(html)
f.close()
word---->pdf
pip install pdfkit
依賴(lài)軟件:https://wkhtmltopdf.org/downloads.html
# 將wkhtmltopdf.exe程序絕對(duì)路徑
path_wkthmltopdf = r'E:\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
# 生成pdf文件,to_file為文件路徑
pdfkit.from_file(html, to_file, configuration=config)
實(shí)例
輸入文件:
輸出文件:
demo代碼:
from docx import Document
doc = Document("./templates.docx")
# 查看所有屬性
# 'add_heading', 'add_page_break', 'add_paragraph', 'add_picture','add_section', 'add_table',
# 'core_properties', 'element', 'inline_shapes', 'paragraphs', 'part', 'save', 'sections', 'settings', 'styles',
# 'tables'
# print(dir(doc))
ps = doc.paragraphs
# print(ps)
for p in ps:
text = p.text
if "分析人" in text:
p.text = text + "General_zy"
elif "分析效果" in text:
p.text = text + "高危漏洞"
tables = doc.tables
# 獲取模板docx中的唯一一個(gè)表
table = tables[0]
for i in range(1, 3):
for j in range(3):
table.cell(i, j).text = str(i) + str(j)
p3 = doc.add_paragraph("三.")
# 'add_run', 'alignment', 'clear', 'insert_paragraph_before', 'paragraph_format', 'part', 'runs', 'style', 'text'
p4 = doc.add_paragraph("分析團(tuán)隊(duì):")
p4.add_run("dddd")
doc.save("./xxx.docx")
實(shí)際案例
-
python-docx
庫(kù)給我的直觀感受就是難!難!難! - 一大堆的私有屬性,連pycharm都沒(méi)有屬性方法提示
- 連chatgpt都頻繁寫(xiě)出錯(cuò)誤代碼
- 以下是我的一些總結(jié),希望可以幫到你們
按順序讀取word文檔中的所有信息(文本,圖片,表格)
- 鬼知道
CT_P
,CT_Tbl
是什么意思 - 鬼知道還有這么一些神奇的xpath
//a:blip/@r:embed
- 磕磕巴巴的一個(gè)函數(shù)寫(xiě)了4天,完成了。
def read_from_word(self, src_filepath: str):
"""
讀取輸入的word附件文件
"""
fp = Document(src_filepath)
# 遍歷整個(gè)文檔的所有元素(段落和表格),并記錄它們?cè)谖臋n中出現(xiàn)的順序
elements = []
for block in fp.element.body:
if block.__class__.__name__ == 'CT_P':
elements.append(('paragraph', block))
elif block.__class__.__name__ == 'CT_Tbl':
elements.append(('table', block))
# 根據(jù)元素出現(xiàn)的順序構(gòu)建讀取出的內(nèi)容
content = []
for index, type_el in enumerate(elements):
el_type, el = type_el[0], type_el[-1]
if el_type == 'paragraph':
paragraph = Paragraph(parse_xml(el.xml), parent=None)
img = paragraph._element.xpath('.//pic:pic')
if not img:
txt = paragraph.text.strip()
if txt != "":
content.append(txt)
else:
picture = img[0]
embed = picture.xpath('.//a:blip/@r:embed')[0]
related_part = fp.part.related_parts[embed]
image = related_part.image
# 圖片下載下來(lái)然后把文件位置保存到content記錄順序
filepath = os.path.join(self.tmp_folder, str(index) + ".png")
with open(filepath, "wb") as f:
f.write(image.blob)
content.append(filepath)
# table將存于一個(gè)二維列表中
elif el_type == 'table':
table = Table(el, parent=None)
tables = []
for row in table.rows:
row_content = []
for cell in row.cells:
for p in cell.paragraphs:
row_content.append(p.text.strip())
tables.append(row_content)
content.append(tables)
寫(xiě)入表格并增加邊框
- 寫(xiě)入表格樣式需要自己設(shè)置
- 邊框也成了一個(gè)難點(diǎn)
def add_other_text(self, word, text: str):
# 設(shè)置附件字體
if not text.isascii():
p = add_text_with_style(word, text, False, u"仿宋_GB2312", 14)
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
else:
p = add_text_with_style(word, text, False, u"Times New Roman", 14)
p.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
def merge_from_word(self, doc, data):
style = doc.styles.add_style('Table Grid', WD_STYLE_TYPE.TABLE)
style.paragraph_format.alignment = WD_TABLE_ALIGNMENT.CENTER # 居中對(duì)齊
style.font.name = '仿宋_GB2312'
style.font.size = Pt(16)
style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋_GB2312') # 設(shè)置中文字體
style._element.rPr.rFonts.set(qn('w:ascii'), 'Times New Roman') # 設(shè)置英文字體
for text in data:
if isinstance(text, list):
table = doc.add_table(len(text), len(text[0]))
# 設(shè)置表格樣式
table.autofit = False
table.style = 'Table Grid'
table.width = Cm(15)
for index, row in enumerate(table.rows):
line = text[index]
for i, cell in enumerate(row.cells):
cell.text = line[i]
# 設(shè)置表格邊框
set_cell_border(
cell,
top={"sz": 1, "val": "single", "color": "#000000", "space": "0"},
bottom={"sz": 1, "val": "single", "color": "#000000", "space": "0"},
left={"sz": 1, "val": "single", "color": "#000000", "space": "0"},
right={"sz": 1, "val": "single", "color": "#000000", "space": "0"},
insideH={"sz": 1, "val": "single", "color": "#000000", "space": "0"},
end={"sz": 1, "val": "single", "color": "#000000", "space": "0"}
)
# 設(shè)置表頭加粗
header_cells = table.rows[0].cells
for cell in header_cells:
cell.paragraphs[0].runs[0].font.bold = True
else:
if text.endswith("png"):
doc.add_picture(text, height=Cm(7.31), width=Cm(14.63))
os.remove(text)
else:
self.add_other_text(doc, text)
return doc
# 設(shè)置表格的邊框
def set_cell_border(cell, **kwargs):
"""
Set cell`s border
Usage:
set_cell_border(
cell,
top={"sz": 12, "val": "single", "color": "#FF0000", "space": "0"},
bottom={"sz": 12, "color": "#00FF00", "val": "single"},
left={"sz": 24, "val": "dashed", "shadow": "true"},
right={"sz": 12, "val": "dashed"},
)
"""
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
# check for tag existnace, if none found, then create one
tcBorders = tcPr.first_child_found_in("w:tcBorders")
if tcBorders is None:
tcBorders = OxmlElement('w:tcBorders')
tcPr.append(tcBorders)
# list over all available tags
for edge in ('left', 'top', 'right', 'bottom', 'insideH', 'insideV'):
edge_data = kwargs.get(edge)
if edge_data:
tag = 'w:{}'.format(edge)
# check for tag existnace, if none found, then create one
element = tcBorders.find(qn(tag))
if element is None:
element = OxmlElement(tag)
tcBorders.append(element)
# looks like order of attributes is important
for key in ["sz", "val", "color", "space", "shadow"]:
if key in edge_data:
element.set(qn('w:{}'.format(key)), str(edge_data[key]))
最后是用到的包
import os
import re
import xlrd
from docx.shared import Cm
from docx.oxml.ns import qn
from docx import Document
from docx.oxml import parse_xml
from docx.shared import Pt
from docx.table import Table
from docx.text.paragraph import Paragraph
from xlrd import xldate_as_datetime
from docx.enum.text import WD_LINE_SPACING
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.table import WD_TABLE_ALIGNMENT
python-docx-template
python-docx-template 模塊主要依賴(lài)兩個(gè)庫(kù), python-docx用于讀取,編寫(xiě)和創(chuàng)建子文檔 , jinja2用于管理插入到模板docx中的標(biāo)簽 。 其基本思路是利用jinja2制作Word模板,并動(dòng)態(tài)向模板中插入文字、圖片、表格等內(nèi)容。
安裝
pip install docxtpl
模板語(yǔ)法
由于使用的jinjia2模板,所以模板語(yǔ)法基本如下:
## 迭代列表
{% for var in list %}
{{ var }}
循環(huán)邏輯
{{loop.index}}表示當(dāng)前是第幾次循環(huán),從1開(kāi)始
{% endfor %}
## 迭代字典
{% for key, value in dict.items() %}
{{ key }} {{ value }}
{% endfor %}
## 另一種迭代字典的方法,這種用的比較多
{% for var in dict %}
{{ var.key }} #key為字典的鍵
{% endfor %}
{% if score>=90 %} <p>優(yōu)秀</p>
{% elif score>=80 %} <p>良好</p>
{% elif score>=60 %} <p>及格</p>
{% else %} </p>不及格</p>
{% endif %}
{% if val.isascii() %}
{{ val }}
{% else %}
fuck off
{% endif %}
插入圖片
- 準(zhǔn)備word,寫(xiě)入如下模板
這是一個(gè)模板:{{ template }}
這是一個(gè)Word文件
這里插入一個(gè)圖片:{{ myimage }}
- 利用python渲染
from docxtpl import InlineImage, DocxTemplate
from docx.shared import Mm
import jinja2
# 打開(kāi)docx文件
tpl = DocxTemplate('test.docx')
# 要裝入的數(shù)據(jù)信息
context = {
'template': 'Hello World!',
'myimage': InlineImage(tpl, 'happy.jpg', width=Mm(20)),
}
jinja_env = jinja2.Environment(autoescape=True)
# 填充數(shù)據(jù)
tpl.render(context, jinja_env)
# 保存文件操作
tpl.save('test_temp.docx')
操作表格
from docxtpl import DocxTemplate, RichText
tpl = DocxTemplate('templates/cellbg_tpl.docx')
context = {
'alerts': [
{
'date': '2015-03-10',
'desc': RichText('Very critical alert', color='FF0000', bold=True),
'type': 'CRITICAL',
'bg': 'FF0000',
},
{
'date': '2015-03-11',
'desc': RichText('Just a warning'),
'type': 'WARNING',
'bg': 'FFDD00',
},
{
'date': '2015-03-12',
'desc': RichText('Information'),
'type': 'INFO',
'bg': '8888FF',
},
{
'date': '2015-03-13',
'desc': RichText('Debug trace'),
'type': 'DEBUG',
'bg': 'FF00FF',
},
],
}
tpl.render(context)
tpl.save('output/cellbg.docx')
合并word文檔docxcompose
安裝
pip install docxcompose
使用
import docx
import os
from glob import glob
from docxcompose.composer import Composer
base_dir = "C:\\Users\\KK.JustDoIT\\Downloads\\匯總\\報(bào)修單\\日常維修-報(bào)修單-2月"
save_path = "C:\\Users\\KK.JustDoIT\\Downloads\\匯總\\報(bào)修單"
def combine_all_docx(files_list):
number_of_sections=len(files_list)
master = docx.Document()
composer = Composer(master)
for i in range(0, number_of_sections):
doc_temp = docx.Document((files_list[i]))
composer.append(doc_temp)
composer.save(os.path.join(save_path, 'merge.docx'))
# 執(zhí)行
path_list = glob(os.path.join(base_dir, '*.docx'))
combine_all_docx(path_list)
字體文件ttf
docx設(shè)置字體失敗,那么可能是因?yàn)闆](méi)有找到字體文件。
linux中存放位置
/usr/share/fonts
安裝字體:
-
mkdir chinese
-
將下載的字體拷貝到chinese目錄下
-
執(zhí)行:
1、mkfontscale 2、mkfontdir 3、fc-cache
-
查看:
fc-list :lang=zh
文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-555002.html
windows中存放位置
將字體文件拖入即可。文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-555002.html
到了這里,關(guān)于python操作word——python-docx和python-docx-template模塊的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!