Python办公自动化（2）对word&pdf的操作-EW帮帮网

一、操作word文档

终端下载操作word文件的工具库：

pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple python-docx

1.遍历文档中内容

paragraphs：段落属性，返回列表类型的段落地址，遍历每一个段落地址，通过text获得文本

# 操作docx文档的工具
from docx import Document
# 加载文档
doc = Document('需求规约v1.0.docx')
# print(doc.paragraphs)
for p in doc.paragraphs:
    print(p.text)

2.遍历文档中所有表格

from docx import Document
doc = Document('需求规约v1.0.docx')
for t in doc.tables:# 遍历文档中所有表格
    for row in t.rows:# 遍历表格中的每一行
        #_row_str = ' ' 👈优化视图
        for cell in row.cells:# 遍历每一行中的每一个格子
            #_row_str = cell.text + '|' 👈视图优化
            print(_row_str)# 输出格子中所包含的内容

3.检索文档中的关键字

批量查找word文档，检索哪些文档中含有关键字。

1.引库

无需多言

import glob
from docx import Document

2.创建ReadDoc类

用于读取 Word 文档中的段落和表格内容，并将它们分别存储到类的属性中。

class ReadDoc(object): # 创建类，继承object父类
    def __init__(self,path):# 类的初始化方法，self可以调用类成员，path初始化时传递的参数
        self.doc = Document(path)# 获得word文件
        self.p_text = ''# 定义存放段落的引用
        self.table_text = ''# 定义存放表格的引用
        self.get_para()
        self.get_table()

    def get_para(self):
        for p in self.doc.paragraphs:
            self.p_text += p.text + '\n'

    def get_table(self):
        for table in self.doc.tables:
            for row in table.rows:
                _cell_str = ''
                for cell in row.cells:
                    _cell_str += cell.text + ','
                self.table_text += _cell_str +'\n'

3.创建search_word函数

用于在指定路径下的 Word 文档中查找是否包含所有指定的关键字。

def search_word(path,targets):
    result = glob.glob(path)
    final_result = []
    for i in result:
        isuse = True
        if glob.os.path.isfile(i):
            if i.endswith('.docx'):#判断是否是word文件，是返回true，不是返回false
                doc = ReadDoc(i)# 创建对象，参数值会自动给初始化方法赋值
                p_text = doc.p_text # 获得ReadDoc类中p_text值，该属性包含word文档中的段落内容
                t_text = doc.table_text # 获得ReadDoc类中table_text值，该属性包含word文档中的表格内容
                all_text = p_text + t_text
                for target in targets: #循环遍历每一个要查找的关键字
                    if target not in all_text:# 判断文档中不包含关键字
                        isuse = False
                        break
                if not isuse:
                    continue
                final_result.append(i)
    return final_result

4.主程序

用于在当前目录下的所有文件中查找包含指定关键字的 Word 文档。

if __name__ == '__main__':
    path = glob.os.path.join(glob.os.getcwd(),'*')
    res = search_word(path,['python','golang','最佳'])
    print(res)

4.生成word文件

1.创建一个docx文件

from docx import Document
doc = Document()
#👉....👈添加内容
doc.save('text.docx') # 保存word

2.添加/追加标题

添加标题
参数1：标题内容；参数2：标题字号0-9

title = doc.add_heading('My Title',0)

追加标题
即在大标题下写一个小标题

title.add_run('\n123456')

3.添加段落

p = doc.add_paragraph('今天下雨辣')
p.add_run('\n其实也可能不下雨')

4.添加图片

参数1：图片的名称；参数2：图片的宽度，Inches：英寸单位

from docx.shared import Inches

doc.add_picture('tupian.jpg',width=Inches(2))

5.添加表格

#添加表格样式
table_title = ['name','age','sex'] # 构建表头信息
table = doc.add_table(rows=1,cols=3) # 初始化表格，默认1行3列
title_cells = table.rows[0].cells # 获得第一行的格子列表
title_cells[0].text = table_title[0]
title_cells[1].text = table_title[1]
title_cells[2].text = table_title[2]
#构建表体数据
data = [
    ('Adela','18','woman'),
    ('Hecate','15','woman'),
    ('Hela','14','woman'),
]
# 将表体数据赋值给表格
for d in data:
    row_cells = table.add_row().cells # 添加行并且获得行中的格子
    row_cells[0].text = d[0] #name
    row_cells[1].text = d[1] #age
    row_cells[2].text = d[2] #sex

6.添加分页

doc.add_page_break()
# title1 = doc.add_heading('My Title2',0) 👈随便加点内容

5.设置word样式

1.创建一个docx文件

from docx import Document
# ......👈添加功能库
doc = Document()
# ......👈添加功能
doc.save('test.docx')

2. 定义全局样式

from docx.shared import RGBColor, Pt
style = doc.styles['Normal']
style.font.name='微软雅黑'
style.font.color.rgb = RGBColor(255,0,0)
style.font.size = Pt(16)
doc.add_paragraph('Java语言和Python已经成为开发者的必备语言')

3.定义全局样式

from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt
title = doc.add_heading('My Title',0)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
title.style.font.size = Pt(20)

4.添加表格样式

from docx.enum.style import WD_STYLE_TYPE
#工具中内置哪些表格样式👈在终端中输出，单纯为了查找可用样式
for i in doc.styles:
    if i.type == WD_STYLE_TYPE.TABLE:
        print(i.name)
#添加表格样式
table_title = ['name','age','sex'] # 构建表头信息
table = doc.add_table(rows=1,cols=3,👉style='Colorful Grid Accent 3'👈) # 初始化表格，默认1行3列
title_cells = table.rows[0].cells # 获得第一行的格子列表
title_cells[0].text = table_title[0]
title_cells[1].text = table_title[1]
title_cells[2].text = table_title[2]
#构建表体数据
data = [
    ('Adela','18','woman'),
    ('Hecate','15','woman'),
    ('Hela','14','woman'),
]
# 将表体数据赋值给表格
for d in data:
    row_cells = table.add_row().cells # 添加行并且获得行中的格子
    row_cells[0].text = d[0] #name
    row_cells[1].text = d[1] #age
    row_cells[2].text = d[2] #sex

二、操作PDF文档

1.安装工具

1.wkhtmltopdf

将 HTML 页面或网页转换为 PDF 文件工具：

下载网址：wkhtmltopdf

配置环境变量后可在pycharm终端中查询版本：

wkhtmltopdf -V

2.pdfkit

将 HTML、CSS 和 JavaScript 转换为 PDF 格式的工具：

在pycharm终端下载工具库即可：

pip3 install pdfkit

3.pywin32

在 Python 环境中直接使用 Windows 的各种功能:

包括操作系统的文件系统、注册表、图形用户界面等

pip3 install pywin32

4.pypdf2

用于处理 PDF 文件的 Python 库:

它提供了丰富的功能，包括读取、合并、拆分、加密、解密 PDF 文件等。

pip3 install pypdf2

5.pdfplumber

也是用于处理 PDF 文件的 Python 库:

主要功能是从 PDF 文件中提取文本、表格、图像等数据。

pip3 install pdfplumber

2.生成PDF文件

1.html转换成pdf

参数1：html文件；参数2：转换pdf文件的名字

import pdfkit
pdfkit.from_file('htmldemo.html','test0.pdf')

2.网址的html转换成pdf

#我没成功，估计是网页信息量太大了👇
pdfkit.from_url(['https://www.baidu.com','https://www/jd.com'],'test1.pdf')

3.字符串转pdf

import pdfkit
html = '''
<html>
    <head>
        <meta charset="utf-8"/>
    </head>
    <body>
        <p>你好</p>
    </body>
</html>
'''
pdfkit.from_string(html,'test2.pdf')

3.生成通知书

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Pt, RGBColor, Inches

def create_doc(car_no,year,month,day,hour,minute,money,type_info):
    doc = Document()
    title = doc.add_paragraph()
    p1 = title.add_run('车辆违章处罚通知单')
    p1.font.size = Pt(30)
    p1.font.color.rgb = RGBColor(255,0,0)
    p1.font.name = ''
    p1._element.rPr.rFonts.set(qn('w:eastAsia'),'黑体')
    title.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    info = f'''辽A{car_no}车于{year}年{month}月{day}日{hour}时{minute}分在营过程中出现{type_info}(违章)现象。公可按票安企法规和公司相关制度发定决定对该车驾驶员处以{money}元款，要求你在今后的营运过程中严格按照相关法律法规运行。(注，罚款金请在返程后立即到公司缴纳)
                                                                
                                                                        驾驶员签字:                  年       月       日     '''
    content = doc.add_paragraph()
    p2 = content.add_run(info)
    content.paragraph_format.first_line_indent = Inches(0.25)

    doc.save('通知书.docx')

if __name__ == '__main__':
    car_no = '123456'
    year = 2030
    month = 8
    day = 8
    hour = 16
    minute = 25
    money = 200
    type_info = '违停'
    create_doc(car_no,year,month,day,hour,minute,money,type_info)

4.通过模板生成文档

from docx import Document
import os
infos = [
    ['辽A00001',2030,12,12,12,12,'违停',200],
    ['辽A00002',2030,11,11,11,11,'闯红灯',500],
    ['辽A00003',2030,10,10,10,10,'压线',200],
]
for info in infos:
    doc = Document('word_模板.docx')
    for p in doc.paragraphs:
        for run in p.runs:
            run.text = run.text.replace('{0}',info[0])
            run.text = run.text.replace('{1}', str(info[1]))
            run.text = run.text.replace('{2}', str(info[2]))
            run.text = run.text.replace('{3}', str(info[3]))
            run.text = run.text.replace('{4}', str(info[4]))
            run.text = run.text.replace('{5}', str(info[5]))
            run.text = run.text.replace('{6}', info[6])
            run.text = run.text.replace('{7}', str(info[7]))

    if not os.path.exists('./通知'):
        os.makedirs('./通知')
    doc.save(f'./通知/{info[0]}.docx')

5.word转换pdf文件

from win32com.client import constants,gencache

def createPdf(wordPath,pdfPath):
    # 声明操作的是word文件
    word = gencache.EnsureDispatch('Word.Application')
    # 打开word文件
    doc = word.Documents.Open(wordPath,ReadOnly=1)
    # 转换pdf文件，并进行格式设置
    doc.ExportAsFixedFormat(pdfPath,constants.wdExportFormatPDF,Item=constants.wdExportDocumentWithMarkup,CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
    word.Quit(constants.wdDoNotSaveChanges)

if __name__ == '__main__':
    path = 'D:/workspace/demo/PythonOfficeAutomation/example2/'
    createPdf(path+'简历1.docx',path+'简历1copy.pdf')

6.读取pdf文件

def read_pdf2(path):
    import pdfplumber
    with pdfplumber.open(path) as pdf:
        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            print(page.extract_text())# 输出当前页中的文本
if __name__ == '__main__':
    read_pdf2('简历1copy.pdf')

7.合并pdf文件

def merger_pdf(path1,path2):
    from PyPDF2 import PdfWriter,PdfReader
    write = PdfWriter()

    for path in [path1,path2]:
        tmp_pdf = PdfReader(open(path,'rb'))

        for page in tmp_pdf.pages:
            write.add_page(page)

        with open('./合并pdf.pdf','wb') as out:
            write.write(out)
if __name__ == '__main__':
    merger_pdf('简历1copy.pdf','test0.pdf')

8.拆分pdf文件

def chaifen_pdf(path):
    from PyPDF2 import PdfWriter,PdfReader
    pdf = PdfReader(open(path,'rb'))
    for i,page in enumerate(pdf.pages):
        writer = PdfWriter()
        writer.add_page(page)
        with open(f'./拆分_{i+1}.pdf','wb') as out:
            writer.write(out)
if __name__ == '__main__':
    chaifen_pdf('./合并pdf.pdf')

9.加密解密pdf文件

def jiami(path):
    from PyPDF2 import PdfWriter,PdfReader
    pdf = PdfReader(open(path,'rb'))
    writer = PdfWriter()
    # pdf.decrypt('123456') # 读取pdf的文件有密码时，填写对应的密码
    writer.encrypt('123456')#设置密码
    for page in pdf.pages:
        writer.add_page(page)
    with open('加密pdf.pdf','wb') as target:
        writer.write(target)

if __name__ == '__main__':
    jiami('拆分_1.pdf')

Python办公自动化（2）对word&pdf的操作

一、操作word文档

1.遍历文档中内容

2.遍历文档中所有表格

3.检索文档中的关键字

1.引库

2.创建ReadDoc类

3.创建search_word函数

4.主程序

4.生成word文件

1.创建一个docx文件

2.添加/追加标题

3.添加段落

4.添加图片

5.添加表格

6.添加分页

5.设置word样式

1.创建一个docx文件

2. 定义全局样式

3.定义全局样式

4.添加表格样式

二、操作PDF文档

1.安装工具

1.wkhtmltopdf

2.pdfkit

3.pywin32

4.pypdf2

5.pdfplumber

2.生成PDF文件

1.html转换成pdf

2.网址的html转换成pdf

3.字符串转pdf

3.生成通知书

4.通过模板生成文档

5.word转换pdf文件

6.读取pdf文件

7.合并pdf文件

8.拆分pdf文件

9.加密解密pdf文件

网站公告

今日签到

热门文章

最新发布