4.1-python操作wrod/pdf 文件

发布于:2025-04-02 ⋅ 阅读:(16) ⋅ 点赞:(0)

1.读取word文件

首先安装软件包  pip3 install python-docx 

from docx import Document


import os 

path = os.path.join(os.getcwd(),'你的文档名字.docx')


# 加载文档
doc = Document(path)

# 遍历数据
for p in doc.paragraphs:
    print(p.text)

# 遍历文档中所有表格
for t in doc.tables:
    for row in t.rows:
        _row_str = ' '
        for cell in row.cells:  # 遍历每一行的每一个格子
            _row_str = _row_str + cell.text + '|'
        print(_row_str)

2. 简历筛选的例子

class ReadDoc(object):
    def __init__(self,path):
        self.doc = Document(path)
        self.p_text = ''
        self.table_text = ''
    def get_patra(self):
        for para in self.doc.paragraphs:
            self.p_text += para.text + '\n'
        return self.p_text
    def get_table(self):
        for table in self.doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    self.table_text += cell.text +'/'
        return self.table_text

def search_word(path,targets):
    # 获得所有文件路径
    result = glob.glob(path)
    # 定义结果列表
    final_result = []
    # 遍历所有文件
    for file in result:
        # 定义是否存在于targets中的标记
        is_use = True
        # 如果是文件
        if os.path.isfile(file):
            # 如果是docx文件
            if file.endswith('docx'):
                # 实例化ReadDoc类 获得段落和表格文本
                doc = ReadDoc(file)
                p_text = doc.get_patra()
                # print(p_text)
                t_text = doc.get_table()
                # print(t_text)
                all_text = p_text + t_text
                # 循环遍历每一个要查找的关键字
                for target in targets:
                    # 如果目标词不在文件中
                    if target not in all_text:
                        # 标记为不合适文件
                        is_use = False
                        # 跳出循环
                        break
                if not is_use:
                    # 不合适文件跳过本次循环
                    continue
                # 合适文件加入结果列表
                final_result.append(file)
    return final_result # 返回结果列表



if __name__ == '__main__':
    path = os.path.join(os.getcwd(),'*')
    targets = ['python','golang']
    res = search_word(path,targets)
    print(res) # 输出结果列表

3. 生成word文件

from docx import Document
from docx.shared import Inches

doc = Document()

# 添加标题
# 参数1 标题内容
# 参数2:标题级别0-9 default=1 字号递减 
title = doc.add_heading("My Title",0)

# 追加标题
title.add_run("123")

# 添加段落
# 参数1 段落内容
p = doc.add_paragraph("嘿嘿嘿")
p.add_run('fjdsaklfjalkfjakls')


# 添加图片
# 参数1 图片路径
# 参数2 宽度

doc.add_picture("1.jpg",width=Inches(2))

# 添加表格
# 创建列表存放表头
table_title = ["name","age","sex"]
table = doc.add_table(rows=1,clos=3) # 初始化表格 一行三列
table.style = "Light List Accent 1" # 表格样式

# 给表格添加表头
for i in range(len(table_title)):
    table.cell(0,i).text = table_title[i]

# 构建表体数据
data = [
    ("方杰","76","man"),
    ("哥斯拉","1000",'master'),
    ("金刚","900",'master')
]

# 将标题数据赋值给表格
for d in data:
    row_cells = table_add_row().cells # 添加行并且获得行中的单元格
    for i in range(len(d))
        row_cells[i].text= d[i]  # 给单元格赋值
# 添加分页
doc.add_page_break()
title1 = doc.add_heading("My Title2",0)

# 保存word

doc.save("test.docx")

4. 生成pdf文件

首先下载 pip3 install pdfkit

pdfplumber  还有这个
wkhtmltopdf  这个地址 下一个你要用的安装包 然后安装 安装后配置好环境变量

import pdfkit 


# html转pdf文件
# 参数1 html文件路径
# 参数2 转换成pdf文件的路径
pdfkit.from_file('htmldemo.html','test0.pdf')


# 网址的html转换成pdf
# 网址转换非常慢慢慢慢慢
# pdfkit.from_url(['https://www.baidu.com','https://www.jd.com'],'test1.pdf')


# 字符串转pdf

html = '''
<html>
    <head>
        <meta charset="utf-8">
        <title>测试</title>
    </head>
    <body>
        <h1>你好</h1>
        <p>这是一个测试文件</p>
    </body>
</html>
'''

pdfkit.from_string(html,'test2.pdf')

5.word转pdf

from PIL.ImageChops import constant
from win32com.client import constants,gencache

def createPdf(wordPath,pdfPath):
    # 生成操作word文件
    word = gencache.EnsureDispatch('Word.Application')
    # 打开word文件
    doc = word.Documents.Open(wordPath,ReadOnly=1)
    # 转换为pdf,并进行格式设置
    doc.ExportAsFixedFormat(pdfPath,constants.wdExportFormatPDF,Item=constants.wdExportDocumentWithMarkup,CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
    word.Quit(constants.wdDoNotSaveChanges)


if __name__ == '__main__':
    path = 'D:/workspace/demo/PythonOfficeAutomation/example2/'
    createPdf(path+'简历1.docx',path+'pdf1.pdf')

6. 读取pdf文件

def read_pdf(path):
    import pdfplumber
    with pdfplumber.open(path) as pdf:
        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            print(page.extract_text()) # 输出当前页中的文本


if __name__ == '__main__':
    read_pdf('pdf1.pdf')

7.合并pdf文件




def merge_pdfs(path1,path2):
    from PyPDF2 import PdfReader, PdfWriter
    write = PdfWriter()

    for path in [path1,path2]:
        temp_pdf = PdfReader(open(path,'rb'))

        for page in temp_pdf.pages:
            write.add_page(page)

    with open('./合并pdf.pdf','wb') as out:
        write.write(out)


if __name__ == '__main__':
    merge_pdfs('pdf1.pdf','test0.pdf')

8.拆分


def split_pdf(path):
    from PyPDF2 import PdfReader, PdfWriter
    pdf = PdfReader(open(path,'rb'))
    for i,page in enumerate(pdf.pages):
        write = PdfWriter()
        write.add_page(page)
        with open(f'./拆分_{i+1}.pdf', 'wb') as out:
            write.write(out)



if __name__ == '__main__':
    split_pdf('./合并pdf.pdf')

9.加密解密


def jiami(path):
    from PyPDF2 import PdfReader, PdfWriter
    pdf = PdfReader(open(path,'rb'))
    writer = PdfWriter()
    # pdf.decrypt('123456') # 解密
    writer.encrypt('123456')  # 设置密码
    for i in pdf.pages:
        writer.add_page(i)

    with open('./加密pdf.pdf','wb') as out:
        writer.write(out)



if __name__ == '__main__':
    jiami('拆分_1.pdf')


网站公告

今日签到

点亮在社区的每一天
去签到