1.读取word文件
首先安装软件包 pip3 install python-docx
from docx import Document
import os
path = os.path.join(os.getcwd(),'你的文档名字.docx')
# 加载文档
doc = Document(path)
# 遍历数据
for p in doc.paragraphs:
print(p.text)
# 遍历文档中所有表格
for t in doc.tables:
for row in t.rows:
_row_str = ' '
for cell in row.cells: # 遍历每一行的每一个格子
_row_str = _row_str + cell.text + '|'
print(_row_str)
2. 简历筛选的例子
class ReadDoc(object):
def __init__(self,path):
self.doc = Document(path)
self.p_text = ''
self.table_text = ''
def get_patra(self):
for para in self.doc.paragraphs:
self.p_text += para.text + '\n'
return self.p_text
def get_table(self):
for table in self.doc.tables:
for row in table.rows:
for cell in row.cells:
self.table_text += cell.text +'/'
return self.table_text
def search_word(path,targets):
# 获得所有文件路径
result = glob.glob(path)
# 定义结果列表
final_result = []
# 遍历所有文件
for file in result:
# 定义是否存在于targets中的标记
is_use = True
# 如果是文件
if os.path.isfile(file):
# 如果是docx文件
if file.endswith('docx'):
# 实例化ReadDoc类 获得段落和表格文本
doc = ReadDoc(file)
p_text = doc.get_patra()
# print(p_text)
t_text = doc.get_table()
# print(t_text)
all_text = p_text + t_text
# 循环遍历每一个要查找的关键字
for target in targets:
# 如果目标词不在文件中
if target not in all_text:
# 标记为不合适文件
is_use = False
# 跳出循环
break
if not is_use:
# 不合适文件跳过本次循环
continue
# 合适文件加入结果列表
final_result.append(file)
return final_result # 返回结果列表
if __name__ == '__main__':
path = os.path.join(os.getcwd(),'*')
targets = ['python','golang']
res = search_word(path,targets)
print(res) # 输出结果列表
3. 生成word文件
from docx import Document
from docx.shared import Inches
doc = Document()
# 添加标题
# 参数1 标题内容
# 参数2:标题级别0-9 default=1 字号递减
title = doc.add_heading("My Title",0)
# 追加标题
title.add_run("123")
# 添加段落
# 参数1 段落内容
p = doc.add_paragraph("嘿嘿嘿")
p.add_run('fjdsaklfjalkfjakls')
# 添加图片
# 参数1 图片路径
# 参数2 宽度
doc.add_picture("1.jpg",width=Inches(2))
# 添加表格
# 创建列表存放表头
table_title = ["name","age","sex"]
table = doc.add_table(rows=1,clos=3) # 初始化表格 一行三列
table.style = "Light List Accent 1" # 表格样式
# 给表格添加表头
for i in range(len(table_title)):
table.cell(0,i).text = table_title[i]
# 构建表体数据
data = [
("方杰","76","man"),
("哥斯拉","1000",'master'),
("金刚","900",'master')
]
# 将标题数据赋值给表格
for d in data:
row_cells = table_add_row().cells # 添加行并且获得行中的单元格
for i in range(len(d))
row_cells[i].text= d[i] # 给单元格赋值
# 添加分页
doc.add_page_break()
title1 = doc.add_heading("My Title2",0)
# 保存word
doc.save("test.docx")
4. 生成pdf文件
首先下载 pip3 install pdfkit
pdfplumber 还有这个
wkhtmltopdf 这个地址 下一个你要用的安装包 然后安装 安装后配置好环境变量
import pdfkit
# html转pdf文件
# 参数1 html文件路径
# 参数2 转换成pdf文件的路径
pdfkit.from_file('htmldemo.html','test0.pdf')
# 网址的html转换成pdf
# 网址转换非常慢慢慢慢慢
# pdfkit.from_url(['https://www.baidu.com','https://www.jd.com'],'test1.pdf')
# 字符串转pdf
html = '''
<html>
<head>
<meta charset="utf-8">
<title>测试</title>
</head>
<body>
<h1>你好</h1>
<p>这是一个测试文件</p>
</body>
</html>
'''
pdfkit.from_string(html,'test2.pdf')
5.word转pdf
from PIL.ImageChops import constant
from win32com.client import constants,gencache
def createPdf(wordPath,pdfPath):
# 生成操作word文件
word = gencache.EnsureDispatch('Word.Application')
# 打开word文件
doc = word.Documents.Open(wordPath,ReadOnly=1)
# 转换为pdf,并进行格式设置
doc.ExportAsFixedFormat(pdfPath,constants.wdExportFormatPDF,Item=constants.wdExportDocumentWithMarkup,CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
word.Quit(constants.wdDoNotSaveChanges)
if __name__ == '__main__':
path = 'D:/workspace/demo/PythonOfficeAutomation/example2/'
createPdf(path+'简历1.docx',path+'pdf1.pdf')
6. 读取pdf文件
def read_pdf(path):
import pdfplumber
with pdfplumber.open(path) as pdf:
for i in range(len(pdf.pages)):
page = pdf.pages[i]
print(page.extract_text()) # 输出当前页中的文本
if __name__ == '__main__':
read_pdf('pdf1.pdf')
7.合并pdf文件
def merge_pdfs(path1,path2):
from PyPDF2 import PdfReader, PdfWriter
write = PdfWriter()
for path in [path1,path2]:
temp_pdf = PdfReader(open(path,'rb'))
for page in temp_pdf.pages:
write.add_page(page)
with open('./合并pdf.pdf','wb') as out:
write.write(out)
if __name__ == '__main__':
merge_pdfs('pdf1.pdf','test0.pdf')
8.拆分
def split_pdf(path):
from PyPDF2 import PdfReader, PdfWriter
pdf = PdfReader(open(path,'rb'))
for i,page in enumerate(pdf.pages):
write = PdfWriter()
write.add_page(page)
with open(f'./拆分_{i+1}.pdf', 'wb') as out:
write.write(out)
if __name__ == '__main__':
split_pdf('./合并pdf.pdf')
9.加密解密
def jiami(path):
from PyPDF2 import PdfReader, PdfWriter
pdf = PdfReader(open(path,'rb'))
writer = PdfWriter()
# pdf.decrypt('123456') # 解密
writer.encrypt('123456') # 设置密码
for i in pdf.pages:
writer.add_page(i)
with open('./加密pdf.pdf','wb') as out:
writer.write(out)
if __name__ == '__main__':
jiami('拆分_1.pdf')