作为一名程序员我们都知道Python的库可能要比C++的丰富的多特别是在算法方面,但是有的时候我们的工程是用C++开发的,我们又像用Python的这个库那怎么办呢?如果直接调.py程序,工程中代码有.py又有.cpp显得工程很杂乱。那么我么可以借助cython来帮助我们将python程序封装成C++动态库。最近正好用到jieba分词工具,接下来让我们基于结巴一起探讨如何完成python程序封装成C++库。
1.开发环境
- 操作系统:Windows10
- 开发IDE:PyCharm、Vistual Studio 2017
- 开发工具:Python3.6.3、C++编译器MSVC14.10
前提条件已经安装了cython和jieba库如果没有安装,可以按照下面方式经行安装
pip install cython
pip install jieba
# 如果安装慢,可以用国内python镜像源
# pip install cython -i https://repo.huaweicloud.com/repository/pypi/simple/
# pip install jieba -i https://repo.huaweicloud.com/repository/pypi/simple/
2.创建安装脚本和cython代码
2.1 创建setup.py脚本呢
在网上看了很多文章,感觉都很复杂,对于我们学习来说可以先从简单开始。我们先创建一个简单的setup.py安装脚本,如下:
# setup.py
from setuptools import setup
from Cython.Build import cythonize
setup(
ext_modules=cythonize("text_segment.pyx"),
zip_safe=False,
)
2.2 创建Cython脚本
创建一个.pyx格式的文件text_segment.pyx,如下:
# text_segment.pyx
from datetime import datetime
import jieba
import os
import logging
cdef void init_logging(log_dir):
if not os.path.exists(log_dir):
os.makedirs(log_dir)
log_filename = os.path.join(log_dir, datetime.now().strftime("%Y_%m_%d") + ".log")
logging.basicConfig(filename=log_filename, level=logging.DEBUG,
format='%(asctime)s [%(filename)s Line %(lineno)d] - %(levelname)s - %(message)s')
cdef bint check_file_exists(str filename):
return os.path.isfile(filename)
cdef str read_last_line(str filename):
if not check_file_exists(filename):
return None
with open(filename, 'r', encoding='utf-8') as file:
lines = file.readlines()
if lines:
return lines[-1].strip()
return ""
cdef void write_segmented_words(list segmented_words, str output_filename):
with open(output_filename, 'w', encoding='utf-8') as file:
for word in segmented_words:
if word.strip() != "":
file.write(word + '\n')
def segment_text_full(sentence: str):
logging.info(f"原始句子:{sentence} ")
cdef list full_list = list(jieba.cut(sentence, cut_all=True))
logging.info(f"全模式: 【{'/'.join(full_list)}】")
write_segmented_words(full_list, "words_txt/cut_all_segmented_words.txt")
return full_list
def segment_text_accurate(sentence: str):
logging.info(f"原始句子:{sentence} ")
cdef list accurate_list = list(jieba.cut(sentence, cut_all=False))
logging.info(f"精确模式: 【{'/'.join(accurate_list)}】")
write_segmented_words(accurate_list, "words_txt/cut_segmented_words.txt")
return accurate_list
def segment_text_search(sentence: str):
logging.info(f"原始句子:{sentence} ")
cdef list search_list = list(jieba.cut_for_search(sentence))
logging.info(f"搜索引擎模式: 【{'/'.join(search_list)}】")
write_segmented_words(search_list, "words_txt/cut_search_segmented_words.txt")
return search_list
def init_data():
init_logging("log")
jieba.load_userdict("dict/dict.txt")
3.生成C++动态库
在终端项目目录下执行下面命令:
E:\jieba_segmentation> python setup.py build
命令成功后会生成build文件夹和text_segment.cp36-win_amd64.pyd(同.dll文件结构一样)文件,build的目录结构如下:
└─build
├─lib.win-amd64-3.6
└─temp.win-amd64-3.6
└─Release
Release目录下会生成text_segment.cp36-win_amd64.lib库文件。
4.C++程序调用
前面步骤顺利的话会生成text_segment.cp36-win_amd64.pyd和text_segment.cp36-win_amd64.lib两个库文件,接下来我们就开始通过C++程序调用Cython生成的C++动态库。
4.1 配置项目依赖
4.1.1配置Python依赖
配置Python头文件:
项目右键属性—>VC++目录—>包含目录,添加Python头文件路径(C:\Users\XXX\AppData\Local\Programs\Python\Python36\include)
配置Python依赖库(.lib静态库):
(1)项目右键属性—>VC++目录—>引用目录,添加Python静态库路径(C:\Users\XXX\AppData\Local\Programs\Python\Python36\libs)
(2)项目右键属性—>链接器—>输入—>附加依赖项,添加python36.lib
4.1.1配置Cython生成的动态库依赖
4.1.1配置Python依赖
因为Cython生成的库我们没有选择生成.h和.cpp文件,所以不用配置包含目录,之后我们在C++程序中直接调用就可以。
配置依赖库(.lib静态库):
(1)项目右键属性—>VC++目录—>引用目录,添加Cython生成的text_segment.cp36-win_amd64.lib静态库路径(我的放在这个下面$(ProjectDir)\jieba_cpython\lib)
(2)项目右键属性—>链接器—>输入—>附加依赖项,添加Cython生成的text_segment.cp36-win_amd64.lib名称
4.2 C++测试函数
我们在上边配好的VS2017工程里添加源码文件main.cpp,内容如下:
#include <iostream>
#include <string>
#include <vector>
#include <Windows.h>
#include <Python.h>
bool cutPy(const std::string& sentence, std::vector<std::string>& result, int cutType);
int main()
{
// 调用python接口
std::vector<std::string> pythonFullCutResultVector;
bool ok = segmenter.cutPy("我爱自然语言处理", pythonFullCutResultVector, 0);
if (!ok)
{
std::cout << "Python 分词失败" << std::endl;
}
std::cout << "Python 分词: ";
result << pythonFullCutResultVector;
std::cout << result << std::endl;
}
bool cutPy(const std::string& sentence, std::vector<std::string>& result, int cutType)
{
// 初始化Python解释器
Py_Initialize();
// 导入Python模块
PyObject *pModule = PyImport_ImportModule("text_segment");
if (pModule == NULL)
{
PyErr_Print();
std::cerr << "加载python分词模块失败!" << std::endl;
Py_Finalize();
return false;
}
PyObject *pInit = PyObject_GetAttrString(pModule, "init_data");
if (pInit && PyCallable_Check(pInit))
{
PyObject_CallObject(pInit, NULL);
}
else
{
if (PyErr_Occurred())
PyErr_Print();
std::cerr << "不能找到init_data方法" << std::endl;
Py_XDECREF(pInit);
Py_DECREF(pModule);
Py_Finalize();
return false;
}
// 获取python模块中的方法
PyObject *pFunc;
switch (cutType)
{
case 0:
pFunc = PyObject_GetAttrString(pModule, "segment_text_full");
break;
case 1:
pFunc = PyObject_GetAttrString(pModule, "segment_text_accurate");
break;
case 2:
pFunc = PyObject_GetAttrString(pModule, "segment_text_search");
break;
default:
pFunc = PyObject_GetAttrString(pModule, "segment_text_full");
break;
}
if (pFunc == NULL || !PyCallable_Check(pFunc))
{
if (PyErr_Occurred())
PyErr_Print();
std::cerr << "不能找到segment_text_full方法" << std::endl;
Py_XDECREF(pFunc);
Py_DECREF(pModule);
Py_Finalize();
return false;
}
// 准备调用函数的输入参数
const char* input_sentence = u8"我爱自然语言处理"; // 输入的句子
if (input_sentence == NULL)
{
std::cerr << "输入句子为空!" << std::endl;
Py_DECREF(pFunc);
Py_DECREF(pModule);
Py_Finalize();
return false;
}
// 创建Python字符串
PyObject *pInput = PyUnicode_FromString(input_sentence);
if (pInput == NULL)
{
PyErr_Print();
std::cerr << "创建Python字符串失败!" << std::endl;
Py_DECREF(pFunc);
Py_DECREF(pModule);
Py_Finalize();
return false;
}
// 封装参数
PyObject *pArgs = PyTuple_Pack(1, pInput); // 将输入封装为元组
if (pArgs == NULL)
{
PyErr_Print();
std::cerr << "参数封装失败!" << std::endl;
Py_DECREF(pInput);
Py_DECREF(pFunc);
Py_DECREF(pModule);
Py_Finalize();
return false;
}
// 调用Python方法
PyObject *pValue = PyObject_CallObject(pFunc, pArgs);
Py_DECREF(pArgs); // 释放参数引用计数
Py_DECREF(pInput); // 释放输入的Python对象
if (pValue != NULL)
{
// 处理返回值(假设返回的是一个列表)
if (PyList_Check(pValue))
{
Py_ssize_t size = PyList_Size(pValue);
std::cout << "分词结果:";
for (Py_ssize_t i = 0; i < size; ++i)
{
PyObject *pItem = PyList_GetItem(pValue, i);
const char* c_str = PyUnicode_AsUTF8(pItem);
if (!c_str)
{
PyErr_Print();
std::cerr << "转换Unicode到UTF-8失败" << std::endl;
continue; // 或其他处理
}
std::string str(c_str);
std::cout << " " << i << ":" << str << " ";
result.push_back(str);
}
std::cout << std::endl;
}
else
{
std::cerr << "返回值不是列表类型!" << std::endl;
}
Py_DECREF(pValue); // 释放返回值引用计数
}
else
{
PyErr_Print();
std::cerr << "调用Python方法失败!" << std::endl;
}
// 清理
Py_XDECREF(pFunc);
Py_DECREF(pModule);
Py_Finalize();
return true;
}
生成解决方案,运行程序,终端显示如下内容,整个动态库调用完成
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\THS\AppData\Local\Temp\jieba.cache
Loading model cost 0.506 seconds.
Prefix dict has been built successfully.
分词结果: 0:我 1:爱 2:自然 3:自然语言 4:语言 5:处理
Python 分词: ["我", "爱", "自然", "自然语言", "语言", "处理"]