🍁 前言
最近阅读论文,在做笔记的时候总是要手动输入一些latex公式,非常耗时。我使用Hapigo的Latex 公式识别,感觉还挺好用,但是缺陷是只有30次免费额度,于是在网上搜索了一下,发现可以通过本地部署Latex OCR来无限制识别latex公式。下面是我部署latex OCR的过程以及我自己总结的一些优化技巧。
🌿 部署
在 M1 上安装 LaTeX-OCR 识别工具
珠玉在前,就不班门弄斧了,需要注意的是这篇帖子的第3步的路径需要修改为你本机电脑的路径。
sudo cp -r /opt/homebrew/Cellar/pyqt@5/5.15.7_2/lib/python3.9/site-packages/* /Users/rey/miniconda3/lib/python3.9/site-packages/
主要就是修改/5.15.7_2
、python3.9
、rey
,通过按Tap
键的方式可以快速补全。
同时为了防止链接失效,我也手动将作者的步骤粘贴如下:
pip install "pix2tex[gui]"
brew install pyqt@5
sudo cp -r /opt/homebrew/Cellar/pyqt@5/5.15.7_2/lib/python3.9/site-packages/* /Users/rey/miniconda3/lib/python3.9/site-packages/
pip install pynput screeninfo
conda install pytorch torchvision
除了图片中提到的在命令行中输入python -m pix2tex latexocr
的使用方法,还可以使用latex OCR的GUI界面,只需要在终端输入latexocr
或者pix2tex_gui
,稍等片刻(打开30秒),就会打开相应的GUI界面。
🌱 优化
主要优化了两个点:
- 将代码打包成了Mac下的app
- 注册了全局快捷键,可以在程序后台运行时按下快捷键直接调用OCR识别公式
打包成app
- 打开Mac的“自动操作”App
- 搜索运行脚本,并双击“运行Shell脚本”,此时右边会出现对应的流程项
选择Shell类型,这里可以在终端输入echo $SHELL
来查看你当前使用的shell是什么类型,我这里是/bin/zsh
- 编写脚本内容,首先在终端输入
where latexocr
,会输出可执行文件的绝对路径,学过计算机组成原理都知道,在unix系统下,在终端中直接输入可执行文件的绝对路径就能够直接运行这个可执行文件,其原理就是shell解释器会去找到这个路径对应的可执行文件并运行之。综上,这个可执行文件路径就是我们要编写的脚本命令.
- 测试运行,点击右上角的运行,稍等片刻,就会弹出GUI窗口,测试成功
打包成app
更改图标,默认的自动化打包的app图标丑的一批,左边是我优化后的图标,看起来舒服多了,在搜索引擎上搜索关键词“OCR 图标”可以找到类似的图标,大家可以自己挑选
更换方式如下
- 复制你的图标图片到剪切板
- 在"应用程序"中找到刚刚打包好的app,选中,按下
command + I
,显示简介- 选中左上角的图标
- 按下
command + V
粘贴你刚才复制的图标图片,即可替换成功
注册全局快捷键
在使用的时候,有个很不方便的地方是,你必须打开latex ocr的窗口,然后按下快捷键才能够调用OCR截图识别公式,就很麻烦。于是我在想能不能让程序在后台运行的时候,自动监听快捷键,在无需显示打开窗口的情况下就能直接调用OCR截图识别,经过我的尝试,发现通过修改源代码的方式,在代码中使用pynut库可以达到预想的效果。
在/opt/miniconda3/lib/python3.12/site-packages/pix2tex
路径下(大致路径是这样,请根据本机的具体情况微调)找到gui.py这个文件,打开并编辑之。(温馨提示:修改前记得先备份哦)
我主要是修改了这几个地方:
- 将默认的识别格式
LaTeX-$
修改成了Raw
,这样识别的结果前后就没有$
- 增加了系统托盘
- 将快捷键设置成了
option + ctrl
,这是因为输入option + 字母
组合,pynut会将其识别成特殊字符,而不是组合键,比如按下option + z
,就会识别成Ω
,所以这里选择了不会产生特殊字符的组合键option + ctrl
- 增加了后台监听快捷键的功能,app在后台运行时也能够监听到快捷键
读者感兴趣的话,也可以使用文本对比工具(比如Beyond Compare)比较我修改的代码和原有代码的区别
最后在这里直接贴一下源代码,需要源代码文件也可以通过网盘下载。
源代码(修改版)
from shutil import which
import io
import subprocess
import sys
import os
import re
import tempfile
import threading
from PyQt6 import QtCore, QtGui
from PyQt6.QtCore import Qt, pyqtSlot, pyqtSignal, QThread, QTimer, QEvent
from PyQt6.QtGui import QGuiApplication
from PyQt6.QtWebEngineWidgets import QWebEngineView
from PyQt6.QtWidgets import QMainWindow, QApplication, QMessageBox, QVBoxLayout, QWidget, \
QPushButton, QTextEdit, QFormLayout, QHBoxLayout, QDoubleSpinBox, QLabel, QRadioButton, \
QSystemTrayIcon, QMenu
from pynput.mouse import Controller
from pynput import keyboard
from pynput.keyboard import Key, Listener
from PIL import ImageGrab, Image, ImageEnhance
import numpy as np
from screeninfo import get_monitors
from pix2tex import cli
from pix2tex.utils import in_model_path
from latex2sympy2 import latex2sympy
import pix2tex.resources.resources
ACCEPTED_IMAGE_SUFFIX = ['png', 'jpg', 'jpeg']
def to_sympy(latex):
normalized = re.sub(r'operatorname\*{(\w+)}', '\g<1>', latex)
sympy_expr = latex2sympy(f'${normalized}$')
return sympy_expr
class WebView(QWebEngineView):
def __init__(self, app) -> None:
super().__init__()
self.setAcceptDrops(True)
self._app = app
def dragEnterEvent(self, event):
if event.mimeData().urls():
event.accept()
else:
event.ignore()
def dropEvent(self, event):
urls = event.mimeData().urls()
self._app.returnFromMimeData(urls)
class App(QMainWindow):
isProcessing = False
globalHotkeyPressed = pyqtSignal() # 添加全局热键信号
def __init__(self, args=None):
super().__init__()
self.args = args
self.model = cli.LatexOCR(self.args)
self.initUI()
self.snipWidget = SnipWidget(self)
# 初始化系统托盘
self.initTray()
# 连接全局热键信号
self.globalHotkeyPressed.connect(self.onClick)
# 启动全局热键监听
self.hotkey_thread = threading.Thread(target=self.start_global_hotkey_listener, daemon=True)
self.hotkey_thread.start()
self.show()
def initTray(self):
"""初始化系统托盘"""
self.tray = QSystemTrayIcon(self)
self.tray.setIcon(QtGui.QIcon(':/icons/icon.svg'))
# 创建托盘菜单
tray_menu = QMenu()
self.show_action = tray_menu.addAction("显示窗口")
self.show_action.triggered.connect(self.showNormal)
quit_action = tray_menu.addAction("退出")
quit_action.triggered.connect(QApplication.quit)
self.tray.setContextMenu(tray_menu)
self.tray.show()
def start_global_hotkey_listener(self):
"""启动全局热键监听"""
# 创建按键状态集合
keys_pressed = set()
def on_press(key):
try:
# 检测 Option/Alt 键
if key == Key.alt or key == Key.alt_l or key == Key.alt_r:
keys_pressed.add('alt')
# 检测 Ctrl 键
elif key == Key.ctrl or key == Key.ctrl_l or key == Key.ctrl_r:
keys_pressed.add('ctrl')
# 检查是否同时按下了 Alt 和 Ctrl
if 'alt' in keys_pressed and 'ctrl' in keys_pressed:
# 确保在主线程中发出信号
QtCore.QMetaObject.invokeMethod(self, "globalHotkeyPressed", QtCore.Qt.ConnectionType.QueuedConnection)
# 清空按键集合,避免连续触发
keys_pressed.clear()
except Exception as e:
print(f"热键监听错误: {e}")
def on_release(key):
try:
# 释放按键时从集合中移除
if key == Key.alt or key == Key.alt_l or key == Key.alt_r:
keys_pressed.discard('alt')
elif key == Key.ctrl or key == Key.ctrl_l or key == Key.ctrl_r:
keys_pressed.discard('ctrl')
except Exception as e:
print(f"热键监听错误 (释放): {e}")
with Listener(on_press=on_press, on_release=on_release) as listener:
listener.join()
def closeEvent(self, event):
"""窗口关闭事件处理"""
if self.tray.isVisible():
self.hide()
event.ignore()
def initUI(self):
self.setWindowTitle("LaTeX OCR")
QApplication.setWindowIcon(QtGui.QIcon(':/icons/icon.svg'))
self.left = 300
self.top = 300
self.width = 500
self.height = 300
self.setGeometry(self.left, self.top, self.width, self.height)
self.format_type = 'Raw' # 秋窗修改了初始化格式
self.raw_prediction = ''
# Create LaTeX display
self.webView = WebView(self)
self.webView.setHtml("")
self.webView.setMinimumHeight(80)
# Create textbox
self.textbox = QTextEdit(self)
# self.textbox.textChanged.connect(self.displayPrediction)
self.textbox.textChanged.connect(self.onTextboxChange)
self.textbox.setMinimumHeight(40)
self.format_textbox = QTextEdit(self)
# self.textbox.textChanged.connect(self.displayPrediction)
self.format_textbox.textChanged.connect(self.onFormatTextboxChange)
self.format_textbox.setMinimumHeight(40)
# format types
format_types = QHBoxLayout()
self.format_label = QLabel('Format:', self)
self.format_type0 = QRadioButton('Raw', self)
self.format_type0.toggled.connect(self.onFormatChange)
self.format_type1 = QRadioButton('LaTeX-$', self)
self.format_type0.setChecked(True) # 秋窗修改此处,以默认选择Raw格式
self.format_type1.toggled.connect(self.onFormatChange)
self.format_type2 = QRadioButton('LaTeX-$$', self)
self.format_type2.toggled.connect(self.onFormatChange)
self.format_type3 = QRadioButton('Sympy', self)
self.format_type3.toggled.connect(self.onFormatChange)
format_types.addWidget(self.format_label)
format_types.addWidget(self.format_type0)
format_types.addWidget(self.format_type1)
format_types.addWidget(self.format_type2)
format_types.addWidget(self.format_type3)
# error output
self.error = QTextEdit(self)
self.error.setReadOnly(True)
self.error.setTextColor(Qt.GlobalColor.red)
self.error.setMinimumHeight(12)
# Create temperature text input
self.tempField = QDoubleSpinBox(self)
self.tempField.setValue(self.args.temperature)
self.tempField.setRange(0, 1)
self.tempField.setSingleStep(0.1)
# Create snip button
if sys.platform == "darwin":
self.snipButton = QPushButton('Snip [Option+Ctrl]', self) # 修改按钮文本
self.snipButton.clicked.connect(self.onClick)
else:
self.snipButton = QPushButton('Snip [Alt+Ctrl]', self) # 修改按钮文本
self.snipButton.clicked.connect(self.onClick)
self.shortcut = QtGui.QShortcut(QtGui.QKeySequence('Ctrl+Alt+Z'), self) # 修改快捷键
self.shortcut.activated.connect(self.onClick)
# Create retry button
self.retryButton = QPushButton('Retry', self)
self.retryButton.setEnabled(False)
self.retryButton.clicked.connect(self.returnSnip)
# Create layout
centralWidget = QWidget()
centralWidget.setMinimumWidth(200)
self.setCentralWidget(centralWidget)
lay = QVBoxLayout(centralWidget)
lay.addWidget(self.webView, stretch=4)
lay.addWidget(self.textbox, stretch=2)
lay.addLayout(format_types)
lay.addWidget(self.format_textbox, stretch=2)
lay.addWidget(self.error, stretch=1)
buttons = QHBoxLayout()
buttons.addWidget(self.snipButton)
buttons.addWidget(self.retryButton)
lay.addLayout(buttons)
settings = QFormLayout()
settings.addRow('Temperature:', self.tempField)
lay.addLayout(settings)
self.installEventFilter(self)
def toggleProcessing(self, value=None):
if value is None:
self.isProcessing = not self.isProcessing
else:
self.isProcessing = value
if self.isProcessing:
text = 'Interrupt'
func = self.interrupt
else:
if sys.platform == "darwin":
text = 'Snip [Option+Ctrl]' # 修改按钮文本
else:
text = 'Snip [Alt+Ctrl]' # 修改按钮文本
func = self.onClick
self.retryButton.setEnabled(True)
self.shortcut.setEnabled(not self.isProcessing)
self.snipButton.setText(text)
self.snipButton.clicked.disconnect()
self.snipButton.clicked.connect(func)
self.displayPrediction()
def eventFilter(self, obj, event):
if event.type() == QEvent.Type.KeyRelease:
if event.key() == Qt.Key.Key_V and event.modifiers() == Qt.KeyboardModifier.ControlModifier:
clipboard = QApplication.clipboard()
img = clipboard.image()
if not img.isNull():
self.returnSnip(Image.fromqimage(img))
else:
self.returnFromMimeData(clipboard.mimeData().urls())
return super().eventFilter(obj, event)
@pyqtSlot()
def onClick(self):
"""点击截图按钮或快捷键时调用"""
# 确保窗口可见
if self.isHidden():
self.showNormal()
self.activateWindow()
self.raise_()
self.close()
if os.environ.get('SCREENSHOT_TOOL') == "gnome-screenshot":
self.snip_using_gnome_screenshot()
elif os.environ.get('SCREENSHOT_TOOL') == "spectacle":
self.snip_using_spectacle()
elif os.environ.get('SCREENSHOT_TOOL') == "grim":
self.snip_using_grim()
elif os.environ.get('SCREENSHOT_TOOL') == "pil":
self.snipWidget.snip()
elif which('gnome-screenshot'):
self.snip_using_gnome_screenshot()
elif which('grim') and which('slurp'):
self.snip_using_grim()
else:
self.snipWidget.snip()
@pyqtSlot()
def interrupt(self):
if hasattr(self, 'thread'):
self.thread.terminate()
self.thread.wait()
self.toggleProcessing(False)
def snip_using_gnome_screenshot(self):
try:
with tempfile.NamedTemporaryFile() as tmp:
subprocess.run(["gnome-screenshot", "--area", f"--file={tmp.name}"])
# Use `tmp.name` instead of `tmp.file` due to compatability issues between Pillow and tempfile
self.returnSnip(Image.open(tmp.name))
except:
print(f"Failed to load saved screenshot! Did you cancel the screenshot?")
print("If you don't have gnome-screenshot installed, please install it.")
self.returnSnip()
def snip_using_spectacle(self):
try:
with tempfile.NamedTemporaryFile() as tmp:
subprocess.run(["spectacle", "-r", "-b", "-n", "-o", f"{tmp.name}"])
self.returnSnip(Image.open(tmp.name))
except:
print(f"Failed to load saved screenshot! Did you cancel the screenshot?")
print("If you don't have spectacle installed, please install it.")
self.returnSnip()
def snip_using_grim(self):
try:
p = subprocess.run('slurp',
check=True,
capture_output=True,
text=True)
geometry = p.stdout.strip()
p = subprocess.run(['grim', '-g', geometry, '-'],
check=True,
capture_output=True)
self.returnSnip(Image.open(io.BytesIO(p.stdout)))
except:
print(f"Failed to load saved screenshot! Did you cancel the screenshot?")
print("If you don't have slurp and grim installed, please install them.")
self.returnSnip()
def returnFromMimeData(self, urls):
if not urls or not urls[0]:
return
image_url = urls[0]
if image_url and image_url.scheme() == 'file' and image_url.fileName().split('.')[-1] in ACCEPTED_IMAGE_SUFFIX:
image_path = image_url.toLocalFile()
return self.returnSnip(Image.open(image_path))
def returnSnip(self, img=None):
self.toggleProcessing(True)
self.retryButton.setEnabled(False)
if img:
width, height = img.size
if width <= 0 or height <= 0:
self.toggleProcessing(False)
self.retryButton.setEnabled(True)
self.show()
return
if width < 100 or height < 100: # too small size will make OCR wrong
scale_factor = max(100 / width, 100 / height)
new_width = int(width * scale_factor)
new_height = int(height * scale_factor)
img = img.resize((new_width,new_height), Image.Resampling.LANCZOS)
contrast = ImageEnhance.Contrast(img)
img = contrast.enhance(1.5)
sharpness = ImageEnhance.Sharpness(img)
img = sharpness.enhance(1.5)
self.show()
try:
self.model.args.temperature = self.tempField.value()
if self.model.args.temperature == 0:
self.model.args.temperature = 1e-8
except:
pass
# Run the model in a separate thread
self.thread = ModelThread(img=img, model=self.model)
self.thread.finished.connect(self.returnPrediction)
self.thread.finished.connect(self.thread.deleteLater)
self.thread.start()
def returnPrediction(self, result):
self.toggleProcessing(False)
success, prediction = result["success"], result["prediction"]
if success:
self.raw_prediction = prediction
self.textbox.setText(prediction)
self.format_textbox.setText(self.formatPrediction(prediction))
self.displayPrediction(prediction)
self.retryButton.setEnabled(True)
else:
self.webView.setHtml("")
msg = QMessageBox()
msg.setWindowTitle(" ")
msg.setText("Prediction failed.")
msg.exec()
def onFormatChange(self):
rb = self.sender()
if rb.isChecked():
self.format_type = rb.text()
#self.format_textbox.setText(self.formatPrediction(self.raw_prediction)) # 秋窗修改了此处,因为把初始格式设置成了Raw,不注释这行会报错
def formatPrediction(self, prediction, format_type=None):
self.error.setText("")
prediction = prediction or self.format_textbox.toPlainText()
raw = prediction.strip('$')
if len(raw) == 0:
return ''
format_type = format_type or self.format_type
if format_type == "Raw":
formatted = raw
elif format_type == "LaTeX-$":
formatted = f"${raw}$"
elif format_type == "LaTeX-$$":
formatted = f"$${raw}$$"
elif format_type == "MathJax":
formatted = raw
elif format_type == "Sympy":
try:
formatted = str(to_sympy(raw))
except Exception as e:
print(e)
formatted = raw
self.error.setText("Failed to parse Sympy expr.")
else:
return raw
return formatted
def onTextboxChange(self):
text = self.textbox.toPlainText()
new_raw_prediction = self.formatPrediction(text, "Raw")
if new_raw_prediction != self.raw_prediction:
self.raw_prediction = new_raw_prediction
self.format_textbox.setText(self.formatPrediction(self.raw_prediction))
self.displayPrediction()
def onFormatTextboxChange(self):
text = self.format_textbox.toPlainText()
clipboard = QApplication.clipboard()
clipboard.setText(text)
def displayPrediction(self, prediction=None):
if self.isProcessing:
pageSource = """<center>
<img src="qrc:/icons/processing-icon-anim.svg" width="50", height="50">
</center>"""
else:
if prediction is None:
prediction = self.textbox.toPlainText().strip('$')
pageSource = """
<html>
<head><script id="MathJax-script" src="qrc:MathJax.js"></script>
<script>
MathJax.Hub.Config({messageStyle: 'none',tex2jax: {preview: 'none'}});
MathJax.Hub.Queue(
function () {
document.getElementById("equation").style.visibility = "";
}
);
</script>
</head> """ + """
<body>
<div id="equation" style="font-size:1em; visibility:hidden">$${equation}$$</div>
</body>
</html>
""".format(equation=prediction)
self.webView.setHtml(pageSource)
class ModelThread(QThread):
finished = pyqtSignal(dict)
def __init__(self, img, model):
super().__init__()
self.img = img
self.model = model
def run(self):
try:
prediction = self.model(self.img)
# replace <, > with \lt, \gt so it won't be interpreted as html code
prediction = prediction.replace('<', '\\lt ').replace('>', '\\gt ')
self.finished.emit({"success": True, "prediction": prediction})
except Exception as e:
import traceback
traceback.print_exc()
self.finished.emit({"success": False, "prediction": None})
class SnipWidget(QMainWindow):
isSnipping = False
def __init__(self, parent):
super().__init__()
self.parent = parent
monitos = get_monitors()
bboxes = np.array([[m.x, m.y, m.width, m.height] for m in monitos])
x, y, _, _ = bboxes.min(0)
w, h = bboxes[:, [0, 2]].sum(1).max(), bboxes[:, [1, 3]].sum(1).max()
self.setGeometry(x, y, w-x, h-y)
self.begin = QtCore.QPoint()
self.end = QtCore.QPoint()
self.mouse = Controller()
# Create and start the timer
self.factor = QGuiApplication.primaryScreen().devicePixelRatio()
self.timer = QTimer(self)
self.timer.timeout.connect(self.update_geometry_based_on_cursor_position)
self.timer.start(500)
def update_geometry_based_on_cursor_position(self):
if not self.isSnipping:
return
# Update the geometry of the SnipWidget based on the current screen
mouse_pos = QtGui.QCursor.pos()
screen = QGuiApplication.screenAt(mouse_pos)
if screen:
self.factor = screen.devicePixelRatio()
screen_geometry = screen.geometry()
self.setGeometry(screen_geometry)
def snip(self):
self.isSnipping = True
self.setWindowFlags(QtCore.Qt.WindowType.WindowStaysOnTopHint)
QApplication.setOverrideCursor(QtGui.QCursor(QtCore.Qt.CursorShape.CrossCursor))
self.show()
def paintEvent(self, event):
if self.isSnipping:
brushColor = (0, 180, 255, 100)
opacity = 0.3
else:
brushColor = (255, 255, 255, 0)
opacity = 0
self.setWindowOpacity(opacity)
qp = QtGui.QPainter(self)
qp.setPen(QtGui.QPen(QtGui.QColor('black'), 2))
qp.setBrush(QtGui.QColor(*brushColor))
qp.drawRect(QtCore.QRect(self.begin, self.end))
def keyPressEvent(self, event):
if event.key() == QtCore.Qt.Key.Key_Escape.value:
QApplication.restoreOverrideCursor()
self.close()
self.parent.show()
event.accept()
def mousePressEvent(self, event):
self.startPos = self.mouse.position
self.begin = event.pos()
self.end = self.begin
self.update()
def mouseMoveEvent(self, event):
self.end = event.pos()
self.update()
def mouseReleaseEvent(self, event):
self.isSnipping = False
QApplication.restoreOverrideCursor()
startPos = self.startPos
endPos = self.mouse.position
x1 = int(min(startPos[0], endPos[0]))
y1 = int(min(startPos[1], endPos[1]))
x2 = int(max(startPos[0], endPos[0]))
y2 = int(max(startPos[1], endPos[1]))
self.repaint()
QApplication.processEvents()
try:
img = ImageGrab.grab(bbox=(x1, y1, x2, y2), all_screens=True)
except Exception as e:
if sys.platform == "darwin":
img = ImageGrab.grab(bbox=(x1//self.factor, y1//self.factor,
x2//self.factor, y2//self.factor), all_screens=True)
else:
raise e
QApplication.processEvents()
self.close()
self.begin = QtCore.QPoint()
self.end = QtCore.QPoint()
self.parent.returnSnip(img)
def main(arguments):
with in_model_path():
if os.name != 'nt':
os.environ['QTWEBENGINE_DISABLE_SANDBOX'] = '1'
app = QApplication(sys.argv)
ex = App(arguments)
sys.exit(app.exec())
最后是我在使用时遇到一个bug,就是我在按下option + ctrl
调用OCR时,如果此时我不小心点击了鼠标或者触控板,导致截图失败,会出现如下报错:
此时,这个透明背景的窗口由于高度超出了屏幕高度,并且不能通过向上拖动窗口使其下面遮掩的部分显示出来,因此也就无法将其关闭,我研究了一下,找到下面这个方法:
打开"活动监视器"APP,搜索Latex OCR,关闭进程