import os
import sys
import logging
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from tkinterdnd2 import DND_FILES, TkinterDnD
import json
from pdf2docx import Converter
import pytesseract
from pdf2image import convert_from_path
from docx import Document
from docx.shared import Pt, Cm, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
import numexpr as ne
import tempfile
import traceback
import io
import threading
import queue
# 获取程序运行路径
def get_application_path():
"""获取应用程序路径"""
if getattr(sys, 'frozen', False):
# 如果是打包后的exe
return os.path.dirname(sys.executable)
else:
# 如果是直接运行的python脚本
return os.path.dirname(os.path.abspath(__file__))
# 设置NumExpr线程数
ne.set_num_threads(8)
# 配置文件路径
CONFIG_FILE = os.path.join(get_application_path(), "config.json")
# 日志文件路径
LOG_FILE = os.path.join(get_application_path(), "conversion.log")
# 自定义StreamHandler来捕获所有输出
class StreamToLogger(io.StringIO):
def __init__(self, logger, level):
super().__init__()
self.logger = logger
self.level = level
self.buf = ''
def write(self, buf):
self.buf = buf.strip('\r\n\t ')
if self.buf:
self.logger.log(self.level, self.buf)
def flush(self):
pass
def setup_logging():
"""配置日志系统"""
try:
app_path = get_application_path()
log_dir = os.path.join(app_path, 'logs')
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'conversion.log')
# 创建logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# 创建文件处理器
file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
# 创建控制台处理器
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
# 创建格式化器
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)
# 添加处理器
logger.addHandler(file_handler)
logger.addHandler(console_handler)
logging.debug(f"日志系统初始化成功,日志文件路径:{log_file}")
return logger
except Exception as e:
print(f"设置日志系统失败: {str(e)}")
return None
# 初始化日志记录器
logger = setup_logging()
if not logger:
print("警告:日志系统初始化失败,程序将继续运行但不会记录日志")
# 创建一个基本的日志记录器
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
def load_config():
"""加载配置文件"""
try:
if os.path.exists(CONFIG_FILE):
with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
config = json.load(f)
logger.debug(f"成功加载配置文件: {config}")
return config
logger.warning("配置文件不存在")
return {}
except Exception as e:
logger.error(f"加载配置文件失败: {str(e)}")
return {}
def save_config(config):
"""保存配置文件"""
with open(CONFIG_FILE, 'w', encoding='utf-8') as f:
json.dump(config, f, ensure_ascii=False, indent=4)
def setup_config():
"""首次运行配置界面"""
root = TkinterDnD.Tk()
root.title("PDF转Word工具 - 首次配置")
root.geometry("600x400")
# 设置窗口样式
root.configure(bg='#f0f0f0')
style = {'bg': '#f0f0f0', 'font': ('微软雅黑', 10)}
button_style = {'bg': '#4CAF50', 'fg': 'white', 'font': ('微软雅黑', 10), 'padx': 10, 'pady': 5}
# 创建说明标签
tk.Label(root, text="首次使用需要配置以下路径:", **style).pack(pady=10)
# Poppler路径配置
poppler_frame = tk.Frame(root, bg='#f0f0f0')
poppler_frame.pack(fill='x', padx=20, pady=5)
tk.Label(poppler_frame, text="Poppler路径:", **style).pack(side='left')
poppler_path = tk.StringVar()
poppler_entry = tk.Entry(poppler_frame, textvariable=poppler_path, width=50)
poppler_entry.pack(side='left', padx=5)
def select_poppler():
path = filedialog.askdirectory(title="选择Poppler安装目录")
if path:
poppler_path.set(path)
tk.Button(poppler_frame, text="浏览", command=select_poppler, **button_style).pack(side='left')
# Tesseract路径配置
tesseract_frame = tk.Frame(root, bg='#f0f0f0')
tesseract_frame.pack(fill='x', padx=20, pady=5)
tk.Label(tesseract_frame, text="Tesseract路径:", **style).pack(side='left')
tesseract_path = tk.StringVar()
tesseract_entry = tk.Entry(tesseract_frame, textvariable=tesseract_path, width=50)
tesseract_entry.pack(side='left', padx=5)
def select_tesseract():
path = filedialog.askdirectory(title="选择Tesseract安装目录")
if path:
tesseract_path.set(path)
tk.Button(tesseract_frame, text="浏览", command=select_tesseract, **button_style).pack(side='left')
# 说明文本
help_text = """
使用说明:
1. Poppler路径:选择poppler的安装目录(包含bin文件夹的目录)
2. Tesseract路径:选择Tesseract-OCR的安装目录
3. 配置完成后点击"保存配置"即可开始使用
"""
tk.Label(root, text=help_text, justify='left', **style).pack(pady=10)
def save_and_exit():
if not poppler_path.get() or not tesseract_path.get():
messagebox.showerror("错误", "请填写所有配置项!")
return
config = {
'POPPLER_PATH': poppler_path.get(),
'TESSERACT_CMD': os.path.join(tesseract_path.get(), 'tesseract.exe')
}
save_config(config)
root.destroy()
tk.Button(root, text="保存配置", command=save_and_exit, **button_style).pack(pady=20)
root.mainloop()
class PDFConverterGUI:
def __init__(self):
self.root = TkinterDnD.Tk()
self.root.title("PDF转Word工具")
self.root.geometry("800x600")
# 设置样式
self.style = {'bg': '#f0f0f0', 'font': ('微软雅黑', 10)}
self.button_style = {'bg': '#4CAF50', 'fg': 'white', 'font': ('微软雅黑', 10), 'padx': 10, 'pady': 5}
# 绑定窗口关闭事件
self.root.protocol("WM_DELETE_WINDOW", self.on_closing)
self.setup_ui()
def on_closing(self):
"""处理窗口关闭事件"""
try:
if messagebox.askokcancel("退出", "确定要退出程序吗?"):
self.root.destroy()
sys.exit(0)
except Exception as e:
logger.error(f"关闭窗口时出错: {str(e)}")
self.root.destroy()
sys.exit(1)
def setup_ui(self):
# 创建主框架
main_frame = tk.Frame(self.root, bg='#f0f0f0')
main_frame.pack(fill='both', expand=True, padx=20, pady=20)
# 路径选择区域
path_frame = tk.Frame(main_frame, bg='#f0f0f0')
path_frame.pack(fill='x', pady=10)
tk.Label(path_frame, text="选择PDF文件或目录:", **self.style).pack(side='left')
self.path_var = tk.StringVar()
path_entry = tk.Entry(path_frame, textvariable=self.path_var, width=50)
path_entry.pack(side='left', padx=5)
tk.Button(path_frame, text="选择文件", command=self.select_file, **self.button_style).pack(side='left', padx=5)
tk.Button(path_frame, text="选择目录", command=self.select_directory, **self.button_style).pack(side='left')
# 文件列表区域
list_frame = tk.Frame(main_frame, bg='#f0f0f0')
list_frame.pack(fill='both', expand=True, pady=10)
tk.Label(list_frame, text="待处理文件列表(支持拖放文件):", **self.style).pack(anchor='w')
# 创建带滚动条的列表框
self.listbox_frame = tk.Frame(list_frame, bg='#f0f0f0')
self.listbox_frame.pack(fill='both', expand=True)
self.file_listbox = tk.Listbox(self.listbox_frame, width=80, height=15)
self.file_listbox.pack(side='left', fill='both', expand=True)
scrollbar = tk.Scrollbar(self.listbox_frame)
scrollbar.pack(side='right', fill='y')
self.file_listbox.config(yscrollcommand=scrollbar.set)
scrollbar.config(command=self.file_listbox.yview)
# 绑定拖放事件
self.file_listbox.drop_target_register(DND_FILES)
self.file_listbox.dnd_bind('<<Drop>>', self.handle_drop)
# 进度条
self.progress_var = tk.DoubleVar()
self.progress = ttk.Progressbar(main_frame, variable=self.progress_var, maximum=100)
self.progress.pack(fill='x', pady=10)
# 状态标签
self.status_var = tk.StringVar(value="就绪")
tk.Label(main_frame, textvariable=self.status_var, **self.style).pack(pady=5)
# 按钮区域
button_frame = tk.Frame(main_frame, bg='#f0f0f0')
button_frame.pack(pady=10)
tk.Button(button_frame, text="开始转换", command=self.start_conversion, **self.button_style).pack(side='left', padx=5)
tk.Button(button_frame, text="清空列表", command=self.clear_list, **self.button_style).pack(side='left', padx=5)
tk.Button(button_frame, text="退出", command=self.root.quit, **self.button_style).pack(side='left', padx=5)
def handle_drop(self, event):
"""处理文件拖放事件"""
files = event.data.split()
for file in files:
# 移除文件路径中的花括号(如果有)
file = file.strip('{}')
if file.lower().endswith('.pdf'):
if file not in self.file_listbox.get(0, tk.END):
self.file_listbox.insert(tk.END, file)
self.path_var.set(os.path.dirname(file))
elif os.path.isdir(file):
# 如果是目录,则添加目录下的所有PDF文件
pdf_files = find_pdf_files(file)
for pdf_file in pdf_files:
if pdf_file not in self.file_listbox.get(0, tk.END):
self.file_listbox.insert(tk.END, pdf_file)
self.path_var.set(file)
def select_file(self):
files = filedialog.askopenfilenames(
title="选择PDF文件",
filetypes=[("PDF文件", "*.pdf")]
)
if files:
for file in files:
if file not in self.file_listbox.get(0, tk.END):
self.file_listbox.insert(tk.END, file)
self.path_var.set(os.path.dirname(files[0]))
def select_directory(self):
directory = filedialog.askdirectory(title="选择包含PDF文件的目录")
if directory:
self.path_var.set(directory)
self.clear_list()
pdf_files = find_pdf_files(directory)
for file in pdf_files:
self.file_listbox.insert(tk.END, file)
def clear_list(self):
self.file_listbox.delete(0, tk.END)
self.progress_var.set(0)
self.status_var.set("就绪")
def start_conversion(self):
files = list(self.file_listbox.get(0, tk.END))
if not files:
messagebox.showwarning("警告", "请先选择要转换的PDF文件!")
return
total_files = len(files)
self.progress_var.set(0)
def update_progress(current, total):
try:
progress = (current / total) * 100
self.progress_var.set(progress)
self.status_var.set(f"正在处理: {current}/{total}")
self.root.update()
except Exception as e:
logger.error(f"更新进度时出错: {str(e)}")
for i, file in enumerate(files, 1):
try:
self.status_var.set(f"正在处理: {os.path.basename(file)}")
self.root.update()
if process_single_pdf(file):
self.file_listbox.itemconfig(i-1, {'bg': '#90EE90'}) # 浅绿色表示成功
else:
self.file_listbox.itemconfig(i-1, {'bg': '#FFB6C1'}) # 浅红色表示失败
update_progress(i, total_files)
except Exception as e:
logger.error(f"处理文件失败: {str(e)}", exc_info=True)
self.file_listbox.itemconfig(i-1, {'bg': '#FFB6C1'})
self.status_var.set("转换完成!")
messagebox.showinfo("完成", f"处理完成!共处理 {total_files} 个文件。")
def validate_environment():
"""检查环境依赖是否就绪"""
missing_deps = []
if not os.path.exists(POPPLER_PATH):
missing_deps.append(f"Poppler路径不存在: {POPPLER_PATH}")
if not os.path.exists(TESSERACT_CMD):
missing_deps.append(f"Tesseract路径不存在: {TESSERACT_CMD}")
if missing_deps:
for dep in missing_deps:
logger.error(dep)
return False
return True
def extract_text_from_image(image):
"""从图片中提取文字"""
try:
# 配置OCR参数
custom_config = r'--oem 3 --psm 6 -l chi_sim'
text = pytesseract.image_to_string(image, config=custom_config)
return text
except Exception as e:
logger.error(f"OCR识别出错: {str(e)}", exc_info=True)
return ""
def process_single_pdf(pdf_path):
"""处理单个PDF文件"""
temp_file = None
temp_path = None
try:
start_time = time.time()
pdf_name = os.path.basename(pdf_path)
word_path = os.path.splitext(pdf_path)[0] + '.docx'
logger.info(f"▶ 开始处理: {pdf_name}")
# 基础转换
cv = Converter(pdf_path)
cv.convert(word_path)
cv.close()
logger.info(" 基础转换完成")
# OCR处理
images = convert_from_path(pdf_path, poppler_path=POPPLER_PATH)
total_pages = len(images)
logger.info(f" 共检测到 {total_pages} 页需要OCR识别")
# 创建临时文件来存储OCR识别的文本
temp_path = os.path.join(tempfile.gettempdir(), f'ocr_text_{int(time.time())}.txt')
temp_file = open(temp_path, 'w', encoding='utf-8')
# 对每一页进行OCR识别
for idx, image in enumerate(images):
logger.info(f" 正在处理第 {idx+1}/{total_pages} 页")
text = extract_text_from_image(image)
if text.strip(): # 如果识别到文字
temp_file.write(f"第{idx+1}页识别到的文字:\n{text}\n\n")
# 确保所有内容都写入文件
temp_file.flush()
temp_file.close()
temp_file = None
# 将OCR识别的文字添加到Word文档中
if os.path.exists(temp_path):
with open(temp_path, 'r', encoding='utf-8') as f:
ocr_text = f.read()
if ocr_text.strip():
logger.info(" 正在将OCR识别的文字添加到Word文档...")
# 打开已转换的Word文档
doc = Document(word_path)
# 添加OCR识别的文字
doc.add_paragraph("\nOCR识别结果:")
doc.add_paragraph(ocr_text)
# 保存文档
doc.save(word_path)
logger.info(f" 已添加OCR识别结果")
cost_time = time.time() - start_time
logger.info(f"✓ 处理完成,耗时 {cost_time:.2f} 秒\n")
return True
except Exception as e:
logger.error(f"转换失败: {str(e)}", exc_info=True)
return False
finally:
# 确保临时文件被关闭和删除
if temp_file and not temp_file.closed:
try:
temp_file.close()
except:
pass
if temp_path and os.path.exists(temp_path):
try:
os.unlink(temp_path)
except:
pass
def find_pdf_files(directory):
"""递归查找所有PDF文件"""
pdf_files = []
try:
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith('.pdf'):
full_path = os.path.join(root, file)
pdf_files.append(full_path)
return pdf_files
except Exception as e:
logger.error(f"文件搜索失败: {str(e)}")
return []
def main():
try:
print("程序启动...")
logger.info("程序启动")
# 检查配置文件
config = load_config()
if not config:
print("首次运行,请进行配置...")
logger.info("首次运行,启动配置界面")
setup_config()
config = load_config()
if not config:
print("配置失败,程序退出")
logger.error("配置失败,程序退出")
return
# 设置全局变量
global POPPLER_PATH, TESSERACT_CMD
POPPLER_PATH = config.get('POPPLER_PATH')
TESSERACT_CMD = config.get('TESSERACT_CMD')
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
print(f"Poppler路径: {POPPLER_PATH}")
print(f"Tesseract路径: {TESSERACT_CMD}")
# 验证环境
if not validate_environment():
print("环境验证失败")
logger.error("环境验证失败")
messagebox.showerror("错误", "环境配置不正确,请检查配置!")
return
# 启动GUI
print("启动GUI界面...")
logger.info("启动GUI界面")
app = PDFConverterGUI()
app.root.mainloop()
except Exception as e:
error_msg = f"程序运行出错: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
logger.error(error_msg)
messagebox.showerror("错误", f"程序运行出错:{str(e)}")
finally:
# 确保所有日志都被写入
for handler in logger.handlers:
handler.flush()
handler.close()
if __name__ == "__main__":
main()
推荐本站淘宝优惠价购买喜欢的宝贝:
本文链接:https://hqyman.cn/post/11256.html 非本站原创文章欢迎转载,原创文章需保留本站地址!
休息一下~~