Files
2026-06-06 19:05:17 +07:00

386 lines
13 KiB
Python

"""Local engine detection and conversion metadata helpers.
The app is offline-first, so high-fidelity conversion depends on tools that
are installed on the user's machine. This module centralizes that discovery so
routes and the UI agree on what is high fidelity, basic fallback, or missing.
"""
from __future__ import annotations
import importlib.util
import os
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import Iterable
QUALITY_HIGH = "high"
QUALITY_BASIC = "basic"
QUALITY_UNAVAILABLE = "unavailable"
def find_soffice() -> str | None:
"""Detect LibreOffice. PATH first, then common per-OS install locations."""
found = shutil.which("soffice") or shutil.which("libreoffice")
if found:
return found
import sys
candidates: list[str] = []
if sys.platform == "win32":
program_files = [
os.environ.get("ProgramFiles", r"C:\Program Files"),
os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)"),
os.environ.get("ProgramW6432", r"C:\Program Files"),
]
for pf in program_files:
if pf:
candidates.append(os.path.join(pf, "LibreOffice", "program", "soffice.exe"))
candidates.append(os.path.join(pf, "LibreOffice", "program", "soffice.com"))
elif sys.platform == "darwin":
candidates.append("/Applications/LibreOffice.app/Contents/MacOS/soffice")
else:
candidates.extend([
"/usr/bin/soffice",
"/usr/bin/libreoffice",
"/usr/local/bin/soffice",
"/usr/local/bin/libreoffice",
"/opt/libreoffice/program/soffice",
"/snap/bin/libreoffice",
])
for candidate in candidates:
if candidate and os.path.isfile(candidate):
return candidate
return None
def _package_available(import_name: str) -> bool:
return importlib.util.find_spec(import_name) is not None
def _binary_version(path: str | None, args: Iterable[str]) -> str | None:
if not path:
return None
try:
proc = subprocess.run(
[path, *args],
capture_output=True,
text=True,
timeout=3,
)
except Exception:
return None
text = (proc.stdout or proc.stderr or "").strip()
return text.splitlines()[0][:160] if text else None
def _binary_engine(engine_id: str, label: str, path: str | None,
version_args: Iterable[str], install_hint: str,
quality: str = QUALITY_HIGH) -> dict:
version_args = list(version_args)
return {
"id": engine_id,
"label": label,
"available": bool(path),
"path": path,
"version": _binary_version(path, version_args) if path and version_args else None,
"quality": quality if path else QUALITY_UNAVAILABLE,
"install_hint": install_hint,
"kind": "binary",
}
def _package_engine(engine_id: str, label: str, import_name: str,
install_hint: str, quality: str = QUALITY_HIGH) -> dict:
available = _package_available(import_name)
return {
"id": engine_id,
"label": label,
"available": available,
"path": None,
"version": None,
"quality": quality if available else QUALITY_UNAVAILABLE,
"install_hint": install_hint,
"kind": "python-package",
}
def _combined_package_engine(engine_id: str, label: str, import_names: Iterable[str],
install_hint: str, quality: str = QUALITY_HIGH) -> dict:
missing = [name for name in import_names if not _package_available(name)]
return {
"id": engine_id,
"label": label,
"available": not missing,
"path": None,
"version": None,
"quality": quality if not missing else QUALITY_UNAVAILABLE,
"install_hint": install_hint,
"kind": "python-package",
"missing_packages": missing,
}
def _oda_path() -> str | None:
return shutil.which("ODAFileConverter") or shutil.which("oda_file_converter")
def get_capabilities() -> dict:
soffice = find_soffice()
ffmpeg = shutil.which("ffmpeg")
ffprobe = shutil.which("ffprobe")
tesseract = shutil.which("tesseract")
oda = _oda_path()
engines = {
"libreoffice": _binary_engine(
"libreoffice",
"LibreOffice",
soffice,
["--version"],
"Install LibreOffice locally, then restart this app.",
),
"ffmpeg": _binary_engine(
"ffmpeg", "FFmpeg", ffmpeg, ["-version"],
"Install FFmpeg locally and make sure it is on PATH.",
),
"ffprobe": _binary_engine(
"ffprobe", "FFprobe", ffprobe, ["-version"],
"Install FFmpeg locally; ffprobe ships with it.",
),
"tesseract": _binary_engine(
"tesseract", "Tesseract OCR", tesseract, ["--version"],
"Install the Tesseract binary and required language packs.",
),
"oda": _binary_engine(
"oda", "ODA File Converter", oda, [],
"Install ODA File Converter for DWG support.",
),
"pymupdf": _package_engine(
"pymupdf", "PyMuPDF", "fitz",
"Install PyMuPDF with pip install PyMuPDF.",
),
"pdf2docx": _package_engine(
"pdf2docx", "pdf2docx", "pdf2docx",
"Install pdf2docx with pip install pdf2docx.",
quality="medium",
),
"pdfplumber": _package_engine(
"pdfplumber", "pdfplumber", "pdfplumber",
"Install pdfplumber with pip install pdfplumber.",
quality="medium",
),
"marker": _package_engine(
"marker", "Marker PDF", "marker",
"Install marker-pdf locally; first use downloads local model weights.",
),
"pytesseract": _package_engine(
"pytesseract", "pytesseract", "pytesseract",
"Install pytesseract with pip install pytesseract.",
),
"pyzbar": _package_engine(
"pyzbar", "pyzbar", "pyzbar",
"Install pyzbar and the local ZBar shared library.",
),
"rembg": _combined_package_engine(
"rembg", "rembg", ["rembg", "onnxruntime"],
'Install rembg with CPU support: pip install "rembg[cpu]".',
),
"pillow-heif": _package_engine(
"pillow-heif", "pillow-heif", "pillow_heif",
"Install pillow-heif with pip install pillow-heif.",
),
"whisper": _package_engine(
"whisper", "Whisper", "whisper",
"Install Whisper with pip install openai-whisper.",
),
"python-pptx": _package_engine(
"python-pptx", "python-pptx", "pptx",
"Install python-pptx with pip install python-pptx.",
quality="medium",
),
}
return {
"offline": True,
"engines": engines,
"routes": _route_statuses(engines),
}
ROUTE_REQUIREMENTS = {
"/convert/to-pdf": {
"label": "Files to PDF",
"primary": ["libreoffice"],
"fallback": "Basic Python renderer for images, text, and simple DOCX.",
},
"/convert/html-to-pdf": {
"label": "HTML to PDF",
"primary": ["libreoffice"],
"fallback": "Basic PyMuPDF HTML renderer.",
},
"/spreadsheet/excel-to-pdf": {
"label": "Excel to PDF",
"primary": ["libreoffice"],
"fallback": "Basic ReportLab table renderer.",
},
"/convert/pdf-to-word": {
"label": "PDF to Word",
"primary_any": ["pdf2docx", "marker", "pymupdf"],
"fallback": "Visual-copy and flowing-text modes remain local fallbacks.",
},
"/convert/pdf-to-excel": {
"label": "PDF to Excel",
"primary_any": ["pdfplumber", "pymupdf"],
"fallback": "PyMuPDF table detection.",
},
"/convert/pdf-to-pptx": {
"label": "PDF to PowerPoint",
"primary": ["libreoffice"],
"fallback": "Image-per-slide PowerPoint output.",
},
"/convert/pptx-to-pdf": {
"label": "PowerPoint to PDF",
"primary": ["libreoffice"],
"fallback": None,
},
"/convert/ocr-pdf": {
"label": "OCR PDF",
"primary": ["tesseract", "pytesseract"],
"fallback": None,
},
"/image/svg-to-png": {
"label": "SVG to PNG",
"primary": [],
"fallback": "Browser canvas renderer; server svglib renderer remains available as fallback.",
},
"/image/ocr": {
"label": "Image OCR",
"primary": ["tesseract", "pytesseract"],
"fallback": None,
},
"/media/convert-audio": {"label": "Convert Audio", "primary": ["ffmpeg"], "fallback": None},
"/media/convert-video": {"label": "Convert Video", "primary": ["ffmpeg"], "fallback": None},
"/media/extract-audio": {"label": "Extract Audio", "primary": ["ffmpeg"], "fallback": None},
"/media/trim": {"label": "Trim Media", "primary": ["ffmpeg"], "fallback": None},
"/media/compress-video": {"label": "Compress Video", "primary": ["ffmpeg"], "fallback": None},
"/media/video-to-gif": {"label": "Video to GIF", "primary": ["ffmpeg"], "fallback": None},
"/media/burn-subtitles": {"label": "Burn Subtitles", "primary": ["ffmpeg"], "fallback": None},
"/media/normalize-audio": {"label": "Normalize Audio", "primary": ["ffmpeg"], "fallback": None},
"/media/transcribe": {"label": "Speech to Text", "primary": ["ffmpeg", "whisper"], "fallback": None},
}
def _route_statuses(engines: dict) -> dict:
statuses = {}
for endpoint, req in ROUTE_REQUIREMENTS.items():
primary = req.get("primary", [])
primary_any = req.get("primary_any", [])
if primary:
available = all(engines[e]["available"] for e in primary if e in engines)
elif primary_any:
available = any(engines[e]["available"] for e in primary_any if e in engines)
else:
available = True
if available:
quality = QUALITY_HIGH
status = "High fidelity"
elif req.get("fallback"):
quality = QUALITY_BASIC
status = "Basic fallback"
else:
quality = QUALITY_UNAVAILABLE
status = "Unavailable"
missing = [
e for e in [*primary, *primary_any]
if e in engines and not engines[e]["available"]
]
statuses[endpoint] = {
"label": req["label"],
"quality": quality,
"status": status,
"required_engines": primary or primary_any,
"missing_engines": missing,
"fallback": req.get("fallback"),
}
return statuses
def set_conversion_metadata(response, engine: str, quality: str,
warnings: str | Iterable[str] | None = None):
response.headers["X-Conversion-Engine"] = engine
response.headers["X-Conversion-Quality"] = quality
if warnings:
if isinstance(warnings, str):
warning_text = warnings
else:
warning_text = "; ".join(str(w) for w in warnings if w)
if warning_text:
response.headers["X-Fidelity-Warnings"] = warning_text[:1000]
return response
def metadata_payload(data: dict | None = None, *, engine: str, quality: str,
warnings: Iterable[str] | str | None = None) -> dict:
payload = dict(data or {})
payload["engine"] = engine
payload["quality"] = quality
if warnings:
payload["warnings"] = [warnings] if isinstance(warnings, str) else list(warnings)
return payload
def soffice_convert(file_data: bytes, source_ext: str, target_ext: str = "pdf",
timeout: int = 180) -> bytes | None:
"""Run LibreOffice headless conversion with an isolated user profile."""
soffice = find_soffice()
if not soffice:
return None
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
profile_dir = tmp_path / "lo-profile"
profile_dir.mkdir(parents=True, exist_ok=True)
in_path = tmp_path / f"input.{source_ext.lstrip('.').lower()}"
in_path.write_bytes(file_data)
profile_uri = profile_dir.resolve().as_uri()
cmd = [
soffice,
f"-env:UserInstallation={profile_uri}",
"--headless",
"--nologo",
"--nofirststartwizard",
"--norestore",
"--convert-to",
target_ext,
"--outdir",
str(tmp_path),
str(in_path),
]
try:
proc = subprocess.run(
cmd,
capture_output=True,
timeout=timeout,
)
except (subprocess.TimeoutExpired, FileNotFoundError):
return None
if proc.returncode != 0:
return None
candidates = [
p for p in tmp_path.iterdir()
if p.is_file() and p.suffix.lower() == f".{target_ext.lower()}"
]
if not candidates:
return None
candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True)
return candidates[0].read_bytes()