"""Local engine detection and conversion metadata helpers. The app is offline-first, so high-fidelity conversion depends on tools that are installed on the user's machine. This module centralizes that discovery so routes and the UI agree on what is high fidelity, basic fallback, or missing. """ from __future__ import annotations import importlib.util import os import shutil import subprocess import tempfile from pathlib import Path from typing import Iterable QUALITY_HIGH = "high" QUALITY_BASIC = "basic" QUALITY_UNAVAILABLE = "unavailable" def find_soffice() -> str | None: """Detect LibreOffice. PATH first, then common per-OS install locations.""" found = shutil.which("soffice") or shutil.which("libreoffice") if found: return found import sys candidates: list[str] = [] if sys.platform == "win32": program_files = [ os.environ.get("ProgramFiles", r"C:\Program Files"), os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)"), os.environ.get("ProgramW6432", r"C:\Program Files"), ] for pf in program_files: if pf: candidates.append(os.path.join(pf, "LibreOffice", "program", "soffice.exe")) candidates.append(os.path.join(pf, "LibreOffice", "program", "soffice.com")) elif sys.platform == "darwin": candidates.append("/Applications/LibreOffice.app/Contents/MacOS/soffice") else: candidates.extend([ "/usr/bin/soffice", "/usr/bin/libreoffice", "/usr/local/bin/soffice", "/usr/local/bin/libreoffice", "/opt/libreoffice/program/soffice", "/snap/bin/libreoffice", ]) for candidate in candidates: if candidate and os.path.isfile(candidate): return candidate return None def _package_available(import_name: str) -> bool: return importlib.util.find_spec(import_name) is not None def _binary_version(path: str | None, args: Iterable[str]) -> str | None: if not path: return None try: proc = subprocess.run( [path, *args], capture_output=True, text=True, timeout=3, ) except Exception: return None text = (proc.stdout or proc.stderr or "").strip() return text.splitlines()[0][:160] if text else None def _binary_engine(engine_id: str, label: str, path: str | None, version_args: Iterable[str], install_hint: str, quality: str = QUALITY_HIGH) -> dict: version_args = list(version_args) return { "id": engine_id, "label": label, "available": bool(path), "path": path, "version": _binary_version(path, version_args) if path and version_args else None, "quality": quality if path else QUALITY_UNAVAILABLE, "install_hint": install_hint, "kind": "binary", } def _package_engine(engine_id: str, label: str, import_name: str, install_hint: str, quality: str = QUALITY_HIGH) -> dict: available = _package_available(import_name) return { "id": engine_id, "label": label, "available": available, "path": None, "version": None, "quality": quality if available else QUALITY_UNAVAILABLE, "install_hint": install_hint, "kind": "python-package", } def _combined_package_engine(engine_id: str, label: str, import_names: Iterable[str], install_hint: str, quality: str = QUALITY_HIGH) -> dict: missing = [name for name in import_names if not _package_available(name)] return { "id": engine_id, "label": label, "available": not missing, "path": None, "version": None, "quality": quality if not missing else QUALITY_UNAVAILABLE, "install_hint": install_hint, "kind": "python-package", "missing_packages": missing, } def _oda_path() -> str | None: return shutil.which("ODAFileConverter") or shutil.which("oda_file_converter") def get_capabilities() -> dict: soffice = find_soffice() ffmpeg = shutil.which("ffmpeg") ffprobe = shutil.which("ffprobe") tesseract = shutil.which("tesseract") oda = _oda_path() engines = { "libreoffice": _binary_engine( "libreoffice", "LibreOffice", soffice, ["--version"], "Install LibreOffice locally, then restart this app.", ), "ffmpeg": _binary_engine( "ffmpeg", "FFmpeg", ffmpeg, ["-version"], "Install FFmpeg locally and make sure it is on PATH.", ), "ffprobe": _binary_engine( "ffprobe", "FFprobe", ffprobe, ["-version"], "Install FFmpeg locally; ffprobe ships with it.", ), "tesseract": _binary_engine( "tesseract", "Tesseract OCR", tesseract, ["--version"], "Install the Tesseract binary and required language packs.", ), "oda": _binary_engine( "oda", "ODA File Converter", oda, [], "Install ODA File Converter for DWG support.", ), "pymupdf": _package_engine( "pymupdf", "PyMuPDF", "fitz", "Install PyMuPDF with pip install PyMuPDF.", ), "pdf2docx": _package_engine( "pdf2docx", "pdf2docx", "pdf2docx", "Install pdf2docx with pip install pdf2docx.", quality="medium", ), "pdfplumber": _package_engine( "pdfplumber", "pdfplumber", "pdfplumber", "Install pdfplumber with pip install pdfplumber.", quality="medium", ), "marker": _package_engine( "marker", "Marker PDF", "marker", "Install marker-pdf locally; first use downloads local model weights.", ), "pytesseract": _package_engine( "pytesseract", "pytesseract", "pytesseract", "Install pytesseract with pip install pytesseract.", ), "pyzbar": _package_engine( "pyzbar", "pyzbar", "pyzbar", "Install pyzbar and the local ZBar shared library.", ), "rembg": _combined_package_engine( "rembg", "rembg", ["rembg", "onnxruntime"], 'Install rembg with CPU support: pip install "rembg[cpu]".', ), "pillow-heif": _package_engine( "pillow-heif", "pillow-heif", "pillow_heif", "Install pillow-heif with pip install pillow-heif.", ), "whisper": _package_engine( "whisper", "Whisper", "whisper", "Install Whisper with pip install openai-whisper.", ), "python-pptx": _package_engine( "python-pptx", "python-pptx", "pptx", "Install python-pptx with pip install python-pptx.", quality="medium", ), } return { "offline": True, "engines": engines, "routes": _route_statuses(engines), } ROUTE_REQUIREMENTS = { "/convert/to-pdf": { "label": "Files to PDF", "primary": ["libreoffice"], "fallback": "Basic Python renderer for images, text, and simple DOCX.", }, "/convert/html-to-pdf": { "label": "HTML to PDF", "primary": ["libreoffice"], "fallback": "Basic PyMuPDF HTML renderer.", }, "/spreadsheet/excel-to-pdf": { "label": "Excel to PDF", "primary": ["libreoffice"], "fallback": "Basic ReportLab table renderer.", }, "/convert/pdf-to-word": { "label": "PDF to Word", "primary_any": ["pdf2docx", "marker", "pymupdf"], "fallback": "Visual-copy and flowing-text modes remain local fallbacks.", }, "/convert/pdf-to-excel": { "label": "PDF to Excel", "primary_any": ["pdfplumber", "pymupdf"], "fallback": "PyMuPDF table detection.", }, "/convert/pdf-to-pptx": { "label": "PDF to PowerPoint", "primary": ["libreoffice"], "fallback": "Image-per-slide PowerPoint output.", }, "/convert/pptx-to-pdf": { "label": "PowerPoint to PDF", "primary": ["libreoffice"], "fallback": None, }, "/convert/ocr-pdf": { "label": "OCR PDF", "primary": ["tesseract", "pytesseract"], "fallback": None, }, "/image/svg-to-png": { "label": "SVG to PNG", "primary": [], "fallback": "Browser canvas renderer; server svglib renderer remains available as fallback.", }, "/image/ocr": { "label": "Image OCR", "primary": ["tesseract", "pytesseract"], "fallback": None, }, "/media/convert-audio": {"label": "Convert Audio", "primary": ["ffmpeg"], "fallback": None}, "/media/convert-video": {"label": "Convert Video", "primary": ["ffmpeg"], "fallback": None}, "/media/extract-audio": {"label": "Extract Audio", "primary": ["ffmpeg"], "fallback": None}, "/media/trim": {"label": "Trim Media", "primary": ["ffmpeg"], "fallback": None}, "/media/compress-video": {"label": "Compress Video", "primary": ["ffmpeg"], "fallback": None}, "/media/video-to-gif": {"label": "Video to GIF", "primary": ["ffmpeg"], "fallback": None}, "/media/burn-subtitles": {"label": "Burn Subtitles", "primary": ["ffmpeg"], "fallback": None}, "/media/normalize-audio": {"label": "Normalize Audio", "primary": ["ffmpeg"], "fallback": None}, "/media/transcribe": {"label": "Speech to Text", "primary": ["ffmpeg", "whisper"], "fallback": None}, } def _route_statuses(engines: dict) -> dict: statuses = {} for endpoint, req in ROUTE_REQUIREMENTS.items(): primary = req.get("primary", []) primary_any = req.get("primary_any", []) if primary: available = all(engines[e]["available"] for e in primary if e in engines) elif primary_any: available = any(engines[e]["available"] for e in primary_any if e in engines) else: available = True if available: quality = QUALITY_HIGH status = "High fidelity" elif req.get("fallback"): quality = QUALITY_BASIC status = "Basic fallback" else: quality = QUALITY_UNAVAILABLE status = "Unavailable" missing = [ e for e in [*primary, *primary_any] if e in engines and not engines[e]["available"] ] statuses[endpoint] = { "label": req["label"], "quality": quality, "status": status, "required_engines": primary or primary_any, "missing_engines": missing, "fallback": req.get("fallback"), } return statuses def set_conversion_metadata(response, engine: str, quality: str, warnings: str | Iterable[str] | None = None): response.headers["X-Conversion-Engine"] = engine response.headers["X-Conversion-Quality"] = quality if warnings: if isinstance(warnings, str): warning_text = warnings else: warning_text = "; ".join(str(w) for w in warnings if w) if warning_text: response.headers["X-Fidelity-Warnings"] = warning_text[:1000] return response def metadata_payload(data: dict | None = None, *, engine: str, quality: str, warnings: Iterable[str] | str | None = None) -> dict: payload = dict(data or {}) payload["engine"] = engine payload["quality"] = quality if warnings: payload["warnings"] = [warnings] if isinstance(warnings, str) else list(warnings) return payload def soffice_convert(file_data: bytes, source_ext: str, target_ext: str = "pdf", timeout: int = 180) -> bytes | None: """Run LibreOffice headless conversion with an isolated user profile.""" soffice = find_soffice() if not soffice: return None with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) profile_dir = tmp_path / "lo-profile" profile_dir.mkdir(parents=True, exist_ok=True) in_path = tmp_path / f"input.{source_ext.lstrip('.').lower()}" in_path.write_bytes(file_data) profile_uri = profile_dir.resolve().as_uri() cmd = [ soffice, f"-env:UserInstallation={profile_uri}", "--headless", "--nologo", "--nofirststartwizard", "--norestore", "--convert-to", target_ext, "--outdir", str(tmp_path), str(in_path), ] try: proc = subprocess.run( cmd, capture_output=True, timeout=timeout, ) except (subprocess.TimeoutExpired, FileNotFoundError): return None if proc.returncode != 0: return None candidates = [ p for p in tmp_path.iterdir() if p.is_file() and p.suffix.lower() == f".{target_ext.lower()}" ] if not candidates: return None candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True) return candidates[0].read_bytes()