mirror of
https://codeberg.org/listyantidewi/your-everyday-tools.git
synced 2026-07-01 23:17:37 +08:00
2106 lines
88 KiB
Python
2106 lines
88 KiB
Python
import io
|
||
import importlib.util
|
||
from flask import Blueprint, render_template, request, send_file, jsonify
|
||
from PIL import Image, ImageOps
|
||
import img2pdf
|
||
from docx import Document as DocxDocument
|
||
from reportlab.lib.pagesizes import A4
|
||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||
from reportlab.lib.units import inch
|
||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
|
||
from reportlab.lib import colors
|
||
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
|
||
from utils.pymupdf import import_pymupdf
|
||
|
||
fitz = import_pymupdf()
|
||
|
||
try:
|
||
from pdf2docx import Converter as Pdf2DocxConverter
|
||
HAS_PDF2DOCX = True
|
||
except ImportError:
|
||
HAS_PDF2DOCX = False
|
||
|
||
# Marker is loaded lazily inside the route to avoid heavy model/module work
|
||
# on server start. We only check package presence here.
|
||
HAS_MARKER = importlib.util.find_spec("marker") is not None
|
||
|
||
try:
|
||
import pytesseract
|
||
HAS_TESSERACT = True
|
||
except ImportError:
|
||
HAS_TESSERACT = False
|
||
|
||
try:
|
||
import pdfplumber
|
||
HAS_PDFPLUMBER = True
|
||
except ImportError:
|
||
HAS_PDFPLUMBER = False
|
||
|
||
HAS_EZDXF = (
|
||
importlib.util.find_spec("ezdxf") is not None
|
||
and importlib.util.find_spec("matplotlib") is not None
|
||
)
|
||
|
||
from routes._helpers import safe_int, safe_float, log_error, NO_FILE_SINGLE, NO_FILE_MULTIPLE
|
||
from utils.capabilities import (
|
||
QUALITY_BASIC,
|
||
QUALITY_HIGH,
|
||
find_soffice,
|
||
set_conversion_metadata,
|
||
soffice_convert,
|
||
)
|
||
import shutil
|
||
|
||
ODA_CONVERTER = shutil.which("ODAFileConverter") or shutil.which("oda_file_converter")
|
||
SOFFICE = find_soffice()
|
||
|
||
try:
|
||
from pptx import Presentation
|
||
from pptx.util import Emu
|
||
HAS_PPTX = True
|
||
except ImportError:
|
||
HAS_PPTX = False
|
||
|
||
bp = Blueprint("convert", __name__)
|
||
|
||
|
||
def _load_cad_modules():
|
||
import ezdxf
|
||
from ezdxf.addons.drawing import RenderContext, Frontend
|
||
from ezdxf.addons.drawing.matplotlib import MatplotlibBackend
|
||
import matplotlib
|
||
matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
return ezdxf, RenderContext, Frontend, MatplotlibBackend, plt
|
||
|
||
|
||
# ── LibreOffice availability note (PPT/ODP/DOC conversion) ──────
|
||
|
||
def _soffice_available_notes():
|
||
if SOFFICE:
|
||
return (
|
||
f'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
|
||
f'<strong>LibreOffice detected:</strong> <code>{SOFFICE}</code></p>'
|
||
)
|
||
return (
|
||
'<p><i class="bi bi-exclamation-triangle-fill" style="color:#ffb703"></i> '
|
||
'<strong>LibreOffice was not found on PATH.</strong> '
|
||
'This tool will not work until LibreOffice is installed.</p>'
|
||
'<details><summary>How to install LibreOffice</summary>'
|
||
'<p><strong>Windows:</strong> Download from '
|
||
'<a href="https://www.libreoffice.org/download/download-libreoffice/" target="_blank">libreoffice.org</a> '
|
||
'and add the install’s <code>program</code> folder '
|
||
'(usually <code>C:\\Program Files\\LibreOffice\\program</code>) to your PATH, '
|
||
'then restart the server.</p>'
|
||
'<p><strong>macOS:</strong> <code>brew install --cask libreoffice</code> '
|
||
'(the <code>soffice</code> binary lives at '
|
||
'<code>/Applications/LibreOffice.app/Contents/MacOS/soffice</code>).</p>'
|
||
'<p><strong>Linux:</strong> <code>sudo apt install libreoffice</code> '
|
||
'(Debian/Ubuntu) or <code>sudo dnf install libreoffice</code> (Fedora).</p>'
|
||
'<p>Restart the server after installing so the new PATH is picked up.</p>'
|
||
'</details>'
|
||
)
|
||
|
||
|
||
def _soffice_convert(file_data: bytes, source_ext: str, target_ext: str = "pdf",
|
||
timeout: int = 180):
|
||
"""Compatibility wrapper around the shared hardened LibreOffice converter."""
|
||
return soffice_convert(file_data, source_ext, target_ext, timeout)
|
||
|
||
|
||
# ── Page Routes ──────────────────────────────────
|
||
|
||
@bp.route("/to-pdf")
|
||
def to_pdf_page():
|
||
if SOFFICE:
|
||
notes = (
|
||
f'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
|
||
f'<strong>LibreOffice detected at <code>{SOFFICE}</code></strong> — Word documents '
|
||
f'will convert with full layout fidelity (fonts, images, tables, columns, '
|
||
f'headers/footers all preserved).</p>'
|
||
)
|
||
else:
|
||
notes = (
|
||
'<p><i class="bi bi-exclamation-triangle-fill" style="color:#ffb703"></i> '
|
||
'<strong>LibreOffice was not found.</strong> Word files (.docx) will use a built-in '
|
||
'fallback that only handles paragraphs, tables, basic formatting, and inline images. '
|
||
'It will <strong>NOT</strong> preserve: custom fonts, headers/footers, columns, '
|
||
'page breaks, text boxes, frames, SmartArt, or precise positioning.</p>'
|
||
'<p><strong>For high-fidelity Word→PDF, install LibreOffice:</strong></p>'
|
||
'<ul style="margin:.4rem 0 .6rem 1.2rem">'
|
||
'<li><strong>Windows:</strong> Download from '
|
||
'<a href="https://www.libreoffice.org/download/download-libreoffice/" target="_blank">libreoffice.org</a> '
|
||
'and install with default options. The app auto-detects it at <code>C:\\Program Files\\LibreOffice\\</code> '
|
||
'on next start — no PATH editing needed.</li>'
|
||
'<li><strong>macOS:</strong> <code>brew install --cask libreoffice</code></li>'
|
||
'<li><strong>Linux:</strong> <code>sudo apt install libreoffice</code> (Debian/Ubuntu) '
|
||
'or <code>sudo dnf install libreoffice</code> (Fedora)</li>'
|
||
'</ul>'
|
||
'<p style="font-size:.9em;color:var(--muted)">Restart the server after installing.</p>'
|
||
)
|
||
return render_template("upload_tool.html",
|
||
title="Files to PDF",
|
||
description="Convert images, Word documents, and text files to PDF",
|
||
notes=notes,
|
||
endpoint="/convert/to-pdf",
|
||
accept=".jpg,.jpeg,.png,.bmp,.tiff,.webp,.txt,.docx,.doc,.odt",
|
||
multiple=True,
|
||
options=[
|
||
{"type": "checkbox", "name": "use_basic_fallback",
|
||
"label": "Fallback",
|
||
"check_label": "Allow basic Python fallback if LibreOffice is unavailable or fails",
|
||
"default": False},
|
||
])
|
||
|
||
|
||
@bp.route("/pdf-to-word")
|
||
def pdf_to_word_page():
|
||
marker_status = (
|
||
'<li><strong>Marker (ML)</strong> — uses an ML model for structure understanding. '
|
||
'Best fidelity for academic papers, books, and complex documents. <em>'
|
||
+ ('Detected and ready.' if HAS_MARKER else 'Not installed — run <code>pip install marker-pdf</code>. '
|
||
'First run downloads ~2 GB of models. Conversion is slow on CPU (30–60s/page).')
|
||
+ '</em></li>'
|
||
)
|
||
return render_template("upload_tool.html",
|
||
title="PDF to Word",
|
||
description="Convert PDF documents to Word (.docx) format",
|
||
notes=(
|
||
'<p><strong>Four conversion modes — pick the one that fits your document:</strong></p>'
|
||
'<ul style="margin:.4rem 0 .6rem 1.2rem">'
|
||
'<li><strong>Layout (default)</strong> — uses <code>pdf2docx</code> to preserve tables, columns, and figure positions. '
|
||
'Best for forms, reports, invoices.</li>'
|
||
'<li><strong>Smart structure</strong> — analyses font sizes to detect headings, lists, and paragraphs, '
|
||
'and emits a Word doc with proper heading styles (visible in Word\'s navigation pane). '
|
||
'Best for articles, blog posts, books, and documentation. Drops tables and figures.</li>'
|
||
'<li><strong>Flowing text</strong> — extracts text in reading order, emits one paragraph per block. '
|
||
'No structure detection. Always produces clean output even on awkward PDFs.</li>'
|
||
f'{marker_status}'
|
||
'</ul>'
|
||
'<p style="font-size:.9em;color:var(--muted)">If your PDF is a scan, run it through <a href="/convert/ocr-pdf">OCR PDF</a> first.</p>'
|
||
),
|
||
endpoint="/convert/pdf-to-word",
|
||
accept=".pdf",
|
||
multiple=False,
|
||
options=[
|
||
{"type": "select", "name": "mode", "label": "Mode", "default": "layout",
|
||
"choices": [
|
||
{"value": "layout", "label": "Layout — preserve tables, columns, figures"},
|
||
{"value": "exact", "label": "Exact visual copy — non-editable page images"},
|
||
{"value": "structure", "label": "Smart structure — detect headings & lists"},
|
||
{"value": "text", "label": "Flowing text — clean paragraphs, no structure"},
|
||
{"value": "marker", "label": "Marker (ML) — best fidelity, slow, needs install"},
|
||
]},
|
||
{"type": "number", "name": "exact_dpi", "label": "Exact visual copy DPI",
|
||
"default": 180, "min": 96, "max": 300, "depends_on": {"mode": "exact"}},
|
||
{"type": "text", "name": "pages", "label": "Pages (blank = all)",
|
||
"placeholder": "e.g. 1-3, 5, 8-10"},
|
||
{"type": "checkbox", "name": "extract_tables",
|
||
"label": "Layout mode: detect borderless tables",
|
||
"check_label": "Try harder to find tables (slower, sometimes invents tables)",
|
||
"default": False},
|
||
])
|
||
|
||
|
||
@bp.route("/pdf-to-images")
|
||
def pdf_to_images_page():
|
||
return render_template("upload_tool.html",
|
||
title="PDF to Images",
|
||
description="Convert each PDF page to an image",
|
||
notes=(
|
||
'<p><strong>What this does:</strong> renders each PDF page as a raster image. '
|
||
'Output is one image per page, bundled as a ZIP if there are multiple pages.</p>'
|
||
'<p><strong>DPI guide:</strong> 72 = screen quality, 150 = good for slides, '
|
||
'200 = good for print preview, 300 = print quality, 600 = archival. '
|
||
'Higher DPI = larger files (a 10-page PDF at 600 DPI can be 50+ MB).</p>'
|
||
'<p><strong>Format:</strong> PNG is lossless and best for diagrams / text-heavy pages. '
|
||
'JPG is smaller but lossy — best for photo-heavy pages.</p>'
|
||
'<p style="font-size:.9em;color:var(--muted)"><strong>No external dependencies.</strong></p>'
|
||
),
|
||
endpoint="/convert/pdf-to-images",
|
||
accept=".pdf",
|
||
multiple=False,
|
||
options=[
|
||
{"type": "select", "name": "format", "label": "Image Format",
|
||
"choices": [
|
||
{"value": "png", "label": "PNG"},
|
||
{"value": "jpg", "label": "JPG"},
|
||
]},
|
||
{"type": "number", "name": "dpi", "label": "Resolution (DPI)", "default": 200, "min": 72, "max": 600},
|
||
])
|
||
|
||
|
||
@bp.route("/pdf-to-text")
|
||
def pdf_to_text_page():
|
||
return render_template("upload_tool.html",
|
||
title="PDF to Text",
|
||
description="Extract all text content from a PDF document",
|
||
notes=(
|
||
'<p><strong>What this does:</strong> pulls all extractable text out of the PDF, '
|
||
'page by page, in reading order.</p>'
|
||
'<p><strong>Important:</strong> this only works on PDFs that <em>contain</em> '
|
||
'real text. If your PDF is a scan (photographed/scanned pages stored as images), '
|
||
'no text will be extracted — run it through '
|
||
'<a href="/convert/ocr-pdf">OCR PDF</a> first to recognise the text, then come back here.</p>'
|
||
'<p style="font-size:.9em;color:var(--muted)"><strong>No external dependencies.</strong></p>'
|
||
),
|
||
endpoint="/convert/pdf-to-text",
|
||
accept=".pdf",
|
||
multiple=False,
|
||
options=[])
|
||
|
||
|
||
@bp.route("/md-to-pdf")
|
||
def md_to_pdf_page():
|
||
return render_template("tools/md_to_pdf.html")
|
||
|
||
|
||
@bp.route("/md-to-docx")
|
||
def md_to_docx_page():
|
||
return render_template("tools/md_to_docx.html")
|
||
|
||
|
||
@bp.route("/pdf-to-excel")
|
||
def pdf_to_excel_page():
|
||
return render_template("upload_tool.html",
|
||
title="PDF to Excel",
|
||
description="Extract tables from a PDF into an .xlsx workbook",
|
||
notes=(
|
||
"<p><strong>How table detection works:</strong> we try both detection strategies in "
|
||
"order of accuracy:</p>"
|
||
"<ul style='margin:.4rem 0 .6rem 1.2rem'>"
|
||
"<li><strong>Auto (recommended)</strong> — tries ruled-line detection first; if a "
|
||
"page has no visible table borders, falls back to text-alignment detection (catches "
|
||
"borderless tables in financial reports, invoices, schedules).</li>"
|
||
"<li><strong>Lines only</strong> — only tables with visible borders. Most accurate "
|
||
"but misses borderless tables.</li>"
|
||
"<li><strong>Text alignment only</strong> — finds tables by detecting columns of "
|
||
"aligned text. Catches borderless tables but can occasionally false-positive on "
|
||
"multi-column body text.</li>"
|
||
"</ul>"
|
||
"<p style='font-size:.9em;color:var(--muted)'><strong>Still get \"no tables found\"?</strong> "
|
||
"Try our <a href='/convert/pdf-to-word'>PDF to Word</a> tool in <em>Layout</em> mode "
|
||
"instead — it uses <code>pdf2docx</code> which is more aggressive about table "
|
||
"detection. If your PDF is scanned, run it through "
|
||
"<a href='/convert/ocr-pdf'>OCR PDF</a> first.</p>"
|
||
),
|
||
endpoint="/convert/pdf-to-excel",
|
||
accept=".pdf",
|
||
multiple=False,
|
||
options=[
|
||
{"type": "text", "name": "pages", "label": "Pages (leave empty for all)",
|
||
"placeholder": "e.g. 1-3, 5"},
|
||
{"type": "select", "name": "strategy", "label": "Table detection strategy", "default": "auto",
|
||
"choices": [
|
||
{"value": "auto", "label": "Auto — lines first, fall back to text alignment"},
|
||
{"value": "lines", "label": "Lines only (ruled tables)"},
|
||
{"value": "text", "label": "Text alignment only (borderless tables)"},
|
||
]},
|
||
{"type": "select", "name": "table_engine", "label": "Table engine", "default": "auto",
|
||
"choices": [
|
||
{"value": "auto", "label": "Auto — pdfplumber if installed, then PyMuPDF"},
|
||
{"value": "pymupdf", "label": "PyMuPDF built-in"},
|
||
{"value": "pdfplumber", "label": "pdfplumber (optional, often better on borderless tables)"},
|
||
]},
|
||
{"type": "select", "name": "mode", "label": "Extraction mode", "default": "tables",
|
||
"choices": [
|
||
{"value": "tables", "label": "Tables only (recommended)"},
|
||
{"value": "tables_text", "label": "Tables, fall back to text rows when none are found"},
|
||
{"value": "text", "label": "Text only — every line becomes a row"},
|
||
]},
|
||
{"type": "select", "name": "organize", "label": "Sheet organization", "default": "per_table",
|
||
"choices": [
|
||
{"value": "per_table", "label": "One sheet per detected table"},
|
||
{"value": "per_page", "label": "One sheet per page (tables stacked)"},
|
||
{"value": "combined", "label": "Everything on one sheet"},
|
||
]},
|
||
])
|
||
|
||
|
||
OCR_LANGS = [
|
||
{"value": "eng", "label": "English"},
|
||
{"value": "ind", "label": "Indonesian"},
|
||
{"value": "fra", "label": "French"},
|
||
{"value": "deu", "label": "German"},
|
||
{"value": "spa", "label": "Spanish"},
|
||
{"value": "ita", "label": "Italian"},
|
||
{"value": "por", "label": "Portuguese"},
|
||
{"value": "rus", "label": "Russian"},
|
||
{"value": "chi_sim", "label": "Chinese (Simplified)"},
|
||
{"value": "chi_tra", "label": "Chinese (Traditional)"},
|
||
{"value": "jpn", "label": "Japanese"},
|
||
{"value": "kor", "label": "Korean"},
|
||
{"value": "ara", "label": "Arabic"},
|
||
{"value": "hin", "label": "Hindi"},
|
||
]
|
||
|
||
|
||
@bp.route("/ocr-pdf")
|
||
def ocr_pdf_page():
|
||
if HAS_TESSERACT:
|
||
status = (
|
||
'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
|
||
'<strong>OCR is ready.</strong> Tesseract Python bindings detected. '
|
||
'Make sure the language pack you select is installed in your Tesseract '
|
||
'<code>tessdata</code> directory — you\'ll get a clear error if it isn\'t.</p>'
|
||
)
|
||
else:
|
||
status = (
|
||
'<p><i class="bi bi-exclamation-triangle-fill" style="color:#ffb703"></i> '
|
||
'<strong>OCR is unavailable.</strong> Two things to install:</p>'
|
||
'<ol style="margin:.4rem 0 .6rem 1.2rem">'
|
||
'<li>The <code>pytesseract</code> Python package: <code>pip install pytesseract</code></li>'
|
||
'<li>The Tesseract binary itself: '
|
||
'<a href="https://github.com/tesseract-ocr/tesseract" target="_blank">github.com/tesseract-ocr/tesseract</a> '
|
||
'(Windows installers, <code>brew install tesseract</code> on macOS, '
|
||
'<code>apt install tesseract-ocr</code> on Linux)</li>'
|
||
'</ol>'
|
||
'<p>Then for non-English OCR, download the matching <code>*.traineddata</code> '
|
||
'file from <a href="https://github.com/tesseract-ocr/tessdata" target="_blank">tessdata</a> '
|
||
'into your Tesseract install\'s <code>tessdata</code> folder.</p>'
|
||
)
|
||
return render_template("upload_tool.html",
|
||
title="OCR PDF",
|
||
description="Extract text from scanned PDFs or create a searchable PDF with a hidden text layer",
|
||
notes=(
|
||
f'{status}'
|
||
'<p><strong>Two output modes:</strong></p>'
|
||
'<ul style="margin:.4rem 0 .6rem 1.2rem">'
|
||
'<li><strong>Searchable PDF</strong> — keeps the original page images and adds an '
|
||
'invisible text layer underneath, so you can copy-paste and search. The PDF still '
|
||
'<em>looks</em> identical to the scan.</li>'
|
||
'<li><strong>Extracted text</strong> — just the recognised text, plain.</li>'
|
||
'</ul>'
|
||
'<p style="font-size:.9em;color:var(--muted)">Higher DPI = better OCR accuracy '
|
||
'but slower. 200 DPI is the sweet spot for most scans; bump to 300+ for small '
|
||
'fonts or low-quality scans.</p>'
|
||
),
|
||
endpoint="/convert/ocr-pdf",
|
||
accept=".pdf",
|
||
multiple=False,
|
||
options=[
|
||
{"type": "select", "name": "mode", "label": "Output",
|
||
"choices": [
|
||
{"value": "searchable", "label": "Searchable PDF (image + text layer)"},
|
||
{"value": "text", "label": "Extracted text only"},
|
||
]},
|
||
{"type": "select", "name": "lang", "label": "Language",
|
||
"choices": OCR_LANGS},
|
||
{"type": "number", "name": "dpi", "label": "OCR Resolution (DPI)",
|
||
"default": 200, "min": 100, "max": 400},
|
||
])
|
||
|
||
|
||
@bp.route("/cad-to-pdf")
|
||
def cad_to_pdf_page():
|
||
if ODA_CONVERTER:
|
||
notes = (
|
||
'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
|
||
'<strong>DWG support is enabled.</strong> ODA File Converter was detected at '
|
||
f'<code>{ODA_CONVERTER}</code>.</p>'
|
||
'<p>DXF files are rendered directly. DWG files are auto-converted to DXF first.</p>'
|
||
)
|
||
else:
|
||
notes = (
|
||
'<p><strong>DXF works out of the box.</strong> DWG files need the free '
|
||
'<a href="https://www.opendesign.com/guestfiles/oda_file_converter" target="_blank" rel="noopener">'
|
||
'ODA File Converter</a> installed and available on your system <code>PATH</code>.</p>'
|
||
'<details>'
|
||
'<summary>How to install ODA File Converter</summary>'
|
||
'<ol>'
|
||
'<li>Download the installer for your OS from '
|
||
'<a href="https://www.opendesign.com/guestfiles/oda_file_converter" target="_blank" rel="noopener">opendesign.com</a> '
|
||
'(free, guest download — no account required).</li>'
|
||
'<li>Run the installer. Defaults are fine.</li>'
|
||
'<li><strong>Add it to your PATH so this app can find it:</strong>'
|
||
'<ul>'
|
||
'<li><strong>Windows:</strong> add <code>C:\\Program Files\\ODA\\ODAFileConverter_title_version</code> '
|
||
'(the folder containing <code>ODAFileConverter.exe</code>) to your <em>System Environment Variables</em> → <code>Path</code>.</li>'
|
||
'<li><strong>macOS:</strong> <code>ln -s /Applications/ODAFileConverter.app/Contents/MacOS/ODAFileConverter /usr/local/bin/ODAFileConverter</code></li>'
|
||
'<li><strong>Linux:</strong> the <code>.deb</code>/<code>.rpm</code> package installs <code>ODAFileConverter</code> on PATH automatically. Otherwise symlink the binary into <code>/usr/local/bin</code>.</li>'
|
||
'</ul></li>'
|
||
'<li>Open a new terminal and verify: <code>ODAFileConverter</code> (should launch the tool GUI, or exit silently).</li>'
|
||
'<li><strong>Restart this Flask server</strong> so it picks up the updated PATH.</li>'
|
||
'</ol>'
|
||
'<p style="margin-top:.4rem">Alternative: open your DWG in free tools like <a href="https://www.autodesk.com/viewers" target="_blank" rel="noopener">Autodesk Viewer</a>, LibreCAD, or QCAD and export it as DXF, then upload the DXF here.</p>'
|
||
'</details>'
|
||
)
|
||
|
||
return render_template("upload_tool.html",
|
||
title="CAD to PDF/Image",
|
||
description="Convert DXF drawings to PDF or PNG. DWG is supported when ODA File Converter is installed.",
|
||
notes=notes,
|
||
endpoint="/convert/cad-to-pdf",
|
||
accept=".dxf,.dwg",
|
||
multiple=False,
|
||
options=[
|
||
{"type": "select", "name": "format", "label": "Output Format",
|
||
"choices": [
|
||
{"value": "pdf", "label": "PDF"},
|
||
{"value": "png", "label": "PNG"},
|
||
]},
|
||
{"type": "number", "name": "dpi", "label": "PNG Resolution (DPI)",
|
||
"default": 150, "min": 72, "max": 600,
|
||
"depends_on": {"format": "png"}},
|
||
])
|
||
|
||
|
||
@bp.route("/html-to-pdf")
|
||
def html_to_pdf_page():
|
||
if SOFFICE:
|
||
notes = (
|
||
f'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
|
||
f'<strong>LibreOffice detected</strong> — full CSS support, tables, lists, and inline styles render correctly.</p>'
|
||
)
|
||
else:
|
||
notes = (
|
||
'<p><i class="bi bi-info-circle-fill" style="color:#4361ee"></i> '
|
||
'<strong>Tip:</strong> install LibreOffice for far better CSS / table / image fidelity. '
|
||
'Without it, PDF rendering uses PyMuPDF\'s minimal HTML parser (basic text and simple tables only).</p>'
|
||
)
|
||
return render_template("upload_tool.html",
|
||
title="HTML to PDF",
|
||
description="Convert HTML content to a PDF document",
|
||
notes=notes,
|
||
endpoint="/convert/html-to-pdf",
|
||
text_input=True,
|
||
text_label="HTML Content",
|
||
text_placeholder="<h1>Hello World</h1>\n<p>Paste your HTML here...</p>",
|
||
accept="",
|
||
multiple=False,
|
||
options=[
|
||
{"type": "checkbox", "name": "use_basic_fallback",
|
||
"label": "Fallback",
|
||
"check_label": "Allow basic PyMuPDF fallback if LibreOffice is unavailable or fails",
|
||
"default": False},
|
||
],
|
||
button_text="Convert to PDF")
|
||
|
||
|
||
# ── Helpers ──────────────────────────────────────
|
||
|
||
def _docx_to_pdf(data: bytes) -> bytes:
|
||
"""Best-effort .docx → PDF conversion using python-docx + reportlab.
|
||
|
||
This is the fallback path used when LibreOffice is not available.
|
||
It preserves document order (paragraphs and tables interleaved correctly),
|
||
inline images, and basic heading/paragraph styling. It does NOT preserve:
|
||
headers/footers, columns, custom fonts, page breaks, text boxes, frames,
|
||
SmartArt, equations, or precise layout. For those, install LibreOffice.
|
||
"""
|
||
from docx.oxml.ns import qn
|
||
from docx.text.paragraph import Paragraph as DocxParagraph
|
||
from docx.table import Table as DocxTable
|
||
from reportlab.platypus import Image as RLImage
|
||
from PIL import Image as PILImage
|
||
|
||
doc = DocxDocument(io.BytesIO(data))
|
||
buf = io.BytesIO()
|
||
|
||
styles = getSampleStyleSheet()
|
||
normal = styles["Normal"]
|
||
normal.fontName = "Helvetica"
|
||
normal.fontSize = 11
|
||
normal.leading = 14
|
||
|
||
heading_styles = {}
|
||
for level in range(1, 4):
|
||
size = {1: 18, 2: 15, 3: 13}[level]
|
||
heading_styles[level] = ParagraphStyle(
|
||
f"Heading{level}", parent=normal,
|
||
fontName="Helvetica-Bold", fontSize=size, leading=size + 4,
|
||
spaceBefore=12, spaceAfter=6,
|
||
)
|
||
|
||
pdf = SimpleDocTemplate(buf, pagesize=A4,
|
||
leftMargin=inch, rightMargin=inch,
|
||
topMargin=inch, bottomMargin=inch)
|
||
story = []
|
||
|
||
# Map of relationship-id → raw image bytes, used to look up images
|
||
# referenced by <a:blip r:embed="rId123" /> elements in paragraphs.
|
||
image_parts: dict[str, bytes] = {}
|
||
try:
|
||
for rel_id, rel in doc.part.rels.items():
|
||
if "image" in (rel.reltype or ""):
|
||
image_parts[rel_id] = rel.target_part.blob
|
||
except Exception:
|
||
pass
|
||
|
||
# Page content area for image scaling (A4 minus 1in margins)
|
||
max_img_w = 6.5 * inch
|
||
max_img_h = 4.0 * inch # cap height so images don't dominate
|
||
|
||
def _emit_image(rel_id: str) -> None:
|
||
img_bytes = image_parts.get(rel_id)
|
||
if not img_bytes:
|
||
return
|
||
try:
|
||
# Pillow may not handle EMF/WMF; skip those gracefully
|
||
with PILImage.open(io.BytesIO(img_bytes)) as pil:
|
||
w, h = pil.size
|
||
# Convert to PNG if needed for reportlab compatibility
|
||
if pil.format not in ("PNG", "JPEG", "GIF"):
|
||
out = io.BytesIO()
|
||
if pil.mode in ("RGBA", "LA"):
|
||
pil.save(out, format="PNG")
|
||
else:
|
||
pil.convert("RGB").save(out, format="JPEG", quality=90)
|
||
out.seek(0)
|
||
img_data = out.getvalue()
|
||
else:
|
||
img_data = img_bytes
|
||
except Exception:
|
||
return
|
||
if w <= 0 or h <= 0:
|
||
return
|
||
scale = min(max_img_w / w, max_img_h / h, 1.0)
|
||
story.append(Spacer(1, 6))
|
||
story.append(RLImage(io.BytesIO(img_data),
|
||
width=w * scale, height=h * scale))
|
||
story.append(Spacer(1, 6))
|
||
|
||
def _emit_paragraph(child) -> None:
|
||
para = DocxParagraph(child, doc)
|
||
|
||
# Emit any inline images first (in their paragraph)
|
||
for blip in child.findall(".//" + qn("a:blip")):
|
||
rel_id = blip.get(qn("r:embed"))
|
||
if rel_id:
|
||
_emit_image(rel_id)
|
||
|
||
text = para.text.strip()
|
||
if not text:
|
||
return # already-emitted image, or genuinely empty
|
||
|
||
style_name = para.style.name.lower() if para.style else ""
|
||
if "heading 1" in style_name:
|
||
story.append(Paragraph(text, heading_styles[1]))
|
||
elif "heading 2" in style_name:
|
||
story.append(Paragraph(text, heading_styles[2]))
|
||
elif "heading 3" in style_name:
|
||
story.append(Paragraph(text, heading_styles[3]))
|
||
else:
|
||
rich = _build_rich_text(para)
|
||
story.append(Paragraph(rich, normal))
|
||
|
||
def _emit_table(child) -> None:
|
||
table = DocxTable(child, doc)
|
||
tdata = []
|
||
for row in table.rows:
|
||
tdata.append([cell.text for cell in row.cells])
|
||
if not tdata:
|
||
return
|
||
t = Table(tdata, repeatRows=1)
|
||
t.setStyle(TableStyle([
|
||
("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
|
||
("BACKGROUND", (0, 0), (-1, 0), colors.Color(0.9, 0.9, 0.95)),
|
||
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
|
||
("FONTSIZE", (0, 0), (-1, -1), 10),
|
||
("TOPPADDING", (0, 0), (-1, -1), 4),
|
||
("BOTTOMPADDING", (0, 0), (-1, -1), 4),
|
||
("LEFTPADDING", (0, 0), (-1, -1), 6),
|
||
("RIGHTPADDING", (0, 0), (-1, -1), 6),
|
||
]))
|
||
story.append(Spacer(1, 8))
|
||
story.append(t)
|
||
story.append(Spacer(1, 8))
|
||
|
||
# Walk the document body in order so paragraphs and tables appear in their
|
||
# original positions, not all paragraphs first then all tables.
|
||
for child in doc.element.body.iterchildren():
|
||
tag = child.tag.split("}", 1)[-1]
|
||
if tag == "p":
|
||
_emit_paragraph(child)
|
||
elif tag == "tbl":
|
||
_emit_table(child)
|
||
# Section-properties (sectPr) and other elements are ignored
|
||
|
||
if not story:
|
||
story.append(Paragraph("(empty document)", normal))
|
||
|
||
pdf.build(story)
|
||
return buf.getvalue()
|
||
|
||
|
||
def _build_rich_text(para) -> str:
|
||
"""Convert a python-docx paragraph's runs into reportlab-compatible rich text."""
|
||
parts = []
|
||
for run in para.runs:
|
||
text = run.text.replace("&", "&").replace("<", "<").replace(">", ">")
|
||
if not text:
|
||
continue
|
||
if run.bold and run.italic:
|
||
parts.append(f"<b><i>{text}</i></b>")
|
||
elif run.bold:
|
||
parts.append(f"<b>{text}</b>")
|
||
elif run.italic:
|
||
parts.append(f"<i>{text}</i>")
|
||
elif run.underline:
|
||
parts.append(f"<u>{text}</u>")
|
||
else:
|
||
parts.append(text)
|
||
return "".join(parts) or para.text
|
||
|
||
|
||
# ── Processing Routes ────────────────────────────
|
||
|
||
@bp.route("/to-pdf", methods=["POST"])
|
||
def to_pdf():
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_MULTIPLE), 400
|
||
|
||
allow_basic_fallback = request.form.get("use_basic_fallback") == "on"
|
||
pdf_doc = fitz.open()
|
||
# Track which engine ran on Word docs so the response can advertise it
|
||
# (helps users diagnose "why is my output low-fidelity" without log access).
|
||
engine_used = "pymupdf"
|
||
quality = QUALITY_HIGH
|
||
warnings: list[str] = []
|
||
|
||
for f in files:
|
||
name = f.filename.lower()
|
||
data = f.read()
|
||
|
||
if name.endswith((".docx", ".doc", ".odt")):
|
||
# Word document → PDF pages.
|
||
# Prefer LibreOffice for high-fidelity layout. Fall back to the
|
||
# python-docx + reportlab rebuilder if soffice is unavailable.
|
||
ext = name.rsplit(".", 1)[-1]
|
||
try:
|
||
pdf_bytes = _soffice_convert(data, ext, "pdf")
|
||
if pdf_bytes is not None:
|
||
engine_used = "libreoffice"
|
||
quality = QUALITY_HIGH
|
||
else:
|
||
if not allow_basic_fallback:
|
||
return jsonify(error=(
|
||
f"High-fidelity conversion for '{f.filename}' requires LibreOffice. "
|
||
"Tick 'Allow basic Python fallback' to continue with lower layout fidelity."
|
||
)), 400
|
||
if ext != "docx":
|
||
return jsonify(error=(
|
||
f"'{f.filename}' requires LibreOffice (soffice) on PATH. "
|
||
"Only .docx is supported by the built-in fallback. "
|
||
"Install LibreOffice for full layout fidelity."
|
||
)), 400
|
||
pdf_bytes = _docx_to_pdf(data)
|
||
engine_used = "python-docx/reportlab"
|
||
quality = QUALITY_BASIC
|
||
warnings.append(
|
||
"Word document used basic fallback; headers, custom layout, and precise positioning may differ."
|
||
)
|
||
with fitz.open(stream=pdf_bytes, filetype="pdf") as docx_pdf:
|
||
pdf_doc.insert_pdf(docx_pdf)
|
||
except Exception as e:
|
||
log_error(e, f"to-pdf docx: {f.filename}")
|
||
return jsonify(error=f"Could not convert '{f.filename}' (Word file may be corrupted)."), 400
|
||
elif name.endswith(".txt"):
|
||
# Text file → PDF page
|
||
text = data.decode("utf-8", errors="replace")
|
||
page = pdf_doc.new_page(width=595, height=842) # A4
|
||
rect = fitz.Rect(50, 50, 545, 792)
|
||
page.insert_textbox(rect, text, fontsize=11, fontname="helv")
|
||
else:
|
||
# Image → PDF page
|
||
try:
|
||
with Image.open(io.BytesIO(data)) as pil_img:
|
||
pil_img = ImageOps.exif_transpose(pil_img)
|
||
if pil_img.mode in ("RGBA", "P"):
|
||
pil_img = pil_img.convert("RGB")
|
||
buf = io.BytesIO()
|
||
pil_img.save(buf, format="JPEG", quality=95)
|
||
img_data = buf.getvalue()
|
||
|
||
with fitz.open(stream=img_data, filetype="jpeg") as img_doc:
|
||
rect = img_doc[0].rect
|
||
pdf_page = pdf_doc.new_page(width=rect.width, height=rect.height)
|
||
pdf_page.insert_image(rect, stream=img_data)
|
||
except Exception as e:
|
||
log_error(e, f"to-pdf image: {f.filename}")
|
||
return jsonify(error=f"Could not convert '{f.filename}' (image may be corrupted or unsupported)."), 400
|
||
|
||
output = io.BytesIO()
|
||
pdf_doc.save(output)
|
||
pdf_doc.close()
|
||
output.seek(0)
|
||
|
||
resp = send_file(output, mimetype="application/pdf",
|
||
as_attachment=True, download_name="converted.pdf")
|
||
return set_conversion_metadata(resp, engine_used, quality, warnings)
|
||
|
||
|
||
@bp.route("/pdf-to-word", methods=["POST"])
|
||
def pdf_to_word():
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_SINGLE), 400
|
||
|
||
mode = request.form.get("mode", "layout")
|
||
pdf_data = files[0].read()
|
||
pages_spec = (request.form.get("pages") or "").strip()
|
||
extract_borderless_tables = request.form.get("extract_tables") == "on"
|
||
base = files[0].filename.rsplit(".", 1)[0]
|
||
docx_mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||
|
||
# Pre-resolve page range against the PDF (used by all modes)
|
||
try:
|
||
with fitz.open(stream=pdf_data, filetype="pdf") as probe:
|
||
total_pages = len(probe)
|
||
if pages_spec:
|
||
from routes.pdf_tools import parse_page_ranges
|
||
target_pages = parse_page_ranges(pages_spec, total_pages)
|
||
if not target_pages:
|
||
return jsonify(error="No valid pages selected."), 400
|
||
else:
|
||
target_pages = list(range(total_pages))
|
||
except (ValueError, IndexError):
|
||
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 8-10'."), 400
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-word probe")
|
||
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
|
||
|
||
# ── Mode dispatch ──────────────────────────────────────
|
||
|
||
if mode == "text":
|
||
try:
|
||
buf = _pdf_to_docx_flowing_text(pdf_data, target_pages)
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-word text")
|
||
return jsonify(error="Could not extract text from the PDF (it may be a scan — try OCR PDF first)."), 400
|
||
resp = send_file(io.BytesIO(buf), mimetype=docx_mime,
|
||
as_attachment=True, download_name=f"{base}.docx")
|
||
return set_conversion_metadata(resp, "pymupdf/python-docx", QUALITY_BASIC,
|
||
"Flowing text prioritizes clean editable text over visual layout.")
|
||
|
||
if mode == "structure":
|
||
try:
|
||
buf = _pdf_to_docx_smart_structure(pdf_data, target_pages)
|
||
except ValueError as e:
|
||
return jsonify(error=str(e)), 400
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-word structure")
|
||
return jsonify(error="Smart-structure analysis failed. Try Flowing text mode instead."), 400
|
||
resp = send_file(io.BytesIO(buf), mimetype=docx_mime,
|
||
as_attachment=True, download_name=f"{base}.docx")
|
||
return set_conversion_metadata(resp, "pymupdf/python-docx", "medium",
|
||
"Smart structure is editable but drops precise layout, figures, and tables.")
|
||
|
||
if mode == "exact":
|
||
dpi = safe_int(request.form.get("exact_dpi"), 180, min_val=96, max_val=300)
|
||
try:
|
||
buf = _pdf_to_docx_exact_visual(pdf_data, target_pages, dpi)
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-word exact")
|
||
return jsonify(error="Exact visual copy failed. The PDF may be corrupted or password-protected."), 400
|
||
resp = send_file(io.BytesIO(buf), mimetype=docx_mime,
|
||
as_attachment=True, download_name=f"{base}.docx")
|
||
return set_conversion_metadata(resp, "pymupdf/python-docx", QUALITY_HIGH,
|
||
"Exact visual copy preserves appearance by embedding page images; text is not editable.")
|
||
|
||
if mode == "marker":
|
||
if not HAS_MARKER:
|
||
return jsonify(error=(
|
||
"Marker mode requires the 'marker-pdf' package. Run: "
|
||
"pip install marker-pdf — first run will download ~2 GB of models."
|
||
)), 400
|
||
try:
|
||
buf = _pdf_to_docx_via_marker(pdf_data, target_pages)
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-word marker")
|
||
return jsonify(error="Marker conversion failed. Check the server log; "
|
||
"first run downloads ~2 GB and may need extra time."), 400
|
||
resp = send_file(io.BytesIO(buf), mimetype=docx_mime,
|
||
as_attachment=True, download_name=f"{base}.docx")
|
||
return set_conversion_metadata(resp, "marker-pdf/python-docx", QUALITY_HIGH,
|
||
"Marker output is editable structured content, not pixel-perfect layout.")
|
||
|
||
# ── Layout mode (default) ──────────────────────────────
|
||
if not HAS_PDF2DOCX:
|
||
return jsonify(error="Layout mode requires pdf2docx. Run: pip install pdf2docx — or switch to 'Flowing text' or 'Smart structure' mode."), 400
|
||
|
||
import tempfile, os
|
||
|
||
with tempfile.TemporaryDirectory() as tmpdir:
|
||
pdf_path = os.path.join(tmpdir, "input.pdf")
|
||
docx_path = os.path.join(tmpdir, "output.docx")
|
||
|
||
with open(pdf_path, "wb") as f:
|
||
f.write(pdf_data)
|
||
|
||
# Translate target_pages (set of 0-based) to start/end if contiguous.
|
||
# pdf2docx supports a `pages` list arg directly, which is cleaner.
|
||
cv_kwargs = {"multi_processing": False}
|
||
if pages_spec:
|
||
cv_kwargs["pages"] = target_pages
|
||
if extract_borderless_tables:
|
||
cv_kwargs["parse_stream_table"] = True
|
||
|
||
try:
|
||
cv = Pdf2DocxConverter(pdf_path)
|
||
try:
|
||
cv.convert(docx_path, **cv_kwargs)
|
||
finally:
|
||
cv.close()
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-word layout")
|
||
return jsonify(error="Layout conversion failed. Try Smart structure or Flowing text mode instead, or check that the PDF isn't password-protected."), 400
|
||
|
||
with open(docx_path, "rb") as f:
|
||
result = io.BytesIO(f.read())
|
||
|
||
result.seek(0)
|
||
name = files[0].filename.rsplit(".", 1)[0] + ".docx"
|
||
resp = send_file(result, mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
as_attachment=True, download_name=name)
|
||
return set_conversion_metadata(resp, "pdf2docx", "medium",
|
||
"Layout mode is editable but PDF-to-Word conversion is inherently lossy.")
|
||
|
||
|
||
# ── PDF → Word helpers (one per non-pdf2docx mode) ─────────
|
||
|
||
def _pdf_to_docx_flowing_text(pdf_data: bytes, target_pages: list[int]) -> bytes:
|
||
"""Reading-order text extraction → one paragraph per block. No structure."""
|
||
from docx import Document as DocxDocument
|
||
|
||
with fitz.open(stream=pdf_data, filetype="pdf") as src:
|
||
doc = DocxDocument()
|
||
for idx, pno in enumerate(target_pages):
|
||
if idx > 0:
|
||
doc.add_page_break()
|
||
page = src[pno]
|
||
blocks = page.get_text("blocks") or []
|
||
blocks.sort(key=lambda b: (round(b[1], 1), round(b[0], 1)))
|
||
for b in blocks:
|
||
text = (b[4] if len(b) > 4 else "").strip()
|
||
if not text:
|
||
continue
|
||
text = "\n".join(p.strip() for p in text.split("\n") if p.strip())
|
||
for para in text.split("\n\n"):
|
||
para = para.replace("\n", " ").strip()
|
||
if para:
|
||
doc.add_paragraph(para)
|
||
buf = io.BytesIO()
|
||
doc.save(buf)
|
||
return buf.getvalue()
|
||
|
||
|
||
def _pdf_to_docx_exact_visual(pdf_data: bytes, target_pages: list[int], dpi: int) -> bytes:
|
||
"""Render PDF pages into a DOCX as full-page images.
|
||
|
||
This mode is intentionally non-editable. It exists for users who care more
|
||
about visual fidelity than editable Word content.
|
||
"""
|
||
from docx import Document as DocxDocument
|
||
from docx.shared import Inches
|
||
|
||
with fitz.open(stream=pdf_data, filetype="pdf") as src:
|
||
doc = DocxDocument()
|
||
section = doc.sections[0]
|
||
section.top_margin = Inches(0)
|
||
section.bottom_margin = Inches(0)
|
||
section.left_margin = Inches(0)
|
||
section.right_margin = Inches(0)
|
||
|
||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||
first = True
|
||
for pno in target_pages:
|
||
page = src[pno]
|
||
page_w_in = page.rect.width / 72
|
||
page_h_in = page.rect.height / 72
|
||
if first:
|
||
section.page_width = Inches(page_w_in)
|
||
section.page_height = Inches(page_h_in)
|
||
first = False
|
||
else:
|
||
doc.add_page_break()
|
||
|
||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||
png_bytes = pix.tobytes("png")
|
||
paragraph = doc.add_paragraph()
|
||
paragraph.paragraph_format.space_before = 0
|
||
paragraph.paragraph_format.space_after = 0
|
||
paragraph.add_run().add_picture(io.BytesIO(png_bytes), width=Inches(page_w_in))
|
||
|
||
buf = io.BytesIO()
|
||
doc.save(buf)
|
||
return buf.getvalue()
|
||
|
||
|
||
def _pdf_to_docx_smart_structure(pdf_data: bytes, target_pages: list[int]) -> bytes:
|
||
"""Detect headings (by font size), bullet/numbered lists (by line prefix),
|
||
and paragraphs. Emit a .docx with proper Word heading and list styles.
|
||
|
||
Drops tables and figures (those need Layout or Marker mode).
|
||
"""
|
||
import re
|
||
from collections import Counter
|
||
from docx import Document as DocxDocument
|
||
|
||
BULLET_RE = re.compile(r"^[•▪●·\-\*]\s+")
|
||
NUMBER_RE = re.compile(r"^(\d+|[a-zA-Z])[\.\)]\s+")
|
||
|
||
with fitz.open(stream=pdf_data, filetype="pdf") as src:
|
||
# Pass 1: collect font sizes to determine the body baseline.
|
||
sizes: list[float] = []
|
||
for pno in target_pages:
|
||
page = src[pno]
|
||
for block in page.get_text("dict")["blocks"]:
|
||
if "lines" not in block:
|
||
continue
|
||
for line in block["lines"]:
|
||
for span in line["spans"]:
|
||
sizes.append(round(span["size"], 1))
|
||
|
||
if not sizes:
|
||
raise ValueError("No text found in the selected pages. If the PDF is a scan, run OCR PDF first.")
|
||
|
||
body_size = Counter(sizes).most_common(1)[0][0]
|
||
|
||
# Pass 2: build the document.
|
||
doc = DocxDocument()
|
||
for idx, pno in enumerate(target_pages):
|
||
if idx > 0:
|
||
doc.add_page_break()
|
||
page = src[pno]
|
||
blocks = [b for b in page.get_text("dict")["blocks"] if "lines" in b]
|
||
# Reading order: top-to-bottom, then left-to-right within tolerance
|
||
blocks.sort(key=lambda b: (round(b["bbox"][1] / 5) * 5, round(b["bbox"][0])))
|
||
|
||
for block in blocks:
|
||
lines = []
|
||
spans_meta: list[tuple[float, bool]] = []
|
||
for line in block["lines"]:
|
||
line_text = "".join(s["text"] for s in line["spans"])
|
||
if line_text.strip():
|
||
lines.append(line_text)
|
||
for s in line["spans"]:
|
||
# PyMuPDF flag bit 4 (0x10) = bold
|
||
spans_meta.append((s["size"], bool(s["flags"] & 16)))
|
||
|
||
if not lines:
|
||
continue
|
||
|
||
avg_size = sum(s for s, _ in spans_meta) / len(spans_meta)
|
||
bold_ratio = sum(1 for _, b in spans_meta if b) / len(spans_meta)
|
||
full_text = " ".join(line.strip() for line in lines).strip()
|
||
|
||
# Heading detection by relative font size
|
||
if avg_size >= body_size * 1.6:
|
||
doc.add_heading(full_text, level=1)
|
||
elif avg_size >= body_size * 1.3:
|
||
doc.add_heading(full_text, level=2)
|
||
elif avg_size >= body_size * 1.15 or (
|
||
avg_size >= body_size * 1.05 and bold_ratio > 0.6 and len(full_text) < 120
|
||
):
|
||
doc.add_heading(full_text, level=3)
|
||
# List detection by line prefix
|
||
elif BULLET_RE.match(full_text):
|
||
doc.add_paragraph(BULLET_RE.sub("", full_text), style="List Bullet")
|
||
elif NUMBER_RE.match(full_text):
|
||
doc.add_paragraph(NUMBER_RE.sub("", full_text), style="List Number")
|
||
else:
|
||
doc.add_paragraph(full_text)
|
||
|
||
buf = io.BytesIO()
|
||
doc.save(buf)
|
||
return buf.getvalue()
|
||
|
||
|
||
def _pdf_to_docx_via_marker(pdf_data: bytes, target_pages: list[int]) -> bytes:
|
||
"""Use Marker (ML) to extract structured Markdown, then convert to .docx."""
|
||
import os
|
||
import tempfile
|
||
from marker.converters.pdf import PdfConverter
|
||
from marker.models import create_model_dict
|
||
from marker.output import text_from_rendered
|
||
|
||
# If specific pages requested, build a subset PDF first so Marker only
|
||
# processes what's needed (it's slow per page).
|
||
if len(target_pages) != _count_pages(pdf_data):
|
||
pdf_data = _extract_pages(pdf_data, target_pages)
|
||
|
||
with tempfile.TemporaryDirectory() as tmp:
|
||
pdf_path = os.path.join(tmp, "input.pdf")
|
||
with open(pdf_path, "wb") as f:
|
||
f.write(pdf_data)
|
||
|
||
converter = PdfConverter(artifact_dict=create_model_dict())
|
||
rendered = converter(pdf_path)
|
||
markdown_text, _, _ = text_from_rendered(rendered)
|
||
|
||
# Convert the markdown to docx via a reusable HTML→docx walker
|
||
import markdown as md_lib
|
||
html = md_lib.markdown(
|
||
markdown_text,
|
||
extensions=["extra", "sane_lists", "nl2br", "fenced_code", "tables"],
|
||
)
|
||
return _md_html_to_docx_bytes(html)
|
||
|
||
|
||
def _count_pages(pdf_data: bytes) -> int:
|
||
with fitz.open(stream=pdf_data, filetype="pdf") as d:
|
||
return len(d)
|
||
|
||
|
||
def _extract_pages(pdf_data: bytes, page_indices: list[int]) -> bytes:
|
||
"""Build a new PDF containing only the listed page indices (0-based)."""
|
||
with fitz.open(stream=pdf_data, filetype="pdf") as src:
|
||
with fitz.open() as out:
|
||
for idx in page_indices:
|
||
out.insert_pdf(src, from_page=idx, to_page=idx)
|
||
buf = io.BytesIO()
|
||
out.save(buf)
|
||
return buf.getvalue()
|
||
|
||
|
||
def _md_html_to_docx_bytes(html: str) -> bytes:
|
||
"""Use the same HTML-walking parser md_to_docx uses, but as a callable
|
||
helper so the marker mode can reuse it. Returns docx bytes.
|
||
"""
|
||
# We avoid circular imports — import lazily.
|
||
from html.parser import HTMLParser as _HP
|
||
from docx import Document as DocxDocument
|
||
|
||
doc = DocxDocument()
|
||
|
||
class _P(_HP):
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.cur_para = None
|
||
self.list_stack = []
|
||
self.in_pre = False
|
||
|
||
def handle_starttag(self, tag, attrs):
|
||
if tag in ("h1", "h2", "h3", "h4"):
|
||
self.cur_para = doc.add_heading("", level=int(tag[1]))
|
||
elif tag == "p":
|
||
self.cur_para = doc.add_paragraph()
|
||
elif tag == "li":
|
||
style = "List Number" if (self.list_stack and self.list_stack[-1] == "ol") else "List Bullet"
|
||
self.cur_para = doc.add_paragraph(style=style)
|
||
elif tag in ("ul", "ol"):
|
||
self.list_stack.append(tag)
|
||
elif tag in ("strong", "b", "em", "i", "code"):
|
||
pass # handled in handle_data
|
||
elif tag == "pre":
|
||
self.in_pre = True
|
||
self.cur_para = doc.add_paragraph(style="Intense Quote")
|
||
elif tag == "hr":
|
||
doc.add_paragraph("─" * 40)
|
||
|
||
def handle_endtag(self, tag):
|
||
if tag in ("ul", "ol") and self.list_stack:
|
||
self.list_stack.pop()
|
||
if tag == "pre":
|
||
self.in_pre = False
|
||
self.cur_para = None
|
||
if tag in ("h1", "h2", "h3", "h4", "p", "li"):
|
||
self.cur_para = None
|
||
|
||
def handle_data(self, data):
|
||
if self.cur_para is None:
|
||
if data.strip():
|
||
self.cur_para = doc.add_paragraph()
|
||
else:
|
||
return
|
||
self.cur_para.add_run(data)
|
||
|
||
parser = _P()
|
||
parser.feed(html)
|
||
|
||
buf = io.BytesIO()
|
||
doc.save(buf)
|
||
return buf.getvalue()
|
||
|
||
|
||
@bp.route("/pdf-to-images", methods=["POST"])
|
||
def pdf_to_images():
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_SINGLE), 400
|
||
|
||
fmt = request.form.get("format", "png")
|
||
dpi = safe_int(request.form.get("dpi"), 200, min_val=72, max_val=600)
|
||
|
||
pdf_data = files[0].read()
|
||
try:
|
||
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-images open")
|
||
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
|
||
|
||
from utils.file_utils import make_zip
|
||
images = []
|
||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||
|
||
try:
|
||
for i, page in enumerate(doc):
|
||
pix = page.get_pixmap(matrix=mat)
|
||
if fmt == "jpg":
|
||
img_bytes = pix.tobytes("jpeg")
|
||
ext = "jpg"
|
||
else:
|
||
img_bytes = pix.tobytes("png")
|
||
ext = "png"
|
||
images.append((f"page_{i + 1}.{ext}", img_bytes))
|
||
finally:
|
||
doc.close()
|
||
|
||
if len(images) == 1:
|
||
mime = "image/png" if fmt == "png" else "image/jpeg"
|
||
return send_file(io.BytesIO(images[0][1]), mimetype=mime,
|
||
as_attachment=True, download_name=images[0][0])
|
||
|
||
zip_buf = make_zip(images)
|
||
name = files[0].filename.rsplit(".", 1)[0] + "_images.zip"
|
||
return send_file(zip_buf, mimetype="application/zip",
|
||
as_attachment=True, download_name=name)
|
||
|
||
|
||
@bp.route("/pdf-to-text", methods=["POST"])
|
||
def pdf_to_text():
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_SINGLE), 400
|
||
|
||
pdf_data = files[0].read()
|
||
try:
|
||
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-text open")
|
||
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
|
||
|
||
text_parts = []
|
||
try:
|
||
for i, page in enumerate(doc):
|
||
text_parts.append(f"--- Page {i + 1} ---")
|
||
text_parts.append(page.get_text())
|
||
finally:
|
||
doc.close()
|
||
|
||
return jsonify(text="\n".join(text_parts))
|
||
|
||
|
||
@bp.route("/pdf-to-excel", methods=["POST"])
|
||
def pdf_to_excel():
|
||
import re
|
||
from openpyxl import Workbook
|
||
from openpyxl.styles import Font
|
||
from openpyxl.utils import get_column_letter
|
||
from routes.pdf_tools import parse_page_ranges
|
||
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_SINGLE), 400
|
||
|
||
mode = request.form.get("mode", "tables")
|
||
organize = request.form.get("organize", "per_table")
|
||
strategy = request.form.get("strategy", "auto")
|
||
if strategy not in ("auto", "lines", "text"):
|
||
strategy = "auto"
|
||
table_engine = request.form.get("table_engine", "auto")
|
||
if table_engine not in ("auto", "pymupdf", "pdfplumber"):
|
||
table_engine = "auto"
|
||
if table_engine == "pdfplumber" and not HAS_PDFPLUMBER:
|
||
return jsonify(error="pdfplumber is not installed. Install it or choose Auto/PyMuPDF."), 400
|
||
pages_spec = request.form.get("pages", "").strip()
|
||
pdf_data = files[0].read()
|
||
|
||
try:
|
||
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-excel open")
|
||
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
|
||
|
||
plumber_doc = None
|
||
if table_engine in ("auto", "pdfplumber") and HAS_PDFPLUMBER:
|
||
try:
|
||
plumber_doc = pdfplumber.open(io.BytesIO(pdf_data))
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-excel pdfplumber open")
|
||
if table_engine == "pdfplumber":
|
||
doc.close()
|
||
return jsonify(error="pdfplumber could not open this PDF. Try Auto or PyMuPDF."), 400
|
||
|
||
try:
|
||
target_pages = parse_page_ranges(pages_spec, len(doc))
|
||
except (ValueError, IndexError):
|
||
doc.close()
|
||
if plumber_doc:
|
||
plumber_doc.close()
|
||
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 8-10'."), 400
|
||
if not target_pages:
|
||
doc.close()
|
||
if plumber_doc:
|
||
plumber_doc.close()
|
||
return jsonify(error="No valid pages selected."), 400
|
||
|
||
wb = Workbook()
|
||
wb.remove(wb.active)
|
||
used_names: set[str] = set()
|
||
total_tables = 0
|
||
total_text_pages = 0
|
||
warnings: list[str] = []
|
||
table_engines_used: set[str] = set()
|
||
|
||
def _safe_name(base: str) -> str:
|
||
name = re.sub(r"[\[\]\*\?\/\\:]", "_", base)[:31] or "Sheet"
|
||
candidate = name
|
||
i = 2
|
||
while candidate in used_names:
|
||
suffix = f"_{i}"
|
||
candidate = (name[: 31 - len(suffix)] + suffix)
|
||
i += 1
|
||
used_names.add(candidate)
|
||
return candidate
|
||
|
||
def _write_rows(ws, rows: list[list], start_row: int = 1, header: bool = True) -> int:
|
||
for r_idx, row in enumerate(rows, start=start_row):
|
||
for c_idx, cell in enumerate(row, start=1):
|
||
ws.cell(row=r_idx, column=c_idx, value="" if cell is None else str(cell))
|
||
if header and r_idx == start_row:
|
||
for c_idx in range(1, len(row) + 1):
|
||
ws.cell(row=r_idx, column=c_idx).font = Font(bold=True)
|
||
return start_row + len(rows)
|
||
|
||
def _text_rows(page) -> list[list[str]]:
|
||
lines = page.get_text().splitlines()
|
||
rows = []
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
parts = re.split(r"\s{2,}|\t+", line)
|
||
rows.append(parts if parts else [line])
|
||
return rows
|
||
|
||
def _find_tables_robust(page) -> list:
|
||
"""Detect tables on a page according to the user's chosen strategy.
|
||
|
||
PyMuPDF's default `find_tables()` only catches ruled (visible-border)
|
||
tables. Many real-world PDFs use borderless tables where columns are
|
||
aligned by whitespace — those need `strategy="text"`. The "auto" mode
|
||
tries lines first and only falls back to text-based when nothing is
|
||
found, which avoids the false-positive risk of text-detection picking
|
||
up multi-column body text as a "table".
|
||
"""
|
||
try:
|
||
if strategy == "lines":
|
||
return list(page.find_tables(strategy="lines"))
|
||
if strategy == "text":
|
||
return list(page.find_tables(
|
||
strategy="text",
|
||
vertical_strategy="text",
|
||
horizontal_strategy="text",
|
||
))
|
||
# auto: lines, then text fallback
|
||
tables = list(page.find_tables(strategy="lines"))
|
||
if tables:
|
||
return tables
|
||
return list(page.find_tables(
|
||
strategy="text",
|
||
vertical_strategy="text",
|
||
horizontal_strategy="text",
|
||
))
|
||
except Exception as e:
|
||
log_error(e, f"find_tables strategy={strategy}")
|
||
return []
|
||
|
||
def _clean_table_rows(rows) -> list[list[str]]:
|
||
cleaned = []
|
||
for row in rows or []:
|
||
normalized = ["" if cell is None else str(cell) for cell in row]
|
||
if any(cell.strip() for cell in normalized):
|
||
cleaned.append(normalized)
|
||
return cleaned
|
||
|
||
def _pymupdf_table_rows(page) -> list[list[list[str]]]:
|
||
rows_list = []
|
||
for table in _find_tables_robust(page):
|
||
try:
|
||
rows = _clean_table_rows(table.extract())
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-excel table extract")
|
||
continue
|
||
if rows:
|
||
rows_list.append(rows)
|
||
if rows_list:
|
||
table_engines_used.add("pymupdf")
|
||
return rows_list
|
||
|
||
def _table_rows_for_page(page, pno: int) -> list[list[list[str]]]:
|
||
if plumber_doc is not None:
|
||
try:
|
||
rows_list = [
|
||
rows for rows in
|
||
(_clean_table_rows(rows) for rows in (plumber_doc.pages[pno].extract_tables() or []))
|
||
if rows
|
||
]
|
||
except Exception as e:
|
||
log_error(e, f"pdfplumber extract page {pno + 1}")
|
||
rows_list = []
|
||
if rows_list:
|
||
table_engines_used.add("pdfplumber")
|
||
return rows_list
|
||
if table_engine == "pdfplumber":
|
||
return []
|
||
|
||
rows_list = _pymupdf_table_rows(page)
|
||
if rows_list and plumber_doc is not None and table_engine == "auto":
|
||
warnings.append("pdfplumber found no table on at least one page; PyMuPDF fallback was used.")
|
||
return rows_list
|
||
|
||
# ── "combined" — stream everything into a single sheet ────────────
|
||
if organize == "combined":
|
||
ws = wb.create_sheet(_safe_name("Extracted"))
|
||
next_row = 1
|
||
for pno in target_pages:
|
||
page = doc[pno]
|
||
page_had_content = False
|
||
|
||
if mode in ("tables", "tables_text"):
|
||
tables = _table_rows_for_page(page, pno)
|
||
for rows in tables:
|
||
if not rows:
|
||
continue
|
||
ws.cell(row=next_row, column=1, value=f"Page {pno + 1} – table").font = Font(bold=True, italic=True)
|
||
next_row += 1
|
||
next_row = _write_rows(ws, rows, start_row=next_row)
|
||
next_row += 1
|
||
total_tables += 1
|
||
page_had_content = True
|
||
|
||
if mode == "text" or (mode == "tables_text" and not page_had_content):
|
||
text_rows = _text_rows(page)
|
||
if text_rows:
|
||
table_engines_used.add("pymupdf")
|
||
ws.cell(row=next_row, column=1, value=f"Page {pno + 1} – text").font = Font(bold=True, italic=True)
|
||
next_row += 1
|
||
next_row = _write_rows(ws, text_rows, start_row=next_row, header=False)
|
||
next_row += 1
|
||
total_text_pages += 1
|
||
|
||
# ── "per_page" and "per_table" ────────────────────────────────────
|
||
else:
|
||
for pno in target_pages:
|
||
page = doc[pno]
|
||
tables_rows = [] # list of (label, rows)
|
||
|
||
if mode in ("tables", "tables_text"):
|
||
for tidx, rows in enumerate(_table_rows_for_page(page, pno), start=1):
|
||
if rows:
|
||
tables_rows.append((f"Table {tidx}", rows))
|
||
total_tables += 1
|
||
|
||
if mode == "text" or (mode == "tables_text" and not tables_rows):
|
||
text_rows = _text_rows(page)
|
||
if text_rows:
|
||
table_engines_used.add("pymupdf")
|
||
tables_rows.append(("Text", text_rows))
|
||
total_text_pages += 1
|
||
|
||
if not tables_rows:
|
||
continue
|
||
|
||
if organize == "per_table":
|
||
for label, rows in tables_rows:
|
||
is_text = label == "Text"
|
||
sheet = wb.create_sheet(_safe_name(f"Page{pno + 1}_{label.replace(' ', '')}"))
|
||
_write_rows(sheet, rows, header=not is_text)
|
||
else: # per_page
|
||
sheet = wb.create_sheet(_safe_name(f"Page_{pno + 1}"))
|
||
next_row = 1
|
||
for label, rows in tables_rows:
|
||
is_text = label == "Text"
|
||
sheet.cell(row=next_row, column=1, value=label).font = Font(bold=True, italic=True)
|
||
next_row += 1
|
||
next_row = _write_rows(sheet, rows, start_row=next_row, header=not is_text)
|
||
next_row += 1
|
||
|
||
doc.close()
|
||
if plumber_doc:
|
||
plumber_doc.close()
|
||
|
||
if not wb.sheetnames:
|
||
msg = "No tables found on the selected pages."
|
||
if strategy == "lines":
|
||
msg += " Try the 'Text alignment' or 'Auto' strategy — your PDF may use borderless tables."
|
||
elif mode == "tables":
|
||
msg += " Try the 'Tables, fall back to text rows' mode, or use PDF to Word in Layout mode."
|
||
else:
|
||
msg += " If this is a scanned PDF, run it through OCR PDF first; otherwise try PDF to Word in Layout mode."
|
||
return jsonify(error=msg), 400
|
||
|
||
# Auto-size columns on every sheet (cap at 60 chars to avoid absurd widths)
|
||
for ws in wb.worksheets:
|
||
for col_idx in range(1, ws.max_column + 1):
|
||
max_len = 0
|
||
for row_idx in range(1, ws.max_row + 1):
|
||
v = ws.cell(row=row_idx, column=col_idx).value
|
||
if v is not None:
|
||
max_len = max(max_len, len(str(v)))
|
||
ws.column_dimensions[get_column_letter(col_idx)].width = min(max_len + 2, 60)
|
||
|
||
output = io.BytesIO()
|
||
wb.save(output)
|
||
output.seek(0)
|
||
|
||
name = files[0].filename.rsplit(".", 1)[0] + ".xlsx"
|
||
engine = "+".join(sorted(table_engines_used)) or "pymupdf"
|
||
quality = "medium" if total_tables else QUALITY_BASIC
|
||
resp = send_file(
|
||
output,
|
||
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||
as_attachment=True,
|
||
download_name=name,
|
||
)
|
||
return set_conversion_metadata(resp, engine, quality, warnings)
|
||
|
||
|
||
@bp.route("/md-to-pdf", methods=["POST"])
|
||
def md_to_pdf():
|
||
import markdown as md_lib
|
||
|
||
md_text = request.form.get("markdown", "").strip()
|
||
if not md_text:
|
||
return jsonify(error="Please enter some Markdown."), 400
|
||
|
||
page_size = request.form.get("page_size", "A4").lower()
|
||
font_size = safe_int(request.form.get("font_size"), 11, min_val=8, max_val=18)
|
||
|
||
sizes_map = {
|
||
"a4": (595.28, 841.89),
|
||
"letter": (612, 792),
|
||
"legal": (612, 1008),
|
||
"a5": (419.53, 595.28),
|
||
}
|
||
page_w, page_h = sizes_map.get(page_size, sizes_map["a4"])
|
||
|
||
html = md_lib.markdown(
|
||
md_text,
|
||
extensions=["extra", "sane_lists", "nl2br", "fenced_code", "tables"],
|
||
)
|
||
|
||
margin = 54 # 0.75 inch
|
||
content_rect = fitz.Rect(margin, margin, page_w - margin, page_h - margin)
|
||
|
||
css = (
|
||
f"* {{ font-family: sans-serif; font-size: {font_size}pt; line-height: 1.45; }}"
|
||
"h1 { font-size: 1.8em; margin: 0.4em 0 0.3em; }"
|
||
"h2 { font-size: 1.5em; margin: 0.4em 0 0.3em; }"
|
||
"h3 { font-size: 1.2em; margin: 0.4em 0 0.3em; }"
|
||
"p { margin: 0.35em 0; }"
|
||
"ul, ol { margin: 0.3em 0 0.3em 1.2em; }"
|
||
"li { margin: 0.15em 0; }"
|
||
"code { font-family: monospace; background: #f2f2f2; padding: 1px 3px; }"
|
||
"pre { font-family: monospace; background: #f5f5f5; padding: 0.5em; white-space: pre-wrap; }"
|
||
"blockquote { margin: 0.5em 0; padding-left: 0.8em; border-left: 3px solid #bbb; color: #555; }"
|
||
"table { border-collapse: collapse; margin: 0.4em 0; }"
|
||
"th, td { border: 1px solid #999; padding: 0.2em 0.5em; }"
|
||
"hr { border: none; border-top: 1px solid #ccc; margin: 0.6em 0; }"
|
||
)
|
||
|
||
# Use PyMuPDF's Story + DocumentWriter for reliable multi-page HTML rendering
|
||
output = io.BytesIO()
|
||
mediabox = fitz.Rect(0, 0, page_w, page_h)
|
||
writer = fitz.DocumentWriter(output)
|
||
story = fitz.Story(html=html, user_css=css)
|
||
more = 1
|
||
safety = 0
|
||
while more and safety < 500:
|
||
dev = writer.begin_page(mediabox)
|
||
more, _ = story.place(content_rect)
|
||
story.draw(dev)
|
||
writer.end_page()
|
||
safety += 1
|
||
writer.close()
|
||
output.seek(0)
|
||
|
||
name = (request.form.get("file_name") or "document").strip() + ".pdf"
|
||
return send_file(output, mimetype="application/pdf",
|
||
as_attachment=True, download_name=name)
|
||
|
||
|
||
@bp.route("/md-to-docx", methods=["POST"])
|
||
def md_to_docx():
|
||
"""Markdown → .docx by walking an HTML tree built from Markdown."""
|
||
import markdown as md_lib
|
||
import re
|
||
from html.parser import HTMLParser
|
||
from docx.shared import Pt, RGBColor
|
||
|
||
md_text = request.form.get("markdown", "").strip()
|
||
if not md_text:
|
||
return jsonify(error="Please enter some Markdown."), 400
|
||
|
||
html = md_lib.markdown(
|
||
md_text,
|
||
extensions=["extra", "sane_lists", "fenced_code", "tables"],
|
||
)
|
||
|
||
docx = DocxDocument()
|
||
|
||
class MdHTMLParser(HTMLParser):
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.stack: list[str] = []
|
||
self.current_para = None
|
||
self.list_stack: list[str] = [] # "ul" or "ol"
|
||
self.in_pre = False
|
||
self.pending_href: str | None = None
|
||
self._run_formats: list[dict] = []
|
||
|
||
def _new_paragraph(self, style=None):
|
||
self.current_para = docx.add_paragraph(style=style) if style else docx.add_paragraph()
|
||
return self.current_para
|
||
|
||
def _add_run(self, text):
|
||
if not text:
|
||
return
|
||
p = self.current_para or self._new_paragraph()
|
||
run = p.add_run(text)
|
||
fmt = {}
|
||
for f in self._run_formats:
|
||
fmt.update(f)
|
||
if fmt.get("bold"): run.bold = True
|
||
if fmt.get("italic"): run.italic = True
|
||
if fmt.get("code") or self.in_pre:
|
||
run.font.name = "Consolas"
|
||
run.font.size = Pt(10)
|
||
if self.pending_href:
|
||
run.font.color.rgb = RGBColor(0x1A, 0x0D, 0xAB)
|
||
run.underline = True
|
||
|
||
def handle_starttag(self, tag, attrs):
|
||
self.stack.append(tag)
|
||
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
||
level = min(int(tag[1]), 4)
|
||
self._new_paragraph(style=f"Heading {level}")
|
||
elif tag == "p":
|
||
if not (self.list_stack or self.in_pre):
|
||
self._new_paragraph()
|
||
elif tag in ("ul", "ol"):
|
||
self.list_stack.append(tag)
|
||
elif tag == "li":
|
||
style = "List Number" if self.list_stack and self.list_stack[-1] == "ol" else "List Bullet"
|
||
try:
|
||
self._new_paragraph(style=style)
|
||
except KeyError:
|
||
self._new_paragraph()
|
||
elif tag in ("strong", "b"):
|
||
self._run_formats.append({"bold": True})
|
||
elif tag in ("em", "i"):
|
||
self._run_formats.append({"italic": True})
|
||
elif tag == "code":
|
||
self._run_formats.append({"code": True})
|
||
elif tag == "pre":
|
||
self.in_pre = True
|
||
self._new_paragraph()
|
||
elif tag == "blockquote":
|
||
try:
|
||
self._new_paragraph(style="Intense Quote")
|
||
except KeyError:
|
||
self._new_paragraph()
|
||
elif tag == "a":
|
||
href = dict(attrs).get("href", "")
|
||
self.pending_href = href
|
||
elif tag == "hr":
|
||
docx.add_paragraph("─" * 40)
|
||
elif tag == "br":
|
||
if self.current_para is not None:
|
||
self.current_para.add_run().add_break()
|
||
|
||
def handle_endtag(self, tag):
|
||
if self.stack and self.stack[-1] == tag:
|
||
self.stack.pop()
|
||
if tag in ("ul", "ol") and self.list_stack:
|
||
self.list_stack.pop()
|
||
elif tag in ("strong", "b", "em", "i", "code"):
|
||
if self._run_formats:
|
||
self._run_formats.pop()
|
||
elif tag == "pre":
|
||
self.in_pre = False
|
||
elif tag == "a":
|
||
if self.pending_href:
|
||
self._add_run(f" ({self.pending_href})")
|
||
self.pending_href = None
|
||
|
||
def handle_data(self, data):
|
||
if not data:
|
||
return
|
||
if self.in_pre:
|
||
for line in data.splitlines():
|
||
if self.current_para is None:
|
||
self._new_paragraph()
|
||
r = self.current_para.add_run(line)
|
||
r.font.name = "Consolas"
|
||
r.font.size = Pt(10)
|
||
self.current_para.add_run().add_break()
|
||
else:
|
||
# Collapse whitespace like HTML does
|
||
text = re.sub(r"\s+", " ", data)
|
||
self._add_run(text)
|
||
|
||
parser = MdHTMLParser()
|
||
parser.feed(html)
|
||
|
||
output = io.BytesIO()
|
||
docx.save(output)
|
||
output.seek(0)
|
||
|
||
name = (request.form.get("file_name") or "document").strip() + ".docx"
|
||
return send_file(
|
||
output,
|
||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
as_attachment=True, download_name=name,
|
||
)
|
||
|
||
|
||
@bp.route("/html-to-pdf", methods=["POST"])
|
||
def html_to_pdf():
|
||
html = request.form.get("text", "").strip()
|
||
if not html:
|
||
return jsonify(error="Please enter some HTML content."), 400
|
||
allow_basic_fallback = request.form.get("use_basic_fallback") == "on"
|
||
|
||
# Wrap in basic structure if no <html> tag present
|
||
if "<html" not in html.lower():
|
||
html = f"<!DOCTYPE html><html><body>{html}</body></html>"
|
||
|
||
# Prefer LibreOffice for proper CSS rendering
|
||
pdf_bytes = _soffice_convert(html.encode("utf-8"), "html", "pdf")
|
||
engine = "libreoffice"
|
||
quality = QUALITY_HIGH
|
||
warnings: list[str] = []
|
||
|
||
if pdf_bytes is None:
|
||
if not allow_basic_fallback:
|
||
return jsonify(error=(
|
||
"High-fidelity HTML to PDF requires LibreOffice. Tick "
|
||
"'Allow basic PyMuPDF fallback' to continue with limited CSS/layout support."
|
||
)), 400
|
||
engine = "pymupdf"
|
||
quality = QUALITY_BASIC
|
||
warnings.append("Basic HTML fallback supports only simple markup and may not preserve CSS/layout.")
|
||
# Fallback: PyMuPDF's minimal HTML rendering
|
||
doc = fitz.open()
|
||
try:
|
||
page = doc.new_page(width=595, height=842) # A4
|
||
try:
|
||
page.insert_htmlbox(fitz.Rect(50, 50, 545, 792), html)
|
||
except Exception as e:
|
||
log_error(e, "html-to-pdf insert_htmlbox")
|
||
return jsonify(error="HTML rendering failed (the markup may be invalid or use unsupported features)."), 400
|
||
|
||
output = io.BytesIO()
|
||
doc.save(output)
|
||
output.seek(0)
|
||
pdf_bytes = output.getvalue()
|
||
finally:
|
||
doc.close()
|
||
|
||
output = io.BytesIO(pdf_bytes)
|
||
output.seek(0)
|
||
|
||
resp = send_file(output, mimetype="application/pdf",
|
||
as_attachment=True, download_name="converted.pdf")
|
||
return set_conversion_metadata(resp, engine, quality, warnings)
|
||
|
||
|
||
@bp.route("/ocr-pdf", methods=["POST"])
|
||
def ocr_pdf():
|
||
if not HAS_TESSERACT:
|
||
return jsonify(error="OCR requires 'pytesseract' and the Tesseract binary. Install: pip install pytesseract, plus Tesseract from https://github.com/tesseract-ocr/tesseract"), 400
|
||
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_SINGLE), 400
|
||
|
||
mode = request.form.get("mode", "searchable")
|
||
lang = request.form.get("lang", "eng")
|
||
dpi = safe_int(request.form.get("dpi"), 200, min_val=72, max_val=400)
|
||
|
||
pdf_data = files[0].read()
|
||
try:
|
||
src = fitz.open(stream=pdf_data, filetype="pdf")
|
||
except Exception as e:
|
||
log_error(e, "ocr-pdf open")
|
||
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
|
||
|
||
zoom = dpi / 72
|
||
|
||
try:
|
||
if mode == "text":
|
||
text_parts = []
|
||
for i, page in enumerate(src):
|
||
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
|
||
with Image.open(io.BytesIO(pix.tobytes("png"))) as img:
|
||
text = pytesseract.image_to_string(img, lang=lang)
|
||
text_parts.append(f"--- Page {i + 1} ---\n{text.strip()}")
|
||
combined = "\n\n".join(text_parts).strip()
|
||
return jsonify(text=combined or "(No text detected)")
|
||
|
||
output = fitz.open()
|
||
try:
|
||
for page in src:
|
||
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
|
||
with Image.open(io.BytesIO(pix.tobytes("png"))) as img:
|
||
page_pdf_bytes = pytesseract.image_to_pdf_or_hocr(
|
||
img, extension="pdf", lang=lang)
|
||
with fitz.open(stream=page_pdf_bytes, filetype="pdf") as sub:
|
||
output.insert_pdf(sub)
|
||
|
||
buf = io.BytesIO()
|
||
output.save(buf)
|
||
buf.seek(0)
|
||
finally:
|
||
output.close()
|
||
|
||
name = files[0].filename.rsplit(".", 1)[0] + "_ocr.pdf"
|
||
return send_file(buf, mimetype="application/pdf",
|
||
as_attachment=True, download_name=name)
|
||
except pytesseract.TesseractNotFoundError:
|
||
return jsonify(error="Tesseract binary not found. Install from https://github.com/tesseract-ocr/tesseract and ensure it is on PATH."), 400
|
||
except Exception as e:
|
||
msg = str(e)
|
||
log_error(e, f"ocr-pdf lang={lang}")
|
||
if "language" in msg.lower() or "traineddata" in msg.lower():
|
||
return jsonify(error=f"Language pack '{lang}' not installed. Download its .traineddata file into your Tesseract tessdata directory."), 400
|
||
return jsonify(error="OCR failed (the PDF may be image-only or unreadable)."), 400
|
||
finally:
|
||
src.close()
|
||
|
||
|
||
@bp.route("/cad-to-pdf", methods=["POST"])
|
||
def cad_to_pdf():
|
||
if not HAS_EZDXF:
|
||
return jsonify(error="CAD conversion requires 'ezdxf' and 'matplotlib'. Install: pip install ezdxf matplotlib"), 400
|
||
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_SINGLE), 400
|
||
|
||
target = request.form.get("format", "pdf")
|
||
dpi = safe_int(request.form.get("dpi"), 150, min_val=72, max_val=600)
|
||
ezdxf, RenderContext, Frontend, MatplotlibBackend, plt = _load_cad_modules()
|
||
|
||
filename = files[0].filename
|
||
ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
|
||
file_data = files[0].read()
|
||
engine_used = "ezdxf/matplotlib"
|
||
|
||
import tempfile, os, subprocess
|
||
with tempfile.TemporaryDirectory() as tmpdir:
|
||
if ext == "dwg":
|
||
if not ODA_CONVERTER:
|
||
return jsonify(error="DWG support requires ODA File Converter. Download it free from https://www.opendesign.com/guestfiles/oda_file_converter and ensure it is on your PATH. Or convert your DWG to DXF first."), 400
|
||
engine_used = "oda/ezdxf/matplotlib"
|
||
|
||
in_dir = os.path.join(tmpdir, "in")
|
||
out_dir = os.path.join(tmpdir, "out")
|
||
os.makedirs(in_dir)
|
||
os.makedirs(out_dir)
|
||
dwg_path = os.path.join(in_dir, "input.dwg")
|
||
with open(dwg_path, "wb") as f:
|
||
f.write(file_data)
|
||
|
||
try:
|
||
subprocess.run(
|
||
[ODA_CONVERTER, in_dir, out_dir, "ACAD2018", "DXF", "0", "1", "*.DWG"],
|
||
check=True, capture_output=True, timeout=60,
|
||
)
|
||
except subprocess.CalledProcessError as e:
|
||
log_error(e, "cad-to-pdf ODA")
|
||
return jsonify(error="DWG to DXF conversion failed (file may be corrupted or use an unsupported version)."), 400
|
||
except subprocess.TimeoutExpired:
|
||
return jsonify(error="DWG conversion timed out."), 400
|
||
|
||
dxf_path = os.path.join(out_dir, "input.dxf")
|
||
if not os.path.exists(dxf_path):
|
||
return jsonify(error="DWG to DXF conversion produced no output."), 400
|
||
doc = ezdxf.readfile(dxf_path)
|
||
elif ext == "dxf":
|
||
dxf_path = os.path.join(tmpdir, "input.dxf")
|
||
with open(dxf_path, "wb") as f:
|
||
f.write(file_data)
|
||
try:
|
||
doc = ezdxf.readfile(dxf_path)
|
||
except Exception as e:
|
||
log_error(e, "cad-to-pdf dxf parse")
|
||
return jsonify(error="Invalid DXF file (the file may be corrupted or use an unsupported feature)."), 400
|
||
else:
|
||
return jsonify(error="Upload a .dxf or .dwg file."), 400
|
||
|
||
msp = doc.modelspace()
|
||
fig = plt.figure()
|
||
ax = fig.add_axes([0, 0, 1, 1])
|
||
ax.set_aspect("equal")
|
||
ax.set_axis_off()
|
||
|
||
try:
|
||
ctx = RenderContext(doc)
|
||
backend = MatplotlibBackend(ax)
|
||
Frontend(ctx, backend).draw_layout(msp, finalize=True)
|
||
except Exception as e:
|
||
plt.close(fig)
|
||
log_error(e, "cad-to-pdf render")
|
||
return jsonify(error="CAD rendering failed (the drawing may use unsupported entities)."), 400
|
||
|
||
buf = io.BytesIO()
|
||
base_name = filename.rsplit(".", 1)[0]
|
||
if target == "pdf":
|
||
fig.savefig(buf, format="pdf", bbox_inches="tight", pad_inches=0.2)
|
||
plt.close(fig)
|
||
buf.seek(0)
|
||
resp = send_file(buf, mimetype="application/pdf",
|
||
as_attachment=True, download_name=base_name + ".pdf")
|
||
return set_conversion_metadata(
|
||
resp,
|
||
engine_used,
|
||
"medium",
|
||
"CAD rendering may omit or simplify unsupported entities, fonts, and line styles.",
|
||
)
|
||
else:
|
||
fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight", pad_inches=0.2)
|
||
plt.close(fig)
|
||
buf.seek(0)
|
||
resp = send_file(buf, mimetype="image/png",
|
||
as_attachment=True, download_name=base_name + ".png")
|
||
return set_conversion_metadata(
|
||
resp,
|
||
engine_used,
|
||
"medium",
|
||
"CAD rendering may omit or simplify unsupported entities, fonts, and line styles.",
|
||
)
|
||
|
||
|
||
# ── PDF → PowerPoint ─────────────────────────────────────
|
||
|
||
PPTX_MIME = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||
SLIDE_SIZES_EMU = {
|
||
# python-pptx uses English Metric Units (914400 EMU per inch).
|
||
"16:9": (12192000, 6858000), # 13.333 × 7.5 in (default widescreen)
|
||
"4:3": (9144000, 6858000), # 10 × 7.5 in
|
||
"a4": (10692000, 7560000), # ~11.69 × 8.27 in (landscape A4)
|
||
}
|
||
|
||
|
||
@bp.route("/pdf-to-pptx")
|
||
def pdf_to_pptx_page():
|
||
# Default to Editable when LibreOffice is on PATH; otherwise Image,
|
||
# since editable mode would just error otherwise.
|
||
default_mode = "editable" if SOFFICE else "image"
|
||
|
||
if SOFFICE:
|
||
editable_status = (
|
||
'<i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
|
||
'<strong>LibreOffice detected</strong> — Editable mode will produce a real .pptx '
|
||
'with text and shapes you can click and edit in PowerPoint.'
|
||
)
|
||
else:
|
||
editable_status = (
|
||
'<i class="bi bi-exclamation-triangle-fill" style="color:#ffb703"></i> '
|
||
'<strong>LibreOffice not found</strong> — Editable mode is unavailable. '
|
||
'Install LibreOffice (see <a href="/convert/pptx-to-pdf">PowerPoint to PDF</a>) '
|
||
'and restart the server, or use Image mode below.'
|
||
)
|
||
|
||
return render_template("upload_tool.html",
|
||
title="PDF to PowerPoint",
|
||
description="Convert a PDF into a .pptx — either as editable text/shapes, or as page images",
|
||
notes=(
|
||
f"<p>{editable_status}</p>"
|
||
"<p><strong>Two conversion modes:</strong></p>"
|
||
"<ul style='margin:.4rem 0 .6rem 1.2rem'>"
|
||
"<li><strong>Editable</strong> — uses LibreOffice to convert each PDF page into native PowerPoint "
|
||
"elements (text frames, lines, shapes, images). You can click on text to edit it, change fonts, "
|
||
"rearrange shapes. Layout fidelity is good but not pixel-perfect — complex PDFs may show small "
|
||
"shifts. Slide size matches the PDF's page dimensions.</li>"
|
||
"<li><strong>Image</strong> — renders each PDF page as a single picture and centers it on a slide. "
|
||
"Visually identical to the PDF, but nothing is editable. Best for archival or when you want to "
|
||
"guarantee the slides look exactly like the source.</li>"
|
||
"</ul>"
|
||
),
|
||
endpoint="/convert/pdf-to-pptx",
|
||
accept=".pdf",
|
||
multiple=False,
|
||
options=[
|
||
{"type": "select", "name": "mode", "label": "Conversion mode", "default": default_mode,
|
||
"choices": [
|
||
{"value": "editable", "label": "Editable — text and shapes can be edited (LibreOffice)"},
|
||
{"value": "image", "label": "Image — slides look identical to PDF, nothing editable"},
|
||
]},
|
||
{"type": "select", "name": "slide_size", "label": "Slide size (Image mode only)", "default": "16:9",
|
||
"choices": [
|
||
{"value": "16:9", "label": "Widescreen 16:9"},
|
||
{"value": "4:3", "label": "Standard 4:3"},
|
||
{"value": "a4", "label": "A4 landscape"},
|
||
]},
|
||
{"type": "number", "name": "dpi", "label": "Render DPI (Image mode only)",
|
||
"default": 150, "min": 72, "max": 300},
|
||
{"type": "text", "name": "pages", "label": "Pages (blank = all)",
|
||
"placeholder": "e.g. 1-3, 5, 8-10"},
|
||
],
|
||
button_text="Convert to PPTX")
|
||
|
||
|
||
@bp.route("/pdf-to-pptx", methods=["POST"])
|
||
def pdf_to_pptx():
|
||
from routes._helpers import safe_int, log_error, NO_FILE_SINGLE
|
||
from routes.pdf_tools import parse_page_ranges
|
||
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_SINGLE), 400
|
||
|
||
mode = request.form.get("mode", "editable" if SOFFICE else "image")
|
||
if mode not in ("editable", "image"):
|
||
mode = "image"
|
||
pages_spec = (request.form.get("pages") or "").strip()
|
||
pdf_data = files[0].read()
|
||
|
||
# Pre-resolve page range against the PDF (used by both modes)
|
||
try:
|
||
with fitz.open(stream=pdf_data, filetype="pdf") as probe:
|
||
total_pages = len(probe)
|
||
try:
|
||
target_pages = parse_page_ranges(pages_spec, total_pages)
|
||
except (ValueError, IndexError):
|
||
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 8-10'."), 400
|
||
if not target_pages:
|
||
return jsonify(error="No pages selected."), 400
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-pptx probe")
|
||
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
|
||
|
||
base = files[0].filename.rsplit(".", 1)[0]
|
||
|
||
# ── Editable mode (LibreOffice) ───────────────────────
|
||
if mode == "editable":
|
||
if not SOFFICE:
|
||
return jsonify(error=(
|
||
"Editable mode requires LibreOffice (soffice) on PATH. "
|
||
"Install LibreOffice and restart the server, or switch to Image mode."
|
||
)), 400
|
||
|
||
# If a page range was specified, build a sub-PDF first so LibreOffice
|
||
# only converts the requested pages.
|
||
source_pdf = pdf_data
|
||
if pages_spec and len(target_pages) != total_pages:
|
||
source_pdf = _extract_pages(pdf_data, target_pages)
|
||
|
||
pptx_bytes = _soffice_convert(source_pdf, "pdf", "pptx", timeout=300)
|
||
if pptx_bytes is None:
|
||
return jsonify(error=(
|
||
"LibreOffice could not convert this PDF. The file may be password-protected or "
|
||
"use features LibreOffice's PDF importer can't handle. Try Image mode instead."
|
||
)), 400
|
||
|
||
resp = send_file(io.BytesIO(pptx_bytes), mimetype=PPTX_MIME,
|
||
as_attachment=True, download_name=f"{base}.pptx")
|
||
return set_conversion_metadata(resp, "libreoffice", QUALITY_HIGH)
|
||
|
||
# ── Image mode (page-image-per-slide) ─────────────────
|
||
if not HAS_PPTX:
|
||
return jsonify(error="Image mode requires python-pptx. Run: pip install python-pptx"), 400
|
||
|
||
dpi = safe_int(request.form.get("dpi"), 150, min_val=72, max_val=300)
|
||
slide_size = request.form.get("slide_size", "16:9")
|
||
if slide_size not in SLIDE_SIZES_EMU:
|
||
slide_size = "16:9"
|
||
|
||
try:
|
||
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
||
except Exception as e:
|
||
log_error(e, "pdf-to-pptx open")
|
||
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
|
||
|
||
try:
|
||
prs = Presentation()
|
||
slide_w, slide_h = SLIDE_SIZES_EMU[slide_size]
|
||
prs.slide_width = slide_w
|
||
prs.slide_height = slide_h
|
||
blank_layout = prs.slide_layouts[6] # 'Blank'
|
||
|
||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||
for idx in target_pages:
|
||
page = doc[idx]
|
||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||
png_bytes = pix.tobytes("png")
|
||
img_w, img_h = pix.width, pix.height
|
||
|
||
# Aspect-fit: scale to slide while preserving aspect ratio, then center.
|
||
slide_ratio = slide_w / slide_h
|
||
img_ratio = img_w / img_h
|
||
if img_ratio > slide_ratio:
|
||
draw_w = slide_w
|
||
draw_h = int(slide_w / img_ratio)
|
||
else:
|
||
draw_h = slide_h
|
||
draw_w = int(slide_h * img_ratio)
|
||
left = (slide_w - draw_w) // 2
|
||
top = (slide_h - draw_h) // 2
|
||
|
||
slide = prs.slides.add_slide(blank_layout)
|
||
slide.shapes.add_picture(io.BytesIO(png_bytes),
|
||
Emu(left), Emu(top),
|
||
width=Emu(draw_w), height=Emu(draw_h))
|
||
|
||
output = io.BytesIO()
|
||
prs.save(output)
|
||
output.seek(0)
|
||
finally:
|
||
doc.close()
|
||
|
||
resp = send_file(output, mimetype=PPTX_MIME,
|
||
as_attachment=True, download_name=f"{base}.pptx")
|
||
return set_conversion_metadata(
|
||
resp,
|
||
"pymupdf/python-pptx",
|
||
QUALITY_HIGH,
|
||
"Image mode preserves appearance but slides are not editable.",
|
||
)
|
||
|
||
|
||
# ── PowerPoint → PDF ─────────────────────────────────────
|
||
|
||
@bp.route("/pptx-to-pdf")
|
||
def pptx_to_pdf_page():
|
||
return render_template("upload_tool.html",
|
||
title="PowerPoint to PDF",
|
||
description="Convert PowerPoint or OpenDocument presentations to PDF",
|
||
notes=_soffice_available_notes(),
|
||
endpoint="/convert/pptx-to-pdf",
|
||
accept=".pptx,.ppt,.odp",
|
||
multiple=False,
|
||
options=[],
|
||
button_text="Convert to PDF")
|
||
|
||
|
||
@bp.route("/pptx-to-pdf", methods=["POST"])
|
||
def pptx_to_pdf():
|
||
from routes._helpers import NO_FILE_SINGLE
|
||
|
||
if not SOFFICE:
|
||
return jsonify(error="LibreOffice (soffice) is not installed or not on PATH. Install LibreOffice and restart the server."), 400
|
||
|
||
files = request.files.getlist("files")
|
||
if not files or not files[0].filename:
|
||
return jsonify(error=NO_FILE_SINGLE), 400
|
||
|
||
f = files[0]
|
||
ext = f.filename.rsplit(".", 1)[-1].lower() if "." in f.filename else ""
|
||
if ext not in ("pptx", "ppt", "odp"):
|
||
return jsonify(error="Unsupported format. Upload .pptx, .ppt, or .odp."), 400
|
||
|
||
data = soffice_convert(f.read(), ext, "pdf", timeout=300)
|
||
if data is None:
|
||
return jsonify(error="LibreOffice could not convert this file."), 400
|
||
|
||
base = f.filename.rsplit(".", 1)[0]
|
||
resp = send_file(io.BytesIO(data), mimetype="application/pdf",
|
||
as_attachment=True, download_name=f"{base}.pdf")
|
||
return set_conversion_metadata(resp, "libreoffice", QUALITY_HIGH)
|