Files
2026-06-06 19:05:17 +07:00

2106 lines
88 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import io
import importlib.util
from flask import Blueprint, render_template, request, send_file, jsonify
from PIL import Image, ImageOps
import img2pdf
from docx import Document as DocxDocument
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
from utils.pymupdf import import_pymupdf
fitz = import_pymupdf()
try:
from pdf2docx import Converter as Pdf2DocxConverter
HAS_PDF2DOCX = True
except ImportError:
HAS_PDF2DOCX = False
# Marker is loaded lazily inside the route to avoid heavy model/module work
# on server start. We only check package presence here.
HAS_MARKER = importlib.util.find_spec("marker") is not None
try:
import pytesseract
HAS_TESSERACT = True
except ImportError:
HAS_TESSERACT = False
try:
import pdfplumber
HAS_PDFPLUMBER = True
except ImportError:
HAS_PDFPLUMBER = False
HAS_EZDXF = (
importlib.util.find_spec("ezdxf") is not None
and importlib.util.find_spec("matplotlib") is not None
)
from routes._helpers import safe_int, safe_float, log_error, NO_FILE_SINGLE, NO_FILE_MULTIPLE
from utils.capabilities import (
QUALITY_BASIC,
QUALITY_HIGH,
find_soffice,
set_conversion_metadata,
soffice_convert,
)
import shutil
ODA_CONVERTER = shutil.which("ODAFileConverter") or shutil.which("oda_file_converter")
SOFFICE = find_soffice()
try:
from pptx import Presentation
from pptx.util import Emu
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
bp = Blueprint("convert", __name__)
def _load_cad_modules():
import ezdxf
from ezdxf.addons.drawing import RenderContext, Frontend
from ezdxf.addons.drawing.matplotlib import MatplotlibBackend
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
return ezdxf, RenderContext, Frontend, MatplotlibBackend, plt
# ── LibreOffice availability note (PPT/ODP/DOC conversion) ──────
def _soffice_available_notes():
if SOFFICE:
return (
f'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
f'<strong>LibreOffice detected:</strong> <code>{SOFFICE}</code></p>'
)
return (
'<p><i class="bi bi-exclamation-triangle-fill" style="color:#ffb703"></i> '
'<strong>LibreOffice was not found on PATH.</strong> '
'This tool will not work until LibreOffice is installed.</p>'
'<details><summary>How to install LibreOffice</summary>'
'<p><strong>Windows:</strong> Download from '
'<a href="https://www.libreoffice.org/download/download-libreoffice/" target="_blank">libreoffice.org</a> '
'and add the installs <code>program</code> folder '
'(usually <code>C:\\Program Files\\LibreOffice\\program</code>) to your PATH, '
'then restart the server.</p>'
'<p><strong>macOS:</strong> <code>brew install --cask libreoffice</code> '
'(the <code>soffice</code> binary lives at '
'<code>/Applications/LibreOffice.app/Contents/MacOS/soffice</code>).</p>'
'<p><strong>Linux:</strong> <code>sudo apt install libreoffice</code> '
'(Debian/Ubuntu) or <code>sudo dnf install libreoffice</code> (Fedora).</p>'
'<p>Restart the server after installing so the new PATH is picked up.</p>'
'</details>'
)
def _soffice_convert(file_data: bytes, source_ext: str, target_ext: str = "pdf",
timeout: int = 180):
"""Compatibility wrapper around the shared hardened LibreOffice converter."""
return soffice_convert(file_data, source_ext, target_ext, timeout)
# ── Page Routes ──────────────────────────────────
@bp.route("/to-pdf")
def to_pdf_page():
if SOFFICE:
notes = (
f'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
f'<strong>LibreOffice detected at <code>{SOFFICE}</code></strong> — Word documents '
f'will convert with full layout fidelity (fonts, images, tables, columns, '
f'headers/footers all preserved).</p>'
)
else:
notes = (
'<p><i class="bi bi-exclamation-triangle-fill" style="color:#ffb703"></i> '
'<strong>LibreOffice was not found.</strong> Word files (.docx) will use a built-in '
'fallback that only handles paragraphs, tables, basic formatting, and inline images. '
'It will <strong>NOT</strong> preserve: custom fonts, headers/footers, columns, '
'page breaks, text boxes, frames, SmartArt, or precise positioning.</p>'
'<p><strong>For high-fidelity Word→PDF, install LibreOffice:</strong></p>'
'<ul style="margin:.4rem 0 .6rem 1.2rem">'
'<li><strong>Windows:</strong> Download from '
'<a href="https://www.libreoffice.org/download/download-libreoffice/" target="_blank">libreoffice.org</a> '
'and install with default options. The app auto-detects it at <code>C:\\Program Files\\LibreOffice\\</code> '
'on next start — no PATH editing needed.</li>'
'<li><strong>macOS:</strong> <code>brew install --cask libreoffice</code></li>'
'<li><strong>Linux:</strong> <code>sudo apt install libreoffice</code> (Debian/Ubuntu) '
'or <code>sudo dnf install libreoffice</code> (Fedora)</li>'
'</ul>'
'<p style="font-size:.9em;color:var(--muted)">Restart the server after installing.</p>'
)
return render_template("upload_tool.html",
title="Files to PDF",
description="Convert images, Word documents, and text files to PDF",
notes=notes,
endpoint="/convert/to-pdf",
accept=".jpg,.jpeg,.png,.bmp,.tiff,.webp,.txt,.docx,.doc,.odt",
multiple=True,
options=[
{"type": "checkbox", "name": "use_basic_fallback",
"label": "Fallback",
"check_label": "Allow basic Python fallback if LibreOffice is unavailable or fails",
"default": False},
])
@bp.route("/pdf-to-word")
def pdf_to_word_page():
marker_status = (
'<li><strong>Marker (ML)</strong> — uses an ML model for structure understanding. '
'Best fidelity for academic papers, books, and complex documents. <em>'
+ ('Detected and ready.' if HAS_MARKER else 'Not installed — run <code>pip install marker-pdf</code>. '
'First run downloads ~2 GB of models. Conversion is slow on CPU (3060s/page).')
+ '</em></li>'
)
return render_template("upload_tool.html",
title="PDF to Word",
description="Convert PDF documents to Word (.docx) format",
notes=(
'<p><strong>Four conversion modes — pick the one that fits your document:</strong></p>'
'<ul style="margin:.4rem 0 .6rem 1.2rem">'
'<li><strong>Layout (default)</strong> — uses <code>pdf2docx</code> to preserve tables, columns, and figure positions. '
'Best for forms, reports, invoices.</li>'
'<li><strong>Smart structure</strong> — analyses font sizes to detect headings, lists, and paragraphs, '
'and emits a Word doc with proper heading styles (visible in Word\'s navigation pane). '
'Best for articles, blog posts, books, and documentation. Drops tables and figures.</li>'
'<li><strong>Flowing text</strong> — extracts text in reading order, emits one paragraph per block. '
'No structure detection. Always produces clean output even on awkward PDFs.</li>'
f'{marker_status}'
'</ul>'
'<p style="font-size:.9em;color:var(--muted)">If your PDF is a scan, run it through <a href="/convert/ocr-pdf">OCR PDF</a> first.</p>'
),
endpoint="/convert/pdf-to-word",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "mode", "label": "Mode", "default": "layout",
"choices": [
{"value": "layout", "label": "Layout — preserve tables, columns, figures"},
{"value": "exact", "label": "Exact visual copy — non-editable page images"},
{"value": "structure", "label": "Smart structure — detect headings & lists"},
{"value": "text", "label": "Flowing text — clean paragraphs, no structure"},
{"value": "marker", "label": "Marker (ML) — best fidelity, slow, needs install"},
]},
{"type": "number", "name": "exact_dpi", "label": "Exact visual copy DPI",
"default": 180, "min": 96, "max": 300, "depends_on": {"mode": "exact"}},
{"type": "text", "name": "pages", "label": "Pages (blank = all)",
"placeholder": "e.g. 1-3, 5, 8-10"},
{"type": "checkbox", "name": "extract_tables",
"label": "Layout mode: detect borderless tables",
"check_label": "Try harder to find tables (slower, sometimes invents tables)",
"default": False},
])
@bp.route("/pdf-to-images")
def pdf_to_images_page():
return render_template("upload_tool.html",
title="PDF to Images",
description="Convert each PDF page to an image",
notes=(
'<p><strong>What this does:</strong> renders each PDF page as a raster image. '
'Output is one image per page, bundled as a ZIP if there are multiple pages.</p>'
'<p><strong>DPI guide:</strong> 72 = screen quality, 150 = good for slides, '
'200 = good for print preview, 300 = print quality, 600 = archival. '
'Higher DPI = larger files (a 10-page PDF at 600 DPI can be 50+ MB).</p>'
'<p><strong>Format:</strong> PNG is lossless and best for diagrams / text-heavy pages. '
'JPG is smaller but lossy — best for photo-heavy pages.</p>'
'<p style="font-size:.9em;color:var(--muted)"><strong>No external dependencies.</strong></p>'
),
endpoint="/convert/pdf-to-images",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "format", "label": "Image Format",
"choices": [
{"value": "png", "label": "PNG"},
{"value": "jpg", "label": "JPG"},
]},
{"type": "number", "name": "dpi", "label": "Resolution (DPI)", "default": 200, "min": 72, "max": 600},
])
@bp.route("/pdf-to-text")
def pdf_to_text_page():
return render_template("upload_tool.html",
title="PDF to Text",
description="Extract all text content from a PDF document",
notes=(
'<p><strong>What this does:</strong> pulls all extractable text out of the PDF, '
'page by page, in reading order.</p>'
'<p><strong>Important:</strong> this only works on PDFs that <em>contain</em> '
'real text. If your PDF is a scan (photographed/scanned pages stored as images), '
'no text will be extracted — run it through '
'<a href="/convert/ocr-pdf">OCR PDF</a> first to recognise the text, then come back here.</p>'
'<p style="font-size:.9em;color:var(--muted)"><strong>No external dependencies.</strong></p>'
),
endpoint="/convert/pdf-to-text",
accept=".pdf",
multiple=False,
options=[])
@bp.route("/md-to-pdf")
def md_to_pdf_page():
return render_template("tools/md_to_pdf.html")
@bp.route("/md-to-docx")
def md_to_docx_page():
return render_template("tools/md_to_docx.html")
@bp.route("/pdf-to-excel")
def pdf_to_excel_page():
return render_template("upload_tool.html",
title="PDF to Excel",
description="Extract tables from a PDF into an .xlsx workbook",
notes=(
"<p><strong>How table detection works:</strong> we try both detection strategies in "
"order of accuracy:</p>"
"<ul style='margin:.4rem 0 .6rem 1.2rem'>"
"<li><strong>Auto (recommended)</strong> — tries ruled-line detection first; if a "
"page has no visible table borders, falls back to text-alignment detection (catches "
"borderless tables in financial reports, invoices, schedules).</li>"
"<li><strong>Lines only</strong> — only tables with visible borders. Most accurate "
"but misses borderless tables.</li>"
"<li><strong>Text alignment only</strong> — finds tables by detecting columns of "
"aligned text. Catches borderless tables but can occasionally false-positive on "
"multi-column body text.</li>"
"</ul>"
"<p style='font-size:.9em;color:var(--muted)'><strong>Still get \"no tables found\"?</strong> "
"Try our <a href='/convert/pdf-to-word'>PDF to Word</a> tool in <em>Layout</em> mode "
"instead — it uses <code>pdf2docx</code> which is more aggressive about table "
"detection. If your PDF is scanned, run it through "
"<a href='/convert/ocr-pdf'>OCR PDF</a> first.</p>"
),
endpoint="/convert/pdf-to-excel",
accept=".pdf",
multiple=False,
options=[
{"type": "text", "name": "pages", "label": "Pages (leave empty for all)",
"placeholder": "e.g. 1-3, 5"},
{"type": "select", "name": "strategy", "label": "Table detection strategy", "default": "auto",
"choices": [
{"value": "auto", "label": "Auto — lines first, fall back to text alignment"},
{"value": "lines", "label": "Lines only (ruled tables)"},
{"value": "text", "label": "Text alignment only (borderless tables)"},
]},
{"type": "select", "name": "table_engine", "label": "Table engine", "default": "auto",
"choices": [
{"value": "auto", "label": "Auto — pdfplumber if installed, then PyMuPDF"},
{"value": "pymupdf", "label": "PyMuPDF built-in"},
{"value": "pdfplumber", "label": "pdfplumber (optional, often better on borderless tables)"},
]},
{"type": "select", "name": "mode", "label": "Extraction mode", "default": "tables",
"choices": [
{"value": "tables", "label": "Tables only (recommended)"},
{"value": "tables_text", "label": "Tables, fall back to text rows when none are found"},
{"value": "text", "label": "Text only — every line becomes a row"},
]},
{"type": "select", "name": "organize", "label": "Sheet organization", "default": "per_table",
"choices": [
{"value": "per_table", "label": "One sheet per detected table"},
{"value": "per_page", "label": "One sheet per page (tables stacked)"},
{"value": "combined", "label": "Everything on one sheet"},
]},
])
OCR_LANGS = [
{"value": "eng", "label": "English"},
{"value": "ind", "label": "Indonesian"},
{"value": "fra", "label": "French"},
{"value": "deu", "label": "German"},
{"value": "spa", "label": "Spanish"},
{"value": "ita", "label": "Italian"},
{"value": "por", "label": "Portuguese"},
{"value": "rus", "label": "Russian"},
{"value": "chi_sim", "label": "Chinese (Simplified)"},
{"value": "chi_tra", "label": "Chinese (Traditional)"},
{"value": "jpn", "label": "Japanese"},
{"value": "kor", "label": "Korean"},
{"value": "ara", "label": "Arabic"},
{"value": "hin", "label": "Hindi"},
]
@bp.route("/ocr-pdf")
def ocr_pdf_page():
if HAS_TESSERACT:
status = (
'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
'<strong>OCR is ready.</strong> Tesseract Python bindings detected. '
'Make sure the language pack you select is installed in your Tesseract '
'<code>tessdata</code> directory — you\'ll get a clear error if it isn\'t.</p>'
)
else:
status = (
'<p><i class="bi bi-exclamation-triangle-fill" style="color:#ffb703"></i> '
'<strong>OCR is unavailable.</strong> Two things to install:</p>'
'<ol style="margin:.4rem 0 .6rem 1.2rem">'
'<li>The <code>pytesseract</code> Python package: <code>pip install pytesseract</code></li>'
'<li>The Tesseract binary itself: '
'<a href="https://github.com/tesseract-ocr/tesseract" target="_blank">github.com/tesseract-ocr/tesseract</a> '
'(Windows installers, <code>brew install tesseract</code> on macOS, '
'<code>apt install tesseract-ocr</code> on Linux)</li>'
'</ol>'
'<p>Then for non-English OCR, download the matching <code>*.traineddata</code> '
'file from <a href="https://github.com/tesseract-ocr/tessdata" target="_blank">tessdata</a> '
'into your Tesseract install\'s <code>tessdata</code> folder.</p>'
)
return render_template("upload_tool.html",
title="OCR PDF",
description="Extract text from scanned PDFs or create a searchable PDF with a hidden text layer",
notes=(
f'{status}'
'<p><strong>Two output modes:</strong></p>'
'<ul style="margin:.4rem 0 .6rem 1.2rem">'
'<li><strong>Searchable PDF</strong> — keeps the original page images and adds an '
'invisible text layer underneath, so you can copy-paste and search. The PDF still '
'<em>looks</em> identical to the scan.</li>'
'<li><strong>Extracted text</strong> — just the recognised text, plain.</li>'
'</ul>'
'<p style="font-size:.9em;color:var(--muted)">Higher DPI = better OCR accuracy '
'but slower. 200 DPI is the sweet spot for most scans; bump to 300+ for small '
'fonts or low-quality scans.</p>'
),
endpoint="/convert/ocr-pdf",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "mode", "label": "Output",
"choices": [
{"value": "searchable", "label": "Searchable PDF (image + text layer)"},
{"value": "text", "label": "Extracted text only"},
]},
{"type": "select", "name": "lang", "label": "Language",
"choices": OCR_LANGS},
{"type": "number", "name": "dpi", "label": "OCR Resolution (DPI)",
"default": 200, "min": 100, "max": 400},
])
@bp.route("/cad-to-pdf")
def cad_to_pdf_page():
if ODA_CONVERTER:
notes = (
'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
'<strong>DWG support is enabled.</strong> ODA File Converter was detected at '
f'<code>{ODA_CONVERTER}</code>.</p>'
'<p>DXF files are rendered directly. DWG files are auto-converted to DXF first.</p>'
)
else:
notes = (
'<p><strong>DXF works out of the box.</strong> DWG files need the free '
'<a href="https://www.opendesign.com/guestfiles/oda_file_converter" target="_blank" rel="noopener">'
'ODA File Converter</a> installed and available on your system <code>PATH</code>.</p>'
'<details>'
'<summary>How to install ODA File Converter</summary>'
'<ol>'
'<li>Download the installer for your OS from '
'<a href="https://www.opendesign.com/guestfiles/oda_file_converter" target="_blank" rel="noopener">opendesign.com</a> '
'(free, guest download — no account required).</li>'
'<li>Run the installer. Defaults are fine.</li>'
'<li><strong>Add it to your PATH so this app can find it:</strong>'
'<ul>'
'<li><strong>Windows:</strong> add <code>C:\\Program Files\\ODA\\ODAFileConverter_title_version</code> '
'(the folder containing <code>ODAFileConverter.exe</code>) to your <em>System Environment Variables</em> &rarr; <code>Path</code>.</li>'
'<li><strong>macOS:</strong> <code>ln -s /Applications/ODAFileConverter.app/Contents/MacOS/ODAFileConverter /usr/local/bin/ODAFileConverter</code></li>'
'<li><strong>Linux:</strong> the <code>.deb</code>/<code>.rpm</code> package installs <code>ODAFileConverter</code> on PATH automatically. Otherwise symlink the binary into <code>/usr/local/bin</code>.</li>'
'</ul></li>'
'<li>Open a new terminal and verify: <code>ODAFileConverter</code> (should launch the tool GUI, or exit silently).</li>'
'<li><strong>Restart this Flask server</strong> so it picks up the updated PATH.</li>'
'</ol>'
'<p style="margin-top:.4rem">Alternative: open your DWG in free tools like <a href="https://www.autodesk.com/viewers" target="_blank" rel="noopener">Autodesk Viewer</a>, LibreCAD, or QCAD and export it as DXF, then upload the DXF here.</p>'
'</details>'
)
return render_template("upload_tool.html",
title="CAD to PDF/Image",
description="Convert DXF drawings to PDF or PNG. DWG is supported when ODA File Converter is installed.",
notes=notes,
endpoint="/convert/cad-to-pdf",
accept=".dxf,.dwg",
multiple=False,
options=[
{"type": "select", "name": "format", "label": "Output Format",
"choices": [
{"value": "pdf", "label": "PDF"},
{"value": "png", "label": "PNG"},
]},
{"type": "number", "name": "dpi", "label": "PNG Resolution (DPI)",
"default": 150, "min": 72, "max": 600,
"depends_on": {"format": "png"}},
])
@bp.route("/html-to-pdf")
def html_to_pdf_page():
if SOFFICE:
notes = (
f'<p><i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
f'<strong>LibreOffice detected</strong> — full CSS support, tables, lists, and inline styles render correctly.</p>'
)
else:
notes = (
'<p><i class="bi bi-info-circle-fill" style="color:#4361ee"></i> '
'<strong>Tip:</strong> install LibreOffice for far better CSS / table / image fidelity. '
'Without it, PDF rendering uses PyMuPDF\'s minimal HTML parser (basic text and simple tables only).</p>'
)
return render_template("upload_tool.html",
title="HTML to PDF",
description="Convert HTML content to a PDF document",
notes=notes,
endpoint="/convert/html-to-pdf",
text_input=True,
text_label="HTML Content",
text_placeholder="<h1>Hello World</h1>\n<p>Paste your HTML here...</p>",
accept="",
multiple=False,
options=[
{"type": "checkbox", "name": "use_basic_fallback",
"label": "Fallback",
"check_label": "Allow basic PyMuPDF fallback if LibreOffice is unavailable or fails",
"default": False},
],
button_text="Convert to PDF")
# ── Helpers ──────────────────────────────────────
def _docx_to_pdf(data: bytes) -> bytes:
"""Best-effort .docx → PDF conversion using python-docx + reportlab.
This is the fallback path used when LibreOffice is not available.
It preserves document order (paragraphs and tables interleaved correctly),
inline images, and basic heading/paragraph styling. It does NOT preserve:
headers/footers, columns, custom fonts, page breaks, text boxes, frames,
SmartArt, equations, or precise layout. For those, install LibreOffice.
"""
from docx.oxml.ns import qn
from docx.text.paragraph import Paragraph as DocxParagraph
from docx.table import Table as DocxTable
from reportlab.platypus import Image as RLImage
from PIL import Image as PILImage
doc = DocxDocument(io.BytesIO(data))
buf = io.BytesIO()
styles = getSampleStyleSheet()
normal = styles["Normal"]
normal.fontName = "Helvetica"
normal.fontSize = 11
normal.leading = 14
heading_styles = {}
for level in range(1, 4):
size = {1: 18, 2: 15, 3: 13}[level]
heading_styles[level] = ParagraphStyle(
f"Heading{level}", parent=normal,
fontName="Helvetica-Bold", fontSize=size, leading=size + 4,
spaceBefore=12, spaceAfter=6,
)
pdf = SimpleDocTemplate(buf, pagesize=A4,
leftMargin=inch, rightMargin=inch,
topMargin=inch, bottomMargin=inch)
story = []
# Map of relationship-id → raw image bytes, used to look up images
# referenced by <a:blip r:embed="rId123" /> elements in paragraphs.
image_parts: dict[str, bytes] = {}
try:
for rel_id, rel in doc.part.rels.items():
if "image" in (rel.reltype or ""):
image_parts[rel_id] = rel.target_part.blob
except Exception:
pass
# Page content area for image scaling (A4 minus 1in margins)
max_img_w = 6.5 * inch
max_img_h = 4.0 * inch # cap height so images don't dominate
def _emit_image(rel_id: str) -> None:
img_bytes = image_parts.get(rel_id)
if not img_bytes:
return
try:
# Pillow may not handle EMF/WMF; skip those gracefully
with PILImage.open(io.BytesIO(img_bytes)) as pil:
w, h = pil.size
# Convert to PNG if needed for reportlab compatibility
if pil.format not in ("PNG", "JPEG", "GIF"):
out = io.BytesIO()
if pil.mode in ("RGBA", "LA"):
pil.save(out, format="PNG")
else:
pil.convert("RGB").save(out, format="JPEG", quality=90)
out.seek(0)
img_data = out.getvalue()
else:
img_data = img_bytes
except Exception:
return
if w <= 0 or h <= 0:
return
scale = min(max_img_w / w, max_img_h / h, 1.0)
story.append(Spacer(1, 6))
story.append(RLImage(io.BytesIO(img_data),
width=w * scale, height=h * scale))
story.append(Spacer(1, 6))
def _emit_paragraph(child) -> None:
para = DocxParagraph(child, doc)
# Emit any inline images first (in their paragraph)
for blip in child.findall(".//" + qn("a:blip")):
rel_id = blip.get(qn("r:embed"))
if rel_id:
_emit_image(rel_id)
text = para.text.strip()
if not text:
return # already-emitted image, or genuinely empty
style_name = para.style.name.lower() if para.style else ""
if "heading 1" in style_name:
story.append(Paragraph(text, heading_styles[1]))
elif "heading 2" in style_name:
story.append(Paragraph(text, heading_styles[2]))
elif "heading 3" in style_name:
story.append(Paragraph(text, heading_styles[3]))
else:
rich = _build_rich_text(para)
story.append(Paragraph(rich, normal))
def _emit_table(child) -> None:
table = DocxTable(child, doc)
tdata = []
for row in table.rows:
tdata.append([cell.text for cell in row.cells])
if not tdata:
return
t = Table(tdata, repeatRows=1)
t.setStyle(TableStyle([
("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
("BACKGROUND", (0, 0), (-1, 0), colors.Color(0.9, 0.9, 0.95)),
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
("FONTSIZE", (0, 0), (-1, -1), 10),
("TOPPADDING", (0, 0), (-1, -1), 4),
("BOTTOMPADDING", (0, 0), (-1, -1), 4),
("LEFTPADDING", (0, 0), (-1, -1), 6),
("RIGHTPADDING", (0, 0), (-1, -1), 6),
]))
story.append(Spacer(1, 8))
story.append(t)
story.append(Spacer(1, 8))
# Walk the document body in order so paragraphs and tables appear in their
# original positions, not all paragraphs first then all tables.
for child in doc.element.body.iterchildren():
tag = child.tag.split("}", 1)[-1]
if tag == "p":
_emit_paragraph(child)
elif tag == "tbl":
_emit_table(child)
# Section-properties (sectPr) and other elements are ignored
if not story:
story.append(Paragraph("(empty document)", normal))
pdf.build(story)
return buf.getvalue()
def _build_rich_text(para) -> str:
"""Convert a python-docx paragraph's runs into reportlab-compatible rich text."""
parts = []
for run in para.runs:
text = run.text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
if not text:
continue
if run.bold and run.italic:
parts.append(f"<b><i>{text}</i></b>")
elif run.bold:
parts.append(f"<b>{text}</b>")
elif run.italic:
parts.append(f"<i>{text}</i>")
elif run.underline:
parts.append(f"<u>{text}</u>")
else:
parts.append(text)
return "".join(parts) or para.text
# ── Processing Routes ────────────────────────────
@bp.route("/to-pdf", methods=["POST"])
def to_pdf():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_MULTIPLE), 400
allow_basic_fallback = request.form.get("use_basic_fallback") == "on"
pdf_doc = fitz.open()
# Track which engine ran on Word docs so the response can advertise it
# (helps users diagnose "why is my output low-fidelity" without log access).
engine_used = "pymupdf"
quality = QUALITY_HIGH
warnings: list[str] = []
for f in files:
name = f.filename.lower()
data = f.read()
if name.endswith((".docx", ".doc", ".odt")):
# Word document → PDF pages.
# Prefer LibreOffice for high-fidelity layout. Fall back to the
# python-docx + reportlab rebuilder if soffice is unavailable.
ext = name.rsplit(".", 1)[-1]
try:
pdf_bytes = _soffice_convert(data, ext, "pdf")
if pdf_bytes is not None:
engine_used = "libreoffice"
quality = QUALITY_HIGH
else:
if not allow_basic_fallback:
return jsonify(error=(
f"High-fidelity conversion for '{f.filename}' requires LibreOffice. "
"Tick 'Allow basic Python fallback' to continue with lower layout fidelity."
)), 400
if ext != "docx":
return jsonify(error=(
f"'{f.filename}' requires LibreOffice (soffice) on PATH. "
"Only .docx is supported by the built-in fallback. "
"Install LibreOffice for full layout fidelity."
)), 400
pdf_bytes = _docx_to_pdf(data)
engine_used = "python-docx/reportlab"
quality = QUALITY_BASIC
warnings.append(
"Word document used basic fallback; headers, custom layout, and precise positioning may differ."
)
with fitz.open(stream=pdf_bytes, filetype="pdf") as docx_pdf:
pdf_doc.insert_pdf(docx_pdf)
except Exception as e:
log_error(e, f"to-pdf docx: {f.filename}")
return jsonify(error=f"Could not convert '{f.filename}' (Word file may be corrupted)."), 400
elif name.endswith(".txt"):
# Text file → PDF page
text = data.decode("utf-8", errors="replace")
page = pdf_doc.new_page(width=595, height=842) # A4
rect = fitz.Rect(50, 50, 545, 792)
page.insert_textbox(rect, text, fontsize=11, fontname="helv")
else:
# Image → PDF page
try:
with Image.open(io.BytesIO(data)) as pil_img:
pil_img = ImageOps.exif_transpose(pil_img)
if pil_img.mode in ("RGBA", "P"):
pil_img = pil_img.convert("RGB")
buf = io.BytesIO()
pil_img.save(buf, format="JPEG", quality=95)
img_data = buf.getvalue()
with fitz.open(stream=img_data, filetype="jpeg") as img_doc:
rect = img_doc[0].rect
pdf_page = pdf_doc.new_page(width=rect.width, height=rect.height)
pdf_page.insert_image(rect, stream=img_data)
except Exception as e:
log_error(e, f"to-pdf image: {f.filename}")
return jsonify(error=f"Could not convert '{f.filename}' (image may be corrupted or unsupported)."), 400
output = io.BytesIO()
pdf_doc.save(output)
pdf_doc.close()
output.seek(0)
resp = send_file(output, mimetype="application/pdf",
as_attachment=True, download_name="converted.pdf")
return set_conversion_metadata(resp, engine_used, quality, warnings)
@bp.route("/pdf-to-word", methods=["POST"])
def pdf_to_word():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
mode = request.form.get("mode", "layout")
pdf_data = files[0].read()
pages_spec = (request.form.get("pages") or "").strip()
extract_borderless_tables = request.form.get("extract_tables") == "on"
base = files[0].filename.rsplit(".", 1)[0]
docx_mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
# Pre-resolve page range against the PDF (used by all modes)
try:
with fitz.open(stream=pdf_data, filetype="pdf") as probe:
total_pages = len(probe)
if pages_spec:
from routes.pdf_tools import parse_page_ranges
target_pages = parse_page_ranges(pages_spec, total_pages)
if not target_pages:
return jsonify(error="No valid pages selected."), 400
else:
target_pages = list(range(total_pages))
except (ValueError, IndexError):
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 8-10'."), 400
except Exception as e:
log_error(e, "pdf-to-word probe")
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
# ── Mode dispatch ──────────────────────────────────────
if mode == "text":
try:
buf = _pdf_to_docx_flowing_text(pdf_data, target_pages)
except Exception as e:
log_error(e, "pdf-to-word text")
return jsonify(error="Could not extract text from the PDF (it may be a scan — try OCR PDF first)."), 400
resp = send_file(io.BytesIO(buf), mimetype=docx_mime,
as_attachment=True, download_name=f"{base}.docx")
return set_conversion_metadata(resp, "pymupdf/python-docx", QUALITY_BASIC,
"Flowing text prioritizes clean editable text over visual layout.")
if mode == "structure":
try:
buf = _pdf_to_docx_smart_structure(pdf_data, target_pages)
except ValueError as e:
return jsonify(error=str(e)), 400
except Exception as e:
log_error(e, "pdf-to-word structure")
return jsonify(error="Smart-structure analysis failed. Try Flowing text mode instead."), 400
resp = send_file(io.BytesIO(buf), mimetype=docx_mime,
as_attachment=True, download_name=f"{base}.docx")
return set_conversion_metadata(resp, "pymupdf/python-docx", "medium",
"Smart structure is editable but drops precise layout, figures, and tables.")
if mode == "exact":
dpi = safe_int(request.form.get("exact_dpi"), 180, min_val=96, max_val=300)
try:
buf = _pdf_to_docx_exact_visual(pdf_data, target_pages, dpi)
except Exception as e:
log_error(e, "pdf-to-word exact")
return jsonify(error="Exact visual copy failed. The PDF may be corrupted or password-protected."), 400
resp = send_file(io.BytesIO(buf), mimetype=docx_mime,
as_attachment=True, download_name=f"{base}.docx")
return set_conversion_metadata(resp, "pymupdf/python-docx", QUALITY_HIGH,
"Exact visual copy preserves appearance by embedding page images; text is not editable.")
if mode == "marker":
if not HAS_MARKER:
return jsonify(error=(
"Marker mode requires the 'marker-pdf' package. Run: "
"pip install marker-pdf — first run will download ~2 GB of models."
)), 400
try:
buf = _pdf_to_docx_via_marker(pdf_data, target_pages)
except Exception as e:
log_error(e, "pdf-to-word marker")
return jsonify(error="Marker conversion failed. Check the server log; "
"first run downloads ~2 GB and may need extra time."), 400
resp = send_file(io.BytesIO(buf), mimetype=docx_mime,
as_attachment=True, download_name=f"{base}.docx")
return set_conversion_metadata(resp, "marker-pdf/python-docx", QUALITY_HIGH,
"Marker output is editable structured content, not pixel-perfect layout.")
# ── Layout mode (default) ──────────────────────────────
if not HAS_PDF2DOCX:
return jsonify(error="Layout mode requires pdf2docx. Run: pip install pdf2docx — or switch to 'Flowing text' or 'Smart structure' mode."), 400
import tempfile, os
with tempfile.TemporaryDirectory() as tmpdir:
pdf_path = os.path.join(tmpdir, "input.pdf")
docx_path = os.path.join(tmpdir, "output.docx")
with open(pdf_path, "wb") as f:
f.write(pdf_data)
# Translate target_pages (set of 0-based) to start/end if contiguous.
# pdf2docx supports a `pages` list arg directly, which is cleaner.
cv_kwargs = {"multi_processing": False}
if pages_spec:
cv_kwargs["pages"] = target_pages
if extract_borderless_tables:
cv_kwargs["parse_stream_table"] = True
try:
cv = Pdf2DocxConverter(pdf_path)
try:
cv.convert(docx_path, **cv_kwargs)
finally:
cv.close()
except Exception as e:
log_error(e, "pdf-to-word layout")
return jsonify(error="Layout conversion failed. Try Smart structure or Flowing text mode instead, or check that the PDF isn't password-protected."), 400
with open(docx_path, "rb") as f:
result = io.BytesIO(f.read())
result.seek(0)
name = files[0].filename.rsplit(".", 1)[0] + ".docx"
resp = send_file(result, mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
as_attachment=True, download_name=name)
return set_conversion_metadata(resp, "pdf2docx", "medium",
"Layout mode is editable but PDF-to-Word conversion is inherently lossy.")
# ── PDF → Word helpers (one per non-pdf2docx mode) ─────────
def _pdf_to_docx_flowing_text(pdf_data: bytes, target_pages: list[int]) -> bytes:
"""Reading-order text extraction → one paragraph per block. No structure."""
from docx import Document as DocxDocument
with fitz.open(stream=pdf_data, filetype="pdf") as src:
doc = DocxDocument()
for idx, pno in enumerate(target_pages):
if idx > 0:
doc.add_page_break()
page = src[pno]
blocks = page.get_text("blocks") or []
blocks.sort(key=lambda b: (round(b[1], 1), round(b[0], 1)))
for b in blocks:
text = (b[4] if len(b) > 4 else "").strip()
if not text:
continue
text = "\n".join(p.strip() for p in text.split("\n") if p.strip())
for para in text.split("\n\n"):
para = para.replace("\n", " ").strip()
if para:
doc.add_paragraph(para)
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
def _pdf_to_docx_exact_visual(pdf_data: bytes, target_pages: list[int], dpi: int) -> bytes:
"""Render PDF pages into a DOCX as full-page images.
This mode is intentionally non-editable. It exists for users who care more
about visual fidelity than editable Word content.
"""
from docx import Document as DocxDocument
from docx.shared import Inches
with fitz.open(stream=pdf_data, filetype="pdf") as src:
doc = DocxDocument()
section = doc.sections[0]
section.top_margin = Inches(0)
section.bottom_margin = Inches(0)
section.left_margin = Inches(0)
section.right_margin = Inches(0)
mat = fitz.Matrix(dpi / 72, dpi / 72)
first = True
for pno in target_pages:
page = src[pno]
page_w_in = page.rect.width / 72
page_h_in = page.rect.height / 72
if first:
section.page_width = Inches(page_w_in)
section.page_height = Inches(page_h_in)
first = False
else:
doc.add_page_break()
pix = page.get_pixmap(matrix=mat, alpha=False)
png_bytes = pix.tobytes("png")
paragraph = doc.add_paragraph()
paragraph.paragraph_format.space_before = 0
paragraph.paragraph_format.space_after = 0
paragraph.add_run().add_picture(io.BytesIO(png_bytes), width=Inches(page_w_in))
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
def _pdf_to_docx_smart_structure(pdf_data: bytes, target_pages: list[int]) -> bytes:
"""Detect headings (by font size), bullet/numbered lists (by line prefix),
and paragraphs. Emit a .docx with proper Word heading and list styles.
Drops tables and figures (those need Layout or Marker mode).
"""
import re
from collections import Counter
from docx import Document as DocxDocument
BULLET_RE = re.compile(r"^[•▪●·\-\*]\s+")
NUMBER_RE = re.compile(r"^(\d+|[a-zA-Z])[\.\)]\s+")
with fitz.open(stream=pdf_data, filetype="pdf") as src:
# Pass 1: collect font sizes to determine the body baseline.
sizes: list[float] = []
for pno in target_pages:
page = src[pno]
for block in page.get_text("dict")["blocks"]:
if "lines" not in block:
continue
for line in block["lines"]:
for span in line["spans"]:
sizes.append(round(span["size"], 1))
if not sizes:
raise ValueError("No text found in the selected pages. If the PDF is a scan, run OCR PDF first.")
body_size = Counter(sizes).most_common(1)[0][0]
# Pass 2: build the document.
doc = DocxDocument()
for idx, pno in enumerate(target_pages):
if idx > 0:
doc.add_page_break()
page = src[pno]
blocks = [b for b in page.get_text("dict")["blocks"] if "lines" in b]
# Reading order: top-to-bottom, then left-to-right within tolerance
blocks.sort(key=lambda b: (round(b["bbox"][1] / 5) * 5, round(b["bbox"][0])))
for block in blocks:
lines = []
spans_meta: list[tuple[float, bool]] = []
for line in block["lines"]:
line_text = "".join(s["text"] for s in line["spans"])
if line_text.strip():
lines.append(line_text)
for s in line["spans"]:
# PyMuPDF flag bit 4 (0x10) = bold
spans_meta.append((s["size"], bool(s["flags"] & 16)))
if not lines:
continue
avg_size = sum(s for s, _ in spans_meta) / len(spans_meta)
bold_ratio = sum(1 for _, b in spans_meta if b) / len(spans_meta)
full_text = " ".join(line.strip() for line in lines).strip()
# Heading detection by relative font size
if avg_size >= body_size * 1.6:
doc.add_heading(full_text, level=1)
elif avg_size >= body_size * 1.3:
doc.add_heading(full_text, level=2)
elif avg_size >= body_size * 1.15 or (
avg_size >= body_size * 1.05 and bold_ratio > 0.6 and len(full_text) < 120
):
doc.add_heading(full_text, level=3)
# List detection by line prefix
elif BULLET_RE.match(full_text):
doc.add_paragraph(BULLET_RE.sub("", full_text), style="List Bullet")
elif NUMBER_RE.match(full_text):
doc.add_paragraph(NUMBER_RE.sub("", full_text), style="List Number")
else:
doc.add_paragraph(full_text)
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
def _pdf_to_docx_via_marker(pdf_data: bytes, target_pages: list[int]) -> bytes:
"""Use Marker (ML) to extract structured Markdown, then convert to .docx."""
import os
import tempfile
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
# If specific pages requested, build a subset PDF first so Marker only
# processes what's needed (it's slow per page).
if len(target_pages) != _count_pages(pdf_data):
pdf_data = _extract_pages(pdf_data, target_pages)
with tempfile.TemporaryDirectory() as tmp:
pdf_path = os.path.join(tmp, "input.pdf")
with open(pdf_path, "wb") as f:
f.write(pdf_data)
converter = PdfConverter(artifact_dict=create_model_dict())
rendered = converter(pdf_path)
markdown_text, _, _ = text_from_rendered(rendered)
# Convert the markdown to docx via a reusable HTML→docx walker
import markdown as md_lib
html = md_lib.markdown(
markdown_text,
extensions=["extra", "sane_lists", "nl2br", "fenced_code", "tables"],
)
return _md_html_to_docx_bytes(html)
def _count_pages(pdf_data: bytes) -> int:
with fitz.open(stream=pdf_data, filetype="pdf") as d:
return len(d)
def _extract_pages(pdf_data: bytes, page_indices: list[int]) -> bytes:
"""Build a new PDF containing only the listed page indices (0-based)."""
with fitz.open(stream=pdf_data, filetype="pdf") as src:
with fitz.open() as out:
for idx in page_indices:
out.insert_pdf(src, from_page=idx, to_page=idx)
buf = io.BytesIO()
out.save(buf)
return buf.getvalue()
def _md_html_to_docx_bytes(html: str) -> bytes:
"""Use the same HTML-walking parser md_to_docx uses, but as a callable
helper so the marker mode can reuse it. Returns docx bytes.
"""
# We avoid circular imports — import lazily.
from html.parser import HTMLParser as _HP
from docx import Document as DocxDocument
doc = DocxDocument()
class _P(_HP):
def __init__(self):
super().__init__()
self.cur_para = None
self.list_stack = []
self.in_pre = False
def handle_starttag(self, tag, attrs):
if tag in ("h1", "h2", "h3", "h4"):
self.cur_para = doc.add_heading("", level=int(tag[1]))
elif tag == "p":
self.cur_para = doc.add_paragraph()
elif tag == "li":
style = "List Number" if (self.list_stack and self.list_stack[-1] == "ol") else "List Bullet"
self.cur_para = doc.add_paragraph(style=style)
elif tag in ("ul", "ol"):
self.list_stack.append(tag)
elif tag in ("strong", "b", "em", "i", "code"):
pass # handled in handle_data
elif tag == "pre":
self.in_pre = True
self.cur_para = doc.add_paragraph(style="Intense Quote")
elif tag == "hr":
doc.add_paragraph("" * 40)
def handle_endtag(self, tag):
if tag in ("ul", "ol") and self.list_stack:
self.list_stack.pop()
if tag == "pre":
self.in_pre = False
self.cur_para = None
if tag in ("h1", "h2", "h3", "h4", "p", "li"):
self.cur_para = None
def handle_data(self, data):
if self.cur_para is None:
if data.strip():
self.cur_para = doc.add_paragraph()
else:
return
self.cur_para.add_run(data)
parser = _P()
parser.feed(html)
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
@bp.route("/pdf-to-images", methods=["POST"])
def pdf_to_images():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
fmt = request.form.get("format", "png")
dpi = safe_int(request.form.get("dpi"), 200, min_val=72, max_val=600)
pdf_data = files[0].read()
try:
doc = fitz.open(stream=pdf_data, filetype="pdf")
except Exception as e:
log_error(e, "pdf-to-images open")
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
from utils.file_utils import make_zip
images = []
mat = fitz.Matrix(dpi / 72, dpi / 72)
try:
for i, page in enumerate(doc):
pix = page.get_pixmap(matrix=mat)
if fmt == "jpg":
img_bytes = pix.tobytes("jpeg")
ext = "jpg"
else:
img_bytes = pix.tobytes("png")
ext = "png"
images.append((f"page_{i + 1}.{ext}", img_bytes))
finally:
doc.close()
if len(images) == 1:
mime = "image/png" if fmt == "png" else "image/jpeg"
return send_file(io.BytesIO(images[0][1]), mimetype=mime,
as_attachment=True, download_name=images[0][0])
zip_buf = make_zip(images)
name = files[0].filename.rsplit(".", 1)[0] + "_images.zip"
return send_file(zip_buf, mimetype="application/zip",
as_attachment=True, download_name=name)
@bp.route("/pdf-to-text", methods=["POST"])
def pdf_to_text():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
pdf_data = files[0].read()
try:
doc = fitz.open(stream=pdf_data, filetype="pdf")
except Exception as e:
log_error(e, "pdf-to-text open")
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
text_parts = []
try:
for i, page in enumerate(doc):
text_parts.append(f"--- Page {i + 1} ---")
text_parts.append(page.get_text())
finally:
doc.close()
return jsonify(text="\n".join(text_parts))
@bp.route("/pdf-to-excel", methods=["POST"])
def pdf_to_excel():
import re
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.utils import get_column_letter
from routes.pdf_tools import parse_page_ranges
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
mode = request.form.get("mode", "tables")
organize = request.form.get("organize", "per_table")
strategy = request.form.get("strategy", "auto")
if strategy not in ("auto", "lines", "text"):
strategy = "auto"
table_engine = request.form.get("table_engine", "auto")
if table_engine not in ("auto", "pymupdf", "pdfplumber"):
table_engine = "auto"
if table_engine == "pdfplumber" and not HAS_PDFPLUMBER:
return jsonify(error="pdfplumber is not installed. Install it or choose Auto/PyMuPDF."), 400
pages_spec = request.form.get("pages", "").strip()
pdf_data = files[0].read()
try:
doc = fitz.open(stream=pdf_data, filetype="pdf")
except Exception as e:
log_error(e, "pdf-to-excel open")
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
plumber_doc = None
if table_engine in ("auto", "pdfplumber") and HAS_PDFPLUMBER:
try:
plumber_doc = pdfplumber.open(io.BytesIO(pdf_data))
except Exception as e:
log_error(e, "pdf-to-excel pdfplumber open")
if table_engine == "pdfplumber":
doc.close()
return jsonify(error="pdfplumber could not open this PDF. Try Auto or PyMuPDF."), 400
try:
target_pages = parse_page_ranges(pages_spec, len(doc))
except (ValueError, IndexError):
doc.close()
if plumber_doc:
plumber_doc.close()
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 8-10'."), 400
if not target_pages:
doc.close()
if plumber_doc:
plumber_doc.close()
return jsonify(error="No valid pages selected."), 400
wb = Workbook()
wb.remove(wb.active)
used_names: set[str] = set()
total_tables = 0
total_text_pages = 0
warnings: list[str] = []
table_engines_used: set[str] = set()
def _safe_name(base: str) -> str:
name = re.sub(r"[\[\]\*\?\/\\:]", "_", base)[:31] or "Sheet"
candidate = name
i = 2
while candidate in used_names:
suffix = f"_{i}"
candidate = (name[: 31 - len(suffix)] + suffix)
i += 1
used_names.add(candidate)
return candidate
def _write_rows(ws, rows: list[list], start_row: int = 1, header: bool = True) -> int:
for r_idx, row in enumerate(rows, start=start_row):
for c_idx, cell in enumerate(row, start=1):
ws.cell(row=r_idx, column=c_idx, value="" if cell is None else str(cell))
if header and r_idx == start_row:
for c_idx in range(1, len(row) + 1):
ws.cell(row=r_idx, column=c_idx).font = Font(bold=True)
return start_row + len(rows)
def _text_rows(page) -> list[list[str]]:
lines = page.get_text().splitlines()
rows = []
for line in lines:
line = line.strip()
if not line:
continue
parts = re.split(r"\s{2,}|\t+", line)
rows.append(parts if parts else [line])
return rows
def _find_tables_robust(page) -> list:
"""Detect tables on a page according to the user's chosen strategy.
PyMuPDF's default `find_tables()` only catches ruled (visible-border)
tables. Many real-world PDFs use borderless tables where columns are
aligned by whitespace — those need `strategy="text"`. The "auto" mode
tries lines first and only falls back to text-based when nothing is
found, which avoids the false-positive risk of text-detection picking
up multi-column body text as a "table".
"""
try:
if strategy == "lines":
return list(page.find_tables(strategy="lines"))
if strategy == "text":
return list(page.find_tables(
strategy="text",
vertical_strategy="text",
horizontal_strategy="text",
))
# auto: lines, then text fallback
tables = list(page.find_tables(strategy="lines"))
if tables:
return tables
return list(page.find_tables(
strategy="text",
vertical_strategy="text",
horizontal_strategy="text",
))
except Exception as e:
log_error(e, f"find_tables strategy={strategy}")
return []
def _clean_table_rows(rows) -> list[list[str]]:
cleaned = []
for row in rows or []:
normalized = ["" if cell is None else str(cell) for cell in row]
if any(cell.strip() for cell in normalized):
cleaned.append(normalized)
return cleaned
def _pymupdf_table_rows(page) -> list[list[list[str]]]:
rows_list = []
for table in _find_tables_robust(page):
try:
rows = _clean_table_rows(table.extract())
except Exception as e:
log_error(e, "pdf-to-excel table extract")
continue
if rows:
rows_list.append(rows)
if rows_list:
table_engines_used.add("pymupdf")
return rows_list
def _table_rows_for_page(page, pno: int) -> list[list[list[str]]]:
if plumber_doc is not None:
try:
rows_list = [
rows for rows in
(_clean_table_rows(rows) for rows in (plumber_doc.pages[pno].extract_tables() or []))
if rows
]
except Exception as e:
log_error(e, f"pdfplumber extract page {pno + 1}")
rows_list = []
if rows_list:
table_engines_used.add("pdfplumber")
return rows_list
if table_engine == "pdfplumber":
return []
rows_list = _pymupdf_table_rows(page)
if rows_list and plumber_doc is not None and table_engine == "auto":
warnings.append("pdfplumber found no table on at least one page; PyMuPDF fallback was used.")
return rows_list
# ── "combined" — stream everything into a single sheet ────────────
if organize == "combined":
ws = wb.create_sheet(_safe_name("Extracted"))
next_row = 1
for pno in target_pages:
page = doc[pno]
page_had_content = False
if mode in ("tables", "tables_text"):
tables = _table_rows_for_page(page, pno)
for rows in tables:
if not rows:
continue
ws.cell(row=next_row, column=1, value=f"Page {pno + 1} table").font = Font(bold=True, italic=True)
next_row += 1
next_row = _write_rows(ws, rows, start_row=next_row)
next_row += 1
total_tables += 1
page_had_content = True
if mode == "text" or (mode == "tables_text" and not page_had_content):
text_rows = _text_rows(page)
if text_rows:
table_engines_used.add("pymupdf")
ws.cell(row=next_row, column=1, value=f"Page {pno + 1} text").font = Font(bold=True, italic=True)
next_row += 1
next_row = _write_rows(ws, text_rows, start_row=next_row, header=False)
next_row += 1
total_text_pages += 1
# ── "per_page" and "per_table" ────────────────────────────────────
else:
for pno in target_pages:
page = doc[pno]
tables_rows = [] # list of (label, rows)
if mode in ("tables", "tables_text"):
for tidx, rows in enumerate(_table_rows_for_page(page, pno), start=1):
if rows:
tables_rows.append((f"Table {tidx}", rows))
total_tables += 1
if mode == "text" or (mode == "tables_text" and not tables_rows):
text_rows = _text_rows(page)
if text_rows:
table_engines_used.add("pymupdf")
tables_rows.append(("Text", text_rows))
total_text_pages += 1
if not tables_rows:
continue
if organize == "per_table":
for label, rows in tables_rows:
is_text = label == "Text"
sheet = wb.create_sheet(_safe_name(f"Page{pno + 1}_{label.replace(' ', '')}"))
_write_rows(sheet, rows, header=not is_text)
else: # per_page
sheet = wb.create_sheet(_safe_name(f"Page_{pno + 1}"))
next_row = 1
for label, rows in tables_rows:
is_text = label == "Text"
sheet.cell(row=next_row, column=1, value=label).font = Font(bold=True, italic=True)
next_row += 1
next_row = _write_rows(sheet, rows, start_row=next_row, header=not is_text)
next_row += 1
doc.close()
if plumber_doc:
plumber_doc.close()
if not wb.sheetnames:
msg = "No tables found on the selected pages."
if strategy == "lines":
msg += " Try the 'Text alignment' or 'Auto' strategy — your PDF may use borderless tables."
elif mode == "tables":
msg += " Try the 'Tables, fall back to text rows' mode, or use PDF to Word in Layout mode."
else:
msg += " If this is a scanned PDF, run it through OCR PDF first; otherwise try PDF to Word in Layout mode."
return jsonify(error=msg), 400
# Auto-size columns on every sheet (cap at 60 chars to avoid absurd widths)
for ws in wb.worksheets:
for col_idx in range(1, ws.max_column + 1):
max_len = 0
for row_idx in range(1, ws.max_row + 1):
v = ws.cell(row=row_idx, column=col_idx).value
if v is not None:
max_len = max(max_len, len(str(v)))
ws.column_dimensions[get_column_letter(col_idx)].width = min(max_len + 2, 60)
output = io.BytesIO()
wb.save(output)
output.seek(0)
name = files[0].filename.rsplit(".", 1)[0] + ".xlsx"
engine = "+".join(sorted(table_engines_used)) or "pymupdf"
quality = "medium" if total_tables else QUALITY_BASIC
resp = send_file(
output,
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
as_attachment=True,
download_name=name,
)
return set_conversion_metadata(resp, engine, quality, warnings)
@bp.route("/md-to-pdf", methods=["POST"])
def md_to_pdf():
import markdown as md_lib
md_text = request.form.get("markdown", "").strip()
if not md_text:
return jsonify(error="Please enter some Markdown."), 400
page_size = request.form.get("page_size", "A4").lower()
font_size = safe_int(request.form.get("font_size"), 11, min_val=8, max_val=18)
sizes_map = {
"a4": (595.28, 841.89),
"letter": (612, 792),
"legal": (612, 1008),
"a5": (419.53, 595.28),
}
page_w, page_h = sizes_map.get(page_size, sizes_map["a4"])
html = md_lib.markdown(
md_text,
extensions=["extra", "sane_lists", "nl2br", "fenced_code", "tables"],
)
margin = 54 # 0.75 inch
content_rect = fitz.Rect(margin, margin, page_w - margin, page_h - margin)
css = (
f"* {{ font-family: sans-serif; font-size: {font_size}pt; line-height: 1.45; }}"
"h1 { font-size: 1.8em; margin: 0.4em 0 0.3em; }"
"h2 { font-size: 1.5em; margin: 0.4em 0 0.3em; }"
"h3 { font-size: 1.2em; margin: 0.4em 0 0.3em; }"
"p { margin: 0.35em 0; }"
"ul, ol { margin: 0.3em 0 0.3em 1.2em; }"
"li { margin: 0.15em 0; }"
"code { font-family: monospace; background: #f2f2f2; padding: 1px 3px; }"
"pre { font-family: monospace; background: #f5f5f5; padding: 0.5em; white-space: pre-wrap; }"
"blockquote { margin: 0.5em 0; padding-left: 0.8em; border-left: 3px solid #bbb; color: #555; }"
"table { border-collapse: collapse; margin: 0.4em 0; }"
"th, td { border: 1px solid #999; padding: 0.2em 0.5em; }"
"hr { border: none; border-top: 1px solid #ccc; margin: 0.6em 0; }"
)
# Use PyMuPDF's Story + DocumentWriter for reliable multi-page HTML rendering
output = io.BytesIO()
mediabox = fitz.Rect(0, 0, page_w, page_h)
writer = fitz.DocumentWriter(output)
story = fitz.Story(html=html, user_css=css)
more = 1
safety = 0
while more and safety < 500:
dev = writer.begin_page(mediabox)
more, _ = story.place(content_rect)
story.draw(dev)
writer.end_page()
safety += 1
writer.close()
output.seek(0)
name = (request.form.get("file_name") or "document").strip() + ".pdf"
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
@bp.route("/md-to-docx", methods=["POST"])
def md_to_docx():
"""Markdown → .docx by walking an HTML tree built from Markdown."""
import markdown as md_lib
import re
from html.parser import HTMLParser
from docx.shared import Pt, RGBColor
md_text = request.form.get("markdown", "").strip()
if not md_text:
return jsonify(error="Please enter some Markdown."), 400
html = md_lib.markdown(
md_text,
extensions=["extra", "sane_lists", "fenced_code", "tables"],
)
docx = DocxDocument()
class MdHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.stack: list[str] = []
self.current_para = None
self.list_stack: list[str] = [] # "ul" or "ol"
self.in_pre = False
self.pending_href: str | None = None
self._run_formats: list[dict] = []
def _new_paragraph(self, style=None):
self.current_para = docx.add_paragraph(style=style) if style else docx.add_paragraph()
return self.current_para
def _add_run(self, text):
if not text:
return
p = self.current_para or self._new_paragraph()
run = p.add_run(text)
fmt = {}
for f in self._run_formats:
fmt.update(f)
if fmt.get("bold"): run.bold = True
if fmt.get("italic"): run.italic = True
if fmt.get("code") or self.in_pre:
run.font.name = "Consolas"
run.font.size = Pt(10)
if self.pending_href:
run.font.color.rgb = RGBColor(0x1A, 0x0D, 0xAB)
run.underline = True
def handle_starttag(self, tag, attrs):
self.stack.append(tag)
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
level = min(int(tag[1]), 4)
self._new_paragraph(style=f"Heading {level}")
elif tag == "p":
if not (self.list_stack or self.in_pre):
self._new_paragraph()
elif tag in ("ul", "ol"):
self.list_stack.append(tag)
elif tag == "li":
style = "List Number" if self.list_stack and self.list_stack[-1] == "ol" else "List Bullet"
try:
self._new_paragraph(style=style)
except KeyError:
self._new_paragraph()
elif tag in ("strong", "b"):
self._run_formats.append({"bold": True})
elif tag in ("em", "i"):
self._run_formats.append({"italic": True})
elif tag == "code":
self._run_formats.append({"code": True})
elif tag == "pre":
self.in_pre = True
self._new_paragraph()
elif tag == "blockquote":
try:
self._new_paragraph(style="Intense Quote")
except KeyError:
self._new_paragraph()
elif tag == "a":
href = dict(attrs).get("href", "")
self.pending_href = href
elif tag == "hr":
docx.add_paragraph("" * 40)
elif tag == "br":
if self.current_para is not None:
self.current_para.add_run().add_break()
def handle_endtag(self, tag):
if self.stack and self.stack[-1] == tag:
self.stack.pop()
if tag in ("ul", "ol") and self.list_stack:
self.list_stack.pop()
elif tag in ("strong", "b", "em", "i", "code"):
if self._run_formats:
self._run_formats.pop()
elif tag == "pre":
self.in_pre = False
elif tag == "a":
if self.pending_href:
self._add_run(f" ({self.pending_href})")
self.pending_href = None
def handle_data(self, data):
if not data:
return
if self.in_pre:
for line in data.splitlines():
if self.current_para is None:
self._new_paragraph()
r = self.current_para.add_run(line)
r.font.name = "Consolas"
r.font.size = Pt(10)
self.current_para.add_run().add_break()
else:
# Collapse whitespace like HTML does
text = re.sub(r"\s+", " ", data)
self._add_run(text)
parser = MdHTMLParser()
parser.feed(html)
output = io.BytesIO()
docx.save(output)
output.seek(0)
name = (request.form.get("file_name") or "document").strip() + ".docx"
return send_file(
output,
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
as_attachment=True, download_name=name,
)
@bp.route("/html-to-pdf", methods=["POST"])
def html_to_pdf():
html = request.form.get("text", "").strip()
if not html:
return jsonify(error="Please enter some HTML content."), 400
allow_basic_fallback = request.form.get("use_basic_fallback") == "on"
# Wrap in basic structure if no <html> tag present
if "<html" not in html.lower():
html = f"<!DOCTYPE html><html><body>{html}</body></html>"
# Prefer LibreOffice for proper CSS rendering
pdf_bytes = _soffice_convert(html.encode("utf-8"), "html", "pdf")
engine = "libreoffice"
quality = QUALITY_HIGH
warnings: list[str] = []
if pdf_bytes is None:
if not allow_basic_fallback:
return jsonify(error=(
"High-fidelity HTML to PDF requires LibreOffice. Tick "
"'Allow basic PyMuPDF fallback' to continue with limited CSS/layout support."
)), 400
engine = "pymupdf"
quality = QUALITY_BASIC
warnings.append("Basic HTML fallback supports only simple markup and may not preserve CSS/layout.")
# Fallback: PyMuPDF's minimal HTML rendering
doc = fitz.open()
try:
page = doc.new_page(width=595, height=842) # A4
try:
page.insert_htmlbox(fitz.Rect(50, 50, 545, 792), html)
except Exception as e:
log_error(e, "html-to-pdf insert_htmlbox")
return jsonify(error="HTML rendering failed (the markup may be invalid or use unsupported features)."), 400
output = io.BytesIO()
doc.save(output)
output.seek(0)
pdf_bytes = output.getvalue()
finally:
doc.close()
output = io.BytesIO(pdf_bytes)
output.seek(0)
resp = send_file(output, mimetype="application/pdf",
as_attachment=True, download_name="converted.pdf")
return set_conversion_metadata(resp, engine, quality, warnings)
@bp.route("/ocr-pdf", methods=["POST"])
def ocr_pdf():
if not HAS_TESSERACT:
return jsonify(error="OCR requires 'pytesseract' and the Tesseract binary. Install: pip install pytesseract, plus Tesseract from https://github.com/tesseract-ocr/tesseract"), 400
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
mode = request.form.get("mode", "searchable")
lang = request.form.get("lang", "eng")
dpi = safe_int(request.form.get("dpi"), 200, min_val=72, max_val=400)
pdf_data = files[0].read()
try:
src = fitz.open(stream=pdf_data, filetype="pdf")
except Exception as e:
log_error(e, "ocr-pdf open")
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
zoom = dpi / 72
try:
if mode == "text":
text_parts = []
for i, page in enumerate(src):
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
with Image.open(io.BytesIO(pix.tobytes("png"))) as img:
text = pytesseract.image_to_string(img, lang=lang)
text_parts.append(f"--- Page {i + 1} ---\n{text.strip()}")
combined = "\n\n".join(text_parts).strip()
return jsonify(text=combined or "(No text detected)")
output = fitz.open()
try:
for page in src:
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
with Image.open(io.BytesIO(pix.tobytes("png"))) as img:
page_pdf_bytes = pytesseract.image_to_pdf_or_hocr(
img, extension="pdf", lang=lang)
with fitz.open(stream=page_pdf_bytes, filetype="pdf") as sub:
output.insert_pdf(sub)
buf = io.BytesIO()
output.save(buf)
buf.seek(0)
finally:
output.close()
name = files[0].filename.rsplit(".", 1)[0] + "_ocr.pdf"
return send_file(buf, mimetype="application/pdf",
as_attachment=True, download_name=name)
except pytesseract.TesseractNotFoundError:
return jsonify(error="Tesseract binary not found. Install from https://github.com/tesseract-ocr/tesseract and ensure it is on PATH."), 400
except Exception as e:
msg = str(e)
log_error(e, f"ocr-pdf lang={lang}")
if "language" in msg.lower() or "traineddata" in msg.lower():
return jsonify(error=f"Language pack '{lang}' not installed. Download its .traineddata file into your Tesseract tessdata directory."), 400
return jsonify(error="OCR failed (the PDF may be image-only or unreadable)."), 400
finally:
src.close()
@bp.route("/cad-to-pdf", methods=["POST"])
def cad_to_pdf():
if not HAS_EZDXF:
return jsonify(error="CAD conversion requires 'ezdxf' and 'matplotlib'. Install: pip install ezdxf matplotlib"), 400
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
target = request.form.get("format", "pdf")
dpi = safe_int(request.form.get("dpi"), 150, min_val=72, max_val=600)
ezdxf, RenderContext, Frontend, MatplotlibBackend, plt = _load_cad_modules()
filename = files[0].filename
ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
file_data = files[0].read()
engine_used = "ezdxf/matplotlib"
import tempfile, os, subprocess
with tempfile.TemporaryDirectory() as tmpdir:
if ext == "dwg":
if not ODA_CONVERTER:
return jsonify(error="DWG support requires ODA File Converter. Download it free from https://www.opendesign.com/guestfiles/oda_file_converter and ensure it is on your PATH. Or convert your DWG to DXF first."), 400
engine_used = "oda/ezdxf/matplotlib"
in_dir = os.path.join(tmpdir, "in")
out_dir = os.path.join(tmpdir, "out")
os.makedirs(in_dir)
os.makedirs(out_dir)
dwg_path = os.path.join(in_dir, "input.dwg")
with open(dwg_path, "wb") as f:
f.write(file_data)
try:
subprocess.run(
[ODA_CONVERTER, in_dir, out_dir, "ACAD2018", "DXF", "0", "1", "*.DWG"],
check=True, capture_output=True, timeout=60,
)
except subprocess.CalledProcessError as e:
log_error(e, "cad-to-pdf ODA")
return jsonify(error="DWG to DXF conversion failed (file may be corrupted or use an unsupported version)."), 400
except subprocess.TimeoutExpired:
return jsonify(error="DWG conversion timed out."), 400
dxf_path = os.path.join(out_dir, "input.dxf")
if not os.path.exists(dxf_path):
return jsonify(error="DWG to DXF conversion produced no output."), 400
doc = ezdxf.readfile(dxf_path)
elif ext == "dxf":
dxf_path = os.path.join(tmpdir, "input.dxf")
with open(dxf_path, "wb") as f:
f.write(file_data)
try:
doc = ezdxf.readfile(dxf_path)
except Exception as e:
log_error(e, "cad-to-pdf dxf parse")
return jsonify(error="Invalid DXF file (the file may be corrupted or use an unsupported feature)."), 400
else:
return jsonify(error="Upload a .dxf or .dwg file."), 400
msp = doc.modelspace()
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
ax.set_aspect("equal")
ax.set_axis_off()
try:
ctx = RenderContext(doc)
backend = MatplotlibBackend(ax)
Frontend(ctx, backend).draw_layout(msp, finalize=True)
except Exception as e:
plt.close(fig)
log_error(e, "cad-to-pdf render")
return jsonify(error="CAD rendering failed (the drawing may use unsupported entities)."), 400
buf = io.BytesIO()
base_name = filename.rsplit(".", 1)[0]
if target == "pdf":
fig.savefig(buf, format="pdf", bbox_inches="tight", pad_inches=0.2)
plt.close(fig)
buf.seek(0)
resp = send_file(buf, mimetype="application/pdf",
as_attachment=True, download_name=base_name + ".pdf")
return set_conversion_metadata(
resp,
engine_used,
"medium",
"CAD rendering may omit or simplify unsupported entities, fonts, and line styles.",
)
else:
fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight", pad_inches=0.2)
plt.close(fig)
buf.seek(0)
resp = send_file(buf, mimetype="image/png",
as_attachment=True, download_name=base_name + ".png")
return set_conversion_metadata(
resp,
engine_used,
"medium",
"CAD rendering may omit or simplify unsupported entities, fonts, and line styles.",
)
# ── PDF → PowerPoint ─────────────────────────────────────
PPTX_MIME = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
SLIDE_SIZES_EMU = {
# python-pptx uses English Metric Units (914400 EMU per inch).
"16:9": (12192000, 6858000), # 13.333 × 7.5 in (default widescreen)
"4:3": (9144000, 6858000), # 10 × 7.5 in
"a4": (10692000, 7560000), # ~11.69 × 8.27 in (landscape A4)
}
@bp.route("/pdf-to-pptx")
def pdf_to_pptx_page():
# Default to Editable when LibreOffice is on PATH; otherwise Image,
# since editable mode would just error otherwise.
default_mode = "editable" if SOFFICE else "image"
if SOFFICE:
editable_status = (
'<i class="bi bi-check-circle-fill" style="color:#2ec4b6"></i> '
'<strong>LibreOffice detected</strong> — Editable mode will produce a real .pptx '
'with text and shapes you can click and edit in PowerPoint.'
)
else:
editable_status = (
'<i class="bi bi-exclamation-triangle-fill" style="color:#ffb703"></i> '
'<strong>LibreOffice not found</strong> — Editable mode is unavailable. '
'Install LibreOffice (see <a href="/convert/pptx-to-pdf">PowerPoint to PDF</a>) '
'and restart the server, or use Image mode below.'
)
return render_template("upload_tool.html",
title="PDF to PowerPoint",
description="Convert a PDF into a .pptx — either as editable text/shapes, or as page images",
notes=(
f"<p>{editable_status}</p>"
"<p><strong>Two conversion modes:</strong></p>"
"<ul style='margin:.4rem 0 .6rem 1.2rem'>"
"<li><strong>Editable</strong> — uses LibreOffice to convert each PDF page into native PowerPoint "
"elements (text frames, lines, shapes, images). You can click on text to edit it, change fonts, "
"rearrange shapes. Layout fidelity is good but not pixel-perfect — complex PDFs may show small "
"shifts. Slide size matches the PDF's page dimensions.</li>"
"<li><strong>Image</strong> — renders each PDF page as a single picture and centers it on a slide. "
"Visually identical to the PDF, but nothing is editable. Best for archival or when you want to "
"guarantee the slides look exactly like the source.</li>"
"</ul>"
),
endpoint="/convert/pdf-to-pptx",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "mode", "label": "Conversion mode", "default": default_mode,
"choices": [
{"value": "editable", "label": "Editable — text and shapes can be edited (LibreOffice)"},
{"value": "image", "label": "Image — slides look identical to PDF, nothing editable"},
]},
{"type": "select", "name": "slide_size", "label": "Slide size (Image mode only)", "default": "16:9",
"choices": [
{"value": "16:9", "label": "Widescreen 16:9"},
{"value": "4:3", "label": "Standard 4:3"},
{"value": "a4", "label": "A4 landscape"},
]},
{"type": "number", "name": "dpi", "label": "Render DPI (Image mode only)",
"default": 150, "min": 72, "max": 300},
{"type": "text", "name": "pages", "label": "Pages (blank = all)",
"placeholder": "e.g. 1-3, 5, 8-10"},
],
button_text="Convert to PPTX")
@bp.route("/pdf-to-pptx", methods=["POST"])
def pdf_to_pptx():
from routes._helpers import safe_int, log_error, NO_FILE_SINGLE
from routes.pdf_tools import parse_page_ranges
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
mode = request.form.get("mode", "editable" if SOFFICE else "image")
if mode not in ("editable", "image"):
mode = "image"
pages_spec = (request.form.get("pages") or "").strip()
pdf_data = files[0].read()
# Pre-resolve page range against the PDF (used by both modes)
try:
with fitz.open(stream=pdf_data, filetype="pdf") as probe:
total_pages = len(probe)
try:
target_pages = parse_page_ranges(pages_spec, total_pages)
except (ValueError, IndexError):
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 8-10'."), 400
if not target_pages:
return jsonify(error="No pages selected."), 400
except Exception as e:
log_error(e, "pdf-to-pptx probe")
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
base = files[0].filename.rsplit(".", 1)[0]
# ── Editable mode (LibreOffice) ───────────────────────
if mode == "editable":
if not SOFFICE:
return jsonify(error=(
"Editable mode requires LibreOffice (soffice) on PATH. "
"Install LibreOffice and restart the server, or switch to Image mode."
)), 400
# If a page range was specified, build a sub-PDF first so LibreOffice
# only converts the requested pages.
source_pdf = pdf_data
if pages_spec and len(target_pages) != total_pages:
source_pdf = _extract_pages(pdf_data, target_pages)
pptx_bytes = _soffice_convert(source_pdf, "pdf", "pptx", timeout=300)
if pptx_bytes is None:
return jsonify(error=(
"LibreOffice could not convert this PDF. The file may be password-protected or "
"use features LibreOffice's PDF importer can't handle. Try Image mode instead."
)), 400
resp = send_file(io.BytesIO(pptx_bytes), mimetype=PPTX_MIME,
as_attachment=True, download_name=f"{base}.pptx")
return set_conversion_metadata(resp, "libreoffice", QUALITY_HIGH)
# ── Image mode (page-image-per-slide) ─────────────────
if not HAS_PPTX:
return jsonify(error="Image mode requires python-pptx. Run: pip install python-pptx"), 400
dpi = safe_int(request.form.get("dpi"), 150, min_val=72, max_val=300)
slide_size = request.form.get("slide_size", "16:9")
if slide_size not in SLIDE_SIZES_EMU:
slide_size = "16:9"
try:
doc = fitz.open(stream=pdf_data, filetype="pdf")
except Exception as e:
log_error(e, "pdf-to-pptx open")
return jsonify(error="Could not open PDF (the file may be corrupted or password-protected)."), 400
try:
prs = Presentation()
slide_w, slide_h = SLIDE_SIZES_EMU[slide_size]
prs.slide_width = slide_w
prs.slide_height = slide_h
blank_layout = prs.slide_layouts[6] # 'Blank'
mat = fitz.Matrix(dpi / 72, dpi / 72)
for idx in target_pages:
page = doc[idx]
pix = page.get_pixmap(matrix=mat, alpha=False)
png_bytes = pix.tobytes("png")
img_w, img_h = pix.width, pix.height
# Aspect-fit: scale to slide while preserving aspect ratio, then center.
slide_ratio = slide_w / slide_h
img_ratio = img_w / img_h
if img_ratio > slide_ratio:
draw_w = slide_w
draw_h = int(slide_w / img_ratio)
else:
draw_h = slide_h
draw_w = int(slide_h * img_ratio)
left = (slide_w - draw_w) // 2
top = (slide_h - draw_h) // 2
slide = prs.slides.add_slide(blank_layout)
slide.shapes.add_picture(io.BytesIO(png_bytes),
Emu(left), Emu(top),
width=Emu(draw_w), height=Emu(draw_h))
output = io.BytesIO()
prs.save(output)
output.seek(0)
finally:
doc.close()
resp = send_file(output, mimetype=PPTX_MIME,
as_attachment=True, download_name=f"{base}.pptx")
return set_conversion_metadata(
resp,
"pymupdf/python-pptx",
QUALITY_HIGH,
"Image mode preserves appearance but slides are not editable.",
)
# ── PowerPoint → PDF ─────────────────────────────────────
@bp.route("/pptx-to-pdf")
def pptx_to_pdf_page():
return render_template("upload_tool.html",
title="PowerPoint to PDF",
description="Convert PowerPoint or OpenDocument presentations to PDF",
notes=_soffice_available_notes(),
endpoint="/convert/pptx-to-pdf",
accept=".pptx,.ppt,.odp",
multiple=False,
options=[],
button_text="Convert to PDF")
@bp.route("/pptx-to-pdf", methods=["POST"])
def pptx_to_pdf():
from routes._helpers import NO_FILE_SINGLE
if not SOFFICE:
return jsonify(error="LibreOffice (soffice) is not installed or not on PATH. Install LibreOffice and restart the server."), 400
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
f = files[0]
ext = f.filename.rsplit(".", 1)[-1].lower() if "." in f.filename else ""
if ext not in ("pptx", "ppt", "odp"):
return jsonify(error="Unsupported format. Upload .pptx, .ppt, or .odp."), 400
data = soffice_convert(f.read(), ext, "pdf", timeout=300)
if data is None:
return jsonify(error="LibreOffice could not convert this file."), 400
base = f.filename.rsplit(".", 1)[0]
resp = send_file(io.BytesIO(data), mimetype="application/pdf",
as_attachment=True, download_name=f"{base}.pdf")
return set_conversion_metadata(resp, "libreoffice", QUALITY_HIGH)