Files
your-everyday-tools/routes/convert_tools.py
T
2026-04-19 10:53:59 +07:00

583 lines
20 KiB
Python

import io
import fitz # PyMuPDF
from flask import Blueprint, render_template, request, send_file, jsonify
from PIL import Image
import img2pdf
from docx import Document as DocxDocument
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
try:
from pdf2docx import Converter as Pdf2DocxConverter
HAS_PDF2DOCX = True
except ImportError:
HAS_PDF2DOCX = False
try:
import pytesseract
HAS_TESSERACT = True
except ImportError:
HAS_TESSERACT = False
try:
import ezdxf
from ezdxf.addons.drawing import RenderContext, Frontend
from ezdxf.addons.drawing.matplotlib import MatplotlibBackend
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
HAS_EZDXF = True
except ImportError:
HAS_EZDXF = False
import shutil
ODA_CONVERTER = shutil.which("ODAFileConverter") or shutil.which("oda_file_converter")
bp = Blueprint("convert", __name__)
# ── Page Routes ──────────────────────────────────
@bp.route("/to-pdf")
def to_pdf_page():
return render_template("upload_tool.html",
title="Files to PDF",
description="Convert images, Word documents, and text files to PDF",
endpoint="/convert/to-pdf",
accept=".jpg,.jpeg,.png,.bmp,.tiff,.webp,.txt,.docx",
multiple=True,
options=[])
@bp.route("/pdf-to-word")
def pdf_to_word_page():
return render_template("upload_tool.html",
title="PDF to Word",
description="Convert PDF documents to Word (.docx) format",
endpoint="/convert/pdf-to-word",
accept=".pdf",
multiple=False,
options=[])
@bp.route("/pdf-to-images")
def pdf_to_images_page():
return render_template("upload_tool.html",
title="PDF to Images",
description="Convert each PDF page to an image",
endpoint="/convert/pdf-to-images",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "format", "label": "Image Format",
"choices": [
{"value": "png", "label": "PNG"},
{"value": "jpg", "label": "JPG"},
]},
{"type": "number", "name": "dpi", "label": "Resolution (DPI)", "default": 200, "min": 72, "max": 600},
])
@bp.route("/pdf-to-text")
def pdf_to_text_page():
return render_template("upload_tool.html",
title="PDF to Text",
description="Extract all text content from a PDF document",
endpoint="/convert/pdf-to-text",
accept=".pdf",
multiple=False,
options=[])
OCR_LANGS = [
{"value": "eng", "label": "English"},
{"value": "ind", "label": "Indonesian"},
{"value": "fra", "label": "French"},
{"value": "deu", "label": "German"},
{"value": "spa", "label": "Spanish"},
{"value": "ita", "label": "Italian"},
{"value": "por", "label": "Portuguese"},
{"value": "rus", "label": "Russian"},
{"value": "chi_sim", "label": "Chinese (Simplified)"},
{"value": "chi_tra", "label": "Chinese (Traditional)"},
{"value": "jpn", "label": "Japanese"},
{"value": "kor", "label": "Korean"},
{"value": "ara", "label": "Arabic"},
{"value": "hin", "label": "Hindi"},
]
@bp.route("/ocr-pdf")
def ocr_pdf_page():
return render_template("upload_tool.html",
title="OCR PDF",
description="Extract text from scanned PDFs or create a searchable PDF with a hidden text layer",
endpoint="/convert/ocr-pdf",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "mode", "label": "Output",
"choices": [
{"value": "searchable", "label": "Searchable PDF (image + text layer)"},
{"value": "text", "label": "Extracted text only"},
]},
{"type": "select", "name": "lang", "label": "Language",
"choices": OCR_LANGS},
{"type": "number", "name": "dpi", "label": "OCR Resolution (DPI)",
"default": 200, "min": 100, "max": 400},
])
@bp.route("/cad-to-pdf")
def cad_to_pdf_page():
desc = "Convert DXF (or DWG with ODA File Converter) drawings to PDF or PNG"
if not ODA_CONVERTER:
desc += " — DWG requires ODA File Converter on PATH"
return render_template("upload_tool.html",
title="CAD to PDF/Image",
description=desc,
endpoint="/convert/cad-to-pdf",
accept=".dxf,.dwg",
multiple=False,
options=[
{"type": "select", "name": "format", "label": "Output Format",
"choices": [
{"value": "pdf", "label": "PDF"},
{"value": "png", "label": "PNG"},
]},
{"type": "number", "name": "dpi", "label": "PNG Resolution (DPI)",
"default": 150, "min": 72, "max": 600,
"depends_on": {"format": "png"}},
])
@bp.route("/html-to-pdf")
def html_to_pdf_page():
return render_template("upload_tool.html",
title="HTML to PDF",
description="Convert HTML content to a PDF document",
endpoint="/convert/html-to-pdf",
text_input=True,
text_label="HTML Content",
text_placeholder="<h1>Hello World</h1>\n<p>Paste your HTML here...</p>",
accept="",
multiple=False,
options=[],
button_text="Convert to PDF")
# ── Helpers ──────────────────────────────────────
def _docx_to_pdf(data: bytes) -> bytes:
"""Convert a .docx file (as bytes) to PDF bytes using python-docx + reportlab."""
doc = DocxDocument(io.BytesIO(data))
buf = io.BytesIO()
styles = getSampleStyleSheet()
normal = styles["Normal"]
normal.fontName = "Helvetica"
normal.fontSize = 11
normal.leading = 14
heading_styles = {}
for level in range(1, 4):
size = {1: 18, 2: 15, 3: 13}[level]
heading_styles[level] = ParagraphStyle(
f"Heading{level}", parent=normal,
fontName="Helvetica-Bold", fontSize=size, leading=size + 4,
spaceBefore=12, spaceAfter=6,
)
pdf = SimpleDocTemplate(buf, pagesize=A4,
leftMargin=inch, rightMargin=inch,
topMargin=inch, bottomMargin=inch)
story = []
for para in doc.paragraphs:
text = para.text.strip()
if not text:
story.append(Spacer(1, 6))
continue
style_name = para.style.name.lower() if para.style else ""
if "heading 1" in style_name:
story.append(Paragraph(text, heading_styles[1]))
elif "heading 2" in style_name:
story.append(Paragraph(text, heading_styles[2]))
elif "heading 3" in style_name:
story.append(Paragraph(text, heading_styles[3]))
else:
# Preserve basic inline formatting
rich = _build_rich_text(para)
story.append(Paragraph(rich, normal))
# Handle tables
for table in doc.tables:
tdata = []
for row in table.rows:
tdata.append([cell.text for cell in row.cells])
if tdata:
t = Table(tdata, repeatRows=1)
t.setStyle(TableStyle([
("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
("BACKGROUND", (0, 0), (-1, 0), colors.Color(0.9, 0.9, 0.95)),
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
("FONTSIZE", (0, 0), (-1, -1), 10),
("TOPPADDING", (0, 0), (-1, -1), 4),
("BOTTOMPADDING", (0, 0), (-1, -1), 4),
("LEFTPADDING", (0, 0), (-1, -1), 6),
("RIGHTPADDING", (0, 0), (-1, -1), 6),
]))
story.append(Spacer(1, 8))
story.append(t)
story.append(Spacer(1, 8))
if not story:
story.append(Paragraph("(empty document)", normal))
pdf.build(story)
return buf.getvalue()
def _build_rich_text(para) -> str:
"""Convert a python-docx paragraph's runs into reportlab-compatible rich text."""
parts = []
for run in para.runs:
text = run.text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
if not text:
continue
if run.bold and run.italic:
parts.append(f"<b><i>{text}</i></b>")
elif run.bold:
parts.append(f"<b>{text}</b>")
elif run.italic:
parts.append(f"<i>{text}</i>")
elif run.underline:
parts.append(f"<u>{text}</u>")
else:
parts.append(text)
return "".join(parts) or para.text
# ── Processing Routes ────────────────────────────
@bp.route("/to-pdf", methods=["POST"])
def to_pdf():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error="No files uploaded."), 400
pdf_doc = fitz.open()
for f in files:
name = f.filename.lower()
data = f.read()
if name.endswith(".docx"):
# Word document → PDF pages
try:
docx_pdf_bytes = _docx_to_pdf(data)
docx_pdf = fitz.open(stream=docx_pdf_bytes, filetype="pdf")
pdf_doc.insert_pdf(docx_pdf)
docx_pdf.close()
except Exception as e:
return jsonify(error=f"Error converting {f.filename}: {str(e)}"), 400
elif name.endswith(".txt"):
# Text file → PDF page
text = data.decode("utf-8", errors="replace")
page = pdf_doc.new_page(width=595, height=842) # A4
rect = fitz.Rect(50, 50, 545, 792)
page.insert_textbox(rect, text, fontsize=11, fontname="helv")
else:
# Image → PDF page
try:
img_data = data
# Convert to format fitz can handle
pil_img = Image.open(io.BytesIO(data))
if pil_img.mode in ("RGBA", "P"):
pil_img = pil_img.convert("RGB")
buf = io.BytesIO()
pil_img.save(buf, format="JPEG", quality=95)
img_data = buf.getvalue()
img_doc = fitz.open(stream=img_data, filetype="jpeg")
rect = img_doc[0].rect
pdf_page = pdf_doc.new_page(width=rect.width, height=rect.height)
pdf_page.insert_image(rect, stream=img_data)
img_doc.close()
except Exception as e:
return jsonify(error=f"Error processing {f.filename}: {str(e)}"), 400
output = io.BytesIO()
pdf_doc.save(output)
pdf_doc.close()
output.seek(0)
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name="converted.pdf")
@bp.route("/pdf-to-word", methods=["POST"])
def pdf_to_word():
if not HAS_PDF2DOCX:
return jsonify(error="pdf2docx package not installed. Run: pip install pdf2docx"), 400
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error="No file uploaded."), 400
import tempfile, os
pdf_data = files[0].read()
with tempfile.TemporaryDirectory() as tmpdir:
pdf_path = os.path.join(tmpdir, "input.pdf")
docx_path = os.path.join(tmpdir, "output.docx")
with open(pdf_path, "wb") as f:
f.write(pdf_data)
try:
cv = Pdf2DocxConverter(pdf_path)
cv.convert(docx_path)
cv.close()
except Exception as e:
return jsonify(error=f"Conversion failed: {str(e)}"), 400
with open(docx_path, "rb") as f:
result = io.BytesIO(f.read())
result.seek(0)
name = files[0].filename.rsplit(".", 1)[0] + ".docx"
return send_file(result, mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
as_attachment=True, download_name=name)
@bp.route("/pdf-to-images", methods=["POST"])
def pdf_to_images():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error="No file uploaded."), 400
fmt = request.form.get("format", "png")
dpi = int(request.form.get("dpi", 200))
pdf_data = files[0].read()
doc = fitz.open(stream=pdf_data, filetype="pdf")
from utils.file_utils import make_zip
images = []
mat = fitz.Matrix(dpi / 72, dpi / 72)
for i, page in enumerate(doc):
pix = page.get_pixmap(matrix=mat)
if fmt == "jpg":
img_bytes = pix.tobytes("jpeg")
ext = "jpg"
else:
img_bytes = pix.tobytes("png")
ext = "png"
images.append((f"page_{i + 1}.{ext}", img_bytes))
doc.close()
if len(images) == 1:
mime = "image/png" if fmt == "png" else "image/jpeg"
return send_file(io.BytesIO(images[0][1]), mimetype=mime,
as_attachment=True, download_name=images[0][0])
zip_buf = make_zip(images)
name = files[0].filename.rsplit(".", 1)[0] + "_images.zip"
return send_file(zip_buf, mimetype="application/zip",
as_attachment=True, download_name=name)
@bp.route("/pdf-to-text", methods=["POST"])
def pdf_to_text():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error="No file uploaded."), 400
pdf_data = files[0].read()
doc = fitz.open(stream=pdf_data, filetype="pdf")
text_parts = []
for i, page in enumerate(doc):
text_parts.append(f"--- Page {i + 1} ---")
text_parts.append(page.get_text())
doc.close()
return jsonify(text="\n".join(text_parts))
@bp.route("/html-to-pdf", methods=["POST"])
def html_to_pdf():
html = request.form.get("text", "").strip()
if not html:
return jsonify(error="Please enter some HTML content."), 400
doc = fitz.open()
page = doc.new_page(width=595, height=842) # A4
# Wrap in basic structure if no <html> tag present
if "<html" not in html.lower():
html = f"<html><body>{html}</body></html>"
try:
page.insert_htmlbox(fitz.Rect(50, 50, 545, 792), html)
except Exception as e:
return jsonify(error=f"HTML rendering failed: {str(e)}"), 400
output = io.BytesIO()
doc.save(output)
doc.close()
output.seek(0)
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name="converted.pdf")
@bp.route("/ocr-pdf", methods=["POST"])
def ocr_pdf():
if not HAS_TESSERACT:
return jsonify(error="OCR requires 'pytesseract' and the Tesseract binary. Install: pip install pytesseract, plus Tesseract from https://github.com/tesseract-ocr/tesseract"), 400
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error="No file uploaded."), 400
mode = request.form.get("mode", "searchable")
lang = request.form.get("lang", "eng")
dpi = int(request.form.get("dpi", 200))
pdf_data = files[0].read()
src = fitz.open(stream=pdf_data, filetype="pdf")
zoom = dpi / 72
try:
if mode == "text":
text_parts = []
for i, page in enumerate(src):
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
img = Image.open(io.BytesIO(pix.tobytes("png")))
text = pytesseract.image_to_string(img, lang=lang)
text_parts.append(f"--- Page {i + 1} ---\n{text.strip()}")
src.close()
combined = "\n\n".join(text_parts).strip()
return jsonify(text=combined or "(No text detected)")
output = fitz.open()
for page in src:
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
img = Image.open(io.BytesIO(pix.tobytes("png")))
page_pdf_bytes = pytesseract.image_to_pdf_or_hocr(img, extension="pdf", lang=lang)
sub = fitz.open(stream=page_pdf_bytes, filetype="pdf")
output.insert_pdf(sub)
sub.close()
src.close()
buf = io.BytesIO()
output.save(buf)
output.close()
buf.seek(0)
name = files[0].filename.rsplit(".", 1)[0] + "_ocr.pdf"
return send_file(buf, mimetype="application/pdf",
as_attachment=True, download_name=name)
except pytesseract.TesseractNotFoundError:
return jsonify(error="Tesseract binary not found. Install from https://github.com/tesseract-ocr/tesseract and ensure it is on PATH."), 400
except Exception as e:
msg = str(e)
if "language" in msg.lower() or "traineddata" in msg.lower():
return jsonify(error=f"Language pack '{lang}' not installed. Download its .traineddata file into your Tesseract tessdata directory."), 400
return jsonify(error=f"OCR failed: {msg}"), 400
@bp.route("/cad-to-pdf", methods=["POST"])
def cad_to_pdf():
if not HAS_EZDXF:
return jsonify(error="CAD conversion requires 'ezdxf' and 'matplotlib'. Install: pip install ezdxf matplotlib"), 400
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error="No file uploaded."), 400
target = request.form.get("format", "pdf")
dpi = int(request.form.get("dpi", 150))
filename = files[0].filename
ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
file_data = files[0].read()
import tempfile, os, subprocess
with tempfile.TemporaryDirectory() as tmpdir:
if ext == "dwg":
if not ODA_CONVERTER:
return jsonify(error="DWG support requires ODA File Converter. Download it free from https://www.opendesign.com/guestfiles/oda_file_converter and ensure it is on your PATH. Or convert your DWG to DXF first."), 400
in_dir = os.path.join(tmpdir, "in")
out_dir = os.path.join(tmpdir, "out")
os.makedirs(in_dir)
os.makedirs(out_dir)
dwg_path = os.path.join(in_dir, "input.dwg")
with open(dwg_path, "wb") as f:
f.write(file_data)
try:
subprocess.run(
[ODA_CONVERTER, in_dir, out_dir, "ACAD2018", "DXF", "0", "1", "*.DWG"],
check=True, capture_output=True, timeout=60,
)
except subprocess.CalledProcessError as e:
return jsonify(error=f"DWG to DXF conversion failed: {e.stderr.decode(errors='replace')[:200]}"), 400
except subprocess.TimeoutExpired:
return jsonify(error="DWG conversion timed out."), 400
dxf_path = os.path.join(out_dir, "input.dxf")
if not os.path.exists(dxf_path):
return jsonify(error="DWG to DXF conversion produced no output."), 400
doc = ezdxf.readfile(dxf_path)
elif ext == "dxf":
dxf_path = os.path.join(tmpdir, "input.dxf")
with open(dxf_path, "wb") as f:
f.write(file_data)
try:
doc = ezdxf.readfile(dxf_path)
except Exception as e:
return jsonify(error=f"Invalid DXF file: {str(e)[:200]}"), 400
else:
return jsonify(error="Upload a .dxf or .dwg file."), 400
msp = doc.modelspace()
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
ax.set_aspect("equal")
ax.set_axis_off()
try:
ctx = RenderContext(doc)
backend = MatplotlibBackend(ax)
Frontend(ctx, backend).draw_layout(msp, finalize=True)
except Exception as e:
plt.close(fig)
return jsonify(error=f"Rendering failed: {str(e)[:200]}"), 400
buf = io.BytesIO()
base_name = filename.rsplit(".", 1)[0]
if target == "pdf":
fig.savefig(buf, format="pdf", bbox_inches="tight", pad_inches=0.2)
plt.close(fig)
buf.seek(0)
return send_file(buf, mimetype="application/pdf",
as_attachment=True, download_name=base_name + ".pdf")
else:
fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight", pad_inches=0.2)
plt.close(fig)
buf.seek(0)
return send_file(buf, mimetype="image/png",
as_attachment=True, download_name=base_name + ".png")