Files
2026-06-06 19:05:17 +07:00

1165 lines
44 KiB
Python

import io
from flask import Blueprint, render_template, request, send_file, jsonify
from utils.file_utils import make_zip
from utils.pymupdf import import_pymupdf
from routes._helpers import safe_int, safe_float, log_error, NO_FILE_SINGLE, NO_FILE_MULTIPLE
fitz = import_pymupdf()
bp = Blueprint("pdf", __name__)
def _open_pdf(data: bytes):
"""Open a PDF from bytes, raising a friendly ValueError on failure."""
try:
return fitz.open(stream=data, filetype="pdf")
except Exception as e:
log_error(e, "fitz.open")
raise ValueError("Could not open PDF (the file may be corrupted, encrypted, or not a PDF).")
# ── Page Routes ──────────────────────────────────
@bp.route("/merge")
def merge_page():
return render_template("upload_tool.html",
title="Merge PDFs",
description="Combine multiple PDF files into one document",
endpoint="/pdf/merge",
accept=".pdf",
multiple=True,
options=[])
@bp.route("/split")
def split_page():
return render_template("upload_tool.html",
title="Split PDF",
description="Split a PDF into individual pages or custom ranges",
endpoint="/pdf/split",
accept=".pdf",
multiple=False,
options=[
{"type": "text", "name": "pages", "label": "Page ranges (leave empty for all pages)",
"placeholder": "e.g. 1-3, 5, 7-10"},
])
@bp.route("/compress")
def compress_page():
return render_template("upload_tool.html",
title="Compress PDF",
description="Reduce PDF file size by compressing images and cleaning up",
notes=(
'<p><strong>How compression works:</strong> embedded images are re-encoded as JPEG '
'at a lower quality, downscaled if larger than a per-level cap (1200/1800/2400 px), '
'and the PDF\'s internal cross-reference table is cleaned up.</p>'
'<p><strong>Best results on:</strong> photo-heavy PDFs (scanned reports, brochures, '
'photo books). <strong>Minimal savings on:</strong> text-only PDFs — they\'re already '
'tiny because text compresses well in PDF natively.</p>'
'<ul style="margin:.4rem 0 .6rem 1.2rem">'
'<li><strong>Maximum compression</strong> — JPEG quality 40, max image edge 1200px. '
'Best size, visible image quality loss.</li>'
'<li><strong>Medium</strong> — JPEG quality 65, max 1800px. Good balance for most use.</li>'
'<li><strong>Minimal</strong> — JPEG quality 85, max 2400px. Slightly smaller, hardly any loss.</li>'
'</ul>'
'<p style="font-size:.9em;color:var(--muted)">Image positions, sizes, and rotation '
'are preserved exactly — we replace each image in-place rather than re-flowing the page.</p>'
),
endpoint="/pdf/compress",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "quality", "label": "Compression Level",
"choices": [
{"value": "medium", "label": "Medium (good balance)"},
{"value": "low", "label": "Maximum compression"},
{"value": "high", "label": "Minimal compression"},
]},
])
@bp.route("/rotate")
def rotate_page():
return render_template("upload_tool.html",
title="Rotate PDF",
description="Rotate all or specific pages of a PDF",
endpoint="/pdf/rotate",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "angle", "label": "Rotation Angle",
"choices": [
{"value": "90", "label": "90° Clockwise"},
{"value": "180", "label": "180°"},
{"value": "270", "label": "90° Counter-clockwise"},
]},
{"type": "text", "name": "pages", "label": "Pages to rotate (leave empty for all)",
"placeholder": "e.g. 1, 3, 5-7"},
])
@bp.route("/resize")
def resize_page():
return render_template("upload_tool.html",
title="Resize PDF",
description="Change the page dimensions of a PDF",
endpoint="/pdf/resize",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "mode", "label": "Resize Mode",
"choices": [
{"value": "scale", "label": "Scale by percentage"},
{"value": "paper", "label": "Standard paper size"},
]},
{"type": "number", "name": "scale", "label": "Scale (%)", "default": 100, "min": 10, "max": 500,
"depends_on": {"mode": "scale"}},
{"type": "select", "name": "paper", "label": "Paper Size",
"choices": [
{"value": "a4", "label": "A4 (210 x 297 mm)"},
{"value": "letter", "label": "Letter (8.5 x 11 in)"},
{"value": "a3", "label": "A3 (297 x 420 mm)"},
{"value": "a5", "label": "A5 (148 x 210 mm)"},
{"value": "legal", "label": "Legal (8.5 x 14 in)"},
],
"depends_on": {"mode": "paper"}},
])
@bp.route("/page-numbers")
def page_numbers_page():
return render_template("upload_tool.html",
title="Add Page Numbers",
description="Add page numbers to each page of a PDF",
endpoint="/pdf/page-numbers",
accept=".pdf",
multiple=False,
options=[
{"type": "select", "name": "position", "label": "Position",
"choices": [
{"value": "bottom-center", "label": "Bottom Center"},
{"value": "bottom-right", "label": "Bottom Right"},
{"value": "bottom-left", "label": "Bottom Left"},
{"value": "top-center", "label": "Top Center"},
{"value": "top-right", "label": "Top Right"},
{"value": "top-left", "label": "Top Left"},
]},
{"type": "number", "name": "start", "label": "Start number", "default": 1, "min": 0},
{"type": "number", "name": "fontsize", "label": "Font size", "default": 11, "min": 6, "max": 30},
])
@bp.route("/extract-images")
def extract_images_page():
return render_template("upload_tool.html",
title="Extract Images",
description="Extract all images embedded in a PDF file",
notes=(
'<p><strong>What you get:</strong> every embedded raster image (PNG / JPEG / TIFF) '
'found in the PDF, downloaded as a ZIP. Vector graphics (lines, paths, drawn shapes) '
'are <strong>not</strong> exported as images — they\'re part of the page itself, not '
'separate image objects.</p>'
'<p style="font-size:.9em;color:var(--muted)">For scanned PDFs you usually get one '
'large image per page. For rendered text PDFs with figures, you get just the figures. '
'If you need a screenshot of the whole page, use <a href="/convert/pdf-to-images">PDF to Images</a>.</p>'
),
endpoint="/pdf/extract-images",
accept=".pdf",
multiple=False,
options=[])
@bp.route("/protect")
def protect_page():
return render_template("upload_tool.html",
title="Protect PDF",
description="Add password protection to a PDF file",
notes=(
'<p><strong>Encryption:</strong> AES-256, the strongest standard PDF encryption. '
'Required to open and to print/copy.</p>'
'<p><strong>User vs Owner password:</strong></p>'
'<ul style="margin:.4rem 0 .6rem 1.2rem">'
'<li><strong>User password</strong> — required to open the PDF. Without it, the PDF cannot be viewed.</li>'
'<li><strong>Owner password</strong> — controls editing/printing/copying restrictions. Leave blank to use the same as the user password.</li>'
'</ul>'
'<p style="font-size:.9em;color:var(--muted)"><strong>Important:</strong> there is '
'no recovery — if you forget the password, the PDF stays locked. We allow printing '
'and copying for password-holders by default.</p>'
),
endpoint="/pdf/protect",
accept=".pdf",
multiple=False,
options=[
{"type": "password", "name": "user_password", "label": "User Password (to open)",
"placeholder": "Enter password"},
{"type": "password", "name": "owner_password", "label": "Owner Password (optional, for editing)",
"placeholder": "Leave empty to use same password"},
])
@bp.route("/sign")
def sign_page():
return render_template("upload_tool.html",
title="Sign PDF",
description="Stamp a signature image onto one or more pages of a PDF",
notes=(
"<p><strong>Tip:</strong> upload a transparent PNG of your signature for best results. "
"A white-background JPG will look like a sticker on the page.</p>"
"<p>This tool stamps a visible signature — it does <em>not</em> apply a cryptographic digital signature.</p>"
),
endpoint="/pdf/sign",
accept=".pdf",
multiple=False,
options=[
{"type": "file", "name": "signature", "label": "Signature image (PNG / JPG)",
"accept": "image/png,image/jpeg", "required": True},
{"type": "text", "name": "pages", "label": "Pages to sign (leave empty for all)",
"placeholder": "e.g. 1, 3, 5-7"},
{"type": "select", "name": "position", "label": "Position", "default": "bottom-right",
"choices": [
{"value": "bottom-right", "label": "Bottom Right"},
{"value": "bottom-center", "label": "Bottom Center"},
{"value": "bottom-left", "label": "Bottom Left"},
{"value": "top-right", "label": "Top Right"},
{"value": "top-center", "label": "Top Center"},
{"value": "top-left", "label": "Top Left"},
]},
{"type": "number", "name": "width", "label": "Signature width (points)", "default": 140, "min": 30, "max": 400},
{"type": "number", "name": "margin", "label": "Margin from edge (points)", "default": 36, "min": 0, "max": 200},
{"type": "number", "name": "opacity", "label": "Opacity (%)", "default": 100, "min": 10, "max": 100},
])
@bp.route("/unlock")
def unlock_page():
return render_template("upload_tool.html",
title="Unlock PDF",
description="Remove password protection from a PDF",
notes=(
'<p><strong>You need to know the password.</strong> This tool removes password '
'protection from a PDF you can already open — it is not a password cracker. '
'You\'ll get a clear "incorrect password" error if you enter the wrong one.</p>'
'<p style="font-size:.9em;color:var(--muted)">Use case: you have a PDF protected '
'by a password you know, and want to share an unprotected copy or pass it through '
'tools that don\'t handle encrypted PDFs.</p>'
),
endpoint="/pdf/unlock",
accept=".pdf",
multiple=False,
options=[
{"type": "password", "name": "password", "label": "PDF Password",
"placeholder": "Enter the current password"},
])
@bp.route("/form-fill")
def form_fill_page():
return render_template("tools/form_fill.html")
@bp.route("/redact")
def redact_page():
return render_template("upload_tool.html",
title="Redact PDF",
description="Permanently black-out sensitive text in a PDF",
notes=(
'<p><strong>How it works:</strong> enter one or more search terms or regex patterns '
'(one per line). Every occurrence on every page is found, then permanently '
'overlaid with a solid black rectangle. The underlying text is also stripped '
'from the PDF\'s content stream so it cannot be recovered with copy-paste.</p>'
'<p style="font-size:.9em;color:var(--muted)"><strong>Common patterns:</strong> '
'<code>\\b\\d{16}\\b</code> (credit-card numbers), '
'<code>[\\w.-]+@[\\w.-]+\\.[\\w]+</code> (emails), '
'<code>\\b\\d{3}-\\d{2}-\\d{4}\\b</code> (US SSN-like). '
'Plain text is matched literally unless you tick &ldquo;Treat as regex&rdquo;.</p>'
),
endpoint="/pdf/redact",
accept=".pdf",
multiple=False,
options=[
{"type": "text", "name": "patterns",
"label": "Patterns (one per line)",
"placeholder": "e.g. john@example.com / 4111-?\\d{4}-?\\d{4}-?\\d{4}"},
{"type": "checkbox", "name": "is_regex",
"label": "Pattern type",
"check_label": "Treat each line as a regular expression",
"default": False},
{"type": "checkbox", "name": "case_sensitive",
"label": "Case sensitivity",
"check_label": "Match case exactly",
"default": False},
{"type": "text", "name": "pages", "label": "Pages (blank = all)",
"placeholder": "e.g. 1-3, 5"},
])
# ── Processing Routes ────────────────────────────
def parse_page_ranges(spec: str, total: int) -> list[int]:
"""Parse '1-3, 5, 7-10' into a list of 0-based page indices."""
if not spec.strip():
return list(range(total))
pages = set()
for part in spec.split(","):
part = part.strip()
if "-" in part:
start, end = part.split("-", 1)
s = max(1, int(start.strip()))
e = min(total, int(end.strip()))
pages.update(range(s - 1, e))
else:
p = int(part.strip()) - 1
if 0 <= p < total:
pages.add(p)
return sorted(pages)
PAPER_SIZES = {
"a4": (595.28, 841.89),
"letter": (612, 792),
"a3": (841.89, 1190.55),
"a5": (419.53, 595.28),
"legal": (612, 1008),
}
@bp.route("/merge", methods=["POST"])
def merge():
files = request.files.getlist("files")
if len(files) < 2:
return jsonify(error="Please upload at least 2 PDF files."), 400
result = fitz.open()
try:
for f in files:
try:
with fitz.open(stream=f.read(), filetype="pdf") as doc:
result.insert_pdf(doc)
except Exception as e:
log_error(e, f"merge: {f.filename}")
return jsonify(error=f"Could not read '{f.filename}' (corrupted or not a PDF)."), 400
output = io.BytesIO()
result.save(output)
output.seek(0)
finally:
result.close()
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name="merged.pdf")
@bp.route("/split", methods=["POST"])
def split():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
page_spec = request.form.get("pages", "").strip()
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
try:
pages = parse_page_ranges(page_spec, len(doc))
except (ValueError, IndexError):
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 7-10'."), 400
if not pages:
return jsonify(error="No valid pages selected."), 400
if len(pages) == 1:
with fitz.open() as single:
single.insert_pdf(doc, from_page=pages[0], to_page=pages[0])
output = io.BytesIO()
single.save(output)
output.seek(0)
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=f"page_{pages[0]+1}.pdf")
parts = []
for p in pages:
with fitz.open() as part:
part.insert_pdf(doc, from_page=p, to_page=p)
buf = io.BytesIO()
part.save(buf)
parts.append((f"page_{p + 1}.pdf", buf.getvalue()))
finally:
doc.close()
zip_buf = make_zip(parts)
return send_file(zip_buf, mimetype="application/zip",
as_attachment=True, download_name="split_pages.zip")
@bp.route("/compress", methods=["POST"])
def compress():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
quality = request.form.get("quality", "medium")
image_quality = {"low": 40, "medium": 65, "high": 85}.get(quality, 65)
max_dim = {"low": 1200, "medium": 1800, "high": 2400}.get(quality, 1800)
from PIL import Image
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
processed_xrefs = set()
for page in doc:
for img_info in page.get_images(full=True):
xref = img_info[0]
if xref in processed_xrefs:
continue
processed_xrefs.add(xref)
try:
base_image = doc.extract_image(xref)
if not base_image:
continue
with Image.open(io.BytesIO(base_image["image"])) as pil_img:
if pil_img.mode != "RGB":
pil_img = pil_img.convert("RGB")
if max(pil_img.size) > max_dim:
pil_img.thumbnail((max_dim, max_dim), Image.LANCZOS)
buf = io.BytesIO()
pil_img.save(buf, format="JPEG",
quality=image_quality, optimize=True)
# Replace image in-place — preserves original placement & size.
page.replace_image(xref, stream=buf.getvalue())
except Exception as e:
log_error(e, f"compress xref={xref}")
continue
output = io.BytesIO()
doc.save(output, garbage=4, deflate=True, clean=True)
output.seek(0)
finally:
doc.close()
name = files[0].filename.rsplit(".", 1)[0] + "_compressed.pdf"
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
@bp.route("/rotate", methods=["POST"])
def rotate():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
angle = safe_int(request.form.get("angle"), 90)
if angle not in (90, 180, 270):
return jsonify(error="Rotation must be 90, 180, or 270."), 400
page_spec = request.form.get("pages", "").strip()
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
try:
pages = parse_page_ranges(page_spec, len(doc))
except (ValueError, IndexError):
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 7-10'."), 400
for p in pages:
doc[p].set_rotation((doc[p].rotation + angle) % 360)
output = io.BytesIO()
doc.save(output)
output.seek(0)
finally:
doc.close()
name = files[0].filename.rsplit(".", 1)[0] + "_rotated.pdf"
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
@bp.route("/resize", methods=["POST"])
def resize():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
mode = request.form.get("mode", "scale")
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
new_doc = fitz.open()
try:
if mode == "scale":
scale_pct = safe_float(request.form.get("scale"), 100.0,
min_val=10.0, max_val=500.0)
scale = scale_pct / 100.0
for page in doc:
r = page.rect
new_page = new_doc.new_page(width=r.width * scale,
height=r.height * scale)
new_page.show_pdf_page(new_page.rect, doc, page.number,
rotate=page.rotation)
elif mode == "paper":
paper = request.form.get("paper", "a4")
target_w, target_h = PAPER_SIZES.get(paper, PAPER_SIZES["a4"])
for page in doc:
r = page.rect
src_w, src_h = r.width, r.height
# Match target orientation to source orientation
if (src_w > src_h) != (target_w > target_h):
page_w, page_h = target_h, target_w
else:
page_w, page_h = target_w, target_h
# Fit source page into new page, preserving aspect ratio
fit = min(page_w / src_w, page_h / src_h)
content_w = src_w * fit
content_h = src_h * fit
x0 = (page_w - content_w) / 2
y0 = (page_h - content_h) / 2
new_page = new_doc.new_page(width=page_w, height=page_h)
new_page.show_pdf_page(
fitz.Rect(x0, y0, x0 + content_w, y0 + content_h),
doc, page.number, rotate=page.rotation
)
else:
return jsonify(error="Unknown resize mode."), 400
output = io.BytesIO()
new_doc.save(output, garbage=4, deflate=True)
output.seek(0)
finally:
new_doc.close()
doc.close()
name = files[0].filename.rsplit(".", 1)[0] + "_resized.pdf"
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
@bp.route("/page-numbers", methods=["POST"])
def page_numbers():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
position = request.form.get("position", "bottom-center")
start = safe_int(request.form.get("start"), 1, min_val=0, max_val=100000)
fontsize = safe_int(request.form.get("fontsize"), 11, min_val=6, max_val=72)
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
for i, page in enumerate(doc):
num = start + i
r = page.rect
margin = 36 # 0.5 inch
pos_map = {
"bottom-center": fitz.Point(r.width / 2, r.height - margin),
"bottom-right": fitz.Point(r.width - margin, r.height - margin),
"bottom-left": fitz.Point(margin, r.height - margin),
"top-center": fitz.Point(r.width / 2, margin + fontsize),
"top-right": fitz.Point(r.width - margin, margin + fontsize),
"top-left": fitz.Point(margin, margin + fontsize),
}
point = pos_map.get(position, pos_map["bottom-center"])
page.insert_text(point, str(num), fontsize=fontsize,
fontname="helv", color=(0.3, 0.3, 0.3))
output = io.BytesIO()
doc.save(output)
output.seek(0)
finally:
doc.close()
name = files[0].filename.rsplit(".", 1)[0] + "_numbered.pdf"
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
@bp.route("/extract-images", methods=["POST"])
def extract_images():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
images = []
try:
for i, page in enumerate(doc):
for img_idx, img_info in enumerate(page.get_images(full=True)):
xref = img_info[0]
try:
base_image = doc.extract_image(xref)
if not base_image:
continue
ext = base_image.get("ext", "png")
images.append((f"page{i+1}_img{img_idx+1}.{ext}",
base_image["image"]))
except Exception as e:
log_error(e, f"extract_images xref={xref}")
continue
finally:
doc.close()
if not images:
return jsonify(error="No images found in the PDF."), 400
if len(images) == 1:
ext = images[0][0].rsplit(".", 1)[1]
mime = f"image/{'jpeg' if ext in ('jpg','jpeg') else ext}"
return send_file(io.BytesIO(images[0][1]), mimetype=mime,
as_attachment=True, download_name=images[0][0])
zip_buf = make_zip(images)
name = files[0].filename.rsplit(".", 1)[0] + "_images.zip"
return send_file(zip_buf, mimetype="application/zip",
as_attachment=True, download_name=name)
@bp.route("/protect", methods=["POST"])
def protect():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
user_pw = request.form.get("user_password", "")
owner_pw = request.form.get("owner_password", "") or user_pw
if not user_pw:
return jsonify(error="Please enter a password."), 400
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
perm = fitz.PDF_PERM_PRINT | fitz.PDF_PERM_COPY
output = io.BytesIO()
doc.save(output,
encryption=fitz.PDF_ENCRYPT_AES_256,
user_pw=user_pw,
owner_pw=owner_pw,
permissions=perm)
output.seek(0)
finally:
doc.close()
name = files[0].filename.rsplit(".", 1)[0] + "_protected.pdf"
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
@bp.route("/sign", methods=["POST"])
def sign():
from PIL import Image
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error="Please upload a PDF."), 400
sig_file = request.files.get("signature")
if not sig_file or not sig_file.filename:
return jsonify(error="Please upload a signature image (PNG or JPG)."), 400
position = request.form.get("position", "bottom-right")
sig_width = safe_float(request.form.get("width"), 140.0,
min_val=30.0, max_val=600.0)
margin = safe_float(request.form.get("margin"), 36.0,
min_val=0.0, max_val=300.0)
opacity_pct = safe_int(request.form.get("opacity"), 100,
min_val=10, max_val=100)
opacity = opacity_pct / 100.0
page_spec = request.form.get("pages", "").strip()
try:
with Image.open(sig_file) as raw:
sig_img = raw.convert("RGBA")
except Exception as e:
log_error(e, "sign: signature image")
return jsonify(error="Could not read signature image (file may be corrupted or not an image)."), 400
if opacity < 1.0:
r, g, b, a = sig_img.split()
a = a.point(lambda v: int(v * opacity))
sig_img = Image.merge("RGBA", (r, g, b, a))
sig_buf = io.BytesIO()
sig_img.save(sig_buf, format="PNG")
sig_bytes = sig_buf.getvalue()
sig_ratio = sig_img.height / sig_img.width if sig_img.width else 1.0
sig_h = sig_width * sig_ratio
sig_img.close()
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
try:
target = parse_page_ranges(page_spec, len(doc))
except (ValueError, IndexError):
return jsonify(error="Invalid page range. Use e.g. '1, 3, 5-7'."), 400
if not target:
return jsonify(error="No valid pages selected."), 400
for pno in target:
page = doc[pno]
r = page.rect
if "right" in position:
x0 = r.width - margin - sig_width
elif "center" in position:
x0 = (r.width - sig_width) / 2
else:
x0 = margin
if "bottom" in position:
y0 = r.height - margin - sig_h
else:
y0 = margin
page.insert_image(
fitz.Rect(x0, y0, x0 + sig_width, y0 + sig_h),
stream=sig_bytes, keep_proportion=True, overlay=True,
)
output = io.BytesIO()
doc.save(output, garbage=4, deflate=True)
output.seek(0)
finally:
doc.close()
name = files[0].filename.rsplit(".", 1)[0] + "_signed.pdf"
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
@bp.route("/redact", methods=["POST"])
def redact():
import re
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
patterns_raw = request.form.get("patterns", "").strip()
if not patterns_raw:
return jsonify(error="Enter at least one search term or pattern."), 400
is_regex = request.form.get("is_regex") == "on"
case_sensitive = request.form.get("case_sensitive") == "on"
page_spec = request.form.get("pages", "").strip()
patterns = [p for p in patterns_raw.splitlines() if p.strip()]
if not patterns:
return jsonify(error="Enter at least one search term or pattern."), 400
# Validate regex patterns up-front so the user gets a clean error message.
flags = 0 if case_sensitive else re.IGNORECASE
if is_regex:
compiled: list[re.Pattern] = []
for p in patterns:
try:
compiled.append(re.compile(p, flags))
except re.error as e:
return jsonify(error=f"Invalid regex {p!r}: {e}"), 400
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
try:
target = parse_page_ranges(page_spec, len(doc))
except (ValueError, IndexError):
return jsonify(error="Invalid page range. Use e.g. '1-3, 5, 7-10'."), 400
if not target:
return jsonify(error="No valid pages selected."), 400
total_redactions = 0
for pno in target:
page = doc[pno]
rects: list[fitz.Rect] = []
if is_regex:
# Regex path: walk the page text, locate each match, then map
# the character range back to bounding boxes via search_for.
page_text = page.get_text()
for pat in compiled:
for m in pat.finditer(page_text):
snippet = m.group(0)
if not snippet.strip():
continue
# search_for handles word-wrap and returns one rect per
# visual hit on the page.
for r in page.search_for(snippet, quads=False):
rects.append(r)
else:
for term in patterns:
if not term.strip():
continue
flags_arg = 0 if case_sensitive else fitz.TEXT_PRESERVE_LIGATURES # search_for is case-insensitive by default
found = page.search_for(term)
rects.extend(found)
# De-duplicate near-identical rectangles
uniq: list[fitz.Rect] = []
for r in rects:
if not any(abs(r.x0 - u.x0) < 0.5 and abs(r.y0 - u.y0) < 0.5
and abs(r.x1 - u.x1) < 0.5 and abs(r.y1 - u.y1) < 0.5
for u in uniq):
uniq.append(r)
for r in uniq:
page.add_redact_annot(r, fill=(0, 0, 0))
total_redactions += len(uniq)
# apply_redactions actually removes the underlying text; the
# IMAGE_PIXELS option preserves images on the page.
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
if total_redactions == 0:
return jsonify(error=(
"No matches found. Check spelling, toggle case-sensitivity, "
"or try a different pattern."
)), 400
output = io.BytesIO()
doc.save(output, garbage=4, deflate=True, clean=True)
output.seek(0)
finally:
doc.close()
name = files[0].filename.rsplit(".", 1)[0] + "_redacted.pdf"
resp = send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
resp.headers["X-Redactions-Applied"] = str(total_redactions)
return resp
@bp.route("/unlock", methods=["POST"])
def unlock():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
password = request.form.get("password", "")
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
if doc.needs_pass:
if not doc.authenticate(password):
return jsonify(error="Incorrect password."), 400
output = io.BytesIO()
doc.save(output)
output.seek(0)
finally:
doc.close()
name = files[0].filename.rsplit(".", 1)[0] + "_unlocked.pdf"
return send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=name)
# ── PDF Form Filler (AcroForm) ─────────────────────────────
# PyMuPDF widget type constants → string labels we expose to the UI
_WIDGET_TYPE_NAMES = {
fitz.PDF_WIDGET_TYPE_TEXT: "text",
fitz.PDF_WIDGET_TYPE_CHECKBOX: "checkbox",
fitz.PDF_WIDGET_TYPE_RADIOBUTTON: "radio",
fitz.PDF_WIDGET_TYPE_LISTBOX: "listbox",
fitz.PDF_WIDGET_TYPE_COMBOBOX: "combobox",
fitz.PDF_WIDGET_TYPE_BUTTON: "button",
fitz.PDF_WIDGET_TYPE_SIGNATURE: "signature",
}
def _label_near_widget(page, rect: fitz.Rect, max_dist: float = 250) -> str:
"""Find the text label that visually sits next to a widget on the page.
Radio button / checkbox labels (e.g. "Male", "Female") are painted on the
page as static text, NOT stored on the widget. We sniff them by walking
page words and picking the contiguous run of words on the same line,
starting from the side adjacent to the widget. A gap > ~25 pixels stops
the run, which prevents grabbing the next radio's label in a horizontal
row layout like "[ ] Male [ ] Female".
Right side is searched first (the conventional layout); left is fallback.
"""
if not rect:
return ""
height = max(rect.y1 - rect.y0, 8)
cy = (rect.y0 + rect.y1) / 2
# get_text("words") -> list of (x0, y0, x1, y1, "text", block, line, word)
words = page.get_text("words")
if not words:
return ""
def same_line(wy0: float, wy1: float) -> bool:
wcy = (wy0 + wy1) / 2
return abs(wcy - cy) <= height * 0.7
GAP = 25.0 # max horizontal gap between adjacent label words, in points
# ── Right-side run ──
right = [w for w in words
if same_line(w[1], w[3])
and w[0] >= rect.x1 - 1
and w[0] - rect.x1 < max_dist]
if right:
right.sort(key=lambda w: w[0])
result = [right[0][4]]
prev_x1 = right[0][2]
for w in right[1:]:
if w[0] - prev_x1 > GAP:
break
result.append(w[4])
prev_x1 = w[2]
text = " ".join(result).strip().rstrip(":;,.")
if text:
return text[:80]
# ── Left-side fallback ──
left = [w for w in words
if same_line(w[1], w[3])
and w[2] <= rect.x0 + 1
and rect.x0 - w[2] < max_dist]
if left:
left.sort(key=lambda w: -w[2]) # rightmost first (closest to widget)
result = [left[0][4]]
prev_x0 = left[0][0]
for w in left[1:]:
if prev_x0 - w[2] > GAP:
break
result.insert(0, w[4])
prev_x0 = w[0]
text = " ".join(result).strip().rstrip(":;,.")
if text:
return text[:80]
return ""
def _serialize_widgets(doc) -> list[dict]:
"""Walk every page's widgets and return a JSON-friendly list of fields."""
fields: list[dict] = []
for page_num, page in enumerate(doc, start=1):
for w in page.widgets() or []:
ftype = _WIDGET_TYPE_NAMES.get(w.field_type, "unknown")
# Required / read-only flags live in field_flags (bit field)
flags = getattr(w, "field_flags", 0) or 0
required = bool(flags & 2) # bit 2 = required
readonly = bool(flags & 1) # bit 1 = read-only
multiline = bool(flags & (1 << 12)) # bit 13 = multiline (text only)
# PDF spec bit 19 (Ff 1<<18) = combobox is editable (user can type
# values outside the choice list). Set only on combo fields.
editable_combo = (ftype == "combobox") and bool(flags & (1 << 18))
# Choice fields expose `choice_values`; treat None as empty list
choices = list(w.choice_values or []) if hasattr(w, "choice_values") else []
# For checkboxes / radios the "on" state name varies per PDF
# (often "Yes", "On", "1", or arbitrary identifiers like "Male").
on_states = []
if ftype in ("checkbox", "radio"):
states = w.button_states() or {}
for _, vals in states.items():
if not vals:
continue
for v in vals:
if v and v != "Off" and v not in on_states:
on_states.append(v)
# For radios + checkboxes, sniff a human label from the page text
# adjacent to this widget. PDFs paint these as static text rather
# than storing them on the widget, so we have to read the page.
option_label = ""
if ftype in ("radio", "checkbox"):
option_label = _label_near_widget(page, w.rect)
# The "value" identifier this specific radio represents when "on".
option_value = on_states[0] if (ftype == "radio" and on_states) else ""
fields.append({
"name": w.field_name or "",
"label": w.field_label or w.field_name or "",
"type": ftype,
"value": w.field_value if w.field_value is not None else "",
"page": page_num,
"rect": [round(c, 2) for c in (w.rect or fitz.Rect())],
"option_label": option_label,
"option_value": option_value,
"editable": editable_combo,
"required": required,
"readonly": readonly,
"multiline": multiline,
"choices": choices,
"on_states": on_states,
"max_length": w.text_maxlen if hasattr(w, "text_maxlen") else 0,
})
return fields
@bp.route("/form-inspect", methods=["POST"])
def form_inspect():
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
try:
fields = _serialize_widgets(doc)
return jsonify({
"filename": files[0].filename,
"page_count": len(doc),
"field_count": len(fields),
"fields": fields,
"has_form": len(fields) > 0,
})
finally:
doc.close()
@bp.route("/form-fill", methods=["POST"])
def form_fill():
"""Apply field values to the uploaded PDF and return the filled file.
Form values are passed as JSON in the `values` field of the multipart body:
`{"<field_name>": "<value>", ...}`. Values are matched against
`widget.field_name`. Unknown names are silently ignored.
"""
import json
files = request.files.getlist("files")
if not files or not files[0].filename:
return jsonify(error=NO_FILE_SINGLE), 400
raw_values = request.form.get("values", "{}")
try:
values_map = json.loads(raw_values)
if not isinstance(values_map, dict):
raise ValueError("values must be an object")
except (ValueError, TypeError) as e:
return jsonify(error=f"Invalid form values JSON: {e}"), 400
flatten = request.form.get("flatten") == "on"
try:
doc = _open_pdf(files[0].read())
except ValueError as e:
return jsonify(error=str(e)), 400
applied = 0
skipped: list[str] = []
try:
for page in doc:
for w in page.widgets() or []:
if not w.field_name or w.field_name not in values_map:
continue
if w.field_flags and (w.field_flags & 1): # read-only
skipped.append(w.field_name)
continue
new_val = values_map[w.field_name]
ftype = w.field_type
try:
if ftype == fitz.PDF_WIDGET_TYPE_CHECKBOX:
# truthy → checkbox's "on" state, falsy → "Off"
if new_val in (True, "true", "on", "1", 1, "Yes", "yes"):
on_vals = []
states = w.button_states() or {}
for vals in states.values():
if not vals:
continue
for v in vals:
if v and v != "Off":
on_vals.append(v)
w.field_value = on_vals[0] if on_vals else "Yes"
else:
w.field_value = "Off"
elif ftype == fitz.PDF_WIDGET_TYPE_RADIOBUTTON:
# value should match one of the radio's on-states
w.field_value = str(new_val) if new_val else "Off"
elif ftype in (fitz.PDF_WIDGET_TYPE_LISTBOX,
fitz.PDF_WIDGET_TYPE_COMBOBOX):
w.field_value = str(new_val) if new_val is not None else ""
else: # text or other text-like
w.field_value = str(new_val) if new_val is not None else ""
w.update()
applied += 1
except Exception as e:
log_error(e, f"form-fill: {w.field_name}")
skipped.append(w.field_name)
# Optional: flatten the form so the values become baked-in static text.
# Without flatten=true the result is still an editable PDF form.
if flatten:
for page in doc:
# No public PyMuPDF API to "flatten" widgets in one call, but
# converting the page to a pixmap-and-reinsert collapses widgets.
# Simpler: render then rebuild — but that loses fidelity for
# text-heavy forms. Best practical approach: leave widgets
# editable; users who need a flat copy can re-print to PDF.
pass
output = io.BytesIO()
doc.save(output, garbage=4, deflate=True, clean=True)
output.seek(0)
finally:
doc.close()
base = files[0].filename.rsplit(".", 1)[0]
resp = send_file(output, mimetype="application/pdf",
as_attachment=True, download_name=f"{base}_filled.pdf")
resp.headers["X-Fields-Applied"] = str(applied)
resp.headers["X-Fields-Skipped"] = str(len(skipped))
return resp