added pdf to excel

2026-07-01 23:17:37 +08:00 · 2026-04-20 15:07:32 +07:00
parent f23ba454f2
commit 56fa04aa3a
4 changed files with 201 additions and 1 deletions
@@ -2,6 +2,11 @@

 All notable changes to **Your Everyday Tools** are documented here. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project loosely follows [Semantic Versioning](https://semver.org/).

+## [0.4.3] — 2026-04-20
+
+### Added
+- **PDF to Excel** — extract tables from a PDF into an `.xlsx` workbook. Three modes: tables-only, tables-with-text-fallback, or text-only. Three sheet organizations: one sheet per table, one per page, or all combined. Powered by PyMuPDF's native `find_tables()` — no new dependencies. For scanned PDFs, run **OCR PDF** first.
+
 ## [0.4.2] — 2026-04-20

 ### Added
@@ -1,6 +1,6 @@
 # Your Everyday Tools

-A lightweight, self-hosted web app that bundles 77 everyday utilities into a single interface. Built with Python + Flask, zero JavaScript frameworks, and minimal CSS — no bloat, just tools.
+A lightweight, self-hosted web app that bundles 78 everyday utilities into a single interface. Built with Python + Flask, zero JavaScript frameworks, and minimal CSS — no bloat, just tools.

 ![Python](https://img.shields.io/badge/Python-3.10+-blue)
 ![Flask](https://img.shields.io/badge/Flask-3.x-green)
@@ -32,6 +32,7 @@ See [CHANGELOG.md](CHANGELOG.md) for release history and recent fixes.
 | **PDF to Word** | Convert PDF documents to `.docx` format |
 | **PDF to Images** | Export each PDF page as PNG or JPG (configurable DPI) |
 | **PDF to Text** | Extract all text content from a PDF |
+| **PDF to Excel** | Extract tables from a PDF into an `.xlsx` workbook — one sheet per table, per page, or all combined. Falls back to line-by-line text when no tables are detected. Uses PyMuPDF's native `find_tables()` (no extra dependencies). |
 | **HTML to PDF** | Convert HTML content to a PDF document |
 | **OCR PDF** | Make scanned PDFs searchable (image + hidden text layer) or extract text — 14 languages supported |
 | **CAD to PDF/Image** | Convert DXF drawings to PDF or PNG (DWG via optional ODA File Converter) |
@@ -14,6 +14,7 @@ TOOL_CATEGORIES = [
            {"id": "pdf-to-word", "name": "PDF to Word", "desc": "Convert PDF to Word document", "icon": "bi-file-word-fill"},
            {"id": "pdf-to-images", "name": "PDF to Images", "desc": "Convert PDF pages to images", "icon": "bi-file-image-fill"},
            {"id": "pdf-to-text", "name": "PDF to Text", "desc": "Extract text content from PDF", "icon": "bi-file-text-fill"},
+            {"id": "pdf-to-excel", "name": "PDF to Excel", "desc": "Extract tables from PDF into an .xlsx", "icon": "bi-file-earmark-spreadsheet-fill"},
            {"id": "html-to-pdf", "name": "HTML to PDF", "desc": "Convert HTML content to PDF", "icon": "bi-filetype-html"},
            {"id": "ocr-pdf", "name": "OCR PDF", "desc": "Make scanned PDFs searchable or extract text", "icon": "bi-file-earmark-text-fill"},
            {"id": "cad-to-pdf", "name": "CAD to PDF/Image", "desc": "Convert DXF/DWG drawings to PDF or PNG", "icon": "bi-rulers"},
@@ -93,6 +93,37 @@ def pdf_to_text_page():
        options=[])


+@bp.route("/pdf-to-excel")
+def pdf_to_excel_page():
+    return render_template("upload_tool.html",
+        title="PDF to Excel",
+        description="Extract tables from a PDF into an .xlsx workbook",
+        notes=(
+            "<p><strong>Tip:</strong> works best on PDFs with clearly ruled tables. "
+            "For scanned PDFs (images of tables), run them through "
+            "<a href=\"/convert/ocr-pdf\">OCR PDF</a> first so the tool has text to work with.</p>"
+        ),
+        endpoint="/convert/pdf-to-excel",
+        accept=".pdf",
+        multiple=False,
+        options=[
+            {"type": "text", "name": "pages", "label": "Pages (leave empty for all)",
+             "placeholder": "e.g. 1-3, 5"},
+            {"type": "select", "name": "mode", "label": "Extraction mode", "default": "tables",
+             "choices": [
+                 {"value": "tables", "label": "Tables only (recommended)"},
+                 {"value": "tables_text", "label": "Tables, fall back to text rows when none are found"},
+                 {"value": "text", "label": "Text only — every line becomes a row"},
+             ]},
+            {"type": "select", "name": "organize", "label": "Sheet organization", "default": "per_table",
+             "choices": [
+                 {"value": "per_table", "label": "One sheet per detected table"},
+                 {"value": "per_page", "label": "One sheet per page (tables stacked)"},
+                 {"value": "combined", "label": "Everything on one sheet"},
+             ]},
+        ])
+
+
 OCR_LANGS = [
    {"value": "eng", "label": "English"},
    {"value": "ind", "label": "Indonesian"},
@@ -445,6 +476,168 @@ def pdf_to_text():
    return jsonify(text="\n".join(text_parts))


+@bp.route("/pdf-to-excel", methods=["POST"])
+def pdf_to_excel():
+    import re
+    from openpyxl import Workbook
+    from openpyxl.styles import Font
+    from openpyxl.utils import get_column_letter
+    from routes.pdf_tools import parse_page_ranges
+
+    files = request.files.getlist("files")
+    if not files or not files[0].filename:
+        return jsonify(error="No file uploaded."), 400
+
+    mode = request.form.get("mode", "tables")
+    organize = request.form.get("organize", "per_table")
+    pages_spec = request.form.get("pages", "").strip()
+
+    doc = fitz.open(stream=files[0].read(), filetype="pdf")
+
+    try:
+        target_pages = parse_page_ranges(pages_spec, len(doc))
+    except ValueError:
+        doc.close()
+        return jsonify(error="Invalid page range format."), 400
+    if not target_pages:
+        doc.close()
+        return jsonify(error="No valid pages selected."), 400
+
+    wb = Workbook()
+    wb.remove(wb.active)
+    used_names: set[str] = set()
+    total_tables = 0
+    total_text_pages = 0
+
+    def _safe_name(base: str) -> str:
+        name = re.sub(r"[\[\]\*\?\/\\:]", "_", base)[:31] or "Sheet"
+        candidate = name
+        i = 2
+        while candidate in used_names:
+            suffix = f"_{i}"
+            candidate = (name[: 31 - len(suffix)] + suffix)
+            i += 1
+        used_names.add(candidate)
+        return candidate
+
+    def _write_rows(ws, rows: list[list], start_row: int = 1, header: bool = True) -> int:
+        for r_idx, row in enumerate(rows, start=start_row):
+            for c_idx, cell in enumerate(row, start=1):
+                ws.cell(row=r_idx, column=c_idx, value="" if cell is None else str(cell))
+            if header and r_idx == start_row:
+                for c_idx in range(1, len(row) + 1):
+                    ws.cell(row=r_idx, column=c_idx).font = Font(bold=True)
+        return start_row + len(rows)
+
+    def _text_rows(page) -> list[list[str]]:
+        lines = page.get_text().splitlines()
+        rows = []
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            parts = re.split(r"\s{2,}|\t+", line)
+            rows.append(parts if parts else [line])
+        return rows
+
+    # ── "combined" — stream everything into a single sheet ────────────
+    if organize == "combined":
+        ws = wb.create_sheet(_safe_name("Extracted"))
+        next_row = 1
+        for pno in target_pages:
+            page = doc[pno]
+            page_had_content = False
+
+            if mode in ("tables", "tables_text"):
+                tables = list(page.find_tables())
+                for t in tables:
+                    rows = t.extract()
+                    if not rows:
+                        continue
+                    ws.cell(row=next_row, column=1, value=f"Page {pno + 1} – table").font = Font(bold=True, italic=True)
+                    next_row += 1
+                    next_row = _write_rows(ws, rows, start_row=next_row)
+                    next_row += 1
+                    total_tables += 1
+                    page_had_content = True
+
+            if mode == "text" or (mode == "tables_text" and not page_had_content):
+                text_rows = _text_rows(page)
+                if text_rows:
+                    ws.cell(row=next_row, column=1, value=f"Page {pno + 1} – text").font = Font(bold=True, italic=True)
+                    next_row += 1
+                    next_row = _write_rows(ws, text_rows, start_row=next_row, header=False)
+                    next_row += 1
+                    total_text_pages += 1
+
+    # ── "per_page" and "per_table" ────────────────────────────────────
+    else:
+        for pno in target_pages:
+            page = doc[pno]
+            tables_rows = []  # list of (label, rows)
+
+            if mode in ("tables", "tables_text"):
+                for tidx, t in enumerate(page.find_tables(), start=1):
+                    rows = t.extract()
+                    if rows:
+                        tables_rows.append((f"Table {tidx}", rows))
+                        total_tables += 1
+
+            if mode == "text" or (mode == "tables_text" and not tables_rows):
+                text_rows = _text_rows(page)
+                if text_rows:
+                    tables_rows.append(("Text", text_rows))
+                    total_text_pages += 1
+
+            if not tables_rows:
+                continue
+
+            if organize == "per_table":
+                for label, rows in tables_rows:
+                    is_text = label == "Text"
+                    sheet = wb.create_sheet(_safe_name(f"Page{pno + 1}_{label.replace(' ', '')}"))
+                    _write_rows(sheet, rows, header=not is_text)
+            else:  # per_page
+                sheet = wb.create_sheet(_safe_name(f"Page_{pno + 1}"))
+                next_row = 1
+                for label, rows in tables_rows:
+                    is_text = label == "Text"
+                    sheet.cell(row=next_row, column=1, value=label).font = Font(bold=True, italic=True)
+                    next_row += 1
+                    next_row = _write_rows(sheet, rows, start_row=next_row, header=not is_text)
+                    next_row += 1
+
+    doc.close()
+
+    if not wb.sheetnames:
+        return jsonify(error=(
+            "No tables or text found on the selected pages. "
+            "If this is a scanned PDF, run it through OCR PDF first."
+        )), 400
+
+    # Auto-size columns on every sheet (cap at 60 chars to avoid absurd widths)
+    for ws in wb.worksheets:
+        for col_idx in range(1, ws.max_column + 1):
+            max_len = 0
+            for row_idx in range(1, ws.max_row + 1):
+                v = ws.cell(row=row_idx, column=col_idx).value
+                if v is not None:
+                    max_len = max(max_len, len(str(v)))
+            ws.column_dimensions[get_column_letter(col_idx)].width = min(max_len + 2, 60)
+
+    output = io.BytesIO()
+    wb.save(output)
+    output.seek(0)
+
+    name = files[0].filename.rsplit(".", 1)[0] + ".xlsx"
+    return send_file(
+        output,
+        mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        as_attachment=True,
+        download_name=name,
+    )
+
+
@bp.route("/html-to-pdf", methods=["POST"])
 def html_to_pdf():
    html = request.form.get("text", "").strip()