From 3187980ead46e433080a9224adfa36bfdb22abf9 Mon Sep 17 00:00:00 2001 From: "hubert.lenoir" Date: Fri, 3 Apr 2026 18:23:27 +0200 Subject: [PATCH] feat: add embeds metadata --- .bruno/Chromium/Convert/HTML to PDF.bru | 5 +- .bruno/Chromium/Convert/Markdown to PDF.bru | 3 + .bruno/Chromium/Convert/URL to PDF.bru | 3 + .bruno/LibreOffice/Convert to PDF.bru | 5 +- .bruno/PDF Engines/Embed/Embed Files.bru | 6 +- Makefile | 3 +- compose.yaml | 1 + pkg/gotenberg/mocks.go | 33 +- pkg/gotenberg/pdfengine.go | 6 + pkg/modules/api/formdata.go | 33 ++ pkg/modules/chromium/routes.go | 16 +- pkg/modules/exiftool/exiftool.go | 5 + .../libreoffice/pdfengine/pdfengine.go | 5 + pkg/modules/libreoffice/routes.go | 6 + pkg/modules/pdfcpu/pdfcpu.go | 5 + pkg/modules/pdfengines/multi.go | 40 +++ pkg/modules/pdfengines/pdfengines.go | 11 + pkg/modules/pdfengines/routes.go | 42 +++ pkg/modules/pdftk/pdftk.go | 5 + pkg/modules/qpdf/qpdf.go | 287 ++++++++++++++++++ pkg/modules/qpdf/qpdf_test.go | 271 +++++++++++++++++ .../features/pdfengines_embed.feature | 15 + test/integration/scenario/containers.go | 3 +- test/integration/scenario/scenario.go | 63 ++++ 24 files changed, 848 insertions(+), 24 deletions(-) create mode 100644 pkg/modules/qpdf/qpdf_test.go diff --git a/.bruno/Chromium/Convert/HTML to PDF.bru b/.bruno/Chromium/Convert/HTML to PDF.bru index 7d219d7..34f201c 100644 --- a/.bruno/Chromium/Convert/HTML to PDF.bru +++ b/.bruno/Chromium/Convert/HTML to PDF.bru @@ -11,7 +11,7 @@ post { } body:multipart-form { - files: @file(../../test/integration/testdata/page-1-html/index.html) + files: @file(../test/integration/testdata/page-1-html/index.html) ~landscape: false ~printBackground: false ~scale: 1.0 @@ -50,6 +50,9 @@ body:multipart-form { ~metadata: {"Author":"Bruno","Title":"Test"} ~userPassword: ~ownerPassword: + ~embeds: @file(../test/integration/testdata/embed_1.xml) + ~embeds: @file(../test/integration/testdata/embed_2.xml) + ~embedsMetadata: {"embed_1.xml":{"mimeType":"text/xml","relationship":"Data"}, "embed_2.xml":{"mimeType":"text/xml","relationship":"Data"}} ~watermarkSource: text ~watermarkExpression: CONFIDENTIAL ~watermarkPages: diff --git a/.bruno/Chromium/Convert/Markdown to PDF.bru b/.bruno/Chromium/Convert/Markdown to PDF.bru index ac850fc..3b49f6a 100644 --- a/.bruno/Chromium/Convert/Markdown to PDF.bru +++ b/.bruno/Chromium/Convert/Markdown to PDF.bru @@ -51,6 +51,9 @@ body:multipart-form { ~metadata: {"Author":"Bruno","Title":"Test"} ~userPassword: ~ownerPassword: + ~embeds: @file(../test/integration/testdata/embed_1.xml) + ~embeds: @file(../test/integration/testdata/embed_2.xml) + ~embedsMetadata: {"embed_1.xml":{"mimeType":"text/xml","relationship":"Data"}, "embed_2.xml":{"mimeType":"text/xml","relationship":"Data"}} ~watermarkSource: text ~watermarkExpression: CONFIDENTIAL ~watermarkPages: diff --git a/.bruno/Chromium/Convert/URL to PDF.bru b/.bruno/Chromium/Convert/URL to PDF.bru index a40f75d..d8fd198 100644 --- a/.bruno/Chromium/Convert/URL to PDF.bru +++ b/.bruno/Chromium/Convert/URL to PDF.bru @@ -50,6 +50,9 @@ body:multipart-form { ~metadata: {"Author":"Bruno","Title":"Test"} ~userPassword: ~ownerPassword: + ~embeds: @file(../test/integration/testdata/embed_1.xml) + ~embeds: @file(../test/integration/testdata/embed_2.xml) + ~embedsMetadata: {"embed_1.xml":{"mimeType":"text/xml","relationship":"Data"}, "embed_2.xml":{"mimeType":"text/xml","relationship":"Data"}} ~watermarkSource: text ~watermarkExpression: CONFIDENTIAL ~watermarkPages: diff --git a/.bruno/LibreOffice/Convert to PDF.bru b/.bruno/LibreOffice/Convert to PDF.bru index 9081658..105e7a8 100644 --- a/.bruno/LibreOffice/Convert to PDF.bru +++ b/.bruno/LibreOffice/Convert to PDF.bru @@ -11,7 +11,7 @@ post { } body:multipart-form { - files: @file(../../test/integration/testdata/page_1.docx) + files: @file(../test/integration/testdata/page_1.docx) ~password: ~landscape: false ~nativePageRanges: @@ -67,6 +67,9 @@ body:multipart-form { ~metadata: {"Author":"Bruno","Title":"Test"} ~userPassword: ~ownerPassword: + ~embeds: @file(../test/integration/testdata/embed_1.xml) + ~embeds: @file(../test/integration/testdata/embed_2.xml) + ~embedsMetadata: {"embed_1.xml":{"mimeType":"text/xml","relationship":"Data"}, "embed_2.xml":{"mimeType":"text/xml","relationship":"Data"}} ~watermarkSource: text ~watermarkExpression: CONFIDENTIAL ~watermarkPages: diff --git a/.bruno/PDF Engines/Embed/Embed Files.bru b/.bruno/PDF Engines/Embed/Embed Files.bru index 30bf349..6b45070 100644 --- a/.bruno/PDF Engines/Embed/Embed Files.bru +++ b/.bruno/PDF Engines/Embed/Embed Files.bru @@ -11,8 +11,10 @@ post { } body:multipart-form { - files: @file(../../test/integration/testdata/page_1.pdf) - embeds: @file(../../test/integration/testdata/page_1.pdf) + files: @file(../test/integration/testdata/page_1.pdf) + embeds: @file(../test/integration/testdata/embed_1.xml) + embeds: @file(../test/integration/testdata/embed_2.xml) + embedsMetadata: {"embed_1.xml":{"mimeType":"text/xml","relationship":"Data"}, "embed_2.xml":{"mimeType":"text/xml","relationship":"Data"}} ~downloadFrom: [{"url":"https://example.com/attachment.xml","embedded":true}] } diff --git a/Makefile b/Makefile index cef5aee..de1c9f5 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,8 @@ PDFENGINES_WATERMARK_ENGINES=pdfcpu,pdftk PDFENGINES_STAMP_ENGINES=pdfcpu,pdftk PDFENGINES_ENCRYPT_ENGINES=qpdf,pdfcpu,pdftk PDFENGINES_ROTATE_ENGINES=pdfcpu,pdftk -PDFENGINES_EMBED_ENGINES=pdfcpu +PDFENGINES_EMBED_ENGINES=qpdf,pdfcpu +PDFENGINES_EMBED_METADATA_ENGINES=qpdf PROMETHEUS_NAMESPACE=gotenberg PROMETHEUS_COLLECT_INTERVAL=1s PROMETHEUS_DISABLE_ROUTE_TELEMETRY=true diff --git a/compose.yaml b/compose.yaml index 728818b..4c1698f 100644 --- a/compose.yaml +++ b/compose.yaml @@ -77,6 +77,7 @@ services: - "--pdfengines-encrypt-engines=${PDFENGINES_ENCRYPT_ENGINES}" - "--pdfengines-rotate-engines=${PDFENGINES_ROTATE_ENGINES}" - "--pdfengines-embed-engines=${PDFENGINES_EMBED_ENGINES}" + - "--pdfengines-embed-metadata-engines=${PDFENGINES_EMBED_METADATA_ENGINES}" - "--pdfengines-disable-routes=${PDFENGINES_DISABLE_ROUTES}" - "--prometheus-namespace=${PROMETHEUS_NAMESPACE}" - "--prometheus-collect-interval=${PROMETHEUS_COLLECT_INTERVAL}" diff --git a/pkg/gotenberg/mocks.go b/pkg/gotenberg/mocks.go index ad87dfc..2b413dd 100644 --- a/pkg/gotenberg/mocks.go +++ b/pkg/gotenberg/mocks.go @@ -45,20 +45,21 @@ func (mod *DebuggableMock) Debug() map[string]any { // //nolint:dupl type PdfEngineMock struct { - MergeMock func(ctx context.Context, logger *slog.Logger, inputPaths []string, outputPath string) error - SplitMock func(ctx context.Context, logger *slog.Logger, mode SplitMode, inputPath, outputDirPath string) ([]string, error) - FlattenMock func(ctx context.Context, logger *slog.Logger, inputPath string) error - ConvertMock func(ctx context.Context, logger *slog.Logger, formats PdfFormats, inputPath, outputPath string) error - ReadMetadataMock func(ctx context.Context, logger *slog.Logger, inputPath string) (map[string]any, error) - PageCountMock func(ctx context.Context, logger *slog.Logger, inputPath string) (int, error) - WriteMetadataMock func(ctx context.Context, logger *slog.Logger, metadata map[string]any, inputPath string) error - ReadBookmarksMock func(ctx context.Context, logger *slog.Logger, inputPath string) ([]Bookmark, error) - EncryptMock func(ctx context.Context, logger *slog.Logger, inputPath, userPassword, ownerPassword string) error - EmbedFilesMock func(ctx context.Context, logger *slog.Logger, filePaths []string, inputPath string) error - WriteBookmarksMock func(ctx context.Context, logger *slog.Logger, inputPath string, bookmarks []Bookmark) error - WatermarkMock func(ctx context.Context, logger *slog.Logger, inputPath string, stamp Stamp) error - StampMock func(ctx context.Context, logger *slog.Logger, inputPath string, stamp Stamp) error - RotateMock func(ctx context.Context, logger *slog.Logger, inputPath string, angle int, pages string) error + MergeMock func(ctx context.Context, logger *slog.Logger, inputPaths []string, outputPath string) error + SplitMock func(ctx context.Context, logger *slog.Logger, mode SplitMode, inputPath, outputDirPath string) ([]string, error) + FlattenMock func(ctx context.Context, logger *slog.Logger, inputPath string) error + ConvertMock func(ctx context.Context, logger *slog.Logger, formats PdfFormats, inputPath, outputPath string) error + ReadMetadataMock func(ctx context.Context, logger *slog.Logger, inputPath string) (map[string]any, error) + PageCountMock func(ctx context.Context, logger *slog.Logger, inputPath string) (int, error) + WriteMetadataMock func(ctx context.Context, logger *slog.Logger, metadata map[string]any, inputPath string) error + ReadBookmarksMock func(ctx context.Context, logger *slog.Logger, inputPath string) ([]Bookmark, error) + EncryptMock func(ctx context.Context, logger *slog.Logger, inputPath, userPassword, ownerPassword string) error + EmbedFilesMock func(ctx context.Context, logger *slog.Logger, filePaths []string, inputPath string) error + EmbedFilesMetadataMock func(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error + WriteBookmarksMock func(ctx context.Context, logger *slog.Logger, inputPath string, bookmarks []Bookmark) error + WatermarkMock func(ctx context.Context, logger *slog.Logger, inputPath string, stamp Stamp) error + StampMock func(ctx context.Context, logger *slog.Logger, inputPath string, stamp Stamp) error + RotateMock func(ctx context.Context, logger *slog.Logger, inputPath string, angle int, pages string) error } func (engine *PdfEngineMock) Merge(ctx context.Context, logger *slog.Logger, inputPaths []string, outputPath string) error { @@ -101,6 +102,10 @@ func (engine *PdfEngineMock) EmbedFiles(ctx context.Context, logger *slog.Logger return engine.EmbedFilesMock(ctx, logger, filePaths, inputPath) } +func (engine *PdfEngineMock) EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error { + return engine.EmbedFilesMetadataMock(ctx, logger, metadata, inputPath) +} + func (engine *PdfEngineMock) WriteBookmarks(ctx context.Context, logger *slog.Logger, inputPath string, bookmarks []Bookmark) error { return engine.WriteBookmarksMock(ctx, logger, inputPath, bookmarks) } diff --git a/pkg/gotenberg/pdfengine.go b/pkg/gotenberg/pdfengine.go index 0bb99bd..7f0e9d2 100644 --- a/pkg/gotenberg/pdfengine.go +++ b/pkg/gotenberg/pdfengine.go @@ -201,6 +201,12 @@ type PdfEngine interface { // TODO: attachments instead? Rename the route? EmbedFiles(ctx context.Context, logger *slog.Logger, filePaths []string, inputPath string) error + // EmbedFilesMetadata sets metadata (such as MIME type and AFRelationship) + // on already-embedded files in a PDF. The metadata map is keyed by + // filename, with each value being a map of property names to values + // (e.g., "mimeType" and "relationship"). + EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error + // Watermark applies a watermark (behind page content) to a PDF file. Watermark(ctx context.Context, logger *slog.Logger, inputPath string, stamp Stamp) error diff --git a/pkg/modules/api/formdata.go b/pkg/modules/api/formdata.go index d920e9f..631f3ab 100644 --- a/pkg/modules/api/formdata.go +++ b/pkg/modules/api/formdata.go @@ -1,6 +1,7 @@ package api import ( + "encoding/json" "errors" "fmt" "math" @@ -391,6 +392,38 @@ func (form *FormData) Embeds(target *[]string) *FormData { return form } +// EmbedsMetadata parses the "embedsMetadata" form field (a JSON string) into +// a map keyed by filename. Each value is a map of property names to values +// (e.g., "mimeType" and "relationship"). +// +// var metadata map[string]map[string]string +// +// ctx.FormData().EmbedsMetadata(&metadata) +func (form *FormData) EmbedsMetadata(target *map[string]map[string]string) *FormData { + if form.errors != nil { + return form + } + + val, ok := form.values["embedsMetadata"] + if !ok || len(val) == 0 || val[0] == "" { + return form + } + + raw := val[0] + parsed := make(map[string]map[string]string) + + err := json.Unmarshal([]byte(raw), &parsed) + if err != nil { + form.append( + fmt.Errorf("form field 'embedsMetadata' is invalid: %w", err), + ) + return form + } + + *target = parsed + return form +} + // MandatoryPaths binds the absolute paths of form data files, according to a // list of file extensions, to a string slice variable. It populates an error // if there is no file for given file extensions. diff --git a/pkg/modules/chromium/routes.go b/pkg/modules/chromium/routes.go index eb8afb8..3a03568 100644 --- a/pkg/modules/chromium/routes.go +++ b/pkg/modules/chromium/routes.go @@ -421,6 +421,7 @@ func convertUrlRoute(chromium Api, engine gotenberg.PdfEngine) api.Route { stamp := pdfengines.FormDataPdfStamp(form, false) stampFile := pdfengines.FormDataPdfStampFile(form) rotateAngle, rotatePages := pdfengines.FormDataPdfRotate(form, false) + embedsMetadata := pdfengines.FormDataPdfEmbedsMetadata(form) var url string err := form. @@ -437,7 +438,7 @@ func convertUrlRoute(chromium Api, engine gotenberg.PdfEngine) api.Route { stamp.Expression = stampFile } - err = convertUrl(ctx, chromium, engine, url, options, mode, pdfFormats, metadata, userPassword, ownerPassword, embedPaths, watermark, stamp, rotateAngle, rotatePages) + err = convertUrl(ctx, chromium, engine, url, options, mode, pdfFormats, metadata, userPassword, ownerPassword, embedPaths, embedsMetadata, watermark, stamp, rotateAngle, rotatePages) if err != nil { return fmt.Errorf("convert URL to PDF: %w", err) } @@ -496,6 +497,7 @@ func convertHtmlRoute(chromium Api, engine gotenberg.PdfEngine) api.Route { stamp := pdfengines.FormDataPdfStamp(form, false) stampFile := pdfengines.FormDataPdfStampFile(form) rotateAngle, rotatePages := pdfengines.FormDataPdfRotate(form, false) + embedsMetadata := pdfengines.FormDataPdfEmbedsMetadata(form) var inputPath string err := form. @@ -514,7 +516,7 @@ func convertHtmlRoute(chromium Api, engine gotenberg.PdfEngine) api.Route { url := fmt.Sprintf("file://%s", inputPath) options.AllowedFilePrefixes = []string{ctx.DirPath()} - err = convertUrl(ctx, chromium, engine, url, options, mode, pdfFormats, metadata, userPassword, ownerPassword, embedPaths, watermark, stamp, rotateAngle, rotatePages) + err = convertUrl(ctx, chromium, engine, url, options, mode, pdfFormats, metadata, userPassword, ownerPassword, embedPaths, embedsMetadata, watermark, stamp, rotateAngle, rotatePages) if err != nil { return fmt.Errorf("convert HTML to PDF: %w", err) } @@ -575,6 +577,7 @@ func convertMarkdownRoute(chromium Api, engine gotenberg.PdfEngine) api.Route { stamp := pdfengines.FormDataPdfStamp(form, false) stampFile := pdfengines.FormDataPdfStampFile(form) rotateAngle, rotatePages := pdfengines.FormDataPdfRotate(form, false) + embedsMetadata := pdfengines.FormDataPdfEmbedsMetadata(form) var ( inputPath string @@ -602,7 +605,7 @@ func convertMarkdownRoute(chromium Api, engine gotenberg.PdfEngine) api.Route { } options.AllowedFilePrefixes = []string{ctx.DirPath()} - err = convertUrl(ctx, chromium, engine, url, options, mode, pdfFormats, metadata, userPassword, ownerPassword, embedPaths, watermark, stamp, rotateAngle, rotatePages) + err = convertUrl(ctx, chromium, engine, url, options, mode, pdfFormats, metadata, userPassword, ownerPassword, embedPaths, embedsMetadata, watermark, stamp, rotateAngle, rotatePages) if err != nil { return fmt.Errorf("convert markdown to PDF: %w", err) } @@ -727,7 +730,7 @@ func markdownToHtml(ctx *api.Context, inputPath string, markdownPaths []string) return fmt.Sprintf("file://%s", inputPath), nil } -func convertUrl(ctx *api.Context, chromium Api, engine gotenberg.PdfEngine, url string, options PdfOptions, mode gotenberg.SplitMode, pdfFormats gotenberg.PdfFormats, metadata map[string]any, userPassword, ownerPassword string, embedPaths []string, watermark, stamp gotenberg.Stamp, rotateAngle int, rotatePages string) error { +func convertUrl(ctx *api.Context, chromium Api, engine gotenberg.PdfEngine, url string, options PdfOptions, mode gotenberg.SplitMode, pdfFormats gotenberg.PdfFormats, metadata map[string]any, userPassword, ownerPassword string, embedPaths []string, embedsMetadata map[string]map[string]string, watermark, stamp gotenberg.Stamp, rotateAngle int, rotatePages string) error { outputPath := ctx.GeneratePath(".pdf") // See https://github.com/gotenberg/gotenberg/issues/1130. filename := ctx.OutputFilename(outputPath) @@ -831,6 +834,11 @@ func convertUrl(ctx *api.Context, chromium Api, engine gotenberg.PdfEngine, url return fmt.Errorf("embed files into PDFs: %w", err) } + err = pdfengines.EmbedFilesMetadataStub(ctx, engine, embedsMetadata, convertOutputPaths) + if err != nil { + return fmt.Errorf("set embeds metadata: %w", err) + } + err = pdfengines.EncryptPdfStub(ctx, engine, userPassword, ownerPassword, convertOutputPaths) if err != nil { return fmt.Errorf("encrypt PDFs: %w", err) diff --git a/pkg/modules/exiftool/exiftool.go b/pkg/modules/exiftool/exiftool.go index cde6f6a..1f71c31 100644 --- a/pkg/modules/exiftool/exiftool.go +++ b/pkg/modules/exiftool/exiftool.go @@ -531,6 +531,11 @@ func (engine *ExifTool) Rotate(ctx context.Context, logger *slog.Logger, inputPa return err } +// EmbedFilesMetadata is not available in this implementation. +func (engine *ExifTool) EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error { + return fmt.Errorf("set embeds metadata with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported) +} + // Interface guards. var ( _ gotenberg.Module = (*ExifTool)(nil) diff --git a/pkg/modules/libreoffice/pdfengine/pdfengine.go b/pkg/modules/libreoffice/pdfengine/pdfengine.go index c9096a0..d451b5a 100644 --- a/pkg/modules/libreoffice/pdfengine/pdfengine.go +++ b/pkg/modules/libreoffice/pdfengine/pdfengine.go @@ -115,6 +115,11 @@ func (engine *LibreOfficePdfEngine) EmbedFiles(ctx context.Context, logger *slog return fmt.Errorf("embed files with LibreOffice: %w", gotenberg.ErrPdfEngineMethodNotSupported) } +// EmbedFilesMetadata is not available in this implementation. +func (engine *LibreOfficePdfEngine) EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error { + return fmt.Errorf("set embeds metadata with LibreOffice: %w", gotenberg.ErrPdfEngineMethodNotSupported) +} + // Watermark is not available in this implementation. func (engine *LibreOfficePdfEngine) Watermark(ctx context.Context, logger *slog.Logger, inputPath string, stamp gotenberg.Stamp) error { return fmt.Errorf("watermark PDF with LibreOffice: %w", gotenberg.ErrPdfEngineMethodNotSupported) diff --git a/pkg/modules/libreoffice/routes.go b/pkg/modules/libreoffice/routes.go index f3e996c..896f2a3 100644 --- a/pkg/modules/libreoffice/routes.go +++ b/pkg/modules/libreoffice/routes.go @@ -37,6 +37,7 @@ func convertRoute(libreOffice libreofficeapi.Uno, engine gotenberg.PdfEngine) ap stamp := pdfengines.FormDataPdfStamp(form, false) stampFile := pdfengines.FormDataPdfStampFile(form) angle, rotatePages := pdfengines.FormDataPdfRotate(form, false) + embedsMetadata := pdfengines.FormDataPdfEmbedsMetadata(form) zeroValuedSplitMode := gotenberg.SplitMode{} @@ -495,6 +496,11 @@ func convertRoute(libreOffice libreofficeapi.Uno, engine gotenberg.PdfEngine) ap return fmt.Errorf("embed files into PDFs: %w", err) } + err = pdfengines.EmbedFilesMetadataStub(ctx, engine, embedsMetadata, outputPaths) + if err != nil { + return fmt.Errorf("set embeds metadata: %w", err) + } + err = pdfengines.EncryptPdfStub(ctx, engine, userPassword, ownerPassword, outputPaths) if err != nil { return fmt.Errorf("encrypt PDFs: %w", err) diff --git a/pkg/modules/pdfcpu/pdfcpu.go b/pkg/modules/pdfcpu/pdfcpu.go index 889cb01..3af7fcc 100644 --- a/pkg/modules/pdfcpu/pdfcpu.go +++ b/pkg/modules/pdfcpu/pdfcpu.go @@ -447,6 +447,11 @@ func (engine *PdfCpu) WriteBookmarks(ctx context.Context, logger *slog.Logger, i return nil } +// EmbedFilesMetadata is not available in this implementation. +func (engine *PdfCpu) EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error { + return fmt.Errorf("set embeds metadata with pdfcpu: %w", gotenberg.ErrPdfEngineMethodNotSupported) +} + // EmbedFiles embeds files into a PDF. All files are embedded as file attachments // without modifying the main PDF content. func (engine *PdfCpu) EmbedFiles(ctx context.Context, logger *slog.Logger, filePaths []string, inputPath string) error { diff --git a/pkg/modules/pdfengines/multi.go b/pkg/modules/pdfengines/multi.go index 0c58e4e..5517c24 100644 --- a/pkg/modules/pdfengines/multi.go +++ b/pkg/modules/pdfengines/multi.go @@ -22,6 +22,7 @@ type multiPdfEngines struct { writeMetadataEngines []gotenberg.PdfEngine passwordEngines []gotenberg.PdfEngine embedEngines []gotenberg.PdfEngine + embedMetadataEngines []gotenberg.PdfEngine readBookmarksEngines []gotenberg.PdfEngine writeBookmarksEngines []gotenberg.PdfEngine watermarkEngines []gotenberg.PdfEngine @@ -38,6 +39,7 @@ func newMultiPdfEngines( writeMetadataEngines, passwordEngines, embedEngines, + embedMetadataEngines, readBookmarksEngines, writeBookmarksEngines, watermarkEngines, @@ -53,6 +55,7 @@ func newMultiPdfEngines( writeMetadataEngines: writeMetadataEngines, passwordEngines: passwordEngines, embedEngines: embedEngines, + embedMetadataEngines: embedMetadataEngines, readBookmarksEngines: readBookmarksEngines, writeBookmarksEngines: writeBookmarksEngines, watermarkEngines: watermarkEngines, @@ -603,6 +606,43 @@ func (multi *multiPdfEngines) Rotate(ctx context.Context, logger *slog.Logger, i return err } +// EmbedFilesMetadata sets metadata on embedded files using the first available +// engine that supports it. +// +//nolint:dupl +func (multi *multiPdfEngines) EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error { + tracer := gotenberg.Tracer() + ctx, span := tracer.Start(ctx, "pdfengines.EmbedFilesMetadata", trace.WithSpanKind(trace.SpanKindInternal)) + defer span.End() + + var err error + errChan := make(chan error, 1) + + for _, engine := range multi.embedMetadataEngines { + go func(engine gotenberg.PdfEngine) { + errChan <- engine.EmbedFilesMetadata(ctx, logger, metadata, inputPath) + }(engine) + + select { + case setErr := <-errChan: + if setErr != nil { + err = errors.Join(err, setErr) + } else { + span.SetStatus(codes.Ok, "") + return nil + } + case <-ctx.Done(): + return ctx.Err() + } + } + + err = fmt.Errorf("set embeds metadata using multi PDF engines: %w", err) + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + + return err +} + // Interface guards. var ( _ gotenberg.PdfEngine = (*multiPdfEngines)(nil) diff --git a/pkg/modules/pdfengines/pdfengines.go b/pkg/modules/pdfengines/pdfengines.go index 38c3ab0..5f01286 100644 --- a/pkg/modules/pdfengines/pdfengines.go +++ b/pkg/modules/pdfengines/pdfengines.go @@ -36,6 +36,7 @@ type PdfEngines struct { writeMetadataNames []string encryptNames []string embedNames []string + embedMetadataNames []string readBookmarksNames []string writeBookmarksNames []string watermarkNames []string @@ -59,6 +60,7 @@ func (mod *PdfEngines) Descriptor() gotenberg.ModuleDescriptor { fs.StringSlice("pdfengines-write-metadata-engines", []string{"exiftool"}, "Set the PDF engines and their order for the write metadata feature - empty means all") fs.StringSlice("pdfengines-encrypt-engines", []string{"qpdf", "pdftk", "pdfcpu"}, "Set the PDF engines and their order for the password protection feature - empty means all") fs.StringSlice("pdfengines-embed-engines", []string{"pdfcpu"}, "Set the PDF engines and their order for the file embedding feature - empty means all") + fs.StringSlice("pdfengines-embed-metadata-engines", []string{"qpdf"}, "Set the PDF engines and their order for the embed metadata feature - empty means all") fs.StringSlice("pdfengines-read-bookmarks-engines", []string{"pdfcpu"}, "Set the PDF engines and their order for the read bookmarks feature - empty means all") fs.StringSlice("pdfengines-write-bookmarks-engines", []string{"pdfcpu"}, "Set the PDF engines and their order for the write bookmarks feature - empty means all") fs.StringSlice("pdfengines-watermark-engines", []string{"pdfcpu", "pdftk"}, "Set the PDF engines and their order for the watermark feature - empty means all") @@ -91,6 +93,7 @@ func (mod *PdfEngines) Provision(ctx *gotenberg.Context) error { writeMetadataNames := flags.MustStringSlice("pdfengines-write-metadata-engines") encryptNames := flags.MustStringSlice("pdfengines-encrypt-engines") embedNames := flags.MustStringSlice("pdfengines-embed-engines") + embedMetadataNames := flags.MustStringSlice("pdfengines-embed-metadata-engines") readBookmarksNames := flags.MustStringSlice("pdfengines-read-bookmarks-engines") writeBookmarksNames := flags.MustStringSlice("pdfengines-write-bookmarks-engines") watermarkNames := flags.MustStringSlice("pdfengines-watermark-engines") @@ -162,6 +165,11 @@ func (mod *PdfEngines) Provision(ctx *gotenberg.Context) error { mod.embedNames = embedNames } + mod.embedMetadataNames = defaultNames + if len(embedMetadataNames) > 0 { + mod.embedMetadataNames = embedMetadataNames + } + mod.readBookmarksNames = defaultNames if len(readBookmarksNames) > 0 { mod.readBookmarksNames = readBookmarksNames @@ -236,6 +244,7 @@ func (mod *PdfEngines) Validate() error { findNonExistingEngines(mod.writeMetadataNames) findNonExistingEngines(mod.encryptNames) findNonExistingEngines(mod.embedNames) + findNonExistingEngines(mod.embedMetadataNames) findNonExistingEngines(mod.readBookmarksNames) findNonExistingEngines(mod.writeBookmarksNames) findNonExistingEngines(mod.watermarkNames) @@ -261,6 +270,7 @@ func (mod *PdfEngines) SystemMessages() []string { fmt.Sprintf("write metadata engines - %s", strings.Join(mod.writeMetadataNames, " ")), fmt.Sprintf("encrypt engines - %s", strings.Join(mod.encryptNames, " ")), fmt.Sprintf("embed engines - %s", strings.Join(mod.embedNames, " ")), + fmt.Sprintf("embed metadata engines - %s", strings.Join(mod.embedMetadataNames, " ")), fmt.Sprintf("read bookmarks engines - %s", strings.Join(mod.readBookmarksNames, " ")), fmt.Sprintf("write bookmarks engines - %s", strings.Join(mod.writeBookmarksNames, " ")), fmt.Sprintf("watermark engines - %s", strings.Join(mod.watermarkNames, " ")), @@ -294,6 +304,7 @@ func (mod *PdfEngines) PdfEngine() (gotenberg.PdfEngine, error) { engines(mod.writeMetadataNames), engines(mod.encryptNames), engines(mod.embedNames), + engines(mod.embedMetadataNames), engines(mod.readBookmarksNames), engines(mod.writeBookmarksNames), engines(mod.watermarkNames), diff --git a/pkg/modules/pdfengines/routes.go b/pkg/modules/pdfengines/routes.go index b1df965..4365ef7 100644 --- a/pkg/modules/pdfengines/routes.go +++ b/pkg/modules/pdfengines/routes.go @@ -443,6 +443,30 @@ func FormDataPdfEmbeds(form *api.FormData) []string { return embedPaths } +// FormDataPdfEmbedsMetadata extracts embeds metadata from form data. +// The "embedsMetadata" field is a JSON string keyed by filename. +func FormDataPdfEmbedsMetadata(form *api.FormData) map[string]map[string]string { + var metadata map[string]map[string]string + form.EmbedsMetadata(&metadata) + return metadata +} + +// EmbedFilesMetadataStub sets metadata on embedded files in PDFs. +func EmbedFilesMetadataStub(ctx *api.Context, engine gotenberg.PdfEngine, metadata map[string]map[string]string, inputPaths []string) error { + if len(metadata) == 0 { + return nil + } + + for _, inputPath := range inputPaths { + err := engine.EmbedFilesMetadata(ctx, ctx.Log(), metadata, inputPath) + if err != nil { + return fmt.Errorf("set embeds metadata on PDF '%s': %w", inputPath, err) + } + } + + return nil +} + // FormDataPdfEncrypt extracts encryption parameters from form data. func FormDataPdfEncrypt(form *api.FormData) (userPassword, ownerPassword string) { form.String("userPassword", &userPassword, "") @@ -638,6 +662,7 @@ func mergeRoute(engine gotenberg.PdfEngine) api.Route { stamp := FormDataPdfStamp(form, false) stampFile := FormDataPdfStampFile(form) angle, rotatePages := FormDataPdfRotate(form, false) + embedsMetadata := FormDataPdfEmbedsMetadata(form) var inputPaths []string var flatten bool @@ -754,6 +779,11 @@ func mergeRoute(engine gotenberg.PdfEngine) api.Route { return fmt.Errorf("embed files into PDFs: %w", err) } + err = EmbedFilesMetadataStub(ctx, engine, embedsMetadata, outputPaths) + if err != nil { + return fmt.Errorf("set embeds metadata: %w", err) + } + err = EncryptPdfStub(ctx, engine, userPassword, ownerPassword, outputPaths) if err != nil { return fmt.Errorf("encrypt PDFs: %w", err) @@ -789,6 +819,7 @@ func splitRoute(engine gotenberg.PdfEngine) api.Route { stamp := FormDataPdfStamp(form, false) stampFile := FormDataPdfStampFile(form) angle, rotatePages := FormDataPdfRotate(form, false) + embedsMetadata := FormDataPdfEmbedsMetadata(form) var inputPaths []string var flatten bool @@ -856,6 +887,11 @@ func splitRoute(engine gotenberg.PdfEngine) api.Route { return fmt.Errorf("embed files into PDFs: %w", err) } + err = EmbedFilesMetadataStub(ctx, engine, embedsMetadata, convertOutputPaths) + if err != nil { + return fmt.Errorf("set embeds metadata: %w", err) + } + err = EncryptPdfStub(ctx, engine, userPassword, ownerPassword, convertOutputPaths) if err != nil { return fmt.Errorf("encrypt PDFs: %w", err) @@ -1180,6 +1216,7 @@ func embedRoute(engine gotenberg.PdfEngine) api.Route { form := ctx.FormData() embedPaths := FormDataPdfEmbeds(form) + embedsMetadata := FormDataPdfEmbedsMetadata(form) var inputPaths []string err := form. @@ -1193,6 +1230,11 @@ func embedRoute(engine gotenberg.PdfEngine) api.Route { return fmt.Errorf("embed files into PDFs: %w", err) } + err = EmbedFilesMetadataStub(ctx, engine, embedsMetadata, inputPaths) + if err != nil { + return fmt.Errorf("set embeds metadata: %w", err) + } + err = ctx.AddOutputPaths(inputPaths...) if err != nil { return fmt.Errorf("add output paths: %w", err) diff --git a/pkg/modules/pdftk/pdftk.go b/pkg/modules/pdftk/pdftk.go index 87c2e10..fad32cf 100644 --- a/pkg/modules/pdftk/pdftk.go +++ b/pkg/modules/pdftk/pdftk.go @@ -495,6 +495,11 @@ func (engine *PdfTk) Rotate(ctx context.Context, logger *slog.Logger, inputPath return nil } +// EmbedFilesMetadata is not available in this implementation. +func (engine *PdfTk) EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error { + return fmt.Errorf("set embeds metadata with PDFtk: %w", gotenberg.ErrPdfEngineMethodNotSupported) +} + // Interface guards. var ( _ gotenberg.Module = (*PdfTk)(nil) diff --git a/pkg/modules/qpdf/qpdf.go b/pkg/modules/qpdf/qpdf.go index b5bd67a..8a7f76d 100644 --- a/pkg/modules/qpdf/qpdf.go +++ b/pkg/modules/qpdf/qpdf.go @@ -3,12 +3,14 @@ package qpdf import ( "bytes" "context" + "encoding/json" "errors" "fmt" "log/slog" "os" "os/exec" "path/filepath" + "strings" "syscall" "go.opentelemetry.io/otel/codes" @@ -348,6 +350,291 @@ func (engine *QPdf) EmbedFiles(ctx context.Context, logger *slog.Logger, filePat return err } +// EmbedFilesMetadata sets metadata on already-embedded files in a PDF using +// QPDF's JSON manipulation. It sets /AFRelationship on Filespec objects, +// /Subtype on EmbeddedFile streams, and ensures the Catalog /AF array +// references the Filespec objects. +func (engine *QPdf) EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error { + ctx, span := gotenberg.Tracer().Start(ctx, "qpdf.EmbedFilesMetadata", + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes(semconv.ServerAddress(engine.binPath)), + ) + defer span.End() + + if len(metadata) == 0 { + span.SetStatus(codes.Ok, "") + return nil + } + + logger.DebugContext(ctx, fmt.Sprintf("setting embeds metadata on %s with QPDF", inputPath)) + + args := append([]string{inputPath}, engine.globalArgs...) + args = append(args, "--newline-before-endstream", "--json-output") + + output, err := engine.execCaptureOutput(ctx, args...) + if err != nil { + err = fmt.Errorf("get PDF JSON with QPDF: %w", err) + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + return err + } + + objects, err := parsePdfObjects(output) + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + return err + } + + catalogRef, catalogValue, filespecRefs, updateObjects := patchFilespecMetadata(logger, objects, metadata) + if len(filespecRefs) == 0 { + span.SetStatus(codes.Ok, "") + return nil + } + + patchCatalogAF(catalogRef, catalogValue, filespecRefs, updateObjects) + + err = engine.writeAndApplyUpdate(ctx, logger, inputPath, updateObjects) + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + return err + } + + span.SetStatus(codes.Ok, "") + return nil +} + +// execCaptureOutput runs QPDF and returns its stdout. This uses +// exec.CommandContext directly because gotenberg.Cmd does not support +// capturing stdout (it only pipes to debug logs). +func (engine *QPdf) execCaptureOutput(ctx context.Context, args ...string) ([]byte, error) { + cmd := exec.CommandContext(ctx, engine.binPath, args...) //nolint:gosec + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + return cmd.Output() +} + +// parsePdfObjects parses QPDF JSON v2 output and returns the objects map. +func parsePdfObjects(output []byte) (map[string]json.RawMessage, error) { + var pdfJSON struct { + Qpdf []json.RawMessage `json:"qpdf"` + } + if err := json.Unmarshal(output, &pdfJSON); err != nil { + return nil, fmt.Errorf("parse PDF JSON: %w", err) + } + if len(pdfJSON.Qpdf) < 2 { + return nil, fmt.Errorf("unexpected QPDF JSON structure: expected at least 2 elements") + } + + var objects map[string]json.RawMessage + if err := json.Unmarshal(pdfJSON.Qpdf[1], &objects); err != nil { + return nil, fmt.Errorf("parse QPDF objects: %w", err) + } + + return objects, nil +} + +// patchFilespecMetadata walks QPDF objects to find Filespecs matching the +// metadata keys. It sets /AFRelationship and /Subtype on matching objects +// and returns the catalog reference, catalog value, filespec references, +// and the update objects map. +func patchFilespecMetadata(logger *slog.Logger, objects map[string]json.RawMessage, metadata map[string]map[string]string) (string, map[string]any, []string, map[string]any) { + updateObjects := make(map[string]any) + var catalogRef string + var catalogValue map[string]any + var filespecRefs []string + + for ref, raw := range objects { + var obj map[string]json.RawMessage + if err := json.Unmarshal(raw, &obj); err != nil { + continue + } + + valueRaw, hasValue := obj["value"] + if !hasValue { + continue + } + + var value map[string]any + if err := json.Unmarshal(valueRaw, &value); err != nil { + continue + } + + typeVal, _ := value["/Type"].(string) + + if typeVal == "/Catalog" { + catalogRef = ref + catalogValue = value + } + + if typeVal == "/Filespec" { + uf, _ := value["/UF"].(string) + if uf == "" { + uf, _ = value["/F"].(string) + } + + cleanUf := stripQpdfStringPrefix(uf) + + meta, exists := metadata[cleanUf] + if !exists { + continue + } + + if rel, ok := meta["relationship"]; ok { + value["/AFRelationship"] = "/" + rel + } + + if mimeType, ok := meta["mimeType"]; ok { + if ef, ok := value["/EF"].(map[string]any); ok { + efRef, _ := ef["/F"].(string) + if efRef != "" { + setStreamSubtype(logger, objects, updateObjects, efRef, mimeType) + } + } + } + + filespecRefs = append(filespecRefs, ref) + updateObjects[ref] = map[string]any{"value": value} + } + } + + return catalogRef, catalogValue, filespecRefs, updateObjects +} + +// patchCatalogAF ensures the Catalog /AF array references all filespec objects. +func patchCatalogAF(catalogRef string, catalogValue map[string]any, filespecRefs []string, updateObjects map[string]any) { + if catalogRef == "" || catalogValue == nil { + return + } + + afSet := make(map[string]bool) + existingAF, _ := catalogValue["/AF"].([]any) + for _, r := range existingAF { + if s, ok := r.(string); ok { + afSet[s] = true + } + } + for _, ref := range filespecRefs { + // Object references in values use "9 0 R" format, + // not the "obj:9 0 R" key format. + valRef := strings.TrimPrefix(ref, "obj:") + if !afSet[valRef] { + existingAF = append(existingAF, valRef) + } + } + catalogValue["/AF"] = existingAF + updateObjects[catalogRef] = map[string]any{"value": catalogValue} +} + +// writeAndApplyUpdate marshals the update objects as QPDF JSON v2, writes +// them to a temp file, and applies the update via --update-from-json. +func (engine *QPdf) writeAndApplyUpdate(ctx context.Context, logger *slog.Logger, inputPath string, updateObjects map[string]any) error { + updateJSON := map[string]any{ + "qpdf": []any{ + map[string]any{ + "jsonversion": 2, + "pushedinheritedpageresources": false, + "calledgetallpages": false, + "maxobjectid": 0, + }, + updateObjects, + }, + } + + jsonBytes, err := json.Marshal(updateJSON) + if err != nil { + return fmt.Errorf("marshal update JSON: %w", err) + } + + tmpFile, err := os.CreateTemp(filepath.Dir(inputPath), "qpdf-embeds-metadata-*.json") + if err != nil { + return fmt.Errorf("create temp file for update JSON: %w", err) + } + defer os.Remove(tmpFile.Name()) + + if _, err := tmpFile.Write(jsonBytes); err != nil { + tmpFile.Close() + return fmt.Errorf("write update JSON: %w", err) + } + if err := tmpFile.Close(); err != nil { + return fmt.Errorf("close temp file: %w", err) + } + + updateArgs := make([]string, 0, 5+len(engine.globalArgs)) + updateArgs = append(updateArgs, inputPath) + updateArgs = append(updateArgs, engine.globalArgs...) + updateArgs = append(updateArgs, "--newline-before-endstream") + updateArgs = append(updateArgs, "--update-from-json="+tmpFile.Name()) + updateArgs = append(updateArgs, "--replace-input") + + cmd, err := gotenberg.CommandContext(ctx, logger, engine.binPath, updateArgs...) + if err != nil { + return fmt.Errorf("create command for JSON update: %w", err) + } + + _, err = cmd.Exec() + if err != nil { + return fmt.Errorf("update embeds metadata with QPDF: %w", err) + } + + return nil +} + +// setStreamSubtype finds a stream object by reference and sets the /Subtype +// key in its dict. +func setStreamSubtype(logger *slog.Logger, objects map[string]json.RawMessage, updateObjects map[string]any, ref, mimeType string) { + objKey := ref + if !strings.HasPrefix(objKey, "obj:") { + objKey = "obj:" + objKey + } + raw, ok := objects[objKey] + if !ok { + logger.Warn(fmt.Sprintf("set stream subtype on %s: object not found", ref)) + return + } + + var obj map[string]json.RawMessage + if err := json.Unmarshal(raw, &obj); err != nil { + logger.Warn(fmt.Sprintf("set stream subtype on %s: unmarshal object: %s", ref, err)) + return + } + + streamRaw, ok := obj["stream"] + if !ok { + logger.Warn(fmt.Sprintf("set stream subtype on %s: no stream key", ref)) + return + } + + var stream map[string]any + if err := json.Unmarshal(streamRaw, &stream); err != nil { + logger.Warn(fmt.Sprintf("set stream subtype on %s: unmarshal stream: %s", ref, err)) + return + } + + dict, ok := stream["dict"].(map[string]any) + if !ok { + logger.Warn(fmt.Sprintf("set stream subtype on %s: stream dict is not a map", ref)) + return + } + + // QPDF JSON uses literal name syntax; it handles PDF name + // encoding internally when writing the binary PDF. + dict["/Subtype"] = "/" + mimeType + stream["dict"] = dict + updateObjects[objKey] = map[string]any{"stream": stream} +} + +// stripQpdfStringPrefix removes the type prefix that QPDF adds to JSON +// string values. Known prefixes: "u:" (Unicode), "b:" (binary), "e:" (encoded). +func stripQpdfStringPrefix(s string) string { + for _, prefix := range []string{"u:", "b:", "e:"} { + if strings.HasPrefix(s, prefix) { + return s[len(prefix):] + } + } + return s +} + // Watermark is not available in this implementation. func (engine *QPdf) Watermark(ctx context.Context, logger *slog.Logger, inputPath string, stamp gotenberg.Stamp) error { _, span := gotenberg.Tracer().Start(ctx, "qpdf.Watermark", diff --git a/pkg/modules/qpdf/qpdf_test.go b/pkg/modules/qpdf/qpdf_test.go new file mode 100644 index 0000000..1c61569 --- /dev/null +++ b/pkg/modules/qpdf/qpdf_test.go @@ -0,0 +1,271 @@ +package qpdf + +import ( + "encoding/json" + "log/slog" + "os" + "testing" +) + +func TestStripQpdfStringPrefix(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + {"unicode prefix", "u:factur-x.xml", "factur-x.xml"}, + {"binary prefix", "b:binary.bin", "binary.bin"}, + {"encoded prefix", "e:encoded.txt", "encoded.txt"}, + {"no prefix", "plain.xml", "plain.xml"}, + {"empty string", "", ""}, + {"prefix only", "u:", ""}, + {"colon in value", "u:file:name.xml", "file:name.xml"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := stripQpdfStringPrefix(tt.input) + if got != tt.expected { + t.Errorf("stripQpdfStringPrefix(%q) = %q, want %q", tt.input, got, tt.expected) + } + }) + } +} + +func TestParsePdfObjects(t *testing.T) { + tests := []struct { + name string + input string + wantKeys []string + wantError bool + }{ + { + name: "valid QPDF JSON v2", + input: `{"qpdf":[{"jsonversion":2},{"obj:1 0 R":{"value":{"/Type":"/Catalog"}}}]}`, + wantKeys: []string{"obj:1 0 R"}, + }, + { + name: "invalid JSON", + input: `not json`, + wantError: true, + }, + { + name: "empty qpdf array", + input: `{"qpdf":[]}`, + wantError: true, + }, + { + name: "only header element", + input: `{"qpdf":[{"jsonversion":2}]}`, + wantError: true, + }, + { + name: "multiple objects", + input: `{"qpdf":[{},{"obj:1 0 R":{"value":{}},"obj:2 0 R":{"value":{}}}]}`, + wantKeys: []string{"obj:1 0 R", "obj:2 0 R"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + objects, err := parsePdfObjects([]byte(tt.input)) + if tt.wantError { + if err == nil { + t.Error("expected error, got nil") + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + for _, key := range tt.wantKeys { + if _, ok := objects[key]; !ok { + t.Errorf("expected key %q in objects", key) + } + } + }) + } +} + +func TestPatchFilespecMetadata(t *testing.T) { + logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError})) + + t.Run("sets AFRelationship on matching Filespec", func(t *testing.T) { + objects := map[string]json.RawMessage{ + "obj:1 0 R": json.RawMessage(`{"value":{"/Type":"/Catalog"}}`), + "obj:2 0 R": json.RawMessage(`{"value":{"/Type":"/Filespec","/UF":"u:factur-x.xml"}}`), + } + metadata := map[string]map[string]string{ + "factur-x.xml": {"relationship": "Data"}, + } + + catalogRef, _, filespecRefs, updateObjects := patchFilespecMetadata(logger, objects, metadata) + + if catalogRef != "obj:1 0 R" { + t.Errorf("catalogRef = %q, want %q", catalogRef, "obj:1 0 R") + } + if len(filespecRefs) != 1 || filespecRefs[0] != "obj:2 0 R" { + t.Errorf("filespecRefs = %v, want [obj:2 0 R]", filespecRefs) + } + updated, ok := updateObjects["obj:2 0 R"] + if !ok { + t.Fatal("expected obj:2 0 R in updateObjects") + } + value := updated.(map[string]any)["value"].(map[string]any) + if value["/AFRelationship"] != "/Data" { + t.Errorf("/AFRelationship = %v, want /Data", value["/AFRelationship"]) + } + }) + + t.Run("skips Filespec with no matching metadata", func(t *testing.T) { + objects := map[string]json.RawMessage{ + "obj:1 0 R": json.RawMessage(`{"value":{"/Type":"/Filespec","/UF":"u:other.xml"}}`), + } + metadata := map[string]map[string]string{ + "factur-x.xml": {"relationship": "Data"}, + } + + _, _, filespecRefs, _ := patchFilespecMetadata(logger, objects, metadata) + if len(filespecRefs) != 0 { + t.Errorf("filespecRefs = %v, want empty", filespecRefs) + } + }) + + t.Run("falls back to /F when /UF is absent", func(t *testing.T) { + objects := map[string]json.RawMessage{ + "obj:1 0 R": json.RawMessage(`{"value":{"/Type":"/Filespec","/F":"u:factur-x.xml"}}`), + } + metadata := map[string]map[string]string{ + "factur-x.xml": {"relationship": "Alternative"}, + } + + _, _, filespecRefs, updateObjects := patchFilespecMetadata(logger, objects, metadata) + if len(filespecRefs) != 1 { + t.Fatalf("filespecRefs = %v, want 1 entry", filespecRefs) + } + value := updateObjects["obj:1 0 R"].(map[string]any)["value"].(map[string]any) + if value["/AFRelationship"] != "/Alternative" { + t.Errorf("/AFRelationship = %v, want /Alternative", value["/AFRelationship"]) + } + }) + + t.Run("sets stream Subtype via EF reference", func(t *testing.T) { + objects := map[string]json.RawMessage{ + "obj:2 0 R": json.RawMessage(`{"value":{"/Type":"/Filespec","/UF":"u:factur-x.xml","/EF":{"/F":"3 0 R"}}}`), + "obj:3 0 R": json.RawMessage(`{"stream":{"dict":{"/Type":"/EmbeddedFile"}}}`), + } + metadata := map[string]map[string]string{ + "factur-x.xml": {"mimeType": "text/xml"}, + } + + _, _, _, updateObjects := patchFilespecMetadata(logger, objects, metadata) + streamObj, ok := updateObjects["obj:3 0 R"] + if !ok { + t.Fatal("expected obj:3 0 R in updateObjects") + } + stream := streamObj.(map[string]any)["stream"].(map[string]any) + dict := stream["dict"].(map[string]any) + if dict["/Subtype"] != "/text/xml" { + t.Errorf("/Subtype = %v, want /text/xml", dict["/Subtype"]) + } + }) +} + +func TestPatchCatalogAF(t *testing.T) { + t.Run("adds filespec refs to AF array", func(t *testing.T) { + catalogValue := map[string]any{"/Type": "/Catalog"} + updateObjects := make(map[string]any) + + patchCatalogAF("obj:1 0 R", catalogValue, []string{"obj:2 0 R", "obj:3 0 R"}, updateObjects) + + af, ok := catalogValue["/AF"].([]any) + if !ok { + t.Fatal("expected /AF to be []any") + } + if len(af) != 2 { + t.Fatalf("/AF has %d entries, want 2", len(af)) + } + if af[0] != "2 0 R" || af[1] != "3 0 R" { + t.Errorf("/AF = %v, want [2 0 R, 3 0 R]", af) + } + }) + + t.Run("does not duplicate existing refs", func(t *testing.T) { + catalogValue := map[string]any{ + "/Type": "/Catalog", + "/AF": []any{"2 0 R"}, + } + updateObjects := make(map[string]any) + + patchCatalogAF("obj:1 0 R", catalogValue, []string{"obj:2 0 R", "obj:3 0 R"}, updateObjects) + + af := catalogValue["/AF"].([]any) + if len(af) != 2 { + t.Fatalf("/AF has %d entries, want 2", len(af)) + } + }) + + t.Run("no-op when catalogRef is empty", func(t *testing.T) { + updateObjects := make(map[string]any) + patchCatalogAF("", nil, []string{"obj:2 0 R"}, updateObjects) + if len(updateObjects) != 0 { + t.Error("expected no updates for empty catalogRef") + } + }) +} + +func TestSetStreamSubtype(t *testing.T) { + logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError})) + + t.Run("sets Subtype in stream dict", func(t *testing.T) { + objects := map[string]json.RawMessage{ + "obj:3 0 R": json.RawMessage(`{"stream":{"dict":{"/Type":"/EmbeddedFile"}}}`), + } + updateObjects := make(map[string]any) + + setStreamSubtype(logger, objects, updateObjects, "obj:3 0 R", "text/xml") + + streamObj := updateObjects["obj:3 0 R"].(map[string]any)["stream"].(map[string]any) + dict := streamObj["dict"].(map[string]any) + if dict["/Subtype"] != "/text/xml" { + t.Errorf("/Subtype = %v, want /text/xml", dict["/Subtype"]) + } + }) + + t.Run("auto-adds obj: prefix to ref", func(t *testing.T) { + objects := map[string]json.RawMessage{ + "obj:5 0 R": json.RawMessage(`{"stream":{"dict":{}}}`), + } + updateObjects := make(map[string]any) + + setStreamSubtype(logger, objects, updateObjects, "5 0 R", "application/pdf") + + if _, ok := updateObjects["obj:5 0 R"]; !ok { + t.Error("expected obj:5 0 R in updateObjects") + } + }) + + t.Run("warns on missing object", func(t *testing.T) { + objects := map[string]json.RawMessage{} + updateObjects := make(map[string]any) + + setStreamSubtype(logger, objects, updateObjects, "obj:99 0 R", "text/xml") + + if len(updateObjects) != 0 { + t.Error("expected no updates for missing object") + } + }) + + t.Run("warns on object without stream key", func(t *testing.T) { + objects := map[string]json.RawMessage{ + "obj:3 0 R": json.RawMessage(`{"value":{"/Type":"/Page"}}`), + } + updateObjects := make(map[string]any) + + setStreamSubtype(logger, objects, updateObjects, "obj:3 0 R", "text/xml") + + if len(updateObjects) != 0 { + t.Error("expected no updates for non-stream object") + } + }) +} diff --git a/test/integration/features/pdfengines_embed.feature b/test/integration/features/pdfengines_embed.feature index 4bf9ca7..18543e2 100644 --- a/test/integration/features/pdfengines_embed.feature +++ b/test/integration/features/pdfengines_embed.feature @@ -17,6 +17,21 @@ Feature: /forms/pdfengines/embed Then the response PDF(s) should have the "embed_1.xml" file embedded Then the response PDF(s) should have the "embed_2.xml" file embedded + Scenario: POST /forms/pdfengines/embed with metadata + Given I have a default Gotenberg container + When I make a "POST" request to Gotenberg at the "/forms/pdfengines/embed" endpoint with the following form data and header(s): + | files | testdata/page_1.pdf | file | + | embeds | testdata/embed_1.xml | file | + | embeds | testdata/embed_2.xml | file | + | embedsMetadata | {"embed_1.xml":{"mimeType":"text/xml","relationship":"Data"},"embed_2.xml":{"mimeType":"text/xml","relationship":"Alternative"}} | field | + Then the response status code should be 200 + And the response header "Content-Type" should be "application/pdf" + And there should be 1 PDF(s) in the response + And the response PDF(s) should have the "embed_1.xml" file embedded + And the response PDF(s) should have the "embed_1.xml" file embedded with relationship "Data" + And the response PDF(s) should have the "embed_2.xml" file embedded + And the response PDF(s) should have the "embed_2.xml" file embedded with relationship "Alternative" + @download-from Scenario: POST /forms/pdfengines/embed with (Download From) Given I have a default Gotenberg container diff --git a/test/integration/scenario/containers.go b/test/integration/scenario/containers.go index 90abab4..6ee0ac1 100644 --- a/test/integration/scenario/containers.go +++ b/test/integration/scenario/containers.go @@ -9,6 +9,7 @@ import ( "github.com/moby/moby/api/types/container" "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/exec" "github.com/testcontainers/testcontainers-go/network" "github.com/testcontainers/testcontainers-go/wait" ) @@ -138,7 +139,7 @@ func execCommandInIntegrationToolsContainer(ctx context.Context, cmd []string, p } }(c, ctx) - _, output, err := c.Exec(ctx, cmd) + _, output, err := c.Exec(ctx, cmd, exec.Multiplexed()) if err != nil { return "", fmt.Errorf("exec %q: %w", cmd, err) } diff --git a/test/integration/scenario/scenario.go b/test/integration/scenario/scenario.go index 95dd136..419f77c 100644 --- a/test/integration/scenario/scenario.go +++ b/test/integration/scenario/scenario.go @@ -1262,6 +1262,68 @@ func (s *scenario) thePdfsShouldHaveEmbeddedFile(ctx context.Context, kind, shou return nil } +func (s *scenario) thePdfsShouldHaveEmbeddedFileWithRelationship(ctx context.Context, kind, embed, relationship string) error { + dirPath := s.teststoreDir + + _, err := os.Stat(dirPath) + if os.IsNotExist(err) { + return fmt.Errorf("directory %q does not exist", dirPath) + } + + var paths []string + err = filepath.Walk(dirPath, func(path string, info os.FileInfo, pathErr error) error { + if pathErr != nil { + return pathErr + } + if strings.EqualFold(filepath.Ext(info.Name()), ".pdf") { + paths = append(paths, path) + } + return nil + }) + if err != nil { + return fmt.Errorf("walk %q: %w", dirPath, err) + } + + for _, path := range paths { + cmd := []string{ + "verapdf", + "--off", + "--loglevel", + "0", + "--extract", + "embeddedFile", + filepath.Base(path), + } + + output, err := execCommandInIntegrationToolsContainer(ctx, cmd, path) + if err != nil { + return fmt.Errorf("exec %q: %w", cmd, err) + } + + fileNameTag := fmt.Sprintf("%s", embed) + relationshipTag := fmt.Sprintf("%s", relationship) + + blocks := strings.Split(output, "") + found := false + for _, block := range blocks { + if !strings.Contains(block, fileNameTag) { + continue + } + if !strings.Contains(block, relationshipTag) { + return fmt.Errorf("embedded file %q missing afRelationship %q", embed, relationship) + } + found = true + break + } + + if !found { + return fmt.Errorf("embedded file %q not found in verapdf output", embed) + } + } + + return nil +} + func InitializeScenario(ctx *godog.ScenarioContext) { s := &scenario{} ctx.Before(func(ctx context.Context, sc *godog.Scenario) (context.Context, error) { @@ -1300,6 +1362,7 @@ func InitializeScenario(ctx *godog.ScenarioContext) { ctx.Then(`^the (response|webhook request) PDF\(s\) (should|should NOT) be flatten$`, s.thePdfsShouldBeFlatten) ctx.Then(`^the (response|webhook request) PDF\(s\) (should|should NOT) be encrypted`, s.thePdfsShouldBeEncrypted) ctx.Then(`^the (response|webhook request) PDF\(s\) (should|should NOT) have the "([^"]*)" file embedded$`, s.thePdfsShouldHaveEmbeddedFile) + ctx.Then(`^the (response|webhook request) PDF\(s\) should have the "([^"]*)" file embedded with relationship "([^"]*)"$`, s.thePdfsShouldHaveEmbeddedFileWithRelationship) ctx.Then(`^the "([^"]*)" PDF should have (\d+) page\(s\)$`, s.thePdfShouldHavePages) ctx.Then(`^the "([^"]*)" PDF (should|should NOT) be set to landscape orientation$`, s.thePdfShouldBeSetToLandscapeOrientation) ctx.Then(`^the "([^"]*)" PDF (should|should NOT) have the following content at page (\d+):$`, s.thePdfShouldHaveTheFollowingContentAtPage)