Files
gotenberg/pkg/modules/exiftool/exiftool.go
T

576 lines
20 KiB
Go

package exiftool
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"os"
"os/exec"
"regexp"
"strings"
"sync"
"syscall"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
semconv "go.opentelemetry.io/otel/semconv/v1.40.0"
"go.opentelemetry.io/otel/trace"
"github.com/gotenberg/gotenberg/v8/pkg/gotenberg"
)
func init() {
gotenberg.MustRegisterModule(new(ExifTool))
}
// safeKeyPattern matches legitimate ExifTool tag names: alphanumeric,
// hyphens, underscores, colons, and periods. The first character may not
// be a hyphen, otherwise exiftool would treat the argv entry as a flag
// rather than a tag assignment. Control characters are implicitly
// rejected because the class is ASCII-only.
var safeKeyPattern = regexp.MustCompile(`^[a-zA-Z0-9_.:][a-zA-Z0-9\-_.:]*$`)
// validateMetadataValue rejects metadata values containing NUL, newline,
// or carriage return. NUL terminates C strings and is rejected by
// [exec.Cmd] anyway; newlines and carriage returns are rejected as
// defense in depth against exiftool parsing quirks, even though argv
// invocation is not susceptible to stdin-protocol injection the way
// the previous go-exiftool backend was. The returned error wraps
// [gotenberg.ErrPdfEngineMetadataValueNotSupported] so the API layer
// surfaces it as HTTP 400.
func validateMetadataValue(key, value string) error {
if strings.ContainsAny(value, "\n\r\x00") {
return fmt.Errorf("write PDF metadata with ExifTool: invalid metadata value for key %q (contains control character): %w", key, gotenberg.ErrPdfEngineMetadataValueNotSupported)
}
return nil
}
// systemTags lists ExifTool tags that reflect internal filesystem state
// or tool identity rather than actual PDF metadata. Stripped from read
// output before returning to the caller.
var systemTags = []string{
"SourceFile", // Full path exiftool -j always emits first
"FileName", // Reflects UUID-based disk name, not original filename
"Directory", // Leaks internal temp path
"FileSize", // System attribute
"FileModifyDate", // System attribute
"FileAccessDate", // System attribute
"FileInodeChangeDate", // System attribute
"FilePermissions", // System attribute
"ExifToolVersion", // Tool metadata
"Error", // Extraction error messages
"Warning", // Extraction warning messages
}
// dangerousTags lists ExifTool pseudo-tags that trigger filesystem side
// effects (file rename, move, link creation, permission change). Writes
// containing any of these keys are silently dropped before the argv is
// handed to exiftool. The comparison strips group prefixes (e.g.
// "System:FileName" collapses to "FileName") because exiftool treats
// the prefixed and bare forms identically.
//
// See https://exiftool.org/TagNames/Extra.html.
var dangerousTags = []string{
"FileName", // Writing this triggers a file rename in ExifTool
"Directory", // Writing this triggers a file move in ExifTool
"HardLink", // Writing this creates a hard link in ExifTool
"SymLink", // Writing this creates a symbolic link in ExifTool
"FilePermissions", // Writing this changes the file's permissions
}
// isDangerousTag reports whether key matches one of the [dangerousTags]
// after case-insensitive comparison with any group prefix stripped.
func isDangerousTag(key string) bool {
bare := key
if i := strings.LastIndex(key, ":"); i >= 0 {
bare = key[i+1:]
}
for _, tag := range dangerousTags {
if strings.EqualFold(bare, tag) {
return true
}
}
return false
}
// buildExifToolWriteArgs builds the variadic argv tail for
//
// exiftool -overwrite_original <args> <path>
//
// from a user-supplied metadata map. Dangerous pseudo-tags are silently
// dropped. Invalid keys (empty, leading dash, control characters) and
// values containing NUL or newlines return an error wrapping
// [gotenberg.ErrPdfEngineMetadataValueNotSupported] so the API layer
// replies with HTTP 400. Supported value kinds: string, []string,
// []any of strings, bool, int, int64, float32, float64.
func buildExifToolWriteArgs(metadata map[string]any) ([]string, error) {
var args []string
for key, value := range metadata {
if isDangerousTag(key) {
continue
}
if !safeKeyPattern.MatchString(key) {
return nil, fmt.Errorf("write PDF metadata with ExifTool: invalid metadata key %q: %w", key, gotenberg.ErrPdfEngineMetadataValueNotSupported)
}
switch val := value.(type) {
case string:
if err := validateMetadataValue(key, val); err != nil {
return nil, err
}
args = append(args, fmt.Sprintf("-%s=%s", key, val))
case []string:
for _, s := range val {
if err := validateMetadataValue(key, s); err != nil {
return nil, err
}
args = append(args, fmt.Sprintf("-%s=%s", key, s))
}
case []any:
// See https://github.com/gotenberg/gotenberg/issues/1048.
for _, entry := range val {
s, ok := entry.(string)
if !ok {
return nil, fmt.Errorf("write PDF metadata with ExifTool: unsupported element type %T in []any for key %q: %w", entry, key, gotenberg.ErrPdfEngineMetadataValueNotSupported)
}
if err := validateMetadataValue(key, s); err != nil {
return nil, err
}
args = append(args, fmt.Sprintf("-%s=%s", key, s))
}
case bool:
args = append(args, fmt.Sprintf("-%s=%t", key, val))
case int:
args = append(args, fmt.Sprintf("-%s=%d", key, val))
case int64:
args = append(args, fmt.Sprintf("-%s=%d", key, val))
case float32:
args = append(args, fmt.Sprintf("-%s=%g", key, val))
case float64:
args = append(args, fmt.Sprintf("-%s=%g", key, val))
default:
return nil, fmt.Errorf("write PDF metadata with ExifTool: unsupported type %T for key %q: %w", value, key, gotenberg.ErrPdfEngineMetadataValueNotSupported)
}
}
return args, nil
}
// ExifTool abstracts the CLI tool ExifTool and implements the
// [gotenberg.PdfEngine] interface.
type ExifTool struct {
binPath string
version string
versionOnce sync.Once
}
// Descriptor returns [ExifTool]'s module descriptor.
func (engine *ExifTool) Descriptor() gotenberg.ModuleDescriptor {
return gotenberg.ModuleDescriptor{
ID: "exiftool",
New: func() gotenberg.Module { return new(ExifTool) },
}
}
// Provision sets the module properties.
func (engine *ExifTool) Provision(ctx *gotenberg.Context) error {
binPath, ok := os.LookupEnv("EXIFTOOL_BIN_PATH")
if !ok {
return errors.New("EXIFTOOL_BIN_PATH environment variable is not set")
}
engine.binPath = binPath
return nil
}
// Validate validates the module properties.
func (engine *ExifTool) Validate() error {
_, err := os.Stat(engine.binPath)
if os.IsNotExist(err) {
return fmt.Errorf("ExifTool binary path does not exist: %w", err)
}
return nil
}
// Debug returns additional debug data.
func (engine *ExifTool) Debug() map[string]any {
return map[string]any{"version": engine.detectVersion()}
}
// detectVersion resolves the ExifTool version once, preferring the value
// captured at image build time so it never spawns ExifTool at runtime. It falls
// back to running exiftool -ver for local or non-Docker builds.
func (engine *ExifTool) detectVersion() string {
engine.versionOnce.Do(func() {
if v, ok := gotenberg.BuildVersion("exiftool"); ok {
engine.version = v
return
}
cmd := exec.Command(engine.binPath, "-ver") //nolint:gosec
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
output, err := cmd.Output()
if err != nil {
engine.version = err.Error()
return
}
engine.version = strings.TrimSpace(string(output))
})
return engine.version
}
// spanAttrs returns the client-span attributes for an ExifTool invocation: the
// server address and the ExifTool version, plus any extra attributes. The
// version rides on every span so a trace records which ExifTool ran the
// operation.
func (engine *ExifTool) spanAttrs(extra ...attribute.KeyValue) []attribute.KeyValue {
attrs := make([]attribute.KeyValue, 0, 2+len(extra))
attrs = append(attrs, semconv.ServerAddress(engine.binPath))
if v := engine.detectVersion(); v != "" {
attrs = append(attrs, attribute.String("gotenberg.exiftool.version", v))
}
return append(attrs, extra...)
}
// Merge is not available in this implementation.
func (engine *ExifTool) Merge(ctx context.Context, logger *slog.Logger, inputPaths []string, outputPath string) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.Merge",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("merge PDFs with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// Split is not available in this implementation.
func (engine *ExifTool) Split(ctx context.Context, logger *slog.Logger, mode gotenberg.SplitMode, inputPath, outputDirPath string) ([]string, error) {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.Split",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("split PDF with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return nil, err
}
// Flatten is not available in this implementation.
func (engine *ExifTool) Flatten(ctx context.Context, logger *slog.Logger, inputPath string) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.Flatten",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("flatten PDF with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// Convert is not available in this implementation.
func (engine *ExifTool) Convert(ctx context.Context, logger *slog.Logger, formats gotenberg.PdfFormats, inputPath, outputPath string) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.Convert",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("convert PDF to '%+v' with ExifTool: %w", formats, gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// ReadMetadata extracts the metadata of a given PDF file by invoking
// the exiftool binary with "-j" (JSON output) and parsing the result.
func (engine *ExifTool) ReadMetadata(ctx context.Context, logger *slog.Logger, inputPath string) (map[string]any, error) {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.ReadMetadata",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
cmd := exec.CommandContext(ctx, engine.binPath, "-j", inputPath) //nolint:gosec
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
output, err := cmd.Output()
if err != nil {
err = fmt.Errorf("read metadata with ExifTool: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return nil, err
}
var files []map[string]any
err = json.Unmarshal(output, &files)
if err != nil {
err = fmt.Errorf("parse ExifTool JSON output: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return nil, err
}
if len(files) == 0 {
err = errors.New("ExifTool returned no file entries")
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return nil, err
}
metadata := files[0]
// ExifTool records extraction errors as an "Error" key on the file
// entry rather than via a non-zero exit code. Surface that back as a
// Go error before stripping so callers see the real cause.
if msg, ok := metadata["Error"].(string); ok && msg != "" {
err = fmt.Errorf("read metadata with ExifTool: %s", msg)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return nil, err
}
for _, tag := range systemTags {
delete(metadata, tag)
}
span.SetStatus(codes.Ok, "")
return metadata, nil
}
// WriteMetadata writes the metadata into a given PDF file by invoking
// the exiftool binary with "-overwrite_original -TAG=VALUE ... path".
// ExifTool preserves tags that are not mentioned in the argv, so the
// write is a merge rather than a rewrite.
func (engine *ExifTool) WriteMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]any, inputPath string) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.WriteMetadata",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
extraArgs, err := buildExifToolWriteArgs(metadata)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
if len(extraArgs) == 0 {
// Nothing to write after filtering. Treat as success so the
// caller can move on without a dedicated zero-tag branch.
span.SetStatus(codes.Ok, "")
return nil
}
args := append([]string{"-overwrite_original"}, extraArgs...)
args = append(args, inputPath)
cmd, err := gotenberg.CommandContext(ctx, logger, engine.binPath, args...)
if err != nil {
err = fmt.Errorf("create ExifTool command: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
exitCode, err := cmd.Exec()
if err != nil {
err = fmt.Errorf("write PDF metadata with ExifTool (exit %d): %w", exitCode, err)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
span.SetStatus(codes.Ok, "")
return nil
}
// PageCount returns the number of pages in a PDF file using ExifTool.
func (engine *ExifTool) PageCount(ctx context.Context, logger *slog.Logger, inputPath string) (int, error) {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.PageCount",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
metadata, err := engine.ReadMetadata(ctx, logger, inputPath)
if err != nil {
err = fmt.Errorf("read metadata with ExifTool: %w", err)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return 0, err
}
pageCountValue, ok := metadata["PageCount"]
if !ok {
err = errors.New("PageCount not found in metadata")
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return 0, err
}
switch val := pageCountValue.(type) {
case int:
span.SetStatus(codes.Ok, "")
return val, nil
case int64:
span.SetStatus(codes.Ok, "")
return int(val), nil
case float64:
span.SetStatus(codes.Ok, "")
return int(val), nil
case string:
var res int
_, err := fmt.Sscanf(val, "%d", &res)
if err != nil {
err = fmt.Errorf("parse PageCount string '%s': %w", val, err)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return 0, err
}
span.SetStatus(codes.Ok, "")
return res, nil
default:
err = fmt.Errorf("unexpected PageCount type '%T'", pageCountValue)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return 0, err
}
}
// WriteBookmarks is not available in this implementation.
func (engine *ExifTool) WriteBookmarks(ctx context.Context, logger *slog.Logger, inputPath string, bookmarks []gotenberg.Bookmark) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.WriteBookmarks",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("write PDF bookmarks with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// ReadBookmarks is not available in this implementation.
func (engine *ExifTool) ReadBookmarks(ctx context.Context, logger *slog.Logger, inputPath string) ([]gotenberg.Bookmark, error) {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.ReadBookmarks",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("read PDF bookmarks with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return nil, err
}
// Encrypt is not available in this implementation.
func (engine *ExifTool) Encrypt(ctx context.Context, logger *slog.Logger, inputPath string, opts gotenberg.EncryptOptions) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.Encrypt",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("encrypt PDF using ExifTool: %w", gotenberg.ErrPdfEncryptionNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// EmbedFiles is not available in this implementation.
func (engine *ExifTool) EmbedFiles(ctx context.Context, logger *slog.Logger, filePaths []string, inputPath string) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.EmbedFiles",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("embed files with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// Watermark is not available in this implementation.
func (engine *ExifTool) Watermark(ctx context.Context, logger *slog.Logger, inputPath string, stamp gotenberg.Stamp) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.Watermark",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("watermark PDF with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// Stamp is not available in this implementation.
func (engine *ExifTool) Stamp(ctx context.Context, logger *slog.Logger, inputPath string, stamp gotenberg.Stamp) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.Stamp",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("stamp PDF with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// Rotate is not available in this implementation.
func (engine *ExifTool) Rotate(ctx context.Context, logger *slog.Logger, inputPath string, angle int, pages string) error {
_, span := gotenberg.Tracer().Start(ctx, "exiftool.Rotate",
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(engine.spanAttrs()...),
)
defer span.End()
err := fmt.Errorf("rotate PDF with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return err
}
// EmbedFilesMetadata is not available in this implementation.
func (engine *ExifTool) EmbedFilesMetadata(ctx context.Context, logger *slog.Logger, metadata map[string]map[string]string, inputPath string) error {
return fmt.Errorf("set embeds metadata with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
}
// InjectFacturXXMP is not available in this implementation.
func (engine *ExifTool) InjectFacturXXMP(ctx context.Context, logger *slog.Logger, facturX gotenberg.FacturX, inputPath string) error {
return fmt.Errorf("inject Factur-X XMP with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
}
// ReadPdfAConformance is not available in this implementation.
func (engine *ExifTool) ReadPdfAConformance(ctx context.Context, logger *slog.Logger, inputPath string) (string, string, error) {
return "", "", fmt.Errorf("read PDF/A conformance with ExifTool: %w", gotenberg.ErrPdfEngineMethodNotSupported)
}
// Interface guards.
var (
_ gotenberg.Module = (*ExifTool)(nil)
_ gotenberg.Provisioner = (*ExifTool)(nil)
_ gotenberg.Validator = (*ExifTool)(nil)
_ gotenberg.Debuggable = (*ExifTool)(nil)
_ gotenberg.PdfEngine = (*ExifTool)(nil)
)