fix(pdfcpu): use custom sort to retrieve the splitted PDFs

2026-07-02 08:27:41 +08:00 · 2025-04-12 19:40:16 +02:00
parent 7ca1d118a1
commit 9701f88ae3
3 changed files with 98 additions and 1 deletions
@@ -146,7 +146,7 @@ func (engine *PdfCpu) Split(ctx context.Context, logger *zap.Logger, mode gotenb
 		return nil, fmt.Errorf("walk directory to find resulting PDFs from split with pdfcpu: %w", err)
 	}

-	sort.Sort(gotenberg.AlphanumericSort(outputPaths))
+	sort.Sort(digitSuffixSort(outputPaths))

 	return outputPaths, nil
 }
@@ -0,0 +1,68 @@
+package pdfcpu
+
+import (
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strconv"
+)
+
+type digitSuffixSort []string
+
+func (s digitSuffixSort) Len() int {
+	return len(s)
+}
+
+func (s digitSuffixSort) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+
+func (s digitSuffixSort) Less(i, j int) bool {
+	numI, restI := extractNumber(s[i])
+	numJ, restJ := extractNumber(s[j])
+
+	// If both strings contain a number, compare them numerically.
+	if numI != -1 && numJ != -1 {
+		if numI != numJ {
+			return numI < numJ
+		}
+		// If the numbers are equal, compare the "rest" strings.
+		return restI < restJ
+	}
+
+	// If one contains a number and the other doesn't, the one with the number
+	// comes first.
+	if numI != -1 {
+		return true
+	}
+	if numJ != -1 {
+		return false
+	}
+
+	// Neither has a number; fall back to lexicographical order.
+	return s[i] < s[j]
+}
+
+func extractNumber(str string) (int, string) {
+	str = filepath.Base(str)
+
+	// Check for a number immediately before an extension.
+	if matches := extensionSuffixRegexp.FindStringSubmatch(str); len(matches) > 3 {
+		if num, err := strconv.Atoi(matches[2]); err == nil {
+			// Remove the numeric block but keep the extension.
+			return num, matches[1] + matches[3]
+		}
+	}
+
+	// No numeric portion found.
+	return -1, str
+}
+
+// Regular expressions used by extractNumber.
+var (
+	// Matches a numeric block immediately before a file extension.
+	extensionSuffixRegexp = regexp.MustCompile(`^(.*?)(\d+)(\.[^.]+)$`)
+)
+
+// Interface guard.
+var _ sort.Interface = (*digitSuffixSort)(nil)
@@ -0,0 +1,29 @@
+package pdfcpu
+
+import (
+	"reflect"
+	"sort"
+	"testing"
+)
+
+func TestDigitSuffixSort(t *testing.T) {
+	for _, tc := range []struct {
+		scenario     string
+		values       []string
+		expectedSort []string
+	}{
+		{
+			scenario:     "UUIDs with digit suffixes",
+			values:       []string{"2521a33d-1fb4-4279-80fe-8a945285b8f4_12.pdf", "2521a33d-1fb4-4279-80fe-8a945285b8f4_1.pdf", "2521a33d-1fb4-4279-80fe-8a945285b8f4_10.pdf", "2521a33d-1fb4-4279-80fe-8a945285b8f4_3.pdf"},
+			expectedSort: []string{"2521a33d-1fb4-4279-80fe-8a945285b8f4_1.pdf", "2521a33d-1fb4-4279-80fe-8a945285b8f4_3.pdf", "2521a33d-1fb4-4279-80fe-8a945285b8f4_10.pdf", "2521a33d-1fb4-4279-80fe-8a945285b8f4_12.pdf"},
+		},
+	} {
+		t.Run(tc.scenario, func(t *testing.T) {
+			sort.Sort(digitSuffixSort(tc.values))
+
+			if !reflect.DeepEqual(tc.values, tc.expectedSort) {
+				t.Fatalf("expected %+v but got: %+v", tc.expectedSort, tc.values)
+			}
+		})
+	}
+}