feat(libreoffice): cap ErrCoreDumped retries and make them observable

This commit is contained in:
Julien Neuhart
2026-06-02 19:50:12 +02:00
parent a82dd9f031
commit 4ebd977d97
2 changed files with 154 additions and 46 deletions
+70 -46
View File
@@ -58,6 +58,7 @@ type Api struct {
conversionDurationCounter metric.Float64Histogram
queueWaitDurationCounter metric.Float64Histogram
pdfOutputSizeCounter metric.Int64Histogram
coreDumpedRetriesCounter metric.Int64Counter
}
// Options gathers available options when converting a document to PDF.
@@ -468,6 +469,15 @@ func (a *Api) Provision(ctx *gotenberg.Context) error {
return fmt.Errorf("create libreoffice.pdf.output.size histogram: %w", err)
}
a.coreDumpedRetriesCounter, err = meter.Int64Counter(
"libreoffice.conversion.retries.total",
metric.WithDescription("Total number of LibreOffice conversion retries after a core dump"),
metric.WithUnit("{retry}"),
)
if err != nil {
return fmt.Errorf("create libreoffice.conversion.retries.total counter: %w", err)
}
return nil
}
@@ -628,60 +638,74 @@ func (a *Api) Pdf(ctx context.Context, logger *slog.Logger, inputPath, outputPat
)
span.SetAttributes(conversionRequestAttributes(inputPath, options)...)
start := time.Now()
var conversionStart time.Time
// ErrCoreDumped happens randomly (https://github.com/gotenberg/gotenberg/issues/639);
// retry the conversion, but cap the retries so a permanently failing
// document cannot loop forever. Each attempt records its own metrics.
const maxCoreDumpedRetries = 10
err := a.supervisor.Run(ctx, logger, func() error {
conversionStart = time.Now()
return a.libreOffice.pdf(ctx, logger, inputPath, outputPath, options)
})
var err error
var reason string
for attempt := 0; ; attempt++ {
start := time.Now()
var conversionStart time.Time
// Determine status and error reason.
status := "success"
reason := ""
err = a.supervisor.Run(ctx, logger, func() error {
conversionStart = time.Now()
return a.libreOffice.pdf(ctx, logger, inputPath, outputPath, options)
})
if err != nil {
status = "error"
if errors.Is(err, context.DeadlineExceeded) {
status = "timeout"
}
reason = libreofficeErrorType(err)
}
// Determine status and error reason.
status := "success"
reason = ""
// Record metrics.
attrs := metric.WithAttributes(attribute.String("status", status))
a.reqsCounter.Add(ctx, 1, attrs)
if reason != "" {
a.errsCounter.Add(ctx, 1, metric.WithAttributes(attribute.String("reason", reason)))
gotenberg.SpanErrorType(span, reason)
}
if !conversionStart.IsZero() {
queueWait := conversionStart.Sub(start).Seconds()
a.queueWaitDurationCounter.Record(ctx, queueWait, attrs)
conversionDuration := time.Since(conversionStart).Seconds()
a.conversionDurationCounter.Record(ctx, conversionDuration, attrs)
}
if err == nil {
stat, statErr := os.Stat(outputPath)
if statErr == nil {
a.pdfOutputSizeCounter.Record(ctx, stat.Size(), attrs)
span.SetAttributes(attribute.Int64("gotenberg.conversion.output.bytes", stat.Size()))
if err != nil {
status = "error"
if errors.Is(err, context.DeadlineExceeded) {
status = "timeout"
}
reason = libreofficeErrorType(err)
}
span.SetStatus(codes.Ok, "")
return nil
}
// See https://github.com/gotenberg/gotenberg/issues/639.
if errors.Is(err, ErrCoreDumped) {
logger.DebugContext(ctx, fmt.Sprintf("got a '%s' error, retry conversion", err))
return a.Pdf(ctx, logger, inputPath, outputPath, options)
// Record metrics for this attempt.
attrs := metric.WithAttributes(attribute.String("status", status))
a.reqsCounter.Add(ctx, 1, attrs)
if reason != "" {
a.errsCounter.Add(ctx, 1, metric.WithAttributes(attribute.String("reason", reason)))
}
if !conversionStart.IsZero() {
queueWait := conversionStart.Sub(start).Seconds()
a.queueWaitDurationCounter.Record(ctx, queueWait, attrs)
conversionDuration := time.Since(conversionStart).Seconds()
a.conversionDurationCounter.Record(ctx, conversionDuration, attrs)
}
if err == nil {
stat, statErr := os.Stat(outputPath)
if statErr == nil {
a.pdfOutputSizeCounter.Record(ctx, stat.Size(), attrs)
span.SetAttributes(attribute.Int64("gotenberg.conversion.output.bytes", stat.Size()))
}
span.SetStatus(codes.Ok, "")
return nil
}
if errors.Is(err, ErrCoreDumped) && attempt < maxCoreDumpedRetries {
logger.DebugContext(ctx, fmt.Sprintf("got a '%s' error, retry conversion (attempt %d)", err, attempt+1))
span.AddEvent("conversion.retry", trace.WithAttributes(
attribute.Int("attempt", attempt+1),
))
a.coreDumpedRetriesCounter.Add(ctx, 1)
continue
}
break
}
gotenberg.SpanErrorType(span, reason)
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
return fmt.Errorf("supervisor run task: %w", err)
@@ -0,0 +1,84 @@
package api
import (
"context"
"errors"
"log/slog"
"testing"
"go.opentelemetry.io/otel"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric/metricdata"
"github.com/gotenberg/gotenberg/v8/pkg/gotenberg"
)
func sumInt64Counter(rm metricdata.ResourceMetrics, name string) int64 {
var total int64
for _, sm := range rm.ScopeMetrics {
for _, m := range sm.Metrics {
if m.Name != name {
continue
}
if sum, ok := m.Data.(metricdata.Sum[int64]); ok {
for _, dp := range sum.DataPoints {
total += dp.Value
}
}
}
}
return total
}
func TestApi_Pdf_CoreDumpedRetryCap(t *testing.T) {
reader := sdkmetric.NewManualReader()
provider := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader))
previous := otel.GetMeterProvider()
otel.SetMeterProvider(provider)
t.Cleanup(func() { otel.SetMeterProvider(previous) })
var runCalls int
supervisor := &gotenberg.ProcessSupervisorMock{
RunMock: func(_ context.Context, _ *slog.Logger, _ func() error) error {
runCalls++
return ErrCoreDumped
},
ReqQueueSizeMock: func() int64 { return 0 },
ConversionsSinceRestartMock: func() int64 { return 0 },
}
a := &Api{supervisor: supervisor}
meter := gotenberg.Meter()
a.reqsCounter, _ = meter.Int64Counter("libreoffice.requests.total")
a.errsCounter, _ = meter.Int64Counter("libreoffice.errors.total")
a.conversionDurationCounter, _ = meter.Float64Histogram("libreoffice.conversion.duration")
a.queueWaitDurationCounter, _ = meter.Float64Histogram("libreoffice.queue.wait.duration")
a.pdfOutputSizeCounter, _ = meter.Int64Histogram("libreoffice.pdf.output.size")
a.coreDumpedRetriesCounter, _ = meter.Int64Counter("libreoffice.conversion.retries.total")
err := a.Pdf(context.Background(), slog.New(slog.DiscardHandler), "/nonexistent/in.docx", "/tmp/out.pdf", Options{})
if err == nil {
t.Fatal("expected an error after exhausting the retries")
}
if !errors.Is(err, ErrCoreDumped) {
t.Errorf("expected ErrCoreDumped, got %v", err)
}
// 1 initial attempt + 10 retries.
if runCalls != 11 {
t.Errorf("supervisor.Run called %d times, want 11", runCalls)
}
var rm metricdata.ResourceMetrics
if err := reader.Collect(context.Background(), &rm); err != nil {
t.Fatalf("collect: %v", err)
}
if retries := sumInt64Counter(rm, "libreoffice.conversion.retries.total"); retries != 10 {
t.Errorf("retries counter = %d, want 10", retries)
}
// Per-attempt request metric must be preserved: one per attempt.
if reqs := sumInt64Counter(rm, "libreoffice.requests.total"); reqs != 11 {
t.Errorf("requests counter = %d, want 11", reqs)
}
}