mirror of
https://github.com/gotenberg/gotenberg.git
synced 2026-07-02 00:17:40 +08:00
513 lines
16 KiB
Go
513 lines
16 KiB
Go
package chromium
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"net/url"
|
|
"slices"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/chromedp/cdproto/cdp"
|
|
"github.com/chromedp/cdproto/fetch"
|
|
"github.com/chromedp/cdproto/network"
|
|
"github.com/chromedp/cdproto/page"
|
|
"github.com/chromedp/cdproto/runtime"
|
|
"github.com/chromedp/chromedp"
|
|
"github.com/dlclark/regexp2"
|
|
"go.uber.org/multierr"
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"github.com/gotenberg/gotenberg/v8/pkg/gotenberg"
|
|
)
|
|
|
|
type eventRequestPausedOptions struct {
|
|
allowList, denyList []*regexp2.Regexp
|
|
allowedFilePrefixes []string
|
|
extraHttpHeaders []ExtraHttpHeader
|
|
}
|
|
|
|
// listenForEventRequestPaused listens for requests to check if they are
|
|
// allowed or not. It also set the extra HTTP headers, if any.
|
|
// See https://github.com/gotenberg/gotenberg/issues/1011.
|
|
// TODO: https://chromedevtools.github.io/devtools-protocol/tot/Network/#method-setBlockedURLs (experimental for now).
|
|
func listenForEventRequestPaused(ctx context.Context, logger *slog.Logger, options eventRequestPausedOptions) {
|
|
if len(options.extraHttpHeaders) == 0 {
|
|
logger.DebugContext(ctx, "no extra HTTP headers")
|
|
} else {
|
|
logger.DebugContext(ctx, fmt.Sprintf("extra HTTP headers: %+v", options.extraHttpHeaders))
|
|
}
|
|
|
|
chromedp.ListenTarget(ctx, func(ev any) {
|
|
if e, ok := ev.(*fetch.EventRequestPaused); ok {
|
|
go func() {
|
|
logger.DebugContext(ctx, fmt.Sprintf("event EventRequestPaused fired for '%s'", e.Request.URL))
|
|
allow := true
|
|
|
|
deadline, ok := ctx.Deadline()
|
|
if !ok {
|
|
logger.ErrorContext(ctx, "context has no deadline, cannot filter URL")
|
|
return
|
|
}
|
|
|
|
err := gotenberg.FilterDeadline(options.allowList, options.denyList, e.Request.URL, deadline)
|
|
if err != nil {
|
|
logger.WarnContext(ctx, err.Error())
|
|
allow = false
|
|
}
|
|
|
|
// Additional restriction: if the sub-resource is a file:// URL
|
|
// and we have allowed file prefixes, restrict access to only
|
|
// those directories. This prevents cross-request file access
|
|
// in /tmp.
|
|
if allow && strings.HasPrefix(e.Request.URL, "file://") && len(options.allowedFilePrefixes) > 0 {
|
|
prefixMatch := false
|
|
for _, prefix := range options.allowedFilePrefixes {
|
|
if strings.HasPrefix(e.Request.URL, "file://"+prefix) {
|
|
prefixMatch = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !prefixMatch {
|
|
logger.WarnContext(ctx, fmt.Sprintf("'%s' is not within any allowed file prefix", e.Request.URL))
|
|
allow = false
|
|
}
|
|
}
|
|
|
|
cctx := chromedp.FromContext(ctx)
|
|
executorCtx := cdp.WithExecutor(ctx, cctx.Target)
|
|
|
|
if !allow {
|
|
req := fetch.FailRequest(e.RequestID, network.ErrorReasonAccessDenied)
|
|
err = req.Do(executorCtx)
|
|
if err != nil {
|
|
logger.ErrorContext(ctx, fmt.Sprintf("fail request: %s", err))
|
|
}
|
|
return
|
|
}
|
|
|
|
req := fetch.ContinueRequest(e.RequestID)
|
|
|
|
var extraHttpHeadersToSet []ExtraHttpHeader
|
|
if len(options.extraHttpHeaders) > 0 {
|
|
// The user wants to set extra HTTP headers.
|
|
|
|
// First, we have to check if at least one header has to be
|
|
// set for the current request.
|
|
for _, header := range options.extraHttpHeaders {
|
|
if header.Scope == nil {
|
|
// Non-scoped header.
|
|
logger.DebugContext(ctx, fmt.Sprintf("extra HTTP header '%s' will be set for request URL '%s'", header.Name, e.Request.URL))
|
|
extraHttpHeadersToSet = append(extraHttpHeadersToSet, header)
|
|
continue
|
|
}
|
|
|
|
ok, err := header.Scope.MatchString(e.Request.URL)
|
|
switch {
|
|
case err != nil:
|
|
logger.ErrorContext(ctx, fmt.Sprintf("fail to match extra HTTP header '%s' scope with URL '%s': %s", header.Name, e.Request.URL, err))
|
|
case ok:
|
|
logger.DebugContext(ctx, fmt.Sprintf("extra HTTP header '%s' (scoped) will be set for request URL '%s'", header.Name, e.Request.URL))
|
|
extraHttpHeadersToSet = append(extraHttpHeadersToSet, header)
|
|
default:
|
|
logger.DebugContext(ctx, fmt.Sprintf("scoped extra HTTP header '%s' (scoped) will not be set for request URL '%s'", header.Name, e.Request.URL))
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(extraHttpHeadersToSet) > 0 {
|
|
logger.DebugContext(ctx, fmt.Sprintf("setting extra HTTP headers for request URL '%s': %+v", e.Request.URL, extraHttpHeadersToSet))
|
|
|
|
originalHeaders := e.Request.Headers
|
|
headers := make(map[string]string)
|
|
|
|
for key, value := range originalHeaders {
|
|
strValue, ok := value.(string)
|
|
if ok {
|
|
headers[key] = strValue
|
|
} else {
|
|
logger.ErrorContext(ctx, fmt.Sprintf("ignoring header '%s' for URL '%s' since it cannot be cast to a string", key, e.Request.URL))
|
|
}
|
|
}
|
|
|
|
var headersEntries []*fetch.HeaderEntry
|
|
for key, value := range headers {
|
|
headersEntries = append(headersEntries, &fetch.HeaderEntry{
|
|
Name: key,
|
|
Value: value,
|
|
})
|
|
}
|
|
for _, header := range extraHttpHeadersToSet {
|
|
headersEntries = append(headersEntries, &fetch.HeaderEntry{
|
|
Name: header.Name,
|
|
Value: header.Value,
|
|
})
|
|
}
|
|
|
|
req.Headers = headersEntries
|
|
}
|
|
|
|
err = req.Do(executorCtx)
|
|
if err != nil {
|
|
logger.ErrorContext(ctx, fmt.Sprintf("continue request: %s", err))
|
|
}
|
|
}()
|
|
}
|
|
})
|
|
}
|
|
|
|
type eventResponseReceivedOptions struct {
|
|
mainPageUrl string
|
|
failOnHttpStatusCodes []int64
|
|
invalidHttpStatusCode *error
|
|
invalidHttpStatusCodeMu *sync.RWMutex
|
|
failOnResourceOnHttpStatusCode []int64
|
|
ignoreResourceHttpStatusDomains []string
|
|
invalidResourceHttpStatusCode *error
|
|
invalidResourceHttpStatusCodeMu *sync.RWMutex
|
|
}
|
|
|
|
// listenForEventResponseReceived listens for an invalid HTTP status code
|
|
// returned by the main page or by one or more resources.
|
|
// See:
|
|
// https://github.com/gotenberg/gotenberg/issues/613.
|
|
// https://github.com/gotenberg/gotenberg/issues/1021.
|
|
func listenForEventResponseReceived(
|
|
ctx context.Context,
|
|
logger *slog.Logger,
|
|
options eventResponseReceivedOptions,
|
|
) {
|
|
normalizedIgnoreDomains := normalizeDomains(options.ignoreResourceHttpStatusDomains)
|
|
|
|
for _, code := range []int64{199, 299, 399, 499, 599} {
|
|
if slices.Contains(options.failOnHttpStatusCodes, code) {
|
|
for i := code - 99; i <= code; i++ {
|
|
options.failOnHttpStatusCodes = append(options.failOnHttpStatusCodes, i)
|
|
}
|
|
}
|
|
|
|
if slices.Contains(options.failOnResourceOnHttpStatusCode, code) {
|
|
for i := code - 99; i <= code; i++ {
|
|
options.failOnResourceOnHttpStatusCode = append(options.failOnResourceOnHttpStatusCode, i)
|
|
}
|
|
}
|
|
}
|
|
|
|
chromedp.ListenTarget(ctx, func(ev any) {
|
|
if ev, ok := ev.(*network.EventResponseReceived); ok {
|
|
if ev.Response.URL == options.mainPageUrl {
|
|
logger.DebugContext(ctx, fmt.Sprintf("event EventResponseReceived fired for main page: %+v", ev.Response))
|
|
|
|
if slices.Contains(options.failOnHttpStatusCodes, ev.Response.Status) {
|
|
options.invalidHttpStatusCodeMu.Lock()
|
|
defer options.invalidHttpStatusCodeMu.Unlock()
|
|
|
|
*options.invalidHttpStatusCode = fmt.Errorf("%d: %s", ev.Response.Status, ev.Response.StatusText)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
logger.DebugContext(ctx, fmt.Sprintf("event EventResponseReceived fired for a resource: %+v", ev.Response))
|
|
|
|
if slices.Contains(options.failOnResourceOnHttpStatusCode, ev.Response.Status) {
|
|
if !shouldCheckResourceHttpStatusCode(ev.Response.URL, normalizedIgnoreDomains) {
|
|
logger.DebugContext(ctx, fmt.Sprintf("skip resource HTTP status code check for '%s' due to domain filtering", ev.Response.URL))
|
|
return
|
|
}
|
|
|
|
options.invalidResourceHttpStatusCodeMu.Lock()
|
|
defer options.invalidResourceHttpStatusCodeMu.Unlock()
|
|
|
|
*options.invalidResourceHttpStatusCode = multierr.Append(
|
|
*options.invalidResourceHttpStatusCode,
|
|
fmt.Errorf("%s - %d: %s", ev.Response.URL, ev.Response.Status, http.StatusText(int(ev.Response.Status))),
|
|
)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
func shouldCheckResourceHttpStatusCode(rawURL string, ignoreDomains []string) bool {
|
|
host := hostnameFromURL(rawURL)
|
|
|
|
if len(ignoreDomains) > 0 && matchesAnyDomain(host, ignoreDomains) {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func hostnameFromURL(rawURL string) string {
|
|
u, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return strings.ToLower(u.Hostname())
|
|
}
|
|
|
|
func normalizeDomains(domains []string) []string {
|
|
normalized := make([]string, 0, len(domains))
|
|
|
|
for _, domain := range domains {
|
|
d := normalizeDomain(domain)
|
|
if d == "" {
|
|
continue
|
|
}
|
|
normalized = append(normalized, d)
|
|
}
|
|
|
|
return normalized
|
|
}
|
|
|
|
func normalizeDomain(domain string) string {
|
|
d := strings.ToLower(strings.TrimSpace(domain))
|
|
if d == "" {
|
|
return ""
|
|
}
|
|
|
|
// Accept "example.com", "*.example.com", ".example.com", "https://example.com/path",
|
|
// or "example.com:443".
|
|
if strings.Contains(d, "://") || strings.HasPrefix(d, "//") {
|
|
u, err := url.Parse(d)
|
|
if err == nil && u.Hostname() != "" {
|
|
d = strings.ToLower(u.Hostname())
|
|
}
|
|
} else {
|
|
// Make it parseable as a URL to extract the hostname and drop any port/path.
|
|
u, err := url.Parse("https://" + d)
|
|
if err == nil && u.Hostname() != "" {
|
|
d = strings.ToLower(u.Hostname())
|
|
}
|
|
}
|
|
|
|
d = strings.TrimPrefix(d, "*.")
|
|
d = strings.TrimPrefix(d, ".")
|
|
|
|
return d
|
|
}
|
|
|
|
func matchesAnyDomain(host string, domains []string) bool {
|
|
if host == "" || len(domains) == 0 {
|
|
return false
|
|
}
|
|
|
|
for _, domain := range domains {
|
|
if host == domain || strings.HasSuffix(host, "."+domain) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
type eventLoadingFailedOptions struct {
|
|
loadingFailed *error
|
|
loadingFailedMu *sync.RWMutex
|
|
resourceLoadingFailed *error
|
|
resourceLoadingFailedMu *sync.RWMutex
|
|
}
|
|
|
|
// listenForEventLoadingFailed listens for an event indicating that the main
|
|
// page or one or more resources failed to load.
|
|
// See:
|
|
// https://github.com/gotenberg/gotenberg/issues/913.
|
|
// https://github.com/gotenberg/gotenberg/issues/959.
|
|
// https://github.com/gotenberg/gotenberg/issues/1021.
|
|
func listenForEventLoadingFailed(ctx context.Context, logger *slog.Logger, options eventLoadingFailedOptions) {
|
|
chromedp.ListenTarget(ctx, func(ev any) {
|
|
if ev, ok := ev.(*network.EventLoadingFailed); ok {
|
|
logger.DebugContext(ctx, fmt.Sprintf("event EventLoadingFailed fired: %+v", ev.ErrorText))
|
|
|
|
// We are looking for common errors.
|
|
// TODO: sufficient?
|
|
errors := []string{
|
|
"net::ERR_CONNECTION_CLOSED",
|
|
"net::ERR_CONNECTION_RESET",
|
|
"net::ERR_CONNECTION_REFUSED",
|
|
"net::ERR_CONNECTION_ABORTED",
|
|
"net::ERR_CONNECTION_FAILED",
|
|
"net::ERR_NAME_NOT_RESOLVED",
|
|
"net::ERR_INTERNET_DISCONNECTED",
|
|
"net::ERR_ADDRESS_UNREACHABLE",
|
|
"net::ERR_BLOCKED_BY_CLIENT",
|
|
"net::ERR_BLOCKED_BY_RESPONSE",
|
|
"net::ERR_FILE_NOT_FOUND",
|
|
"net::ERR_HTTP2_PROTOCOL_ERROR",
|
|
}
|
|
if !slices.Contains(errors, ev.ErrorText) {
|
|
logger.DebugContext(ctx, fmt.Sprintf("skip EventLoadingFailed: '%s' is not part of %+v", ev.ErrorText, errors))
|
|
return
|
|
}
|
|
|
|
if ev.Type == network.ResourceTypeDocument {
|
|
// Supposition: except iframe, an event loading failed with a
|
|
// resource type Document is about the main page.
|
|
logger.DebugContext(ctx, "event EventLoadingFailed fired for main page")
|
|
|
|
options.loadingFailedMu.Lock()
|
|
defer options.loadingFailedMu.Unlock()
|
|
|
|
*options.loadingFailed = fmt.Errorf("%s", ev.ErrorText)
|
|
|
|
return
|
|
}
|
|
|
|
logger.DebugContext(ctx, "event EventLoadingFailed fired for a resource")
|
|
|
|
options.resourceLoadingFailedMu.Lock()
|
|
defer options.resourceLoadingFailedMu.Unlock()
|
|
|
|
*options.resourceLoadingFailed = multierr.Append(
|
|
*options.resourceLoadingFailed,
|
|
fmt.Errorf("resource %s: %s", ev.Type, ev.ErrorText),
|
|
)
|
|
}
|
|
})
|
|
}
|
|
|
|
// listenForEventExceptionThrown listens for exceptions in the console and
|
|
// appends those exceptions to the given error pointer.
|
|
// See https://github.com/gotenberg/gotenberg/issues/262.
|
|
func listenForEventExceptionThrown(ctx context.Context, logger *slog.Logger, consoleExceptions *error, consoleExceptionsMu *sync.RWMutex) {
|
|
chromedp.ListenTarget(ctx, func(ev any) {
|
|
if ev, ok := ev.(*runtime.EventExceptionThrown); ok {
|
|
logger.DebugContext(ctx, fmt.Sprintf("event EventExceptionThrown fired: %+v", ev.ExceptionDetails))
|
|
|
|
consoleExceptionsMu.Lock()
|
|
defer consoleExceptionsMu.Unlock()
|
|
|
|
*consoleExceptions = multierr.Append(*consoleExceptions, fmt.Errorf("\n%+v", ev.ExceptionDetails))
|
|
}
|
|
})
|
|
}
|
|
|
|
// waitForEventDomContentEventFired waits until the event DomContentEventFired
|
|
// is fired or the context timeout.
|
|
func waitForEventDomContentEventFired(ctx context.Context, logger *slog.Logger) func() error {
|
|
return func() error {
|
|
ch := make(chan struct{})
|
|
cctx, cancel := context.WithCancel(ctx)
|
|
chromedp.ListenTarget(cctx, func(ev any) {
|
|
if _, ok := ev.(*page.EventDomContentEventFired); ok {
|
|
cancel()
|
|
close(ch)
|
|
}
|
|
})
|
|
|
|
select {
|
|
case <-ch:
|
|
logger.DebugContext(ctx, "event DomContentEventFired fired")
|
|
return nil
|
|
case <-ctx.Done():
|
|
return fmt.Errorf("wait for event DomContentEventFired: %w", ctx.Err())
|
|
}
|
|
}
|
|
}
|
|
|
|
// waitForEventLoadEventFired waits until the event LoadEventFired is fired or
|
|
// the context timeout.
|
|
func waitForEventLoadEventFired(ctx context.Context, logger *slog.Logger) func() error {
|
|
return func() error {
|
|
ch := make(chan struct{})
|
|
cctx, cancel := context.WithCancel(ctx)
|
|
chromedp.ListenTarget(cctx, func(ev any) {
|
|
if _, ok := ev.(*page.EventLoadEventFired); ok {
|
|
cancel()
|
|
close(ch)
|
|
}
|
|
})
|
|
|
|
select {
|
|
case <-ch:
|
|
logger.DebugContext(ctx, "event LoadEventFired fired")
|
|
return nil
|
|
case <-ctx.Done():
|
|
return fmt.Errorf("wait for event LoadEventFired: %w", ctx.Err())
|
|
}
|
|
}
|
|
}
|
|
|
|
// waitForEventNetworkIdle waits until the event networkIdle is fired or the
|
|
// context timeout.
|
|
func waitForEventNetworkIdle(ctx context.Context, logger *slog.Logger) func() error {
|
|
return func() error {
|
|
ch := make(chan struct{})
|
|
cctx, cancel := context.WithCancel(ctx)
|
|
chromedp.ListenTarget(cctx, func(ev any) {
|
|
if e, ok := ev.(*page.EventLifecycleEvent); ok && e.Name == "networkIdle" {
|
|
cancel()
|
|
close(ch)
|
|
}
|
|
})
|
|
|
|
select {
|
|
case <-ch:
|
|
logger.DebugContext(ctx, "event networkIdle fired")
|
|
return nil
|
|
case <-ctx.Done():
|
|
return fmt.Errorf("wait for event networkIdle: %w", ctx.Err())
|
|
}
|
|
}
|
|
}
|
|
|
|
// waitForEventNetworkAlmostIdle waits until the event networkIdle2 is fired
|
|
// or the context timeout.
|
|
func waitForEventNetworkAlmostIdle(ctx context.Context, logger *slog.Logger) func() error {
|
|
return func() error {
|
|
ch := make(chan struct{})
|
|
cctx, cancel := context.WithCancel(ctx)
|
|
chromedp.ListenTarget(cctx, func(ev any) {
|
|
if e, ok := ev.(*page.EventLifecycleEvent); ok && e.Name == "networkIdle2" {
|
|
cancel()
|
|
close(ch)
|
|
}
|
|
})
|
|
|
|
select {
|
|
case <-ch:
|
|
logger.DebugContext(ctx, "event networkAlmostIdle fired")
|
|
return nil
|
|
case <-ctx.Done():
|
|
return fmt.Errorf("wait for event networkAlmostIdle: %w", ctx.Err())
|
|
}
|
|
}
|
|
}
|
|
|
|
// waitForEventLoadingFinished waits until the event LoadingFinished is fired
|
|
// or the context timeout.
|
|
func waitForEventLoadingFinished(ctx context.Context, logger *slog.Logger) func() error {
|
|
return func() error {
|
|
ch := make(chan struct{})
|
|
cctx, cancel := context.WithCancel(ctx)
|
|
chromedp.ListenTarget(cctx, func(ev any) {
|
|
if _, ok := ev.(*network.EventLoadingFinished); ok {
|
|
cancel()
|
|
close(ch)
|
|
}
|
|
})
|
|
|
|
select {
|
|
case <-ch:
|
|
logger.DebugContext(ctx, "event LoadingFinished fired")
|
|
return nil
|
|
case <-ctx.Done():
|
|
return fmt.Errorf("wait for event LoadingFinished: %w", ctx.Err())
|
|
}
|
|
}
|
|
}
|
|
|
|
// runBatch runs all functions simultaneously and waits until all of them are
|
|
// completed or an error is encountered.
|
|
func runBatch(ctx context.Context, fn ...func() error) error {
|
|
eg, _ := errgroup.WithContext(ctx)
|
|
for _, f := range fn {
|
|
eg.Go(f)
|
|
}
|
|
|
|
return eg.Wait()
|
|
}
|