From 07f05726caf9bb0d3518f3b2b1739c2c41e6a52a Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 28 Nov 2023 17:11:36 -0500 Subject: [PATCH] perf: improve URL parsing performance --- .../src/utils/parseSpanDescription.ts | 1 + packages/utils/src/url.ts | 86 ++++++++++++++++--- 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/packages/opentelemetry/src/utils/parseSpanDescription.ts b/packages/opentelemetry/src/utils/parseSpanDescription.ts index 784b268cc4f1..15a6f7f1de60 100644 --- a/packages/opentelemetry/src/utils/parseSpanDescription.ts +++ b/packages/opentelemetry/src/utils/parseSpanDescription.ts @@ -140,6 +140,7 @@ export function getSanitizedUrl( // This is the normalized route name - may not always be available! const httpRoute = attributes[SemanticAttributes.HTTP_ROUTE]; + // TODO(@anonrig): Use WHATWG URL API when we drop Node.js v10 support. const parsedUrl = typeof httpUrl === 'string' ? parseUrl(httpUrl) : undefined; const url = parsedUrl ? getSanitizedUrlString(parsedUrl) : undefined; const query = parsedUrl && parsedUrl.search ? parsedUrl.search : undefined; diff --git a/packages/utils/src/url.ts b/packages/utils/src/url.ts index d5d773e27389..a73846765775 100644 --- a/packages/utils/src/url.ts +++ b/packages/utils/src/url.ts @@ -1,11 +1,14 @@ -type PartialURL = { - host?: string; - path?: string; - protocol?: string; - relative?: string; - search?: string; - hash?: string; -}; +type PartialURL = Partial<{ + host: string; + path: string; + protocol: string; + relative: string; + search: string; + hash: string; + urlInstance: URL; +}>; + +const urlRegex = /^(([^:/?#]+):)?(\/\/([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/; /** * Parses string form of URL into an object @@ -19,7 +22,32 @@ export function parseUrl(url: string): PartialURL { return {}; } - const match = url.match(/^(([^:/?#]+):)?(\/\/([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/); + // Node.js v10 and above supports WHATWG URL API. We can use it when available. + // TODO(@anonrig): Remove this check when we drop support for Node v10. + if (typeof URL !== undefined) { + try { + const parsed = new URL(url); + const pathname = parsed.pathname; + + return { + host: parsed.host, + // WHATWG URL API includes the leading slash in the pathname + // Example: Returns `/` for `https://sentry.io` + path: pathname.length === 1 ? '' : pathname, + // WHATWG URL API includes the trailing colon in the protocol + // Example: Returns `https:` for `https://sentry.io` + protocol: parsed.protocol.slice(0, -1), + search: parsed.search, + hash: parsed.hash, + relative: parsed.pathname + parsed.search + parsed.hash, + urlInstance: parsed, + }; + } catch { + // If URL is invalid, fallback to regex parsing to support URLs without protocols. + } + } + + const match = url.match(urlRegex); if (!match) { return {}; @@ -62,7 +90,44 @@ export function getNumberOfUrlSegments(url: string): number { * see: https://develop.sentry.dev/sdk/data-handling/#structuring-data */ export function getSanitizedUrlString(url: PartialURL): string { - const { protocol, host, path } = url; + const { protocol, host, path, urlInstance } = url; + + // This means that the environment supports WHATWG URL API. + // This case will not be executed if URL does not have a protocol + // since WHATWG URL specification requires protocol to be present. + if (urlInstance !== undefined) { + const { port, username, password, hostname, pathname, protocol } = urlInstance; + const hasAuthority = username.length > 0 || password.length > 0; + let output = `${protocol}//`; + + if (hasAuthority) { + if (username) { + output += '[filtered]'; + } + + output += ':'; + + if (password) { + output += '[filtered]'; + } + + output += '@'; + } + + output += hostname; + + if (port && port !== '80' && port !== '443') { + output += `:${port}`; + } + + // Do not append pathname if it is empty. + // For example: Pathname is `/` for `https://sentry.io` + if (pathname.length > 1) { + output += pathname; + } + + return output; + } const filteredHost = (host && @@ -70,7 +135,6 @@ export function getSanitizedUrlString(url: PartialURL): string { // Always filter out authority .replace(/^.*@/, '[filtered]:[filtered]@') // Don't show standard :80 (http) and :443 (https) ports to reduce the noise - // TODO: Use new URL global if it exists .replace(/(:80)$/, '') .replace(/(:443)$/, '')) || '';