From 7f0063e2815e5213b29ec726a7dd1f65a6a83a98 Mon Sep 17 00:00:00 2001 From: Fuwn Date: Thu, 12 Feb 2026 04:37:39 -0800 Subject: feat: strip tracking parameters from entry URLs and filter tracking pixel images Strip UTM, fbclid, gclid, and other tracking query parameters from entry URLs at parse time in the Go worker. Filter out sub-3px tracking pixel images from sanitized HTML content via exclusiveFilter. --- apps/web/lib/sanitize.test.ts | 46 +++++++++++++++++++++++++++++++ apps/web/lib/sanitize.ts | 15 ++++++++++ services/worker/internal/parser/parser.go | 33 ++++++++++++++++++++-- 3 files changed, 92 insertions(+), 2 deletions(-) diff --git a/apps/web/lib/sanitize.test.ts b/apps/web/lib/sanitize.test.ts index 266b8af..f47db80 100644 --- a/apps/web/lib/sanitize.test.ts +++ b/apps/web/lib/sanitize.test.ts @@ -44,4 +44,50 @@ describe("sanitizeEntryContent", () => { const input = "
const x = 1
" expect(sanitizeEntryContent(input)).toBe(input) }) + + it("strips 1x1 tracking pixel images", () => { + const input = '

content

' + expect(sanitizeEntryContent(input)).toBe("

content

") + }) + + it("strips images with zero dimensions", () => { + const input = '' + expect(sanitizeEntryContent(input)).toBe("") + }) + + it("strips images where only width is a tracking dimension", () => { + const input = '' + expect(sanitizeEntryContent(input)).toBe("") + }) + + it("strips images where only height is a tracking dimension", () => { + const input = '' + expect(sanitizeEntryContent(input)).toBe("") + }) + + it("preserves normal-sized images", () => { + const input = '' + const result = sanitizeEntryContent(input) + expect(result).toContain("src=") + expect(result).toContain("width=") + expect(result).toContain("height=") + }) + + it("preserves images without dimension attributes", () => { + const input = 'a photo' + const result = sanitizeEntryContent(input) + expect(result).toContain("src=") + expect(result).toContain("alt=") + }) + + it("strips 3x3 tracking pixel at threshold boundary", () => { + const input = '' + expect(sanitizeEntryContent(input)).toBe("") + }) + + it("preserves images just above tracking pixel threshold", () => { + const input = '' + const result = sanitizeEntryContent(input) + expect(result).toContain("src=") + }) }) diff --git a/apps/web/lib/sanitize.ts b/apps/web/lib/sanitize.ts index b63cee1..3a85016 100644 --- a/apps/web/lib/sanitize.ts +++ b/apps/web/lib/sanitize.ts @@ -1,5 +1,19 @@ import sanitizeHtml from "sanitize-html" +const TRACKING_PIXEL_DIMENSION_THRESHOLD = 3 + +function isTrackingPixel(tagName: string, attributes: Record): boolean { + if (tagName !== "img") return false + + const width = parseInt(attributes.width, 10) + const height = parseInt(attributes.height, 10) + + if (!isNaN(width) && width <= TRACKING_PIXEL_DIMENSION_THRESHOLD) return true + if (!isNaN(height) && height <= TRACKING_PIXEL_DIMENSION_THRESHOLD) return true + + return false +} + const SANITIZE_OPTIONS: sanitizeHtml.IOptions = { allowedTags: [ "h1", @@ -36,6 +50,7 @@ const SANITIZE_OPTIONS: sanitizeHtml.IOptions = { img: ["src", "alt", "title", "width", "height"], }, allowedSchemes: ["http", "https"], + exclusiveFilter: (frame) => isTrackingPixel(frame.tag, frame.attribs), } export function sanitizeEntryContent(htmlContent: string): string { diff --git a/services/worker/internal/parser/parser.go b/services/worker/internal/parser/parser.go index 203a943..b5307e8 100644 --- a/services/worker/internal/parser/parser.go +++ b/services/worker/internal/parser/parser.go @@ -174,13 +174,42 @@ func resolveGloballyUniqueIdentifier(feedItem *gofeed.Item) string { return fmt.Sprintf("sha256:%x", hashBytes) } +func stripTrackingParameters(rawURL string) string { + trimmedURL := strings.TrimSpace(rawURL) + + if !strings.HasPrefix(trimmedURL, "http://") && !strings.HasPrefix(trimmedURL, "https://") { + return trimmedURL + } + + parsedURL, parseError := url.Parse(trimmedURL) + + if parseError != nil { + return trimmedURL + } + + queryParameters := parsedURL.Query() + filteredParameters := url.Values{} + + for parameterName, parameterValues := range queryParameters { + loweredName := strings.ToLower(parameterName) + + if !trackingQueryParameters[loweredName] { + filteredParameters[parameterName] = parameterValues + } + } + + parsedURL.RawQuery = canonicalizeQueryString(filteredParameters) + + return parsedURL.String() +} + func resolveEntryURL(feedItem *gofeed.Item) string { if feedItem.Link != "" { - return feedItem.Link + return stripTrackingParameters(feedItem.Link) } if feedItem.GUID != "" && strings.HasPrefix(feedItem.GUID, "http") { - return feedItem.GUID + return stripTrackingParameters(feedItem.GUID) } return "" -- cgit v1.2.3