diff options
| author | Fuwn <[email protected]> | 2026-02-12 04:37:39 -0800 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2026-02-12 04:37:39 -0800 |
| commit | 7f0063e2815e5213b29ec726a7dd1f65a6a83a98 (patch) | |
| tree | 088d5b2f3121c33bec66eab817fd7f1bfbfa0ed8 | |
| parent | fix: keep scroll position stable when entry list re-sorts after read state ch... (diff) | |
| download | asa.news-7f0063e2815e5213b29ec726a7dd1f65a6a83a98.tar.xz asa.news-7f0063e2815e5213b29ec726a7dd1f65a6a83a98.zip | |
feat: strip tracking parameters from entry URLs and filter tracking pixel images
Strip UTM, fbclid, gclid, and other tracking query parameters from entry URLs
at parse time in the Go worker. Filter out sub-3px tracking pixel images from
sanitized HTML content via exclusiveFilter.
| -rw-r--r-- | apps/web/lib/sanitize.test.ts | 46 | ||||
| -rw-r--r-- | apps/web/lib/sanitize.ts | 15 | ||||
| -rw-r--r-- | services/worker/internal/parser/parser.go | 33 |
3 files changed, 92 insertions, 2 deletions
diff --git a/apps/web/lib/sanitize.test.ts b/apps/web/lib/sanitize.test.ts index 266b8af..f47db80 100644 --- a/apps/web/lib/sanitize.test.ts +++ b/apps/web/lib/sanitize.test.ts @@ -44,4 +44,50 @@ describe("sanitizeEntryContent", () => { const input = "<pre><code>const x = 1</code></pre>" expect(sanitizeEntryContent(input)).toBe(input) }) + + it("strips 1x1 tracking pixel images", () => { + const input = '<p>content</p><img src="https://tracker.example.com/pixel.gif" width="1" height="1">' + expect(sanitizeEntryContent(input)).toBe("<p>content</p>") + }) + + it("strips images with zero dimensions", () => { + const input = '<img src="https://tracker.example.com/pixel.gif" width="0" height="0">' + expect(sanitizeEntryContent(input)).toBe("") + }) + + it("strips images where only width is a tracking dimension", () => { + const input = '<img src="https://tracker.example.com/pixel.gif" width="1">' + expect(sanitizeEntryContent(input)).toBe("") + }) + + it("strips images where only height is a tracking dimension", () => { + const input = '<img src="https://tracker.example.com/pixel.gif" height="2">' + expect(sanitizeEntryContent(input)).toBe("") + }) + + it("preserves normal-sized images", () => { + const input = '<img src="https://example.com/photo.jpg" width="800" height="600">' + const result = sanitizeEntryContent(input) + expect(result).toContain("src=") + expect(result).toContain("width=") + expect(result).toContain("height=") + }) + + it("preserves images without dimension attributes", () => { + const input = '<img src="https://example.com/photo.jpg" alt="a photo">' + const result = sanitizeEntryContent(input) + expect(result).toContain("src=") + expect(result).toContain("alt=") + }) + + it("strips 3x3 tracking pixel at threshold boundary", () => { + const input = '<img src="https://tracker.example.com/pixel.gif" width="3" height="3">' + expect(sanitizeEntryContent(input)).toBe("") + }) + + it("preserves images just above tracking pixel threshold", () => { + const input = '<img src="https://example.com/icon.png" width="4" height="4">' + const result = sanitizeEntryContent(input) + expect(result).toContain("src=") + }) }) diff --git a/apps/web/lib/sanitize.ts b/apps/web/lib/sanitize.ts index b63cee1..3a85016 100644 --- a/apps/web/lib/sanitize.ts +++ b/apps/web/lib/sanitize.ts @@ -1,5 +1,19 @@ import sanitizeHtml from "sanitize-html" +const TRACKING_PIXEL_DIMENSION_THRESHOLD = 3 + +function isTrackingPixel(tagName: string, attributes: Record<string, string>): boolean { + if (tagName !== "img") return false + + const width = parseInt(attributes.width, 10) + const height = parseInt(attributes.height, 10) + + if (!isNaN(width) && width <= TRACKING_PIXEL_DIMENSION_THRESHOLD) return true + if (!isNaN(height) && height <= TRACKING_PIXEL_DIMENSION_THRESHOLD) return true + + return false +} + const SANITIZE_OPTIONS: sanitizeHtml.IOptions = { allowedTags: [ "h1", @@ -36,6 +50,7 @@ const SANITIZE_OPTIONS: sanitizeHtml.IOptions = { img: ["src", "alt", "title", "width", "height"], }, allowedSchemes: ["http", "https"], + exclusiveFilter: (frame) => isTrackingPixel(frame.tag, frame.attribs), } export function sanitizeEntryContent(htmlContent: string): string { diff --git a/services/worker/internal/parser/parser.go b/services/worker/internal/parser/parser.go index 203a943..b5307e8 100644 --- a/services/worker/internal/parser/parser.go +++ b/services/worker/internal/parser/parser.go @@ -174,13 +174,42 @@ func resolveGloballyUniqueIdentifier(feedItem *gofeed.Item) string { return fmt.Sprintf("sha256:%x", hashBytes) } +func stripTrackingParameters(rawURL string) string { + trimmedURL := strings.TrimSpace(rawURL) + + if !strings.HasPrefix(trimmedURL, "http://") && !strings.HasPrefix(trimmedURL, "https://") { + return trimmedURL + } + + parsedURL, parseError := url.Parse(trimmedURL) + + if parseError != nil { + return trimmedURL + } + + queryParameters := parsedURL.Query() + filteredParameters := url.Values{} + + for parameterName, parameterValues := range queryParameters { + loweredName := strings.ToLower(parameterName) + + if !trackingQueryParameters[loweredName] { + filteredParameters[parameterName] = parameterValues + } + } + + parsedURL.RawQuery = canonicalizeQueryString(filteredParameters) + + return parsedURL.String() +} + func resolveEntryURL(feedItem *gofeed.Item) string { if feedItem.Link != "" { - return feedItem.Link + return stripTrackingParameters(feedItem.Link) } if feedItem.GUID != "" && strings.HasPrefix(feedItem.GUID, "http") { - return feedItem.GUID + return stripTrackingParameters(feedItem.GUID) } return "" |