summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFuwn <[email protected]>2026-02-12 04:37:39 -0800
committerFuwn <[email protected]>2026-02-12 04:37:39 -0800
commit7f0063e2815e5213b29ec726a7dd1f65a6a83a98 (patch)
tree088d5b2f3121c33bec66eab817fd7f1bfbfa0ed8
parentfix: keep scroll position stable when entry list re-sorts after read state ch... (diff)
downloadasa.news-7f0063e2815e5213b29ec726a7dd1f65a6a83a98.tar.xz
asa.news-7f0063e2815e5213b29ec726a7dd1f65a6a83a98.zip
feat: strip tracking parameters from entry URLs and filter tracking pixel images
Strip UTM, fbclid, gclid, and other tracking query parameters from entry URLs at parse time in the Go worker. Filter out sub-3px tracking pixel images from sanitized HTML content via exclusiveFilter.
-rw-r--r--apps/web/lib/sanitize.test.ts46
-rw-r--r--apps/web/lib/sanitize.ts15
-rw-r--r--services/worker/internal/parser/parser.go33
3 files changed, 92 insertions, 2 deletions
diff --git a/apps/web/lib/sanitize.test.ts b/apps/web/lib/sanitize.test.ts
index 266b8af..f47db80 100644
--- a/apps/web/lib/sanitize.test.ts
+++ b/apps/web/lib/sanitize.test.ts
@@ -44,4 +44,50 @@ describe("sanitizeEntryContent", () => {
const input = "<pre><code>const x = 1</code></pre>"
expect(sanitizeEntryContent(input)).toBe(input)
})
+
+ it("strips 1x1 tracking pixel images", () => {
+ const input = '<p>content</p><img src="https://tracker.example.com/pixel.gif" width="1" height="1">'
+ expect(sanitizeEntryContent(input)).toBe("<p>content</p>")
+ })
+
+ it("strips images with zero dimensions", () => {
+ const input = '<img src="https://tracker.example.com/pixel.gif" width="0" height="0">'
+ expect(sanitizeEntryContent(input)).toBe("")
+ })
+
+ it("strips images where only width is a tracking dimension", () => {
+ const input = '<img src="https://tracker.example.com/pixel.gif" width="1">'
+ expect(sanitizeEntryContent(input)).toBe("")
+ })
+
+ it("strips images where only height is a tracking dimension", () => {
+ const input = '<img src="https://tracker.example.com/pixel.gif" height="2">'
+ expect(sanitizeEntryContent(input)).toBe("")
+ })
+
+ it("preserves normal-sized images", () => {
+ const input = '<img src="https://example.com/photo.jpg" width="800" height="600">'
+ const result = sanitizeEntryContent(input)
+ expect(result).toContain("src=")
+ expect(result).toContain("width=")
+ expect(result).toContain("height=")
+ })
+
+ it("preserves images without dimension attributes", () => {
+ const input = '<img src="https://example.com/photo.jpg" alt="a photo">'
+ const result = sanitizeEntryContent(input)
+ expect(result).toContain("src=")
+ expect(result).toContain("alt=")
+ })
+
+ it("strips 3x3 tracking pixel at threshold boundary", () => {
+ const input = '<img src="https://tracker.example.com/pixel.gif" width="3" height="3">'
+ expect(sanitizeEntryContent(input)).toBe("")
+ })
+
+ it("preserves images just above tracking pixel threshold", () => {
+ const input = '<img src="https://example.com/icon.png" width="4" height="4">'
+ const result = sanitizeEntryContent(input)
+ expect(result).toContain("src=")
+ })
})
diff --git a/apps/web/lib/sanitize.ts b/apps/web/lib/sanitize.ts
index b63cee1..3a85016 100644
--- a/apps/web/lib/sanitize.ts
+++ b/apps/web/lib/sanitize.ts
@@ -1,5 +1,19 @@
import sanitizeHtml from "sanitize-html"
+const TRACKING_PIXEL_DIMENSION_THRESHOLD = 3
+
+function isTrackingPixel(tagName: string, attributes: Record<string, string>): boolean {
+ if (tagName !== "img") return false
+
+ const width = parseInt(attributes.width, 10)
+ const height = parseInt(attributes.height, 10)
+
+ if (!isNaN(width) && width <= TRACKING_PIXEL_DIMENSION_THRESHOLD) return true
+ if (!isNaN(height) && height <= TRACKING_PIXEL_DIMENSION_THRESHOLD) return true
+
+ return false
+}
+
const SANITIZE_OPTIONS: sanitizeHtml.IOptions = {
allowedTags: [
"h1",
@@ -36,6 +50,7 @@ const SANITIZE_OPTIONS: sanitizeHtml.IOptions = {
img: ["src", "alt", "title", "width", "height"],
},
allowedSchemes: ["http", "https"],
+ exclusiveFilter: (frame) => isTrackingPixel(frame.tag, frame.attribs),
}
export function sanitizeEntryContent(htmlContent: string): string {
diff --git a/services/worker/internal/parser/parser.go b/services/worker/internal/parser/parser.go
index 203a943..b5307e8 100644
--- a/services/worker/internal/parser/parser.go
+++ b/services/worker/internal/parser/parser.go
@@ -174,13 +174,42 @@ func resolveGloballyUniqueIdentifier(feedItem *gofeed.Item) string {
return fmt.Sprintf("sha256:%x", hashBytes)
}
+func stripTrackingParameters(rawURL string) string {
+ trimmedURL := strings.TrimSpace(rawURL)
+
+ if !strings.HasPrefix(trimmedURL, "http://") && !strings.HasPrefix(trimmedURL, "https://") {
+ return trimmedURL
+ }
+
+ parsedURL, parseError := url.Parse(trimmedURL)
+
+ if parseError != nil {
+ return trimmedURL
+ }
+
+ queryParameters := parsedURL.Query()
+ filteredParameters := url.Values{}
+
+ for parameterName, parameterValues := range queryParameters {
+ loweredName := strings.ToLower(parameterName)
+
+ if !trackingQueryParameters[loweredName] {
+ filteredParameters[parameterName] = parameterValues
+ }
+ }
+
+ parsedURL.RawQuery = canonicalizeQueryString(filteredParameters)
+
+ return parsedURL.String()
+}
+
func resolveEntryURL(feedItem *gofeed.Item) string {
if feedItem.Link != "" {
- return feedItem.Link
+ return stripTrackingParameters(feedItem.Link)
}
if feedItem.GUID != "" && strings.HasPrefix(feedItem.GUID, "http") {
- return feedItem.GUID
+ return stripTrackingParameters(feedItem.GUID)
}
return ""