diff options
| author | Fuwn <[email protected]> | 2026-02-12 00:49:03 -0800 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2026-02-12 00:49:42 -0800 |
| commit | 2911927a2d9fdd5616c2eda5643143f601068888 (patch) | |
| tree | 79726e2f2babc4a1da58c30b59d22981fbbdfe26 /services/worker/internal/parser/parser.go | |
| parent | Redump latest Supabase schema (diff) | |
| download | asa.news-2911927a2d9fdd5616c2eda5643143f601068888.tar.xz asa.news-2911927a2d9fdd5616c2eda5643143f601068888.zip | |
fix: prevent read entries from reverting to unread on re-fetch
Root cause: cleanup_stale_entries deleted read-but-unsaved entries from
active feeds, then the Go worker re-inserted them with new UUIDs,
orphaning the user_entry_states rows and making entries appear unread.
- cleanup_stale_entries: skip feeds with active subscribers and preserve
entries that have been read (not just saved)
- Go parser: normalize GUIDs by trimming whitespace and stripping
tracking query parameters from URL-based identifiers
- Go writer: preserve original published_at on upsert instead of
overwriting, preventing old entries from jumping to timeline top
- get_unread_counts: apply same time boundary as get_timeline so
ancient re-inserted entries don't inflate counts
- Realtime listener: ignore INSERT events for entries older than 48h
to suppress misleading "new entries" notifications from re-inserts
Diffstat (limited to 'services/worker/internal/parser/parser.go')
| -rw-r--r-- | services/worker/internal/parser/parser.go | 67 |
1 files changed, 62 insertions, 5 deletions
diff --git a/services/worker/internal/parser/parser.go b/services/worker/internal/parser/parser.go index 32611e7..203a943 100644 --- a/services/worker/internal/parser/parser.go +++ b/services/worker/internal/parser/parser.go @@ -3,12 +3,13 @@ package parser import ( "crypto/sha256" "fmt" - "github.com/Fuwn/asa-news/internal/model" - "github.com/mmcdole/gofeed" + "net/url" "strconv" "strings" "time" "unicode/utf8" + "github.com/Fuwn/asa-news/internal/model" + "github.com/mmcdole/gofeed" ) type Parser struct { @@ -102,16 +103,72 @@ func stringPointerOrNil(value string) *string { return &value } +var trackingQueryParameters = map[string]bool{ + "utm_source": true, + "utm_medium": true, + "utm_campaign": true, + "utm_term": true, + "utm_content": true, + "utm_id": true, + "ref": true, + "fbclid": true, + "gclid": true, + "mc_cid": true, + "mc_eid": true, + "_hsenc": true, + "_hsmi": true, + "source": true, + "dest": true, +} + +func normalizeGloballyUniqueIdentifier(rawIdentifier string) string { + normalized := strings.TrimSpace(rawIdentifier) + + if !strings.HasPrefix(normalized, "http://") && !strings.HasPrefix(normalized, "https://") { + return normalized + } + + parsedURL, parseError := url.Parse(normalized) + + if parseError != nil { + return normalized + } + + queryParameters := parsedURL.Query() + filteredParameters := url.Values{} + + for parameterName, parameterValues := range queryParameters { + loweredName := strings.ToLower(parameterName) + + if !trackingQueryParameters[loweredName] { + filteredParameters[parameterName] = parameterValues + } + } + + parsedURL.RawQuery = canonicalizeQueryString(filteredParameters) + parsedURL.Fragment = "" + + return parsedURL.String() +} + +func canonicalizeQueryString(parameters url.Values) string { + if len(parameters) == 0 { + return "" + } + + return parameters.Encode() +} + func resolveGloballyUniqueIdentifier(feedItem *gofeed.Item) string { if feedItem.GUID != "" { - return feedItem.GUID + return normalizeGloballyUniqueIdentifier(feedItem.GUID) } if feedItem.Link != "" { - return feedItem.Link + return normalizeGloballyUniqueIdentifier(feedItem.Link) } - hashInput := feedItem.Title + feedItem.Description + hashInput := strings.TrimSpace(feedItem.Title) + strings.TrimSpace(feedItem.Description) hashBytes := sha256.Sum256([]byte(hashInput)) return fmt.Sprintf("sha256:%x", hashBytes) |