summaryrefslogtreecommitdiff
path: root/services/worker/internal/parser/parser.go
diff options
context:
space:
mode:
authorFuwn <[email protected]>2026-02-12 00:49:03 -0800
committerFuwn <[email protected]>2026-02-12 00:49:42 -0800
commit2911927a2d9fdd5616c2eda5643143f601068888 (patch)
tree79726e2f2babc4a1da58c30b59d22981fbbdfe26 /services/worker/internal/parser/parser.go
parentRedump latest Supabase schema (diff)
downloadasa.news-2911927a2d9fdd5616c2eda5643143f601068888.tar.xz
asa.news-2911927a2d9fdd5616c2eda5643143f601068888.zip
fix: prevent read entries from reverting to unread on re-fetch
Root cause: cleanup_stale_entries deleted read-but-unsaved entries from active feeds, then the Go worker re-inserted them with new UUIDs, orphaning the user_entry_states rows and making entries appear unread. - cleanup_stale_entries: skip feeds with active subscribers and preserve entries that have been read (not just saved) - Go parser: normalize GUIDs by trimming whitespace and stripping tracking query parameters from URL-based identifiers - Go writer: preserve original published_at on upsert instead of overwriting, preventing old entries from jumping to timeline top - get_unread_counts: apply same time boundary as get_timeline so ancient re-inserted entries don't inflate counts - Realtime listener: ignore INSERT events for entries older than 48h to suppress misleading "new entries" notifications from re-inserts
Diffstat (limited to 'services/worker/internal/parser/parser.go')
-rw-r--r--services/worker/internal/parser/parser.go67
1 files changed, 62 insertions, 5 deletions
diff --git a/services/worker/internal/parser/parser.go b/services/worker/internal/parser/parser.go
index 32611e7..203a943 100644
--- a/services/worker/internal/parser/parser.go
+++ b/services/worker/internal/parser/parser.go
@@ -3,12 +3,13 @@ package parser
import (
"crypto/sha256"
"fmt"
- "github.com/Fuwn/asa-news/internal/model"
- "github.com/mmcdole/gofeed"
+ "net/url"
"strconv"
"strings"
"time"
"unicode/utf8"
+ "github.com/Fuwn/asa-news/internal/model"
+ "github.com/mmcdole/gofeed"
)
type Parser struct {
@@ -102,16 +103,72 @@ func stringPointerOrNil(value string) *string {
return &value
}
+var trackingQueryParameters = map[string]bool{
+ "utm_source": true,
+ "utm_medium": true,
+ "utm_campaign": true,
+ "utm_term": true,
+ "utm_content": true,
+ "utm_id": true,
+ "ref": true,
+ "fbclid": true,
+ "gclid": true,
+ "mc_cid": true,
+ "mc_eid": true,
+ "_hsenc": true,
+ "_hsmi": true,
+ "source": true,
+ "dest": true,
+}
+
+func normalizeGloballyUniqueIdentifier(rawIdentifier string) string {
+ normalized := strings.TrimSpace(rawIdentifier)
+
+ if !strings.HasPrefix(normalized, "http://") && !strings.HasPrefix(normalized, "https://") {
+ return normalized
+ }
+
+ parsedURL, parseError := url.Parse(normalized)
+
+ if parseError != nil {
+ return normalized
+ }
+
+ queryParameters := parsedURL.Query()
+ filteredParameters := url.Values{}
+
+ for parameterName, parameterValues := range queryParameters {
+ loweredName := strings.ToLower(parameterName)
+
+ if !trackingQueryParameters[loweredName] {
+ filteredParameters[parameterName] = parameterValues
+ }
+ }
+
+ parsedURL.RawQuery = canonicalizeQueryString(filteredParameters)
+ parsedURL.Fragment = ""
+
+ return parsedURL.String()
+}
+
+func canonicalizeQueryString(parameters url.Values) string {
+ if len(parameters) == 0 {
+ return ""
+ }
+
+ return parameters.Encode()
+}
+
func resolveGloballyUniqueIdentifier(feedItem *gofeed.Item) string {
if feedItem.GUID != "" {
- return feedItem.GUID
+ return normalizeGloballyUniqueIdentifier(feedItem.GUID)
}
if feedItem.Link != "" {
- return feedItem.Link
+ return normalizeGloballyUniqueIdentifier(feedItem.Link)
}
- hashInput := feedItem.Title + feedItem.Description
+ hashInput := strings.TrimSpace(feedItem.Title) + strings.TrimSpace(feedItem.Description)
hashBytes := sha256.Sum256([]byte(hashInput))
return fmt.Sprintf("sha256:%x", hashBytes)