diff options
Diffstat (limited to 'services/worker/internal/parser/parser.go')
| -rw-r--r-- | services/worker/internal/parser/parser.go | 67 |
1 files changed, 62 insertions, 5 deletions
diff --git a/services/worker/internal/parser/parser.go b/services/worker/internal/parser/parser.go index 32611e7..203a943 100644 --- a/services/worker/internal/parser/parser.go +++ b/services/worker/internal/parser/parser.go @@ -3,12 +3,13 @@ package parser import ( "crypto/sha256" "fmt" - "github.com/Fuwn/asa-news/internal/model" - "github.com/mmcdole/gofeed" + "net/url" "strconv" "strings" "time" "unicode/utf8" + "github.com/Fuwn/asa-news/internal/model" + "github.com/mmcdole/gofeed" ) type Parser struct { @@ -102,16 +103,72 @@ func stringPointerOrNil(value string) *string { return &value } +var trackingQueryParameters = map[string]bool{ + "utm_source": true, + "utm_medium": true, + "utm_campaign": true, + "utm_term": true, + "utm_content": true, + "utm_id": true, + "ref": true, + "fbclid": true, + "gclid": true, + "mc_cid": true, + "mc_eid": true, + "_hsenc": true, + "_hsmi": true, + "source": true, + "dest": true, +} + +func normalizeGloballyUniqueIdentifier(rawIdentifier string) string { + normalized := strings.TrimSpace(rawIdentifier) + + if !strings.HasPrefix(normalized, "http://") && !strings.HasPrefix(normalized, "https://") { + return normalized + } + + parsedURL, parseError := url.Parse(normalized) + + if parseError != nil { + return normalized + } + + queryParameters := parsedURL.Query() + filteredParameters := url.Values{} + + for parameterName, parameterValues := range queryParameters { + loweredName := strings.ToLower(parameterName) + + if !trackingQueryParameters[loweredName] { + filteredParameters[parameterName] = parameterValues + } + } + + parsedURL.RawQuery = canonicalizeQueryString(filteredParameters) + parsedURL.Fragment = "" + + return parsedURL.String() +} + +func canonicalizeQueryString(parameters url.Values) string { + if len(parameters) == 0 { + return "" + } + + return parameters.Encode() +} + func resolveGloballyUniqueIdentifier(feedItem *gofeed.Item) string { if feedItem.GUID != "" { - return feedItem.GUID + return normalizeGloballyUniqueIdentifier(feedItem.GUID) } if feedItem.Link != "" { - return feedItem.Link + return normalizeGloballyUniqueIdentifier(feedItem.Link) } - hashInput := feedItem.Title + feedItem.Description + hashInput := strings.TrimSpace(feedItem.Title) + strings.TrimSpace(feedItem.Description) hashBytes := sha256.Sum256([]byte(hashInput)) return fmt.Sprintf("sha256:%x", hashBytes) |