summaryrefslogtreecommitdiff
path: root/services/worker/internal/parser/parser.go
diff options
context:
space:
mode:
Diffstat (limited to 'services/worker/internal/parser/parser.go')
-rw-r--r--services/worker/internal/parser/parser.go67
1 files changed, 62 insertions, 5 deletions
diff --git a/services/worker/internal/parser/parser.go b/services/worker/internal/parser/parser.go
index 32611e7..203a943 100644
--- a/services/worker/internal/parser/parser.go
+++ b/services/worker/internal/parser/parser.go
@@ -3,12 +3,13 @@ package parser
import (
"crypto/sha256"
"fmt"
- "github.com/Fuwn/asa-news/internal/model"
- "github.com/mmcdole/gofeed"
+ "net/url"
"strconv"
"strings"
"time"
"unicode/utf8"
+ "github.com/Fuwn/asa-news/internal/model"
+ "github.com/mmcdole/gofeed"
)
type Parser struct {
@@ -102,16 +103,72 @@ func stringPointerOrNil(value string) *string {
return &value
}
+var trackingQueryParameters = map[string]bool{
+ "utm_source": true,
+ "utm_medium": true,
+ "utm_campaign": true,
+ "utm_term": true,
+ "utm_content": true,
+ "utm_id": true,
+ "ref": true,
+ "fbclid": true,
+ "gclid": true,
+ "mc_cid": true,
+ "mc_eid": true,
+ "_hsenc": true,
+ "_hsmi": true,
+ "source": true,
+ "dest": true,
+}
+
+func normalizeGloballyUniqueIdentifier(rawIdentifier string) string {
+ normalized := strings.TrimSpace(rawIdentifier)
+
+ if !strings.HasPrefix(normalized, "http://") && !strings.HasPrefix(normalized, "https://") {
+ return normalized
+ }
+
+ parsedURL, parseError := url.Parse(normalized)
+
+ if parseError != nil {
+ return normalized
+ }
+
+ queryParameters := parsedURL.Query()
+ filteredParameters := url.Values{}
+
+ for parameterName, parameterValues := range queryParameters {
+ loweredName := strings.ToLower(parameterName)
+
+ if !trackingQueryParameters[loweredName] {
+ filteredParameters[parameterName] = parameterValues
+ }
+ }
+
+ parsedURL.RawQuery = canonicalizeQueryString(filteredParameters)
+ parsedURL.Fragment = ""
+
+ return parsedURL.String()
+}
+
+func canonicalizeQueryString(parameters url.Values) string {
+ if len(parameters) == 0 {
+ return ""
+ }
+
+ return parameters.Encode()
+}
+
func resolveGloballyUniqueIdentifier(feedItem *gofeed.Item) string {
if feedItem.GUID != "" {
- return feedItem.GUID
+ return normalizeGloballyUniqueIdentifier(feedItem.GUID)
}
if feedItem.Link != "" {
- return feedItem.Link
+ return normalizeGloballyUniqueIdentifier(feedItem.Link)
}
- hashInput := feedItem.Title + feedItem.Description
+ hashInput := strings.TrimSpace(feedItem.Title) + strings.TrimSpace(feedItem.Description)
hashBytes := sha256.Sum256([]byte(hashInput))
return fmt.Sprintf("sha256:%x", hashBytes)