package parser import ( "crypto/sha256" "fmt" "net/url" "strconv" "strings" "time" "unicode/utf8" "github.com/Fuwn/asa-news/internal/model" "github.com/mmcdole/gofeed" ) type Parser struct { gofeedParser *gofeed.Parser } func NewParser() *Parser { return &Parser{ gofeedParser: gofeed.NewParser(), } } type ParseResult struct { Entries []model.FeedEntry FeedTitle string SiteURL string AudioEnclosureRatio float64 } func (feedParser *Parser) Parse(feedIdentifier string, ownerIdentifier *string, rawFeedContent []byte) (*ParseResult, error) { parsedFeed, parseError := feedParser.gofeedParser.ParseString(string(rawFeedContent)) if parseError != nil { return nil, fmt.Errorf("failed to parse feed content: %w", parseError) } feedEntries := make([]model.FeedEntry, 0, len(parsedFeed.Items)) audioEnclosureCount := 0 for _, feedItem := range parsedFeed.Items { normalizedEntry := normalizeFeedItem(feedIdentifier, ownerIdentifier, feedItem) if normalizedEntry.EnclosureURL != nil { audioEnclosureCount++ } feedEntries = append(feedEntries, normalizedEntry) } audioEnclosureRatio := 0.0 if len(feedEntries) > 0 { audioEnclosureRatio = float64(audioEnclosureCount) / float64(len(feedEntries)) } return &ParseResult{ Entries: feedEntries, FeedTitle: strings.TrimSpace(parsedFeed.Title), SiteURL: strings.TrimSpace(parsedFeed.Link), AudioEnclosureRatio: audioEnclosureRatio, }, nil } func normalizeFeedItem(feedIdentifier string, ownerIdentifier *string, feedItem *gofeed.Item) model.FeedEntry { globallyUniqueIdentifier := resolveGloballyUniqueIdentifier(feedItem) entryURL := stringPointerOrNil(resolveEntryURL(feedItem)) entryTitle := stringPointerOrNil(strings.TrimSpace(feedItem.Title)) entrySummary := stringPointerOrNil(strings.TrimSpace(feedItem.Description)) entryContentHTML := stringPointerOrNil(resolveContentHTML(feedItem)) entryContentText := resolveContentText(feedItem) authorName := stringPointerOrNil(resolveAuthorName(feedItem)) publishedAt := resolvePublishedDate(feedItem) entryImageURL := stringPointerOrNil(resolveImageURL(feedItem)) wordCount := countWords(entryContentText) enclosureURL, enclosureType, enclosureLength := resolveAudioEnclosure(feedItem) return model.FeedEntry{ FeedIdentifier: feedIdentifier, OwnerIdentifier: ownerIdentifier, GUID: globallyUniqueIdentifier, URL: entryURL, Title: entryTitle, Author: authorName, Summary: entrySummary, ContentHTML: entryContentHTML, ContentText: stringPointerOrNil(entryContentText), ImageURL: entryImageURL, PublishedAt: publishedAt, WordCount: wordCount, EnclosureURL: enclosureURL, EnclosureType: enclosureType, EnclosureLength: enclosureLength, } } func stringPointerOrNil(value string) *string { if value == "" { return nil } return &value } var trackingQueryParameters = map[string]bool{ "utm_source": true, "utm_medium": true, "utm_campaign": true, "utm_term": true, "utm_content": true, "utm_id": true, "ref": true, "fbclid": true, "gclid": true, "mc_cid": true, "mc_eid": true, "_hsenc": true, "_hsmi": true, "source": true, "dest": true, } func normalizeGloballyUniqueIdentifier(rawIdentifier string) string { normalized := strings.TrimSpace(rawIdentifier) if !strings.HasPrefix(normalized, "http://") && !strings.HasPrefix(normalized, "https://") { return normalized } parsedURL, parseError := url.Parse(normalized) if parseError != nil { return normalized } queryParameters := parsedURL.Query() filteredParameters := url.Values{} for parameterName, parameterValues := range queryParameters { loweredName := strings.ToLower(parameterName) if !trackingQueryParameters[loweredName] { filteredParameters[parameterName] = parameterValues } } parsedURL.RawQuery = canonicalizeQueryString(filteredParameters) parsedURL.Fragment = "" return parsedURL.String() } func canonicalizeQueryString(parameters url.Values) string { if len(parameters) == 0 { return "" } return parameters.Encode() } func resolveGloballyUniqueIdentifier(feedItem *gofeed.Item) string { if feedItem.GUID != "" { return normalizeGloballyUniqueIdentifier(feedItem.GUID) } if feedItem.Link != "" { return normalizeGloballyUniqueIdentifier(feedItem.Link) } hashInput := strings.TrimSpace(feedItem.Title) + strings.TrimSpace(feedItem.Description) hashBytes := sha256.Sum256([]byte(hashInput)) return fmt.Sprintf("sha256:%x", hashBytes) } func stripTrackingParameters(rawURL string) string { trimmedURL := strings.TrimSpace(rawURL) if !strings.HasPrefix(trimmedURL, "http://") && !strings.HasPrefix(trimmedURL, "https://") { return trimmedURL } parsedURL, parseError := url.Parse(trimmedURL) if parseError != nil { return trimmedURL } queryParameters := parsedURL.Query() filteredParameters := url.Values{} for parameterName, parameterValues := range queryParameters { loweredName := strings.ToLower(parameterName) if !trackingQueryParameters[loweredName] { filteredParameters[parameterName] = parameterValues } } parsedURL.RawQuery = canonicalizeQueryString(filteredParameters) return parsedURL.String() } func resolveEntryURL(feedItem *gofeed.Item) string { if feedItem.Link != "" { return stripTrackingParameters(feedItem.Link) } if feedItem.GUID != "" && strings.HasPrefix(feedItem.GUID, "http") { return stripTrackingParameters(feedItem.GUID) } return "" } func resolveContentHTML(feedItem *gofeed.Item) string { if feedItem.Content != "" { return feedItem.Content } return feedItem.Description } func resolveContentText(feedItem *gofeed.Item) string { contentSource := feedItem.Content if contentSource == "" { contentSource = feedItem.Description } return stripHTMLTags(contentSource) } func resolveAuthorName(feedItem *gofeed.Item) string { if feedItem.Author != nil && feedItem.Author.Name != "" { return feedItem.Author.Name } if len(feedItem.Authors) > 0 && feedItem.Authors[0].Name != "" { return feedItem.Authors[0].Name } return "" } func resolvePublishedDate(feedItem *gofeed.Item) *time.Time { if feedItem.PublishedParsed != nil { return feedItem.PublishedParsed } if feedItem.UpdatedParsed != nil { return feedItem.UpdatedParsed } return nil } func resolveAudioEnclosure(feedItem *gofeed.Item) (*string, *string, *int64) { if feedItem.Enclosures == nil { return nil, nil, nil } for _, enclosure := range feedItem.Enclosures { if strings.HasPrefix(enclosure.Type, "audio/") && enclosure.URL != "" { enclosureURL := enclosure.URL enclosureType := enclosure.Type var enclosureLength *int64 if enclosure.Length != "" { if parsedLength, parseError := strconv.ParseInt(enclosure.Length, 10, 64); parseError == nil { enclosureLength = &parsedLength } } return &enclosureURL, &enclosureType, enclosureLength } } return nil, nil, nil } func resolveImageURL(feedItem *gofeed.Item) string { if feedItem.Image != nil && feedItem.Image.URL != "" { return feedItem.Image.URL } return "" } func countWords(plainText string) *int { if plainText == "" { return nil } count := len(strings.Fields(plainText)) return &count } func stripHTMLTags(htmlContent string) string { var resultBuilder strings.Builder insideTag := false for characterIndex := 0; characterIndex < len(htmlContent); { currentRune, runeSize := utf8.DecodeRuneInString(htmlContent[characterIndex:]) if currentRune == '<' { insideTag = true } else if currentRune == '>' { insideTag = false } else if !insideTag { resultBuilder.WriteRune(currentRune) } characterIndex += runeSize } return strings.TrimSpace(resultBuilder.String()) }