summaryrefslogtreecommitdiff
path: root/services/worker/internal/parser/parser_test.go
diff options
context:
space:
mode:
authorFuwn <[email protected]>2026-02-10 01:59:01 -0800
committerFuwn <[email protected]>2026-02-10 01:59:01 -0800
commit871985bc9eb42c6a088563e7c34db181f603f407 (patch)
tree31299597a9f246d332b3bf6d5e2bed177648b577 /services/worker/internal/parser/parser_test.go
parentfeat: reorder feature grid by attention-grabbing impact (diff)
downloadasa.news-871985bc9eb42c6a088563e7c34db181f603f407.tar.xz
asa.news-871985bc9eb42c6a088563e7c34db181f603f407.zip
fix: harden CI and close remaining test/security gaps
- Make webhook URL tests deterministic with injectable DNS resolver - Wire tier parity checker into CI and root scripts - Add rate_limits cleanup cron job (hourly, >1hr retention) - Change rate limiter to fail closed on RPC error - Add Go worker tests: parser, SSRF protection, error classification, authentication, and worker pool (48 test functions)
Diffstat (limited to 'services/worker/internal/parser/parser_test.go')
-rw-r--r--services/worker/internal/parser/parser_test.go287
1 files changed, 287 insertions, 0 deletions
diff --git a/services/worker/internal/parser/parser_test.go b/services/worker/internal/parser/parser_test.go
new file mode 100644
index 0000000..7a28132
--- /dev/null
+++ b/services/worker/internal/parser/parser_test.go
@@ -0,0 +1,287 @@
+package parser
+
+import (
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+ "testing"
+ "time"
+)
+
+const validRSS = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>Test Feed</title>
+ <link>https://example.com</link>
+ <item>
+ <guid>entry-1</guid>
+ <title>First Entry</title>
+ <link>https://example.com/1</link>
+ <description>Summary of the first entry.</description>
+ <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/">&lt;p&gt;Full content here.&lt;/p&gt;</content:encoded>
+ <author>Alice</author>
+ <pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
+ </item>
+ <item>
+ <guid>entry-2</guid>
+ <title>Second Entry</title>
+ <link>https://example.com/2</link>
+ <description>Summary of the second entry.</description>
+ <pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
+ </item>
+ </channel>
+</rss>`
+const validAtom = `<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <title>Atom Feed</title>
+ <link href="https://atom.example.com"/>
+ <entry>
+ <id>atom-1</id>
+ <title>Atom Entry</title>
+ <link href="https://atom.example.com/1"/>
+ <summary>An atom summary.</summary>
+ <updated>2024-01-01T12:00:00Z</updated>
+ <author><name>Bob</name></author>
+ </entry>
+</feed>`
+const podcastRSS = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>My Podcast</title>
+ <link>https://podcast.example.com</link>
+ <item>
+ <guid>ep-1</guid>
+ <title>Episode 1</title>
+ <enclosure url="https://cdn.example.com/ep1.mp3" type="audio/mpeg" length="12345678"/>
+ <pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
+ </item>
+ <item>
+ <guid>ep-2</guid>
+ <title>Episode 2</title>
+ <enclosure url="https://cdn.example.com/ep2.mp3" type="audio/mpeg" length="87654321"/>
+ <pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
+ </item>
+ </channel>
+</rss>`
+const mixedEnclosureRSS = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>Mixed Feed</title>
+ <link>https://mixed.example.com</link>
+ <item>
+ <guid>item-audio</guid>
+ <title>Audio Item</title>
+ <enclosure url="https://cdn.example.com/audio.mp3" type="audio/mpeg" length="1000"/>
+ </item>
+ <item>
+ <guid>item-text</guid>
+ <title>Text Item</title>
+ <description>Just a text item.</description>
+ </item>
+ </channel>
+</rss>`
+const noGUIDRSS = `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>No GUID Feed</title>
+ <link>https://example.com</link>
+ <item>
+ <title>No GUID but has link</title>
+ <link>https://example.com/no-guid</link>
+ </item>
+ <item>
+ <title>No GUID no link</title>
+ <description>Only title and description.</description>
+ </item>
+ </channel>
+</rss>`
+
+func TestParseValidRSS(test *testing.T) {
+ feedParser := NewParser()
+ result, parseError := feedParser.Parse("feed-123", nil, []byte(validRSS))
+
+ require.NoError(test, parseError)
+ assert.Equal(test, "Test Feed", result.FeedTitle)
+ assert.Equal(test, "https://example.com", result.SiteURL)
+ assert.Len(test, result.Entries, 2)
+ assert.Equal(test, 0.0, result.AudioEnclosureRatio)
+
+ first := result.Entries[0]
+
+ assert.Equal(test, "entry-1", first.GUID)
+ assert.Equal(test, "feed-123", first.FeedIdentifier)
+ require.NotNil(test, first.Title)
+ assert.Equal(test, "First Entry", *first.Title)
+ require.NotNil(test, first.URL)
+ assert.Equal(test, "https://example.com/1", *first.URL)
+ require.NotNil(test, first.Summary)
+ assert.Equal(test, "Summary of the first entry.", *first.Summary)
+ require.NotNil(test, first.ContentHTML)
+ assert.Contains(test, *first.ContentHTML, "Full content here.")
+ require.NotNil(test, first.PublishedAt)
+ assert.Nil(test, first.OwnerIdentifier)
+}
+
+func TestParseValidAtom(test *testing.T) {
+ feedParser := NewParser()
+ result, parseError := feedParser.Parse("atom-feed", nil, []byte(validAtom))
+
+ require.NoError(test, parseError)
+ assert.Equal(test, "Atom Feed", result.FeedTitle)
+ assert.Len(test, result.Entries, 1)
+
+ entry := result.Entries[0]
+
+ assert.Equal(test, "atom-1", entry.GUID)
+ require.NotNil(test, entry.Title)
+ assert.Equal(test, "Atom Entry", *entry.Title)
+ require.NotNil(test, entry.Author)
+ assert.Equal(test, "Bob", *entry.Author)
+}
+
+func TestParsePodcastFeed(test *testing.T) {
+ feedParser := NewParser()
+ result, parseError := feedParser.Parse("podcast-1", nil, []byte(podcastRSS))
+
+ require.NoError(test, parseError)
+ assert.Equal(test, "My Podcast", result.FeedTitle)
+ assert.Len(test, result.Entries, 2)
+ assert.InDelta(test, 1.0, result.AudioEnclosureRatio, 0.01)
+
+ firstEpisode := result.Entries[0]
+
+ require.NotNil(test, firstEpisode.EnclosureURL)
+ assert.Equal(test, "https://cdn.example.com/ep1.mp3", *firstEpisode.EnclosureURL)
+ require.NotNil(test, firstEpisode.EnclosureType)
+ assert.Equal(test, "audio/mpeg", *firstEpisode.EnclosureType)
+ require.NotNil(test, firstEpisode.EnclosureLength)
+ assert.Equal(test, int64(12345678), *firstEpisode.EnclosureLength)
+}
+
+func TestParseMixedEnclosureFeed(test *testing.T) {
+ feedParser := NewParser()
+ result, parseError := feedParser.Parse("mixed-1", nil, []byte(mixedEnclosureRSS))
+
+ require.NoError(test, parseError)
+ assert.Len(test, result.Entries, 2)
+ assert.InDelta(test, 0.5, result.AudioEnclosureRatio, 0.01)
+
+ audioItem := result.Entries[0]
+
+ require.NotNil(test, audioItem.EnclosureURL)
+
+ textItem := result.Entries[1]
+
+ assert.Nil(test, textItem.EnclosureURL)
+ assert.Nil(test, textItem.EnclosureType)
+ assert.Nil(test, textItem.EnclosureLength)
+}
+
+func TestParseGUIDFallback(test *testing.T) {
+ feedParser := NewParser()
+ result, parseError := feedParser.Parse("no-guid-feed", nil, []byte(noGUIDRSS))
+
+ require.NoError(test, parseError)
+ assert.Len(test, result.Entries, 2)
+
+ entryWithLink := result.Entries[0]
+
+ assert.Equal(test, "https://example.com/no-guid", entryWithLink.GUID)
+
+ entryWithHash := result.Entries[1]
+
+ assert.True(test, len(entryWithHash.GUID) > 0)
+ assert.Contains(test, entryWithHash.GUID, "sha256:")
+}
+
+func TestParseOwnerIdentifier(test *testing.T) {
+ feedParser := NewParser()
+ ownerIdentifier := "user-abc"
+ result, parseError := feedParser.Parse("owned-feed", &ownerIdentifier, []byte(validRSS))
+
+ require.NoError(test, parseError)
+
+ for _, entry := range result.Entries {
+ require.NotNil(test, entry.OwnerIdentifier)
+ assert.Equal(test, "user-abc", *entry.OwnerIdentifier)
+ }
+}
+
+func TestParseInvalidXML(test *testing.T) {
+ feedParser := NewParser()
+ _, parseError := feedParser.Parse("bad-feed", nil, []byte("this is not xml at all"))
+
+ assert.Error(test, parseError)
+}
+
+func TestParsePublishedDateFallsBackToUpdated(test *testing.T) {
+ feedParser := NewParser()
+ result, parseError := feedParser.Parse("atom-feed", nil, []byte(validAtom))
+
+ require.NoError(test, parseError)
+
+ entry := result.Entries[0]
+
+ require.NotNil(test, entry.PublishedAt)
+
+ expectedTime := time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC)
+
+ assert.True(test, entry.PublishedAt.Equal(expectedTime))
+}
+
+func TestParseWordCount(test *testing.T) {
+ feedParser := NewParser()
+ result, parseError := feedParser.Parse("feed-123", nil, []byte(validRSS))
+
+ require.NoError(test, parseError)
+
+ first := result.Entries[0]
+
+ require.NotNil(test, first.WordCount)
+ assert.Greater(test, *first.WordCount, 0)
+}
+
+func TestStripHTMLTags(test *testing.T) {
+ testCases := []struct {
+ name string
+ input string
+ expected string
+ }{
+ {"plain text passthrough", "hello world", "hello world"},
+ {"strips simple tags", "<p>hello</p>", "hello"},
+ {"strips nested tags", "<div><p>hello <strong>world</strong></p></div>", "hello world"},
+ {"handles unicode", "<p>café résumé naïve</p>", "café résumé naïve"},
+ {"empty string", "", ""},
+ {"only tags", "<br/><hr/>", ""},
+ }
+
+ for _, testCase := range testCases {
+ test.Run(testCase.name, func(test *testing.T) {
+ assert.Equal(test, testCase.expected, stripHTMLTags(testCase.input))
+ })
+ }
+}
+
+func TestCountWords(test *testing.T) {
+ emptyResult := countWords("")
+
+ assert.Nil(test, emptyResult)
+
+ twoWords := countWords("hello world")
+
+ require.NotNil(test, twoWords)
+ assert.Equal(test, 2, *twoWords)
+
+ withExtraSpaces := countWords(" hello world ")
+
+ require.NotNil(test, withExtraSpaces)
+ assert.Equal(test, 2, *withExtraSpaces)
+}
+
+func TestStringPointerOrNil(test *testing.T) {
+ assert.Nil(test, stringPointerOrNil(""))
+
+ result := stringPointerOrNil("hello")
+
+ require.NotNil(test, result)
+ assert.Equal(test, "hello", *result)
+}