diff options
| author | Fuwn <[email protected]> | 2026-02-10 01:59:01 -0800 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2026-02-10 01:59:01 -0800 |
| commit | 871985bc9eb42c6a088563e7c34db181f603f407 (patch) | |
| tree | 31299597a9f246d332b3bf6d5e2bed177648b577 /services/worker/internal/parser/parser_test.go | |
| parent | feat: reorder feature grid by attention-grabbing impact (diff) | |
| download | asa.news-871985bc9eb42c6a088563e7c34db181f603f407.tar.xz asa.news-871985bc9eb42c6a088563e7c34db181f603f407.zip | |
fix: harden CI and close remaining test/security gaps
- Make webhook URL tests deterministic with injectable DNS resolver
- Wire tier parity checker into CI and root scripts
- Add rate_limits cleanup cron job (hourly, >1hr retention)
- Change rate limiter to fail closed on RPC error
- Add Go worker tests: parser, SSRF protection, error classification,
authentication, and worker pool (48 test functions)
Diffstat (limited to 'services/worker/internal/parser/parser_test.go')
| -rw-r--r-- | services/worker/internal/parser/parser_test.go | 287 |
1 files changed, 287 insertions, 0 deletions
diff --git a/services/worker/internal/parser/parser_test.go b/services/worker/internal/parser/parser_test.go new file mode 100644 index 0000000..7a28132 --- /dev/null +++ b/services/worker/internal/parser/parser_test.go @@ -0,0 +1,287 @@ +package parser + +import ( + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "testing" + "time" +) + +const validRSS = `<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> + <channel> + <title>Test Feed</title> + <link>https://example.com</link> + <item> + <guid>entry-1</guid> + <title>First Entry</title> + <link>https://example.com/1</link> + <description>Summary of the first entry.</description> + <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><p>Full content here.</p></content:encoded> + <author>Alice</author> + <pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate> + </item> + <item> + <guid>entry-2</guid> + <title>Second Entry</title> + <link>https://example.com/2</link> + <description>Summary of the second entry.</description> + <pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate> + </item> + </channel> +</rss>` +const validAtom = `<?xml version="1.0" encoding="UTF-8"?> +<feed xmlns="http://www.w3.org/2005/Atom"> + <title>Atom Feed</title> + <link href="https://atom.example.com"/> + <entry> + <id>atom-1</id> + <title>Atom Entry</title> + <link href="https://atom.example.com/1"/> + <summary>An atom summary.</summary> + <updated>2024-01-01T12:00:00Z</updated> + <author><name>Bob</name></author> + </entry> +</feed>` +const podcastRSS = `<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> + <channel> + <title>My Podcast</title> + <link>https://podcast.example.com</link> + <item> + <guid>ep-1</guid> + <title>Episode 1</title> + <enclosure url="https://cdn.example.com/ep1.mp3" type="audio/mpeg" length="12345678"/> + <pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate> + </item> + <item> + <guid>ep-2</guid> + <title>Episode 2</title> + <enclosure url="https://cdn.example.com/ep2.mp3" type="audio/mpeg" length="87654321"/> + <pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate> + </item> + </channel> +</rss>` +const mixedEnclosureRSS = `<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> + <channel> + <title>Mixed Feed</title> + <link>https://mixed.example.com</link> + <item> + <guid>item-audio</guid> + <title>Audio Item</title> + <enclosure url="https://cdn.example.com/audio.mp3" type="audio/mpeg" length="1000"/> + </item> + <item> + <guid>item-text</guid> + <title>Text Item</title> + <description>Just a text item.</description> + </item> + </channel> +</rss>` +const noGUIDRSS = `<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> + <channel> + <title>No GUID Feed</title> + <link>https://example.com</link> + <item> + <title>No GUID but has link</title> + <link>https://example.com/no-guid</link> + </item> + <item> + <title>No GUID no link</title> + <description>Only title and description.</description> + </item> + </channel> +</rss>` + +func TestParseValidRSS(test *testing.T) { + feedParser := NewParser() + result, parseError := feedParser.Parse("feed-123", nil, []byte(validRSS)) + + require.NoError(test, parseError) + assert.Equal(test, "Test Feed", result.FeedTitle) + assert.Equal(test, "https://example.com", result.SiteURL) + assert.Len(test, result.Entries, 2) + assert.Equal(test, 0.0, result.AudioEnclosureRatio) + + first := result.Entries[0] + + assert.Equal(test, "entry-1", first.GUID) + assert.Equal(test, "feed-123", first.FeedIdentifier) + require.NotNil(test, first.Title) + assert.Equal(test, "First Entry", *first.Title) + require.NotNil(test, first.URL) + assert.Equal(test, "https://example.com/1", *first.URL) + require.NotNil(test, first.Summary) + assert.Equal(test, "Summary of the first entry.", *first.Summary) + require.NotNil(test, first.ContentHTML) + assert.Contains(test, *first.ContentHTML, "Full content here.") + require.NotNil(test, first.PublishedAt) + assert.Nil(test, first.OwnerIdentifier) +} + +func TestParseValidAtom(test *testing.T) { + feedParser := NewParser() + result, parseError := feedParser.Parse("atom-feed", nil, []byte(validAtom)) + + require.NoError(test, parseError) + assert.Equal(test, "Atom Feed", result.FeedTitle) + assert.Len(test, result.Entries, 1) + + entry := result.Entries[0] + + assert.Equal(test, "atom-1", entry.GUID) + require.NotNil(test, entry.Title) + assert.Equal(test, "Atom Entry", *entry.Title) + require.NotNil(test, entry.Author) + assert.Equal(test, "Bob", *entry.Author) +} + +func TestParsePodcastFeed(test *testing.T) { + feedParser := NewParser() + result, parseError := feedParser.Parse("podcast-1", nil, []byte(podcastRSS)) + + require.NoError(test, parseError) + assert.Equal(test, "My Podcast", result.FeedTitle) + assert.Len(test, result.Entries, 2) + assert.InDelta(test, 1.0, result.AudioEnclosureRatio, 0.01) + + firstEpisode := result.Entries[0] + + require.NotNil(test, firstEpisode.EnclosureURL) + assert.Equal(test, "https://cdn.example.com/ep1.mp3", *firstEpisode.EnclosureURL) + require.NotNil(test, firstEpisode.EnclosureType) + assert.Equal(test, "audio/mpeg", *firstEpisode.EnclosureType) + require.NotNil(test, firstEpisode.EnclosureLength) + assert.Equal(test, int64(12345678), *firstEpisode.EnclosureLength) +} + +func TestParseMixedEnclosureFeed(test *testing.T) { + feedParser := NewParser() + result, parseError := feedParser.Parse("mixed-1", nil, []byte(mixedEnclosureRSS)) + + require.NoError(test, parseError) + assert.Len(test, result.Entries, 2) + assert.InDelta(test, 0.5, result.AudioEnclosureRatio, 0.01) + + audioItem := result.Entries[0] + + require.NotNil(test, audioItem.EnclosureURL) + + textItem := result.Entries[1] + + assert.Nil(test, textItem.EnclosureURL) + assert.Nil(test, textItem.EnclosureType) + assert.Nil(test, textItem.EnclosureLength) +} + +func TestParseGUIDFallback(test *testing.T) { + feedParser := NewParser() + result, parseError := feedParser.Parse("no-guid-feed", nil, []byte(noGUIDRSS)) + + require.NoError(test, parseError) + assert.Len(test, result.Entries, 2) + + entryWithLink := result.Entries[0] + + assert.Equal(test, "https://example.com/no-guid", entryWithLink.GUID) + + entryWithHash := result.Entries[1] + + assert.True(test, len(entryWithHash.GUID) > 0) + assert.Contains(test, entryWithHash.GUID, "sha256:") +} + +func TestParseOwnerIdentifier(test *testing.T) { + feedParser := NewParser() + ownerIdentifier := "user-abc" + result, parseError := feedParser.Parse("owned-feed", &ownerIdentifier, []byte(validRSS)) + + require.NoError(test, parseError) + + for _, entry := range result.Entries { + require.NotNil(test, entry.OwnerIdentifier) + assert.Equal(test, "user-abc", *entry.OwnerIdentifier) + } +} + +func TestParseInvalidXML(test *testing.T) { + feedParser := NewParser() + _, parseError := feedParser.Parse("bad-feed", nil, []byte("this is not xml at all")) + + assert.Error(test, parseError) +} + +func TestParsePublishedDateFallsBackToUpdated(test *testing.T) { + feedParser := NewParser() + result, parseError := feedParser.Parse("atom-feed", nil, []byte(validAtom)) + + require.NoError(test, parseError) + + entry := result.Entries[0] + + require.NotNil(test, entry.PublishedAt) + + expectedTime := time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC) + + assert.True(test, entry.PublishedAt.Equal(expectedTime)) +} + +func TestParseWordCount(test *testing.T) { + feedParser := NewParser() + result, parseError := feedParser.Parse("feed-123", nil, []byte(validRSS)) + + require.NoError(test, parseError) + + first := result.Entries[0] + + require.NotNil(test, first.WordCount) + assert.Greater(test, *first.WordCount, 0) +} + +func TestStripHTMLTags(test *testing.T) { + testCases := []struct { + name string + input string + expected string + }{ + {"plain text passthrough", "hello world", "hello world"}, + {"strips simple tags", "<p>hello</p>", "hello"}, + {"strips nested tags", "<div><p>hello <strong>world</strong></p></div>", "hello world"}, + {"handles unicode", "<p>café résumé naïve</p>", "café résumé naïve"}, + {"empty string", "", ""}, + {"only tags", "<br/><hr/>", ""}, + } + + for _, testCase := range testCases { + test.Run(testCase.name, func(test *testing.T) { + assert.Equal(test, testCase.expected, stripHTMLTags(testCase.input)) + }) + } +} + +func TestCountWords(test *testing.T) { + emptyResult := countWords("") + + assert.Nil(test, emptyResult) + + twoWords := countWords("hello world") + + require.NotNil(test, twoWords) + assert.Equal(test, 2, *twoWords) + + withExtraSpaces := countWords(" hello world ") + + require.NotNil(test, withExtraSpaces) + assert.Equal(test, 2, *withExtraSpaces) +} + +func TestStringPointerOrNil(test *testing.T) { + assert.Nil(test, stringPointerOrNil("")) + + result := stringPointerOrNil("hello") + + require.NotNil(test, result) + assert.Equal(test, "hello", *result) +} |