diff --git a/tests/mock_server/templates/example.atom b/tests/mock_server/templates/example.atom new file mode 100644 index 00000000..9d71abb1 --- /dev/null +++ b/tests/mock_server/templates/example.atom @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="utf-8"?> +<feed + xml:lang="en" + xmlns="http://www.w3.org/2005/Atom" +> + <id>http://www.example.com/</id> + <title>Example of an Atom feed</title> + <link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" /> + <link rel="alternate" type="text/html" href="http://www.example.com/" /> + <author> + <name>Jim Winstead</name> + </author> + <updated>2024-02-26T03:18:26Z</updated> + <entry> + <title>Example</title> + <link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" /> + <id>tag:example.com,2024-02-25:3319</id> + <updated>2024-02-26T03:18:26Z</updated> + <published>2024-02-25T19:18:25-08:00</published> + <category term="Tag1" scheme="http://example.com/archive" /> + <category term="Tag2" scheme="http://example.com/archive" /> + <content type="html">This is some <b>content</b></content> + </entry> +</feed> diff --git a/tests/mock_server/templates/example.rss b/tests/mock_server/templates/example.rss new file mode 100644 index 00000000..d47a5a38 --- /dev/null +++ b/tests/mock_server/templates/example.rss @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="utf-8"?> +<rss version="2.0" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:admin="http://webns.net/mvcb/" + xmlns:content="http://purl.org/rss/1.0/modules/content/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> +<channel> + <title>Sample Feed</title> + <link>http://example.org/</link> + <description>For documentation only</description> + <dc:language>en-us</dc:language> + <dc:creator>Nobody (nobody@example.org)</dc:creator> + <dc:rights>Public domain</dc:rights> + <dc:date>2024-02-26T17:28:12-08:00</dc:date> + <admin:generatorAgent rdf:resource="http://www.example.org/"/> + <admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/> + + <item> + <title>First!</title> + <link>http://127.0.0.1:8080/static/example.com.html</link> + <guid isPermaLink="false">just-an@example.org</guid> + <description> + This has a description. + </description> + <dc:subject>Tag1 Tag2</dc:subject> + <dc:date>2024-02-26T17:28:12-08:00</dc:date> + <content:encoded><![CDATA[ + This has a <b>description</b>.]]> + </content:encoded> + </item> +</channel> +</rss> diff --git a/tests/test_add.py b/tests/test_add.py index 331178fe..9675f361 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -91,3 +91,71 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process): assert (archived_item_path / "warc").exists() assert not (archived_item_path / "singlefile.html").exists() + +def test_generic_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://purl.org/dc/elements/1.1/" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1 Tag2" in tags + +def test_pinboard_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=pinboard_rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_atom(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.w3.org/2005/Atom" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags