From c374d7695e87a7e1c76b7949b20bd846e0bb3793 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <github@sweeting.me>
Date: Tue, 3 Dec 2024 02:13:45 -0800
Subject: [PATCH] allow getting crawl from API as rss feed

---
 archivebox/api/v1_crawls.py                     | 17 +++++++++++++++--
 archivebox/core/statemachines.py                |  4 ++++
 .../abx_plugin_chrome/__init__.py               |  4 ++++
 3 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py
index 2c8ac63d..a11dd3a4 100644
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -97,8 +97,8 @@ class CrawlSchema(Schema):
 def get_crawls(request):
     return Crawl.objects.all().distinct()
 
-@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
-def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
+@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
+def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
     """Get a specific Crawl by id or abid."""
     
     crawl = None
@@ -114,5 +114,18 @@ def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archivere
         crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
     except Exception:
         pass
+    
+    if crawl and as_rss:
+        # return snapshots as XML rss feed
+        urls = [
+            {'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str}
+            for snapshot in crawl.snapshot_set.all()
+        ]
+        xml = '<rss version="2.0"><channel>'
+        for url in urls:
+            xml += f'<item><url>{url["url"]}</url><title>{url["title"]}</title><bookmarked_at>{url["bookmarked_at"]}</bookmarked_at><tags>{url["tags"]}</tags></item>'
+        xml += '</channel></rss>'
+        return xml
+    
     return crawl
 
diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py
index b850fdf8..e5221d2b 100644
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -204,6 +204,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
             start_ts=timezone.now(),
         )   # lock the obj for the next ~30s to limit racing with other workers
         
+        # run_subcommand([
+        #     'archivebox', 'extract', self.archiveresult.ABID,
+        # ])
+        
         # create the output directory and fork the new extractor job subprocess
         self.archiveresult.create_output_dir()
         # self.archiveresult.extract(background=True)
diff --git a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py
index 09896924..40a3a829 100644
--- a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py
+++ b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py
@@ -28,7 +28,11 @@ def ready():
 
 @abx.hookimpl
 def get_EXTRACTORS():
+    """extractors that can be run for each URL, producing one or more ArchiveResults each"""
     from .extractors import PDF_EXTRACTOR, SCREENSHOT_EXTRACTOR, DOM_EXTRACTOR
+    # dom           -> ./output.html      -> ./chrome_dom/index.html
+    # screenshot    -> ./screenshot.png   -> ./chrome_screenshot/screenshot.png
+    # pdf           -> ./output.pdf       -> ./chrome_pdf/pdf.pdf
     return {
         'pdf': PDF_EXTRACTOR,
         'screenshot': SCREENSHOT_EXTRACTOR,