From c374d7695e87a7e1c76b7949b20bd846e0bb3793 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 3 Dec 2024 02:13:45 -0800 Subject: [PATCH] allow getting crawl from API as rss feed --- archivebox/api/v1_crawls.py | 17 +++++++++++++++-- archivebox/core/statemachines.py | 4 ++++ .../abx_plugin_chrome/__init__.py | 4 ++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py index 2c8ac63d..a11dd3a4 100644 --- a/archivebox/api/v1_crawls.py +++ b/archivebox/api/v1_crawls.py @@ -97,8 +97,8 @@ class CrawlSchema(Schema): def get_crawls(request): return Crawl.objects.all().distinct() -@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl") -def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False): +@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl") +def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False): """Get a specific Crawl by id or abid.""" crawl = None @@ -114,5 +114,18 @@ def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archivere crawl = crawl or Crawl.objects.get(id__icontains=crawl_id) except Exception: pass + + if crawl and as_rss: + # return snapshots as XML rss feed + urls = [ + {'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str} + for snapshot in crawl.snapshot_set.all() + ] + xml = '' + for url in urls: + xml += f'{url["url"]}{url["title"]}{url["bookmarked_at"]}{url["tags"]}' + xml += '' + return xml + return crawl diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index b850fdf8..e5221d2b 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -204,6 +204,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True): start_ts=timezone.now(), ) # lock the obj for the next ~30s to limit racing with other workers + # run_subcommand([ + # 'archivebox', 'extract', self.archiveresult.ABID, + # ]) + # create the output directory and fork the new extractor job subprocess self.archiveresult.create_output_dir() # self.archiveresult.extract(background=True) diff --git a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py index 09896924..40a3a829 100644 --- a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py +++ b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py @@ -28,7 +28,11 @@ def ready(): @abx.hookimpl def get_EXTRACTORS(): + """extractors that can be run for each URL, producing one or more ArchiveResults each""" from .extractors import PDF_EXTRACTOR, SCREENSHOT_EXTRACTOR, DOM_EXTRACTOR + # dom -> ./output.html -> ./chrome_dom/index.html + # screenshot -> ./screenshot.png -> ./chrome_screenshot/screenshot.png + # pdf -> ./output.pdf -> ./chrome_pdf/pdf.pdf return { 'pdf': PDF_EXTRACTOR, 'screenshot': SCREENSHOT_EXTRACTOR,