mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-18 00:54:26 -04:00
allow getting crawl from API as rss feed
This commit is contained in:
parent
eae7ed8447
commit
c374d7695e
3 changed files with 23 additions and 2 deletions
|
@ -97,8 +97,8 @@ class CrawlSchema(Schema):
|
||||||
def get_crawls(request):
|
def get_crawls(request):
|
||||||
return Crawl.objects.all().distinct()
|
return Crawl.objects.all().distinct()
|
||||||
|
|
||||||
@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
|
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
|
||||||
def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
|
def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
|
||||||
"""Get a specific Crawl by id or abid."""
|
"""Get a specific Crawl by id or abid."""
|
||||||
|
|
||||||
crawl = None
|
crawl = None
|
||||||
|
@ -114,5 +114,18 @@ def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archivere
|
||||||
crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
|
crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if crawl and as_rss:
|
||||||
|
# return snapshots as XML rss feed
|
||||||
|
urls = [
|
||||||
|
{'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str}
|
||||||
|
for snapshot in crawl.snapshot_set.all()
|
||||||
|
]
|
||||||
|
xml = '<rss version="2.0"><channel>'
|
||||||
|
for url in urls:
|
||||||
|
xml += f'<item><url>{url["url"]}</url><title>{url["title"]}</title><bookmarked_at>{url["bookmarked_at"]}</bookmarked_at><tags>{url["tags"]}</tags></item>'
|
||||||
|
xml += '</channel></rss>'
|
||||||
|
return xml
|
||||||
|
|
||||||
return crawl
|
return crawl
|
||||||
|
|
||||||
|
|
|
@ -204,6 +204,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
start_ts=timezone.now(),
|
start_ts=timezone.now(),
|
||||||
) # lock the obj for the next ~30s to limit racing with other workers
|
) # lock the obj for the next ~30s to limit racing with other workers
|
||||||
|
|
||||||
|
# run_subcommand([
|
||||||
|
# 'archivebox', 'extract', self.archiveresult.ABID,
|
||||||
|
# ])
|
||||||
|
|
||||||
# create the output directory and fork the new extractor job subprocess
|
# create the output directory and fork the new extractor job subprocess
|
||||||
self.archiveresult.create_output_dir()
|
self.archiveresult.create_output_dir()
|
||||||
# self.archiveresult.extract(background=True)
|
# self.archiveresult.extract(background=True)
|
||||||
|
|
|
@ -28,7 +28,11 @@ def ready():
|
||||||
|
|
||||||
@abx.hookimpl
|
@abx.hookimpl
|
||||||
def get_EXTRACTORS():
|
def get_EXTRACTORS():
|
||||||
|
"""extractors that can be run for each URL, producing one or more ArchiveResults each"""
|
||||||
from .extractors import PDF_EXTRACTOR, SCREENSHOT_EXTRACTOR, DOM_EXTRACTOR
|
from .extractors import PDF_EXTRACTOR, SCREENSHOT_EXTRACTOR, DOM_EXTRACTOR
|
||||||
|
# dom -> ./output.html -> ./chrome_dom/index.html
|
||||||
|
# screenshot -> ./screenshot.png -> ./chrome_screenshot/screenshot.png
|
||||||
|
# pdf -> ./output.pdf -> ./chrome_pdf/pdf.pdf
|
||||||
return {
|
return {
|
||||||
'pdf': PDF_EXTRACTOR,
|
'pdf': PDF_EXTRACTOR,
|
||||||
'screenshot': SCREENSHOT_EXTRACTOR,
|
'screenshot': SCREENSHOT_EXTRACTOR,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue