load EXTRACTORS dynamically using importlib.import_module

This commit is contained in:
Nick Sweeting 2024-05-11 22:28:59 -07:00
parent c7f55fc3ba
commit 457c42bf84
No known key found for this signature in database
18 changed files with 198 additions and 40 deletions

View file

@ -17,8 +17,6 @@ except AttributeError:
def forwards_func(apps, schema_editor): def forwards_func(apps, schema_editor):
from core.models import EXTRACTORS
Snapshot = apps.get_model("core", "Snapshot") Snapshot = apps.get_model("core", "Snapshot")
ArchiveResult = apps.get_model("core", "ArchiveResult") ArchiveResult = apps.get_model("core", "ArchiveResult")

View file

@ -6,6 +6,7 @@ import json
from pathlib import Path from pathlib import Path
from typing import Optional, List from typing import Optional, List
from importlib import import_module
from django.db import models from django.db import models
from django.utils.functional import cached_property from django.utils.functional import cached_property
@ -20,9 +21,9 @@ from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl from ..util import parse_date, base_url, hashurl
from ..index.schema import Link from ..index.schema import Link
from ..index.html import snapshot_icons from ..index.html import snapshot_icons
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
STATUS_CHOICES = [ STATUS_CHOICES = [
("succeeded", "succeeded"), ("succeeded", "succeeded"),
("failed", "failed"), ("failed", "failed"),
@ -267,11 +268,13 @@ class ArchiveResultManager(models.Manager):
class ArchiveResult(models.Model): class ArchiveResult(models.Model):
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
uuid = models.UUIDField(default=uuid.uuid4, editable=False) uuid = models.UUIDField(default=uuid.uuid4, editable=False)
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
extractor = models.CharField(choices=EXTRACTORS, max_length=32) extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
cmd = JSONField() cmd = JSONField()
pwd = models.CharField(max_length=256) pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
@ -284,3 +287,34 @@ class ArchiveResult(models.Model):
def __str__(self): def __str__(self):
return self.extractor return self.extractor
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.link_dir)
@property
def extractor_module(self):
return EXTRACTORS[self.extractor]
def output_path(self) -> str:
"""return the canonical output filename or directory name within the snapshot dir"""
return self.extractor_module.get_output_path()
def embed_path(self) -> str:
"""
return the actual runtime-calculated path to the file on-disk that
should be used for user-facing iframe embeds of this result
"""
if hasattr(self.extractor_module, 'get_embed_path'):
return self.extractor_module.get_embed_path(self)
return self.extractor_module.get_output_path()
def legacy_output_path(self):
link = self.snapshot.as_link()
return link.canonical_outputs().get(f'{self.extractor}_path')
def output_exists(self) -> bool:
return Path(self.output_path()).exists()

View file

@ -1,11 +1,13 @@
__package__ = 'archivebox.extractors' __package__ = 'archivebox.extractors'
from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast
import os import os
import sys import sys
from pathlib import Path from pathlib import Path
from importlib import import_module
from typing import Callable, Optional, List, Iterable, Union
from datetime import datetime, timezone from datetime import datetime, timezone
from django.db.models import QuerySet from django.db.models import QuerySet
from ..config import ( from ..config import (
@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
log_archiving_finished(num_links) log_archiving_finished(num_links)
return all_links return all_links
EXTRACTORS_DIR = Path(__file__).parent
class ExtractorModuleProtocol(Protocol):
"""Type interface for an Extractor Module (WIP)"""
get_output_path: Callable
# TODO:
# get_embed_path: Callable | None
# should_extract(Snapshot)
# extract(Snapshot)
def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]:
"""iterate through archivebox/extractors/*.py and load extractor modules"""
EXTRACTORS = {}
for filename in EXTRACTORS_DIR.glob('*.py'):
if filename.name.startswith('__'):
continue
extractor_name = filename.name.replace('.py', '')
extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__))
assert getattr(extractor_module, 'get_output_path')
EXTRACTORS[extractor_name] = extractor_module
return EXTRACTORS
EXTRACTORS = get_extractors(EXTRACTORS_DIR)

View file

@ -24,6 +24,8 @@ from ..config import (
) )
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'archive.org.txt'
@enforce_types @enforce_types
@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'archive.org.txt').exists(): if not overwrite and (out_dir / get_output_path()).exists():
# if open(path, 'r', encoding='utf-8').read().strip() != 'None': # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
return False return False
@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
"""submit site to archive.org for archiving via their service, save returned archive url""" """submit site to archive.org for archiving via their service, save returned archive url"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'archive.org.txt' output: ArchiveOutput = get_output_path()
archive_org_url = None archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url) submit_url = 'https://web.archive.org/save/{}'.format(link.url)
# later options take precedence # later options take precedence
@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
archive_org_url = archive_org_url or submit_url archive_org_url = archive_org_url or submit_url
with open(str(out_dir / output), 'w', encoding='utf-8') as f: with open(str(out_dir / output), 'w', encoding='utf-8') as f:
f.write(archive_org_url) f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=str(out_dir)) chmod_file(str(out_dir / output), cwd=str(out_dir))
output = archive_org_url output = archive_org_url
return ArchiveResult( return ArchiveResult(

View file

@ -19,6 +19,9 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'output.html'
@enforce_types @enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'output.html').exists(): if not overwrite and (out_dir / get_output_path()).exists():
if (out_dir / 'output.html').stat().st_size > 1: if (out_dir / get_output_path()).stat().st_size > 1:
return False return False
return SAVE_DOM return SAVE_DOM
@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'output.html' output: ArchiveOutput = get_output_path()
output_path = out_dir / output output_path = out_dir / output
cmd = [ cmd = [
*chrome_args(), *chrome_args(),

View file

@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti
return SAVE_FAVICON return SAVE_FAVICON
@enforce_types
def get_output_path():
return 'favicon.ico'
@enforce_types @enforce_types
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api""" """download site favicon from google's favicon api"""

View file

@ -26,6 +26,19 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'git/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
try:
return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
except IndexError:
pass
return get_output_path()
@enforce_types @enforce_types
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'git').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
is_clonable_url = ( is_clonable_url = (
@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""download full site using git""" """download full site using git"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'git' output: ArchiveOutput = get_output_path()
output_path = out_dir / output output_path = out_dir / output
output_path.mkdir(exist_ok=True) output_path.mkdir(exist_ok=True)
cmd = [ cmd = [

View file

@ -23,10 +23,14 @@ from ..config import (
) )
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'headers.json'
@enforce_types @enforce_types
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'headers.json').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_HEADERS return SAVE_HEADERS
@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
out_dir = Path(out_dir or link.link_dir) out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() output_folder = out_dir.absolute()
output: ArchiveOutput = 'headers.json' output: ArchiveOutput = get_output_path()
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
try: try:
json_headers = get_headers(link.url, timeout=timeout) json_headers = get_headers(link.url, timeout=timeout)
output_folder.mkdir(exist_ok=True) output_folder.mkdir(exist_ok=True)
atomic_write(str(output_folder / "headers.json"), json_headers) atomic_write(str(output_folder / get_output_path()), json_headers)
except (Exception, OSError) as err: except (Exception, OSError) as err:
status = 'failed' status = 'failed'
output = err output = err

View file

@ -19,6 +19,12 @@ from ..util import (
) )
from .title import get_html from .title import get_html
def get_output_path():
return "htmltotext.txt"
class HTMLTextExtractor(HTMLParser): class HTMLTextExtractor(HTMLParser):
TEXT_ATTRS = [ TEXT_ATTRS = [
"alt", "cite", "href", "label", "alt", "cite", "href", "label",
@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'htmltotext.txt').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_HTMLTOTEXT return SAVE_HTMLTOTEXT
@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""extract search-indexing-friendly text from an HTML document""" """extract search-indexing-friendly text from an HTML document"""
out_dir = Path(out_dir or link.link_dir) out_dir = Path(out_dir or link.link_dir)
output = "htmltotext.txt" output = get_output_path()
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html'] cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')

View file

@ -22,13 +22,27 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'media/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
out_dir = archiveresult.snapshot_dir / get_output_path()
try:
return get_output_path() + list(out_dir.glob('*.mp4'))[0].name
except IndexError:
return get_output_path()
@enforce_types @enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'media').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_MEDIA return SAVE_MEDIA
@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'media' output: ArchiveOutput = get_output_path()
output_path = out_dir / output output_path = out_dir / output
output_path.mkdir(exist_ok=True) output_path.mkdir(exist_ok=True)
# later options take precedence # later options take precedence

View file

@ -24,6 +24,12 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'mercury/'
def get_embed_path(archiveresult=None):
return get_output_path() + 'content.html'
@enforce_types @enforce_types
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError: def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'mercury').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_MERCURY return SAVE_MERCURY
@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
"""download reader friendly version using @postlight/mercury-parser""" """download reader friendly version using @postlight/mercury-parser"""
out_dir = Path(out_dir or link.link_dir) out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "mercury" output_folder = out_dir.absolute() / get_output_path()
output = "mercury" output = get_output_path()
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')

View file

@ -19,13 +19,17 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'output.pdf'
@enforce_types @enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'output.pdf').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_PDF return SAVE_PDF
@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'output.pdf' output: ArchiveOutput = get_output_path()
cmd = [ cmd = [
*chrome_args(), *chrome_args(),
'--print-to-pdf', '--print-to-pdf',
@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
hints = (result.stderr or result.stdout).decode() hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save PDF', hints) raise ArchiveError('Failed to save PDF', hints)
chmod_file('output.pdf', cwd=str(out_dir)) chmod_file(get_output_path(), cwd=str(out_dir))
except Exception as err: except Exception as err:
status = 'failed' status = 'failed'
output = err output = err

View file

@ -22,6 +22,12 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from .title import get_html from .title import get_html
def get_output_path():
return 'readability/'
def get_embed_path(archiveresult=None):
return get_output_path() + 'content.html'
@enforce_types @enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'readability').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_READABILITY return SAVE_READABILITY
@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
"""download reader friendly version using @mozilla/readability""" """download reader friendly version using @mozilla/readability"""
out_dir = Path(out_dir or link.link_dir) out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "readability" output_folder = out_dir.absolute() / get_output_path()
output = "readability" output = get_output_path()
# Readability Docs: https://github.com/mozilla/readability # Readability Docs: https://github.com/mozilla/readability

View file

@ -19,6 +19,9 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'screenshot.png'
@enforce_types @enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'screenshot.png').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_SCREENSHOT return SAVE_SCREENSHOT
@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'screenshot.png' output: ArchiveOutput = get_output_path()
cmd = [ cmd = [
*chrome_args(), *chrome_args(),
'--screenshot', '--screenshot',

View file

@ -26,13 +26,17 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
return 'singlefile.html'
@enforce_types @enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'singlefile.html').exists(): if not overwrite and (out_dir / get_output_path()).exists():
return False return False
return SAVE_SINGLEFILE return SAVE_SINGLEFILE
@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
"""download full site using single-file""" """download full site using single-file"""
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output = "singlefile.html" output = get_output_path()
browser_args = chrome_args(CHROME_TIMEOUT=0) browser_args = chrome_args(CHROME_TIMEOUT=0)

View file

@ -60,6 +60,7 @@ class TitleParser(HTMLParser):
if tag.lower() == "title": if tag.lower() == "title":
self.inside_title_tag = False self.inside_title_tag = False
@enforce_types @enforce_types
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
""" """
@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
else: else:
return document return document
def get_output_path():
# TODO: actually save title to this file
# (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
return 'title.json'
@enforce_types @enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
# if link already has valid title, skip it # if link already has valid title, skip it

View file

@ -35,6 +35,18 @@ from ..config import (
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path():
# TODO: actually save output into this folder, instead of do {domain}/**/index.html
return 'wget/'
def get_embed_path(archiveresult=None):
if not archiveresult:
return get_output_path()
link = archiveresult.snapshot.as_link()
return wget_output_path(link)
@enforce_types @enforce_types
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
output_path = wget_output_path(link) output_path = wget_output_path(link)

View file

@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str:
cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
def calc_snapshot_icons(): def calc_snapshot_icons():
from core.models import EXTRACTORS from core.models import EXTRACTOR_CHOICES
# start = datetime.now(timezone.utc) # start = datetime.now(timezone.utc)
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
# Missing specific entry for WARC # Missing specific entry for WARC
extractor_outputs = defaultdict(lambda: None) extractor_outputs = defaultdict(lambda: None)
for extractor, _ in EXTRACTORS: for extractor, _ in EXTRACTOR_CHOICES:
for result in archive_results: for result in archive_results:
if result.extractor == extractor and result: if result.extractor == extractor and result:
extractor_outputs[extractor] = result extractor_outputs[extractor] = result
for extractor, _ in EXTRACTORS: for extractor, _ in EXTRACTOR_CHOICES:
if extractor not in exclude: if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)