mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
load EXTRACTORS dynamically using importlib.import_module
This commit is contained in:
parent
c7f55fc3ba
commit
457c42bf84
18 changed files with 198 additions and 40 deletions
|
@ -17,8 +17,6 @@ except AttributeError:
|
||||||
|
|
||||||
|
|
||||||
def forwards_func(apps, schema_editor):
|
def forwards_func(apps, schema_editor):
|
||||||
from core.models import EXTRACTORS
|
|
||||||
|
|
||||||
Snapshot = apps.get_model("core", "Snapshot")
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ import json
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
|
from importlib import import_module
|
||||||
|
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils.functional import cached_property
|
from django.utils.functional import cached_property
|
||||||
|
@ -20,9 +21,9 @@ from ..system import get_dir_size
|
||||||
from ..util import parse_date, base_url, hashurl
|
from ..util import parse_date, base_url, hashurl
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..index.html import snapshot_icons
|
from ..index.html import snapshot_icons
|
||||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||||
|
|
||||||
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
|
||||||
STATUS_CHOICES = [
|
STATUS_CHOICES = [
|
||||||
("succeeded", "succeeded"),
|
("succeeded", "succeeded"),
|
||||||
("failed", "failed"),
|
("failed", "failed"),
|
||||||
|
@ -267,11 +268,13 @@ class ArchiveResultManager(models.Manager):
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResult(models.Model):
|
class ArchiveResult(models.Model):
|
||||||
|
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
|
||||||
|
|
||||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||||
uuid = models.UUIDField(default=uuid.uuid4, editable=False)
|
uuid = models.UUIDField(default=uuid.uuid4, editable=False)
|
||||||
|
|
||||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||||
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
|
||||||
cmd = JSONField()
|
cmd = JSONField()
|
||||||
pwd = models.CharField(max_length=256)
|
pwd = models.CharField(max_length=256)
|
||||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||||
|
@ -284,3 +287,34 @@ class ArchiveResult(models.Model):
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.extractor
|
return self.extractor
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def snapshot_dir(self):
|
||||||
|
return Path(self.snapshot.link_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def extractor_module(self):
|
||||||
|
return EXTRACTORS[self.extractor]
|
||||||
|
|
||||||
|
def output_path(self) -> str:
|
||||||
|
"""return the canonical output filename or directory name within the snapshot dir"""
|
||||||
|
return self.extractor_module.get_output_path()
|
||||||
|
|
||||||
|
def embed_path(self) -> str:
|
||||||
|
"""
|
||||||
|
return the actual runtime-calculated path to the file on-disk that
|
||||||
|
should be used for user-facing iframe embeds of this result
|
||||||
|
"""
|
||||||
|
|
||||||
|
if hasattr(self.extractor_module, 'get_embed_path'):
|
||||||
|
return self.extractor_module.get_embed_path(self)
|
||||||
|
|
||||||
|
return self.extractor_module.get_output_path()
|
||||||
|
|
||||||
|
def legacy_output_path(self):
|
||||||
|
link = self.snapshot.as_link()
|
||||||
|
return link.canonical_outputs().get(f'{self.extractor}_path')
|
||||||
|
|
||||||
|
def output_exists(self) -> bool:
|
||||||
|
return Path(self.output_path()).exists()
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
__package__ = 'archivebox.extractors'
|
__package__ = 'archivebox.extractors'
|
||||||
|
|
||||||
|
from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from importlib import import_module
|
||||||
from typing import Callable, Optional, List, Iterable, Union
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from ..config import (
|
from ..config import (
|
||||||
|
@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
||||||
|
|
||||||
log_archiving_finished(num_links)
|
log_archiving_finished(num_links)
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
EXTRACTORS_DIR = Path(__file__).parent
|
||||||
|
|
||||||
|
class ExtractorModuleProtocol(Protocol):
|
||||||
|
"""Type interface for an Extractor Module (WIP)"""
|
||||||
|
|
||||||
|
get_output_path: Callable
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# get_embed_path: Callable | None
|
||||||
|
# should_extract(Snapshot)
|
||||||
|
# extract(Snapshot)
|
||||||
|
|
||||||
|
|
||||||
|
def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]:
|
||||||
|
"""iterate through archivebox/extractors/*.py and load extractor modules"""
|
||||||
|
EXTRACTORS = {}
|
||||||
|
|
||||||
|
for filename in EXTRACTORS_DIR.glob('*.py'):
|
||||||
|
if filename.name.startswith('__'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
extractor_name = filename.name.replace('.py', '')
|
||||||
|
|
||||||
|
extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__))
|
||||||
|
|
||||||
|
assert getattr(extractor_module, 'get_output_path')
|
||||||
|
EXTRACTORS[extractor_name] = extractor_module
|
||||||
|
|
||||||
|
return EXTRACTORS
|
||||||
|
|
||||||
|
EXTRACTORS = get_extractors(EXTRACTORS_DIR)
|
||||||
|
|
|
@ -24,6 +24,8 @@ from ..config import (
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'archive.org.txt'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'archive.org.txt').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
|
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
||||||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'archive.org.txt'
|
output: ArchiveOutput = get_output_path()
|
||||||
archive_org_url = None
|
archive_org_url = None
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||||
# later options take precedence
|
# later options take precedence
|
||||||
|
@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
||||||
archive_org_url = archive_org_url or submit_url
|
archive_org_url = archive_org_url or submit_url
|
||||||
with open(str(out_dir / output), 'w', encoding='utf-8') as f:
|
with open(str(out_dir / output), 'w', encoding='utf-8') as f:
|
||||||
f.write(archive_org_url)
|
f.write(archive_org_url)
|
||||||
chmod_file('archive.org.txt', cwd=str(out_dir))
|
chmod_file(str(out_dir / output), cwd=str(out_dir))
|
||||||
output = archive_org_url
|
output = archive_org_url
|
||||||
|
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
|
|
|
@ -19,6 +19,9 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'output.html'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'output.html').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
if (out_dir / 'output.html').stat().st_size > 1:
|
if (out_dir / get_output_path()).stat().st_size > 1:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_DOM
|
return SAVE_DOM
|
||||||
|
@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
"""print HTML of site to file using chrome --dump-html"""
|
"""print HTML of site to file using chrome --dump-html"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'output.html'
|
output: ArchiveOutput = get_output_path()
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
*chrome_args(),
|
||||||
|
|
|
@ -8,8 +8,8 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from ..system import chmod_file, run
|
from ..system import chmod_file, run
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
domain,
|
domain,
|
||||||
dedupe,
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti
|
||||||
|
|
||||||
return SAVE_FAVICON
|
return SAVE_FAVICON
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def get_output_path():
|
||||||
|
return 'favicon.ico'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
"""download site favicon from google's favicon api"""
|
"""download site favicon from google's favicon api"""
|
||||||
|
|
|
@ -26,6 +26,19 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'git/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
if not archiveresult:
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
|
try:
|
||||||
|
return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'git').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
is_clonable_url = (
|
is_clonable_url = (
|
||||||
|
@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
"""download full site using git"""
|
"""download full site using git"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'git'
|
output: ArchiveOutput = get_output_path()
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
output_path.mkdir(exist_ok=True)
|
output_path.mkdir(exist_ok=True)
|
||||||
cmd = [
|
cmd = [
|
||||||
|
|
|
@ -23,10 +23,14 @@ from ..config import (
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'headers.json'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'headers.json').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_HEADERS
|
return SAVE_HEADERS
|
||||||
|
@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute()
|
output_folder = out_dir.absolute()
|
||||||
output: ArchiveOutput = 'headers.json'
|
output: ArchiveOutput = get_output_path()
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
||||||
try:
|
try:
|
||||||
json_headers = get_headers(link.url, timeout=timeout)
|
json_headers = get_headers(link.url, timeout=timeout)
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_folder.mkdir(exist_ok=True)
|
||||||
atomic_write(str(output_folder / "headers.json"), json_headers)
|
atomic_write(str(output_folder / get_output_path()), json_headers)
|
||||||
except (Exception, OSError) as err:
|
except (Exception, OSError) as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
|
|
@ -19,6 +19,12 @@ from ..util import (
|
||||||
)
|
)
|
||||||
from .title import get_html
|
from .title import get_html
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return "htmltotext.txt"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLTextExtractor(HTMLParser):
|
class HTMLTextExtractor(HTMLParser):
|
||||||
TEXT_ATTRS = [
|
TEXT_ATTRS = [
|
||||||
"alt", "cite", "href", "label",
|
"alt", "cite", "href", "label",
|
||||||
|
@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'htmltotext.txt').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_HTMLTOTEXT
|
return SAVE_HTMLTOTEXT
|
||||||
|
@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
"""extract search-indexing-friendly text from an HTML document"""
|
"""extract search-indexing-friendly text from an HTML document"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output = "htmltotext.txt"
|
output = get_output_path()
|
||||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||||
|
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
|
|
@ -22,13 +22,27 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'media/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
if not archiveresult:
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
|
out_dir = archiveresult.snapshot_dir / get_output_path()
|
||||||
|
try:
|
||||||
|
return get_output_path() + list(out_dir.glob('*.mp4'))[0].name
|
||||||
|
except IndexError:
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'media').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_MEDIA
|
return SAVE_MEDIA
|
||||||
|
@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
||||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'media'
|
output: ArchiveOutput = get_output_path()
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
output_path.mkdir(exist_ok=True)
|
output_path.mkdir(exist_ok=True)
|
||||||
# later options take precedence
|
# later options take precedence
|
||||||
|
|
|
@ -24,6 +24,12 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'mercury/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
return get_output_path() + 'content.html'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
|
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
|
||||||
|
@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'mercury').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_MERCURY
|
return SAVE_MERCURY
|
||||||
|
@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
"""download reader friendly version using @postlight/mercury-parser"""
|
"""download reader friendly version using @postlight/mercury-parser"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute() / "mercury"
|
output_folder = out_dir.absolute() / get_output_path()
|
||||||
output = "mercury"
|
output = get_output_path()
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
|
|
@ -19,13 +19,17 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'output.pdf'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'output.pdf').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_PDF
|
return SAVE_PDF
|
||||||
|
@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
"""print PDF of site to file using chrome --headless"""
|
"""print PDF of site to file using chrome --headless"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'output.pdf'
|
output: ArchiveOutput = get_output_path()
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
*chrome_args(),
|
||||||
'--print-to-pdf',
|
'--print-to-pdf',
|
||||||
|
@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
hints = (result.stderr or result.stdout).decode()
|
hints = (result.stderr or result.stdout).decode()
|
||||||
raise ArchiveError('Failed to save PDF', hints)
|
raise ArchiveError('Failed to save PDF', hints)
|
||||||
|
|
||||||
chmod_file('output.pdf', cwd=str(out_dir))
|
chmod_file(get_output_path(), cwd=str(out_dir))
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
|
|
@ -22,6 +22,12 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from .title import get_html
|
from .title import get_html
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'readability/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
return get_output_path() + 'content.html'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'readability').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_READABILITY
|
return SAVE_READABILITY
|
||||||
|
@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
"""download reader friendly version using @mozilla/readability"""
|
"""download reader friendly version using @mozilla/readability"""
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute() / "readability"
|
output_folder = out_dir.absolute() / get_output_path()
|
||||||
output = "readability"
|
output = get_output_path()
|
||||||
|
|
||||||
# Readability Docs: https://github.com/mozilla/readability
|
# Readability Docs: https://github.com/mozilla/readability
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,9 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'screenshot.png'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'screenshot.png').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_SCREENSHOT
|
return SAVE_SCREENSHOT
|
||||||
|
@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
"""take screenshot of site using chrome --headless"""
|
"""take screenshot of site using chrome --headless"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = 'screenshot.png'
|
output: ArchiveOutput = get_output_path()
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
*chrome_args(),
|
||||||
'--screenshot',
|
'--screenshot',
|
||||||
|
|
|
@ -26,13 +26,17 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
return 'singlefile.html'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
if not overwrite and (out_dir / 'singlefile.html').exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_SINGLEFILE
|
return SAVE_SINGLEFILE
|
||||||
|
@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
"""download full site using single-file"""
|
"""download full site using single-file"""
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output = "singlefile.html"
|
output = get_output_path()
|
||||||
|
|
||||||
browser_args = chrome_args(CHROME_TIMEOUT=0)
|
browser_args = chrome_args(CHROME_TIMEOUT=0)
|
||||||
|
|
||||||
|
|
|
@ -60,6 +60,7 @@ class TitleParser(HTMLParser):
|
||||||
if tag.lower() == "title":
|
if tag.lower() == "title":
|
||||||
self.inside_title_tag = False
|
self.inside_title_tag = False
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||||
else:
|
else:
|
||||||
return document
|
return document
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
# TODO: actually save title to this file
|
||||||
|
# (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
|
||||||
|
return 'title.json'
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
# if link already has valid title, skip it
|
# if link already has valid title, skip it
|
||||||
|
|
|
@ -35,6 +35,18 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
def get_output_path():
|
||||||
|
# TODO: actually save output into this folder, instead of do {domain}/**/index.html
|
||||||
|
return 'wget/'
|
||||||
|
|
||||||
|
def get_embed_path(archiveresult=None):
|
||||||
|
if not archiveresult:
|
||||||
|
return get_output_path()
|
||||||
|
|
||||||
|
link = archiveresult.snapshot.as_link()
|
||||||
|
return wget_output_path(link)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
output_path = wget_output_path(link)
|
output_path = wget_output_path(link)
|
||||||
|
|
|
@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str:
|
||||||
cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
|
cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
|
||||||
|
|
||||||
def calc_snapshot_icons():
|
def calc_snapshot_icons():
|
||||||
from core.models import EXTRACTORS
|
from core.models import EXTRACTOR_CHOICES
|
||||||
# start = datetime.now(timezone.utc)
|
# start = datetime.now(timezone.utc)
|
||||||
|
|
||||||
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||||
|
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
|
||||||
# Missing specific entry for WARC
|
# Missing specific entry for WARC
|
||||||
|
|
||||||
extractor_outputs = defaultdict(lambda: None)
|
extractor_outputs = defaultdict(lambda: None)
|
||||||
for extractor, _ in EXTRACTORS:
|
for extractor, _ in EXTRACTOR_CHOICES:
|
||||||
for result in archive_results:
|
for result in archive_results:
|
||||||
if result.extractor == extractor and result:
|
if result.extractor == extractor and result:
|
||||||
extractor_outputs[extractor] = result
|
extractor_outputs[extractor] = result
|
||||||
|
|
||||||
for extractor, _ in EXTRACTORS:
|
for extractor, _ in EXTRACTOR_CHOICES:
|
||||||
if extractor not in exclude:
|
if extractor not in exclude:
|
||||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||||
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue