Merge branch 'dev' into link-removal2

This commit is contained in:
Nick Sweeting 2021-01-30 03:43:46 -05:00
commit 1fe95474c2
52 changed files with 896 additions and 550 deletions

View file

@ -96,7 +96,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I
if method_name not in details["history"]:
details["history"][method_name] = []
if should_run(snapshot, out_dir) or overwrite:
if should_run(snapshot, out_dir, overwrite):
log_archive_method_started(method_name)
result = method_function(snapshot=snapshot, out_dir=out_dir)

View file

@ -25,14 +25,17 @@ from ..config import (
from ..logging_util import TimedProgress
# output = '{domain}/'
@enforce_types
def should_save_archive_dot_org(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
def should_save_archive_dot_org(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(snapshot.snapshot_dir)
if is_static_file(snapshot.url):
return False
if (out_dir / "archive.org.txt").exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'archive.org.txt').exists():
# if open(path, 'r').read().strip() != 'None':
return False

View file

@ -20,18 +20,21 @@ from ..config import (
from ..logging_util import TimedProgress
# output = 'output.html'
@enforce_types
def should_save_dom(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
def should_save_dom(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(snapshot.snapshot_dir)
if is_static_file(snapshot.url):
return False
if (out_dir / 'output.html').exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'output.html').exists():
return False
return SAVE_DOM
@enforce_types
def save_dom(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""

View file

@ -21,14 +21,17 @@ from ..config import (
from ..logging_util import TimedProgress
# output = 'favicon.ico'
@enforce_types
def should_save_favicon(snapshot: Model, out_dir: Optional[str]=None) -> bool:
def should_save_favicon(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or snapshot.snapshot_dir
if (Path(out_dir) / 'favicon.ico').exists():
if not overwrite and (Path(out_dir) / 'favicon.ico').exists():
return False
return SAVE_FAVICON
@enforce_types
def save_favicon(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""

View file

@ -28,14 +28,20 @@ from ..config import (
from ..logging_util import TimedProgress
# output = 'git/'
# @contents = output.glob('*.*')
# @exists = self.contents.exists()
# @size => get_size(self.contents)
# @num_files => len(self.contents)
@enforce_types
def should_save_git(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
def should_save_git(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or snapshot.snapshot_dir
if is_static_file(snapshot.url):
return False
if (out_dir / "git").exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'git').exists():
return False
is_clonable_url = (

View file

@ -23,12 +23,21 @@ from ..config import (
)
from ..logging_util import TimedProgress
# output = 'headers.json'
@enforce_types
def should_save_headers(snapshot: Model, out_dir: Optional[str]=None) -> bool:
def should_save_headers(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or snapshot.snapshot_dir
if not SAVE_HEADERS:
return False
if overwrite:
return True
output = Path(out_dir or snapshot.snapshot_dir) / 'headers.json'
return not output.exists() and SAVE_HEADERS
return not output.exists()
@enforce_types

View file

@ -22,14 +22,17 @@ from ..config import (
from ..logging_util import TimedProgress
# output = 'media/'
@enforce_types
def should_save_media(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
def should_save_media(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or snapshot.snapshot_dir
if is_static_file(snapshot.url):
return False
if (out_dir / "media").exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'media').exists():
return False
return SAVE_MEDIA

View file

@ -39,13 +39,16 @@ def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> Archi
@enforce_types
def should_save_mercury(snapshot: Model, out_dir: Optional[str]=None) -> bool:
def should_save_mercury(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or snapshot.snapshot_dir
if is_static_file(snapshot.url):
return False
output = Path(out_dir or snapshot.snapshot_dir) / 'mercury'
return SAVE_MERCURY and MERCURY_VERSION and (not output.exists())
if not overwrite and output.exists():
return False
return SAVE_MERCURY and MERCURY_VERSION
@enforce_types

View file

@ -19,14 +19,16 @@ from ..config import (
)
from ..logging_util import TimedProgress
# output = 'output.pdf'
@enforce_types
def should_save_pdf(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
def should_save_pdf(snapshot: Model, verwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(snapshot.snapshot_dir)
if is_static_file(snapshot.url):
return False
if (out_dir / "output.pdf").exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'output.pdf').exists():
return False
return SAVE_PDF

View file

@ -25,6 +25,7 @@ from ..config import (
)
from ..logging_util import TimedProgress
@enforce_types
def get_html(snapshot: Model, path: Path) -> str:
"""
@ -47,14 +48,20 @@ def get_html(snapshot: Model, path: Path) -> str:
else:
return document
# output = 'readability/'
@enforce_types
def should_save_readability(snapshot: Model, out_dir: Optional[str]=None) -> bool:
def should_save_readability(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or snapshot.link_dir
if is_static_file(snapshot.url):
return False
output = Path(out_dir or snapshot.snapshot_dir) / 'readability'
return SAVE_READABILITY and READABILITY_VERSION and (not output.exists())
if not overwrite and output.exists():
return False
return SAVE_READABILITY and READABILITY_VERSION
@enforce_types

View file

@ -20,14 +20,16 @@ from ..config import (
from ..logging_util import TimedProgress
# output = 'screenshot.png'
@enforce_types
def should_save_screenshot(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
def should_save_screenshot(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(snapshot.snapshot_dir)
if is_static_file(snapshot.url):
return False
if (out_dir / "screenshot.png").exists():
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'screenshot.png').exists():
return False
return SAVE_SCREENSHOT

View file

@ -25,13 +25,16 @@ from ..logging_util import TimedProgress
@enforce_types
def should_save_singlefile(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
def should_save_singlefile(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
out_dir = out_dir or Path(snapshot.snapshot_dir)
if is_static_file(snapshot.url):
return False
output = out_dir / 'singlefile.html'
return SAVE_SINGLEFILE and SINGLEFILE_VERSION and (not output.exists())
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'singlefile.html').exists():
return False
return SAVE_SINGLEFILE
@enforce_types

View file

@ -62,13 +62,15 @@ class TitleParser(HTMLParser):
self.inside_title_tag = False
@enforce_types
def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if snapshot.title and not snapshot.title.lower().startswith('http'):
return False
# output = '{title}'
@enforce_types
def should_save_title(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
if is_static_file(snapshot.url):
False
# if snapshot already has valid title, skip it
if not overwrite and snapshot.title and not snapshot.title.lower().startswith('http'):
return False
return SAVE_TITLE

View file

@ -38,10 +38,10 @@ from ..logging_util import TimedProgress
@enforce_types
def should_save_wget(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
def should_save_wget(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
output_path = wget_output_path(snapshot)
out_dir = out_dir or Path(snapshot.snapshot_dir)
if output_path and (out_dir / output_path).exists():
if not overwrite output_path and (out_dir / output_path).exists():
return False
return SAVE_WGET
@ -68,7 +68,7 @@ def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOU
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
*(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if SAVE_WARC else ['--timestamping']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
@ -177,11 +177,22 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
if html_files:
return str(html_files[0].relative_to(snapshot.snapshot_dir))
# sometimes wget'd URLs have no ext and return non-html
# e.g. /some/example/rss/all -> some RSS XML content)
# /some/other/url.o4g -> some binary unrecognized ext)
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
for file_present in search_dir.iterdir():
if file_present == last_part_of_url:
return str(search_dir / file_present)
# Move up one directory level
search_dir = search_dir.parent
if search_dir == snapshot.snapshot_dir:
break
search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path)
if not search_dir.is_dir():