mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-27 21:24:16 -04:00
Merge branch 'dev' into link-removal2
This commit is contained in:
commit
1fe95474c2
52 changed files with 896 additions and 550 deletions
|
@ -96,7 +96,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I
|
|||
if method_name not in details["history"]:
|
||||
details["history"][method_name] = []
|
||||
|
||||
if should_run(snapshot, out_dir) or overwrite:
|
||||
if should_run(snapshot, out_dir, overwrite):
|
||||
log_archive_method_started(method_name)
|
||||
|
||||
result = method_function(snapshot=snapshot, out_dir=out_dir)
|
||||
|
|
|
@ -25,14 +25,17 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
# output = '{domain}/'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_archive_dot_org(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
def should_save_archive_dot_org(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
if (out_dir / "archive.org.txt").exists():
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'archive.org.txt').exists():
|
||||
# if open(path, 'r').read().strip() != 'None':
|
||||
return False
|
||||
|
||||
|
|
|
@ -20,18 +20,21 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
# output = 'output.html'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_dom(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
def should_save_dom(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
if (out_dir / 'output.html').exists():
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'output.html').exists():
|
||||
return False
|
||||
|
||||
return SAVE_DOM
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_dom(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
|
|
@ -21,14 +21,17 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
# output = 'favicon.ico'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_favicon(snapshot: Model, out_dir: Optional[str]=None) -> bool:
|
||||
def should_save_favicon(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
if (Path(out_dir) / 'favicon.ico').exists():
|
||||
if not overwrite and (Path(out_dir) / 'favicon.ico').exists():
|
||||
return False
|
||||
|
||||
return SAVE_FAVICON
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_favicon(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
|
|
@ -28,14 +28,20 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
# output = 'git/'
|
||||
# @contents = output.glob('*.*')
|
||||
# @exists = self.contents.exists()
|
||||
# @size => get_size(self.contents)
|
||||
# @num_files => len(self.contents)
|
||||
|
||||
@enforce_types
|
||||
def should_save_git(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
def should_save_git(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
if (out_dir / "git").exists():
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'git').exists():
|
||||
return False
|
||||
|
||||
is_clonable_url = (
|
||||
|
|
|
@ -23,12 +23,21 @@ from ..config import (
|
|||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
# output = 'headers.json'
|
||||
|
||||
@enforce_types
|
||||
def should_save_headers(snapshot: Model, out_dir: Optional[str]=None) -> bool:
|
||||
def should_save_headers(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
|
||||
if not SAVE_HEADERS:
|
||||
return False
|
||||
|
||||
if overwrite:
|
||||
return True
|
||||
|
||||
output = Path(out_dir or snapshot.snapshot_dir) / 'headers.json'
|
||||
return not output.exists() and SAVE_HEADERS
|
||||
return not output.exists()
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -22,14 +22,17 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
# output = 'media/'
|
||||
|
||||
@enforce_types
|
||||
def should_save_media(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
def should_save_media(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
if (out_dir / "media").exists():
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'media').exists():
|
||||
return False
|
||||
|
||||
return SAVE_MEDIA
|
||||
|
|
|
@ -39,13 +39,16 @@ def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> Archi
|
|||
|
||||
|
||||
@enforce_types
|
||||
def should_save_mercury(snapshot: Model, out_dir: Optional[str]=None) -> bool:
|
||||
def should_save_mercury(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
output = Path(out_dir or snapshot.snapshot_dir) / 'mercury'
|
||||
return SAVE_MERCURY and MERCURY_VERSION and (not output.exists())
|
||||
if not overwrite and output.exists():
|
||||
return False
|
||||
|
||||
return SAVE_MERCURY and MERCURY_VERSION
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -19,14 +19,16 @@ from ..config import (
|
|||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
# output = 'output.pdf'
|
||||
|
||||
@enforce_types
|
||||
def should_save_pdf(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
def should_save_pdf(snapshot: Model, verwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
if (out_dir / "output.pdf").exists():
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'output.pdf').exists():
|
||||
return False
|
||||
|
||||
return SAVE_PDF
|
||||
|
|
|
@ -25,6 +25,7 @@ from ..config import (
|
|||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
def get_html(snapshot: Model, path: Path) -> str:
|
||||
"""
|
||||
|
@ -47,14 +48,20 @@ def get_html(snapshot: Model, path: Path) -> str:
|
|||
else:
|
||||
return document
|
||||
|
||||
|
||||
# output = 'readability/'
|
||||
|
||||
@enforce_types
|
||||
def should_save_readability(snapshot: Model, out_dir: Optional[str]=None) -> bool:
|
||||
def should_save_readability(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or snapshot.link_dir
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
output = Path(out_dir or snapshot.snapshot_dir) / 'readability'
|
||||
return SAVE_READABILITY and READABILITY_VERSION and (not output.exists())
|
||||
if not overwrite and output.exists():
|
||||
return False
|
||||
|
||||
return SAVE_READABILITY and READABILITY_VERSION
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -20,14 +20,16 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
# output = 'screenshot.png'
|
||||
|
||||
@enforce_types
|
||||
def should_save_screenshot(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
def should_save_screenshot(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
if (out_dir / "screenshot.png").exists():
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'screenshot.png').exists():
|
||||
return False
|
||||
|
||||
return SAVE_SCREENSHOT
|
||||
|
|
|
@ -25,13 +25,16 @@ from ..logging_util import TimedProgress
|
|||
|
||||
|
||||
@enforce_types
|
||||
def should_save_singlefile(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
def should_save_singlefile(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
output = out_dir / 'singlefile.html'
|
||||
return SAVE_SINGLEFILE and SINGLEFILE_VERSION and (not output.exists())
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'singlefile.html').exists():
|
||||
return False
|
||||
|
||||
return SAVE_SINGLEFILE
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -62,13 +62,15 @@ class TitleParser(HTMLParser):
|
|||
self.inside_title_tag = False
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool:
|
||||
# if link already has valid title, skip it
|
||||
if snapshot.title and not snapshot.title.lower().startswith('http'):
|
||||
return False
|
||||
# output = '{title}'
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[str]=None) -> bool:
|
||||
if is_static_file(snapshot.url):
|
||||
False
|
||||
|
||||
# if snapshot already has valid title, skip it
|
||||
if not overwrite and snapshot.title and not snapshot.title.lower().startswith('http'):
|
||||
return False
|
||||
|
||||
return SAVE_TITLE
|
||||
|
|
|
@ -38,10 +38,10 @@ from ..logging_util import TimedProgress
|
|||
|
||||
|
||||
@enforce_types
|
||||
def should_save_wget(snapshot: Model, out_dir: Optional[Path]=None) -> bool:
|
||||
def should_save_wget(snapshot: Model, overwrite: Optional[bool]=False, out_dir: Optional[Path]=None) -> bool:
|
||||
output_path = wget_output_path(snapshot)
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
if output_path and (out_dir / output_path).exists():
|
||||
if not overwrite output_path and (out_dir / output_path).exists():
|
||||
return False
|
||||
|
||||
return SAVE_WGET
|
||||
|
@ -68,7 +68,7 @@ def save_wget(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOU
|
|||
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
||||
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
|
||||
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
|
||||
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
|
||||
*(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
|
||||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||
*([] if SAVE_WARC else ['--timestamping']),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||
|
@ -177,11 +177,22 @@ def wget_output_path(snapshot: Model) -> Optional[str]:
|
|||
if html_files:
|
||||
return str(html_files[0].relative_to(snapshot.snapshot_dir))
|
||||
|
||||
# sometimes wget'd URLs have no ext and return non-html
|
||||
# e.g. /some/example/rss/all -> some RSS XML content)
|
||||
# /some/other/url.o4g -> some binary unrecognized ext)
|
||||
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
||||
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
||||
for file_present in search_dir.iterdir():
|
||||
if file_present == last_part_of_url:
|
||||
return str(search_dir / file_present)
|
||||
|
||||
# Move up one directory level
|
||||
search_dir = search_dir.parent
|
||||
|
||||
if search_dir == snapshot.snapshot_dir:
|
||||
break
|
||||
|
||||
|
||||
|
||||
search_dir = Path(snapshot.snapshot_dir) / domain(snapshot.url).replace(":", "+") / urldecode(full_path)
|
||||
if not search_dir.is_dir():
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue