mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-15 15:44:26 -04:00
new archivebox update speed improvements
This commit is contained in:
parent
2d32f05a62
commit
6a4e568d1b
8 changed files with 36 additions and 13 deletions
|
@ -186,6 +186,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
ts
|
ts
|
||||||
) + "\n" + str(e) + "\n"))
|
) + "\n" + str(e) + "\n"))
|
||||||
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||||
|
|
||||||
|
# print(f' ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
|
||||||
|
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||||
|
method_name,
|
||||||
|
link.url,
|
||||||
|
)) from e
|
||||||
|
|
||||||
|
|
||||||
# print(' ', stats)
|
# print(' ', stats)
|
||||||
|
|
||||||
|
@ -218,7 +225,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
||||||
|
|
||||||
if type(all_links) is QuerySet:
|
if type(all_links) is QuerySet:
|
||||||
num_links: int = all_links.count()
|
num_links: int = all_links.count()
|
||||||
get_link = lambda x: x.as_link()
|
get_link = lambda x: x.as_link_with_details()
|
||||||
all_links = all_links.iterator()
|
all_links = all_links.iterator()
|
||||||
else:
|
else:
|
||||||
num_links: int = len(all_links)
|
num_links: int = len(all_links)
|
||||||
|
|
|
@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output = "htmltotext.txt"
|
output = "htmltotext.txt"
|
||||||
|
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||||
|
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
extracted_text = None
|
extracted_text = None
|
||||||
|
status = 'failed'
|
||||||
try:
|
try:
|
||||||
extractor = HTMLTextExtractor()
|
extractor = HTMLTextExtractor()
|
||||||
document = get_html(link, out_dir)
|
document = get_html(link, out_dir)
|
||||||
|
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
extracted_text = str(extractor)
|
extracted_text = str(extractor)
|
||||||
|
|
||||||
atomic_write(str(out_dir / output), extracted_text)
|
atomic_write(str(out_dir / output), extracted_text)
|
||||||
|
status = 'succeeded'
|
||||||
except (Exception, OSError) as err:
|
except (Exception, OSError) as err:
|
||||||
status = 'failed'
|
|
||||||
output = err
|
output = err
|
||||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
result = None
|
||||||
try:
|
try:
|
||||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||||
|
|
||||||
|
@ -84,7 +85,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||||
output_tail = [
|
output_tail = [
|
||||||
line.strip()
|
line.strip()
|
||||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
|
||||||
if line.strip()
|
if line.strip()
|
||||||
]
|
]
|
||||||
hints = (
|
hints = (
|
||||||
|
@ -94,12 +95,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
|
|
||||||
# Check for common failure cases
|
# Check for common failure cases
|
||||||
if (result.returncode > 0) or not (out_dir / output).is_file():
|
if (result.returncode > 0) or not (out_dir / output).is_file():
|
||||||
raise ArchiveError('SingleFile was not able to archive the page', hints)
|
raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
|
||||||
chmod_file(output, cwd=str(out_dir))
|
chmod_file(output, cwd=str(out_dir))
|
||||||
except (Exception, OSError) as err:
|
except (Exception, OSError) as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
||||||
cmd[2] = browser_args.replace('"', "\\\"")
|
cmd[2] = browser_args.replace('"', "\\\"")
|
||||||
|
err.hints = (result.stdout + result.stderr).decode().split('\n')
|
||||||
output = err
|
output = err
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
|
@ -75,7 +75,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||||
with open(abs_path / source, "r", encoding="utf-8") as f:
|
with open(abs_path / source, "r", encoding="utf-8") as f:
|
||||||
document = f.read()
|
document = f.read()
|
||||||
break
|
break
|
||||||
except (FileNotFoundError, TypeError):
|
except (FileNotFoundError, TypeError, UnicodeDecodeError):
|
||||||
continue
|
continue
|
||||||
if document is None:
|
if document is None:
|
||||||
return download_url(link.url, timeout=timeout)
|
return download_url(link.url, timeout=timeout)
|
||||||
|
|
|
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
||||||
|
|
||||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links without checking archive status or data directory validity"""
|
"""indexed links without checking archive status or data directory validity"""
|
||||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in links
|
for link in links
|
||||||
|
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
||||||
|
|
||||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are archived with a valid data directory"""
|
"""indexed links that are archived with a valid data directory"""
|
||||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_archived, links)
|
for link in filter(is_archived, links)
|
||||||
|
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
||||||
|
|
||||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_unarchived, links)
|
for link in filter(is_unarchived, links)
|
||||||
|
|
|
@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
||||||
**ANSI,
|
**ANSI,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# import pudb; pudb.set_trace()
|
||||||
|
|
||||||
# Prettify error output hints string and limit to five lines
|
# Prettify error output hints string and limit to five lines
|
||||||
hints = getattr(result.output, 'hints', None) or ()
|
hints = getattr(result.output, 'hints', None) or ()
|
||||||
if hints:
|
if hints:
|
||||||
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
|
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
|
||||||
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
|
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
|
||||||
else:
|
else:
|
||||||
if isinstance(hints, bytes):
|
if isinstance(hints, bytes):
|
||||||
hints = hints.decode()
|
hints = hints.decode()
|
||||||
|
|
|
@ -791,6 +791,8 @@ def update(resume: Optional[float]=None,
|
||||||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||||
|
|
||||||
|
from core.models import ArchiveResult
|
||||||
|
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
check_dependencies()
|
check_dependencies()
|
||||||
new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||||
|
@ -798,19 +800,23 @@ def update(resume: Optional[float]=None,
|
||||||
extractors = extractors.split(",") if extractors else []
|
extractors = extractors.split(",") if extractors else []
|
||||||
|
|
||||||
# Step 1: Filter for selected_links
|
# Step 1: Filter for selected_links
|
||||||
|
print('[*] Finding matching Snapshots to update...')
|
||||||
|
print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
|
||||||
matching_snapshots = list_links(
|
matching_snapshots = list_links(
|
||||||
filter_patterns=filter_patterns,
|
filter_patterns=filter_patterns,
|
||||||
filter_type=filter_type,
|
filter_type=filter_type,
|
||||||
before=before,
|
before=before,
|
||||||
after=after,
|
after=after,
|
||||||
)
|
)
|
||||||
|
print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
|
||||||
matching_folders = list_folders(
|
matching_folders = list_folders(
|
||||||
links=matching_snapshots,
|
links=matching_snapshots,
|
||||||
status=status,
|
status=status,
|
||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
)
|
)
|
||||||
all_links = [link for link in matching_folders.values() if link]
|
all_links = (link for link in matching_folders.values() if link)
|
||||||
|
print(' - Sorting by most unfinished -> least unfinished + date archived...')
|
||||||
|
all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
|
||||||
|
|
||||||
if index_only:
|
if index_only:
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
|
@ -836,6 +842,7 @@ def update(resume: Optional[float]=None,
|
||||||
if extractors:
|
if extractors:
|
||||||
archive_kwargs["methods"] = extractors
|
archive_kwargs["methods"] = extractors
|
||||||
|
|
||||||
|
|
||||||
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||||
|
|
||||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
|
|
|
@ -179,7 +179,11 @@ def download_url(url: str, timeout: int=None) -> str:
|
||||||
if encoding is not None:
|
if encoding is not None:
|
||||||
response.encoding = encoding
|
response.encoding = encoding
|
||||||
|
|
||||||
return response.text
|
try:
|
||||||
|
return response.text
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# if response is non-test (e.g. image or other binary files), just return the filename instead
|
||||||
|
return url.rsplit('/', 1)[-1]
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def get_headers(url: str, timeout: int=None) -> str:
|
def get_headers(url: str, timeout: int=None) -> str:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue