new archivebox update speed improvements

2025-05-15 15:44:26 -04:00 · 2024-02-22 04:49:09 -08:00 · 2024-02-22 04:49:09 -08:00 · 6a4e568d1b
commit 6a4e568d1b
parent 2d32f05a62
8 changed files with 36 additions and 13 deletions
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -186,6 +186,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                        ts
                    ) + "\n" + str(e) + "\n"))
                    #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
                # print(f'        ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
                    method_name,
                    link.url,
                )) from e
        # print('    ', stats)
@ -218,7 +225,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
    if type(all_links) is QuerySet:
        num_links: int = all_links.count()
-        get_link = lambda x: x.as_link()
+        get_link = lambda x: x.as_link_with_details()
        all_links = all_links.iterator()
    else:
        num_links: int = len(all_links)
--- a/archivebox/extractors/htmltotext.py
+++ b/archivebox/extractors/htmltotext.py
@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
    out_dir = Path(out_dir or link.link_dir)
    output = "htmltotext.txt"
    cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
    timer = TimedProgress(timeout, prefix='      ')
    extracted_text = None
    status = 'failed'
    try:
        extractor = HTMLTextExtractor()
        document = get_html(link, out_dir)
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
        extracted_text = str(extractor)
        atomic_write(str(out_dir / output), extracted_text)
        status = 'succeeded'
    except (Exception, OSError) as err:
        status = 'failed'
        output = err
        cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
    finally:
        timer.end()
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@ -77,6 +77,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    result = None
    try:
        result = run(cmd, cwd=str(out_dir), timeout=timeout)
@ -84,7 +85,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
        output_tail = [
            line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
            if line.strip()
        ]
        hints = (
@ -94,12 +95,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
        # Check for common failure cases
        if (result.returncode > 0) or not (out_dir / output).is_file():
-            raise ArchiveError('SingleFile was not able to archive the page', hints)
+            raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
        chmod_file(output, cwd=str(out_dir))
    except (Exception, OSError) as err:
        status = 'failed'
        # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
        cmd[2] = browser_args.replace('"', "\\\"")
        err.hints = (result.stdout + result.stderr).decode().split('\n')
        output = err
    finally:
        timer.end()
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -75,7 +75,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
            with open(abs_path / source, "r", encoding="utf-8") as f:
                document = f.read()
                break
-        except (FileNotFoundError, TypeError):
+        except (FileNotFoundError, TypeError, UnicodeDecodeError):
            continue
    if document is None:
        return download_url(link.url, timeout=timeout)
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links without checking archive status or data directory validity"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in links
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links that are archived with a valid data directory"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in filter(is_archived, links)
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links that are unarchived with no data directory or an empty data directory"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = (snapshot.as_link() for snapshot in snapshots.iterator())
    return {
        link.link_dir: link
        for link in filter(is_unarchived, links)
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"):
                    **ANSI,
                ),
            ]
        # import pudb; pudb.set_trace()
        # Prettify error output hints string and limit to five lines
        hints = getattr(result.output, 'hints', None) or ()
        if hints:
            if isinstance(hints, (list, tuple, type(_ for _ in ()))):
-                hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
+                hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
            else:
                if isinstance(hints, bytes):
                    hints = hints.decode()
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -791,6 +791,8 @@ def update(resume: Optional[float]=None,
           out_dir: Path=OUTPUT_DIR) -> List[Link]:
    """Import any new links from subscriptions and retry any previously failed/skipped links"""
    from core.models import ArchiveResult
    check_data_folder(out_dir=out_dir)
    check_dependencies()
    new_links: List[Link] = [] # TODO: Remove input argument: only_new
@ -798,19 +800,23 @@ def update(resume: Optional[float]=None,
    extractors = extractors.split(",") if extractors else []
    # Step 1: Filter for selected_links
    print('[*] Finding matching Snapshots to update...')
    print(f'    - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
    matching_snapshots = list_links(
        filter_patterns=filter_patterns,
        filter_type=filter_type,
        before=before,
        after=after,
    )
-
+    print(f'    - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
    matching_folders = list_folders(
        links=matching_snapshots,
        status=status,
        out_dir=out_dir,
    )
-    all_links = [link for link in matching_folders.values() if link]
+    all_links = (link for link in matching_folders.values() if link)
    print('    - Sorting by most unfinished -> least unfinished + date archived...')
    all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
    if index_only:
        for link in all_links:
@ -836,6 +842,7 @@ def update(resume: Optional[float]=None,
    if extractors:
        archive_kwargs["methods"] = extractors
    archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
    # Step 4: Re-write links index with updated titles, icons, and resources
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -179,7 +179,11 @@ def download_url(url: str, timeout: int=None) -> str:
    if encoding is not None:
        response.encoding = encoding
-    return response.text
+    try:
        return response.text
    except UnicodeDecodeError:
        # if response is non-test (e.g. image or other binary files), just return the filename instead
        return url.rsplit('/', 1)[-1]
@enforce_types
 def get_headers(url: str, timeout: int=None) -> str: