split up utils into separate files

2025-06-02 23:59:52 -04:00 · 2019-04-30 23:13:04 -04:00 · 2019-04-30 23:13:04 -04:00 · 95007d9137
commit 95007d9137
parent daf5951897
23 changed files with 820 additions and 759 deletions
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@ -5,16 +5,11 @@ import os
 from typing import Optional, List, Dict, Tuple
 from collections import defaultdict

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, DEVNULL, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
-    DEVNULL,
    is_static_file,
-    ArchiveError,
-    chmod_file,
 )
 from ..config import (
    VERSION,
@ -24,6 +19,7 @@ from ..config import (
    CURL_VERSION,
    CHECK_SSL_VALIDITY
 )
+from ..cli.logging import TimedProgress



--- a/archivebox/extractors/dom.py
+++ b/archivebox/extractors/dom.py
@ -4,22 +4,19 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
    chrome_args,
-    chmod_file,
 )
 from ..config import (
    TIMEOUT,
    SAVE_DOM,
    CHROME_VERSION,
 )
+from ..cli.logging import TimedProgress



--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@ -5,14 +5,8 @@ import os
 from typing import Optional

 from ..index.schema import Link, ArchiveResult, ArchiveOutput
-from ..util import (
-    enforce_types,
-    TimedProgress,
-    domain,
-    run,
-    PIPE,
-    chmod_file,
-)
+from ..system import chmod_file, run, PIPE
+from ..util import enforce_types, domain
 from ..config import (
    TIMEOUT,
    SAVE_FAVICON,
@ -20,6 +14,7 @@ from ..config import (
    CURL_VERSION,
    CHECK_SSL_VALIDITY,
 )
+from ..cli.logging import TimedProgress


@enforce_types
--- a/archivebox/extractors/git.py
+++ b/archivebox/extractors/git.py
@ -4,15 +4,11 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
-    chmod_file,
    domain,
    extension,
    without_query,
@ -26,6 +22,7 @@ from ..config import (
    GIT_DOMAINS,
    CHECK_SSL_VALIDITY
 )
+from ..cli.logging import TimedProgress



--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@ -4,15 +4,11 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
-    chmod_file,
 )
 from ..config import (
    MEDIA_TIMEOUT,
@ -21,6 +17,7 @@ from ..config import (
    YOUTUBEDL_VERSION,
    CHECK_SSL_VALIDITY
 )
+from ..cli.logging import TimedProgress


@enforce_types
--- a/archivebox/extractors/pdf.py
+++ b/archivebox/extractors/pdf.py
@ -4,23 +4,19 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
    chrome_args,
-    chmod_file,
 )
 from ..config import (
    TIMEOUT,
    SAVE_PDF,
    CHROME_VERSION,
 )
-
+from ..cli.logging import TimedProgress


@enforce_types
--- a/archivebox/extractors/screenshot.py
+++ b/archivebox/extractors/screenshot.py
@ -4,22 +4,19 @@ import os

 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
    is_static_file,
-    ArchiveError,
    chrome_args,
-    chmod_file,
 )
 from ..config import (
    TIMEOUT,
    SAVE_SCREENSHOT,
    CHROME_VERSION,
 )
+from ..cli.logging import TimedProgress



--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -1,14 +1,14 @@
 __package__ = 'archivebox.extractors'

+import re
 from typing import Optional

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..util import (
    enforce_types,
-    TimedProgress,
    is_static_file,
-    ArchiveError,
-    fetch_page_title,
+    download_url,
+    htmldecode,
 )
 from ..config import (
    TIMEOUT,
@ -16,6 +16,14 @@ from ..config import (
    CURL_BINARY,
    CURL_VERSION,
 )
+from ..cli.logging import TimedProgress
+
+
+HTML_TITLE_REGEX = re.compile(
+    r'<title.*?>'                      # start matching text after <title> tag
+    r'(.[^<>]+)',                      # get everything up to these symbols
+    re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
+)


@enforce_types
@ -44,7 +52,9 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
-        output = fetch_page_title(link.url, timeout=timeout, progress=False)
+        html = download_url(link.url, timeout=timeout)
+        match = re.search(HTML_TITLE_REGEX, html)
+        output = htmldecode(match.group(1).strip()) if match else None
        if not output:
            raise ArchiveError('Unable to detect page title')
    except Exception as err:
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@ -1,18 +1,22 @@
 __package__ = 'archivebox.extractors'

 import os
+import re

 from typing import Optional
 from datetime import datetime

-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE
 from ..util import (
    enforce_types,
-    TimedProgress,
-    run,
-    PIPE,
-    wget_output_path,
-    ArchiveError,
+    is_static_file,
+    without_scheme,
+    without_fragment,
+    without_query,
+    path,
+    domain,
+    urldecode,
 )
 from ..config import (
    TIMEOUT,
@ -26,7 +30,7 @@ from ..config import (
    WGET_USER_AGENT,
    COOKIES_FILE,
 )
-
+from ..cli.logging import TimedProgress


@enforce_types
@ -121,3 +125,72 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
        status=status,
        **timer.stats,
    )
+
+
+@enforce_types
+def wget_output_path(link: Link) -> Optional[str]:
+    """calculate the path to the wgetted .html file, since wget may
+    adjust some paths to be different than the base_url path.
+
+    See docs on wget --adjust-extension (-E)
+    """
+
+    if is_static_file(link.url):
+        return without_scheme(without_fragment(link.url))
+
+    # Wget downloads can save in a number of different ways depending on the url:
+    #    https://example.com
+    #       > example.com/index.html
+    #    https://example.com?v=zzVa_tX1OiI
+    #       > example.com/index.html?v=zzVa_tX1OiI.html
+    #    https://www.example.com/?v=zzVa_tX1OiI
+    #       > example.com/index.html?v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc
+    #       > example.com/abc.html
+    #    https://example.com/abc/
+    #       > example.com/abc/index.html
+    #    https://example.com/abc?v=zzVa_tX1OiI.html
+    #       > example.com/abc?v=zzVa_tX1OiI.html
+    #    https://example.com/abc/?v=zzVa_tX1OiI.html
+    #       > example.com/abc/index.html?v=zzVa_tX1OiI.html
+
+    #    https://example.com/abc/test.html
+    #       > example.com/abc/test.html
+    #    https://example.com/abc/test?v=zzVa_tX1OiI
+    #       > example.com/abc/test?v=zzVa_tX1OiI.html
+    #    https://example.com/abc/test/?v=zzVa_tX1OiI
+    #       > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
+
+    # There's also lots of complexity around how the urlencoding and renaming
+    # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
+
+    # Since the wget algorithm for -E (appending .html) is incredibly complex
+    # and there's no way to get the computed output path from wget
+    # in order to avoid having to reverse-engineer how they calculate it,
+    # we just look in the output folder read the filename wget used from the filesystem
+    full_path = without_fragment(without_query(path(link.url))).strip('/')
+    search_dir = os.path.join(
+        link.link_dir,
+        domain(link.url),
+        urldecode(full_path),
+    )
+
+    for _ in range(4):
+        if os.path.exists(search_dir):
+            if os.path.isdir(search_dir):
+                html_files = [
+                    f for f in os.listdir(search_dir)
+                    if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
+                ]
+                if html_files:
+                    path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
+                    return os.path.join(path_from_link_dir, html_files[0])
+
+        # Move up one directory level
+        search_dir = search_dir.rsplit('/', 1)[0]
+
+        if search_dir == link.link_dir:
+            break
+
+    return None