Merge branch 'master' into django

2025-05-22 02:45:10 -04:00 · 2020-06-25 21:30:29 -04:00 · 2020-06-25 21:30:29 -04:00 · cb67b09f9d
commit cb67b09f9d
parent fae2fdaf2b 10799e4085
29 changed files with 418 additions and 911 deletions
--- a/archivebox/VERSION
+++ b/archivebox/VERSION
@ -1 +1 @@
-0.4.2
+0.4.3
--- a/archivebox/init.py
+++ b/archivebox/init.py
@ -3,4 +3,5 @@ __package__ = 'archivebox'
 from . import core
 from . import cli

+# The main CLI source code, is in 'archivebox/main.py'
 from .main import *
--- a/archivebox/config/init.py
+++ b/archivebox/config/init.py
@ -44,6 +44,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
        'TIMEOUT':                  {'type': int,   'default': 60},
        'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
        'OUTPUT_PERMISSIONS':       {'type': str,   'default': '755'},
+        'RESTRICT_FILE_NAMES':      {'type': str,   'default': 'windows'},
        'URL_BLACKLIST':            {'type': str,   'default': None},
    },

@ -77,6 +78,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com'},
        'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},

+        'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'}
        'WGET_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
        'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},

@ -85,6 +87,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {

        'CHROME_HEADLESS':          {'type': bool,  'default': True},
        'CHROME_SANDBOX':           {'type': bool,  'default': True},
+
    },

    'DEPENDENCY_CONFIG': {
@ -130,7 +133,7 @@ DEFAULT_CLI_COLORS = {
 ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}

 STATICFILE_EXTENSIONS = {
-    # 99.999% of the time, URLs ending in these extentions are static files
+    # 99.999% of the time, URLs ending in these extensions are static files
    # that can be downloaded as-is, not html pages that need to be rendered
    'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
    'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
@ -147,7 +150,7 @@ STATICFILE_EXTENSIONS = {
    # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
    # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml

-    # Thse are always treated as pages, not as static files, never add them:
+    # These are always treated as pages, not as static files, never add them:
    # html, htm, shtml, xhtml, xml, aspx, php, cgi
 }

@ -210,8 +213,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
    'DJANGO_BINARY':            {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
    'DJANGO_VERSION':           {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},

-    'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_ARCHIVE_DOT_ORG'])},
+    'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['FETCH_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
    'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
+    'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
    'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
    'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},

@ -480,6 +484,7 @@ def find_chrome_binary() -> Optional[str]:
        'chromium-browser',
        'chromium',
        '/Applications/Chromium.app/Contents/MacOS/Chromium',
+        'chrome',
        'google-chrome',
        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
        'google-chrome-stable',
@ -506,6 +511,7 @@ def find_chrome_data_dir() -> Optional[str]:
        '~/.config/chromium',
        '~/Library/Application Support/Chromium',
        '~/AppData/Local/Chromium/User Data',
+        '~/.config/chrome',
        '~/.config/google-chrome',
        '~/Library/Application Support/Google/Chrome',
        '~/AppData/Local/Google/Chrome/User Data',
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@ -13,6 +13,7 @@ from ..config import (
    CURL_BINARY,
    CURL_VERSION,
    CHECK_SSL_VALIDITY,
+    CURL_USER_AGENT,
 )
 from ..cli.logging import TimedProgress

@ -37,14 +38,16 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
        '--max-time', str(timeout),
        '--location',
        '--output', str(output),
+        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else [],
        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
        'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
    ]
-    status = 'succeeded'
+    status = 'pending'
    timer = TimedProgress(timeout, prefix='      ')
    try:
        run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
        chmod_file(output, cwd=out_dir)
+        status = 'succeeded'
    except Exception as err:
        status = 'failed'
        output = err
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@ -24,6 +24,7 @@ from ..config import (
    SAVE_WARC,
    WGET_BINARY,
    WGET_VERSION,
+    RESTRICT_FILE_NAMES,
    CHECK_SSL_VALIDITY,
    SAVE_WGET_REQUISITES,
    WGET_AUTO_COMPRESSION,
@ -66,14 +67,14 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
        '--span-hosts',
        '--no-parent',
        '-e', 'robots=off',
-        '--restrict-file-names=windows',
        '--timeout={}'.format(timeout),
-        *([] if SAVE_WARC else ['--timestamping']),
+        *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
        *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
        *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
        *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
        *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
        *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
+        *([] if SAVE_WARC else ['--timestamping']),
        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
        link.url,
    ]
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -325,7 +325,8 @@ def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
    # Patch HTML main index
    html_path = os.path.join(out_dir, 'index.html')
    with open(html_path, 'r') as f:
-        html = f.read().split('\n')
+        html = f.read().splitlines()
+
    for idx, line in enumerate(html):
        if title and ('<span data-title-for="{}"'.format(link.url) in line):
            html[idx] = '<span>{}</span>'.format(title)
@ -333,7 +334,7 @@ def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
            html[idx] = '<span>{}</span>'.format(successful)
            break

-    atomic_write('\n'.join(html), html_path)
+    atomic_write(html_path, '\n'.join(html))


 ### Link Details Index
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -41,7 +41,7 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
    """parse an archive index html file and return the list of urls"""

-    index_path = os.path.join(out_dir, HTML_INDEX_FILENAME)
+    index_path = join(out_dir, HTML_INDEX_FILENAME)
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f:
            for line in f:
@ -58,7 +58,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
    copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME))
    
    rendered_html = main_index_template(links, finished=finished)
-    atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
+    atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)


@enforce_types
@ -116,7 +116,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    out_dir = out_dir or link.link_dir

    rendered_html = link_details_template(link)
-    atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
+    atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)


@enforce_types
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@ -74,7 +74,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
        'last_run_cmd': sys.argv,
        'links': links,
    }
-    atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
+    atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)


 ### Link Details Index
@ -86,7 +86,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    out_dir = out_dir or link.link_dir
    path = os.path.join(out_dir, JSON_INDEX_FILENAME)

-    atomic_write(link._asdict(extended=True), path)
+    atomic_write(path, link._asdict(extended=True))


@enforce_types
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@ -13,7 +13,6 @@ import os
 from typing import Tuple, List
 from datetime import datetime

-from ..index.schema import Link
 from ..system import atomic_write
 from ..config import (
    ANSI,
@ -29,6 +28,7 @@ from ..util import (
    enforce_types,
    URL_REGEX,
 )
+from ..index.schema import Link
 from ..cli.logging import pretty_path, TimedProgress
 from .pocket_html import parse_pocket_html_export
 from .pinboard_rss import parse_pinboard_rss_export
@ -93,8 +93,7 @@ def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
    ts = str(datetime.now().timestamp()).split('.', 1)[0]

    source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
-
-    atomic_write(raw_text, source_path)
+    atomic_write(source_path, raw_text)
    return source_path


@ -112,6 +111,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI
    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))

    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+        # Source is a URL that needs to be downloaded
        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
        print('{}[*] [{}] Downloading {}{}'.format(
            ANSI['green'],
@ -134,10 +134,11 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI
            raise SystemExit(1)

    else:
+        # Source is a path to a local file on the filesystem
        with open(path, 'r') as f:
            raw_source_text = f.read()

-    atomic_write(raw_source_text, source_path)
+    atomic_write(source_path, raw_source_text)

    print('    > {}'.format(pretty_path(source_path)))

--- a/archivebox/system.py
+++ b/archivebox/system.py
@ -8,6 +8,7 @@ import json as pyjson
 from typing import Optional, Union, Set, Tuple

 from crontab import CronTab
+from atomicwrites import atomic_write as awrite

 from subprocess import (
    Popen,
@ -22,10 +23,10 @@ from .util import enforce_types, ExtendedEncoder
 from .config import OUTPUT_PERMISSIONS


+
 def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
    """Patched of subprocess.run to fix blocking io making timeout=innefective"""

-    
    if input is not None:
        if 'stdin' in kwargs:
            raise ValueError('stdin and input arguments may not both be used.')
@ -59,30 +60,14 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
    return CompletedProcess(process.args, retcode, stdout, stderr)


-def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
+def atomic_write(path: str, contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
    """Safe atomic write to filesystem by writing to temp file + atomic rename"""
-    try:
-        tmp_file = '{}.tmp'.format(path)
-        
-        if isinstance(contents, bytes):
-            args = {'mode': 'wb+'}
+    
+    with awrite(path, overwrite=overwrite) as f:
+        if isinstance(contents, dict):
+            pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
        else:
-            args = {'mode': 'w+', 'encoding': 'utf-8'}
-
-        with open(tmp_file, **args) as f:
-            if isinstance(contents, dict):
-                pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
-            else:
-                f.write(contents)
-            
-            os.fsync(f.fileno())
-
-        os.rename(tmp_file, path)
-        chmod_file(path)
-    finally:
-        if os.path.exists(tmp_file):
-            os.remove(tmp_file)
-
+            f.write(contents)

@enforce_types
 def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
@ -105,7 +90,8 @@ def copy_and_overwrite(from_path: str, to_path: str):
        shutil.copytree(from_path, to_path)
    else:
        with open(from_path, 'rb') as src:
-            atomic_write(src.read(), to_path)
+            contents = src.read()
+        atomic_write(to_path, contents)


@enforce_types
--- a/archivebox/themes/default/main_index.html
+++ b/archivebox/themes/default/main_index.html
@ -6,6 +6,37 @@
        <title>Archived Sites</title>
        <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
        <style>
+            :root {
+                --bg-main: #efefef;
+                --accent-1: #aa1e55;
+                --accent-2: #ffebeb;
+                --accent-3: #efefef;
+
+                --text-1: #1c1c1c;
+                --text-2: #eaeaea;
+                --text-main: #1a1a1a;
+                --font-main: "Gill Sans", Helvetica, sans-serif;
+            }
+            /* Dark Mode (WIP) */
+            /*
+            @media (prefers-color-scheme: dark) {
+                :root {
+                    --accent-2: hsl(160, 100%, 96%);
+
+                    --text-1: #eaeaea;
+                    --text-2: #1a1a1a;
+                    --bg-main: #101010;
+                }
+
+                #table-bookmarks_wrapper,
+                #table-bookmarks_wrapper img,
+                tbody td:nth-child(3),
+                tbody td:nth-child(3) span,
+                footer {
+                    filter: invert(100%);
+                }
+            }*/
+
            html, body {
                width: 100%;
                height: 100%;
@ -14,11 +45,12 @@
                text-align: center;
                margin: 0px;
                padding: 0px;
-                font-family: "Gill Sans", Helvetica, sans-serif;
+                font-family: var(--font-main);
            }
+
            .header-top small {
                font-weight: 200;
-                color: #efefef;
+                color: var(--accent-3);
            }
            
            .header-top {
@ -31,8 +63,8 @@
                font-size: calc(11px + 0.84vw);
                font-weight: 200;
                padding: 4px 4px;
-                border-bottom: 3px solid #aa1e55;
-                background-color: #aa1e55;
+                border-bottom: 3px solid var(--accent-1);
+                background-color: var(--accent-1);
            }
            input[type=search] {
                width: 22vw;
@ -86,7 +118,7 @@
                height: 35px;
            }
            tbody tr:nth-child(odd) {
-               background-color: #ffebeb !important;
+               background-color: var(--accent-2) !important;
            }
            table tr td {
                white-space: nowrap;
@ -146,7 +178,7 @@
                color:black;
            }
            tr td a.title small {
-                background-color: #efefef;
+                background-color: var(--accent-3);
                border-radius: 4px;
                float:right
            }
--- a/archivebox/themes/legacy/main_index.html
+++ b/archivebox/themes/legacy/main_index.html
@ -4,6 +4,34 @@
        <title>Archived Sites</title>
        <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
        <style>
+            :root {
+                --accent-1: #aa1e55;
+                --accent-2: #ffebeb;
+                --accent-3: #efefef;
+                
+                --bg-main: #efefef;
+                --text-main: black;
+                --text-1: #1a1a1a;
+                --text-2: #eaeaea;
+            }
+            
+            @media (prefers-color-scheme: dark) {
+                :root {
+                    --accent-2: hsl(160, 100%, 96%);
+                    
+                    --text-1: #eaeaea;
+                    --text-2: #1a1a1a;
+                    --bg-main: #101010;
+                }
+                
+                #table-bookmarks_wrapper,
+                #table-bookmarks_wrapper img,
+                tbody td:nth-child(3),
+                tbody td:nth-child(3) span,
+                footer {
+                    filter: invert(100%);
+                }
+            }
            html, body {
                width: 100%;
                height: 100%;
@ -13,7 +41,10 @@
                margin: 0px;
                padding: 0px;
                font-family: "Gill Sans", Helvetica, sans-serif;
+                background: var(--bg-main);
+                color: var(--text-main);
            }
+<<<<<<< HEAD:archivebox/themes/legacy/main_index.html
            .header-top small {
                font-weight: 200;
                color: #efefef;
@ -24,6 +55,33 @@
                height: auto;
                min-height: 40px;
                margin: 0px;
+=======
+            header {
+                background-color: var(--accent-1);
+                color: var(--text-1);
+                padding: 10px;
+                padding-top: 0px;
+                padding-bottom: 15px;
+                /*height: 40px;*/
+            }
+            header h1 {
+                margin: 7px 0px;
+                font-size: 35px;
+                font-weight: 300;
+                color: var(--text-1);
+            }
+            header h1 img {
+                height: 44px;
+                vertical-align: bottom;
+            }
+            header a {
+                text-decoration: none !important;
+                color: var(--text-1);
+            }
+            .header-center {
+                margin: auto;
+                float: none;
+>>>>>>> master:archivebox/templates/index.html
                text-align: center;
                color: white;
                font-size: calc(11px + 0.84vw);
@ -32,11 +90,17 @@
                border-bottom: 3px solid #aa1e55;
                background-color: #aa1e55;
            }
+<<<<<<< HEAD:archivebox/themes/legacy/main_index.html
            input[type=search] {
                width: 22vw;
                border-radius: 4px;
                border: 1px solid #aeaeae;
                padding: 3px 5px;
+=======
+            .header-center small {
+                color: var(--text-2);
+                opacity: 0.7;
+>>>>>>> master:archivebox/templates/index.html
            }
            .nav > div {
                min-height: 30px;
@ -45,9 +109,14 @@
                text-decoration: none;
                color: rgba(0,0,0,0.6);
            }
+<<<<<<< HEAD:archivebox/themes/legacy/main_index.html
            .header-top a:hover {
                text-decoration: none;
                color: rgba(0,0,0,0.9);
+=======
+            header + div {
+                padding-top: 10px;
+>>>>>>> master:archivebox/templates/index.html
            }
            .header-top .col-lg-4 {
                text-align: center;
@ -84,7 +153,7 @@
                height: 35px;
            }
            tbody tr:nth-child(odd) {
-               background-color: #ffebeb !important;
+               background-color: var(--accent-2) !important;
            }
            table tr td {
                white-space: nowrap;
@ -144,7 +213,7 @@
                color:black;
            }
            tr td a.title small {
-                background-color: #efefef;
+                background-color: var(--accent-3);
                border-radius: 4px;
                float:right
            }
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -1,5 +1,6 @@
 import re
 import ssl
+import json as pyjson


 from typing import List, Optional, Any
@ -12,8 +13,7 @@ from html import escape, unescape
 from datetime import datetime
 from dateutil import parser as dateparser

-from base32_crockford import encode as base32_encode         # type: ignore
-import json as pyjson
+from base32_crockford import encode as base32_encode                            # type: ignore

 from .config import (
    TIMEOUT,
@ -23,6 +23,12 @@ from .config import (
    CHROME_OPTIONS,
 )

+try:
+    import chardet
+    detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
+except ImportError:
+    detect_encoding = lambda rawdata: "utf-8"
+
 ### Parsing Helpers

 # All of these are (str) -> str
@ -158,8 +164,9 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
        insecure = ssl._create_unverified_context()
        resp = urlopen(req, timeout=timeout, context=insecure)

-    encoding = resp.headers.get_content_charset() or 'utf-8'  # type: ignore
-    return resp.read().decode(encoding)
+    rawdata = resp.read()
+    encoding = resp.headers.get_content_charset() or detect_encoding(rawdata)
+    return rawdata.decode(encoding)


@enforce_types
 @ -1 +1 @@
 .4.2
 .4.3