From abde871a3c93d1aee74aab92b4f5557df3c6c35d Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 25 Sep 2020 08:13:28 -0500 Subject: [PATCH 01/29] fix: Wget absolute path generating issues --- archivebox/extractors/wget.py | 2 +- archivebox/themes/legacy/main_index_row.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index dac0bdd3..da88dc5f 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -179,7 +179,7 @@ def wget_output_path(link: Link) -> Optional[str]: if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) ] if html_files: - return str(html_files[0]) + return str(html_files[0].relative_to(link.link_dir)) # Move up one directory level search_dir = search_dir.parent diff --git a/archivebox/themes/legacy/main_index_row.html b/archivebox/themes/legacy/main_index_row.html index 03a36af6..a9037f83 100644 --- a/archivebox/themes/legacy/main_index_row.html +++ b/archivebox/themes/legacy/main_index_row.html @@ -2,7 +2,7 @@ $bookmarked_date - + $title $tags From 5975c27a6a685db8274645106a670850c37b6ec8 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 25 Sep 2020 13:48:19 -0500 Subject: [PATCH 02/29] fix: Remove trailing slash from public index --- archivebox/core/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 902eef01..1c24fe4d 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -14,11 +14,11 @@ def get_icons(snapshot: Snapshot) -> str: return format_html( '' - '🌐 ' + '🌐 ' '📄 ' '🖥 ' '🅷 ' - '🆆 ' + '🆆 ' '🗜 ' '📼 ' '📦 ' From 73418836f85ed257bd002df8289dcac1b9e9ff7c Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:05:39 -0500 Subject: [PATCH 03/29] Replaced os.path in server.py --- tests/mock_server/server.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index 9e5bea05..4283574f 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -1,4 +1,3 @@ -from os.path import abspath from os import getcwd from pathlib import Path @@ -10,20 +9,20 @@ def index(): @route("/static/") def static_path(filename): - template_path = abspath(getcwd()) / Path("tests/mock_server/templates") + template_path = Path.cwd().resolve() / "tests/mock_server/templates" response = static_file(filename, root=template_path) return response @route("/static_no_content_type/") def static_no_content_type(filename): - template_path = abspath(getcwd()) / Path("tests/mock_server/templates") + template_path = Path.cwd().resolve() / "tests/mock_server/templates" response = static_file(filename, root=template_path) response.set_header("Content-Type", "") return response @route("/static/headers/") def static_path_with_headers(filename): - template_path = abspath(getcwd()) / Path("tests/mock_server/templates") + template_path = Path.cwd().resolve() / "tests/mock_server/templates" response = static_file(filename, root=template_path) response.add_header("Content-Language", "en") response.add_header("Content-Script-Type", "text/javascript") @@ -32,7 +31,7 @@ def static_path_with_headers(filename): @route("/static/400/", method="HEAD") def static_400(filename): - template_path = abspath(getcwd()) / Path("tests/mock_server/templates") + template_path = Path.cwd().resolve() / "tests/mock_server/templates" response = static_file(filename, root=template_path) response.status = 400 response.add_header("Status-Code", "400") @@ -40,7 +39,7 @@ def static_400(filename): @route("/static/400/", method="GET") def static_200(filename): - template_path = abspath(getcwd()) / Path("tests/mock_server/templates") + template_path = Path.cwd().resolve() / "tests/mock_server/templates" response = static_file(filename, root=template_path) response.add_header("Status-Code", "200") return response From 2c62abb2705150831df55b66eaea882e28af4b1d Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:09:34 -0500 Subject: [PATCH 04/29] Replaced os.path in init parsers --- archivebox/parsers/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 520b9609..5d0d5ca5 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -8,7 +8,6 @@ For examples of supported import formats see tests/. __package__ = 'archivebox.parsers' import re -import os from io import StringIO from typing import IO, Tuple, List, Optional @@ -128,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) @enforce_types def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str: ts = str(datetime.now().timestamp()).split('.', 1)[0] - source_path = os.path.join(out_dir, SOURCES_DIR_NAME, filename.format(ts=ts)) + source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts)) atomic_write(source_path, raw_text) log_source_saved(source_file=source_path) return source_path @@ -138,7 +137,7 @@ def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str: """download a given url's content into output/sources/domain-.txt""" ts = str(datetime.now().timestamp()).split('.', 1)[0] - source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts)) + source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts)) if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): # Source is a URL that needs to be downloaded From 7d513b9b19a995a456fe1dd93d17b96fc146d062 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:14:34 -0500 Subject: [PATCH 05/29] Replaced os.path in schema.py --- archivebox/index/schema.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 7508890d..dd8fe7cd 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -1,6 +1,5 @@ __package__ = 'archivebox.index' -import os from pathlib import Path from datetime import datetime, timedelta @@ -250,7 +249,7 @@ class Link: @property def link_dir(self) -> str: from ..config import CONFIG - return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp) + return Path(CONFIG['ARCHIVE_DIR']) / self.timestamp @property def archive_path(self) -> str: @@ -369,7 +368,7 @@ class Link: ) return any( - os.path.exists(os.path.join(ARCHIVE_DIR, self.timestamp, path)) + (Path(ARCHIVE_DIR) / self.timestamp / path).exists() for path in output_paths ) From 9264ad88e00721482048f7a68a40e9fe8184d519 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 15:30:06 -0500 Subject: [PATCH 06/29] Fixed string casting --- archivebox/index/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index dd8fe7cd..68d840a2 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -249,7 +249,7 @@ class Link: @property def link_dir(self) -> str: from ..config import CONFIG - return Path(CONFIG['ARCHIVE_DIR']) / self.timestamp + return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp) @property def archive_path(self) -> str: From 8b03c37fbb2eeaceebd2303ec7b3b82e819b07fd Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:21:41 -0500 Subject: [PATCH 07/29] Replaced os.path in json.py --- archivebox/index/json.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 76e6ec80..36c5ccdb 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -45,8 +45,8 @@ MAIN_INDEX_HEADER = { def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: """parse an archive index json file and return the list of links""" - index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) - if os.path.exists(index_path): + index_path = Path(out_dir) / JSON_INDEX_FILENAME + if index_path.exists(): with open(index_path, 'r', encoding='utf-8') as f: links = pyjson.load(f)['links'] for link_json in links: @@ -86,7 +86,7 @@ def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: 'last_run_cmd': sys.argv, 'links': links, } - atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json) + atomic_write(str(Path(out_dir) / JSON_INDEX_FILENAME), main_index_json) ### Link Details Index @@ -96,15 +96,15 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: """write a json file with some info about the link""" out_dir = out_dir or link.link_dir - path = os.path.join(out_dir, JSON_INDEX_FILENAME) - atomic_write(path, link._asdict(extended=True)) + path = Path(out_dir) / JSON_INDEX_FILENAME + atomic_write(str(path), link._asdict(extended=True)) @enforce_types def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]: """load the json link index from a given directory""" - existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME) - if os.path.exists(existing_index): + existing_index = Path(out_dir) / JSON_INDEX_FILENAME + if existing_index.exists(): with open(existing_index, 'r', encoding='utf-8') as f: try: link_json = pyjson.load(f) @@ -118,9 +118,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]: """read through all the archive data folders and return the parsed links""" - for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): + for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): if entry.is_dir(follow_symlinks=True): - if os.path.exists(os.path.join(entry.path, 'index.json')): + if (Path(entry.path) / 'index.json').exists(): try: link = parse_json_link_details(entry.path) except KeyError: From 78f706276113016c8591b8681ce1226b707927ee Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:30:53 -0500 Subject: [PATCH 08/29] Replaced os.path in html.py --- archivebox/index/html.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index a46611d6..793a60af 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -1,7 +1,5 @@ __package__ = 'archivebox.index' -import os - from string import Template from datetime import datetime from typing import List, Optional, Iterator, Mapping @@ -30,11 +28,10 @@ from ..config import ( FAVICON_FILENAME, ) -join = lambda *paths: os.path.join(*paths) -MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html') -MINIMAL_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index_minimal.html') -MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html') -LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html') +MAIN_INDEX_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index.html') +MINIMAL_INDEX_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index_minimal.html') +MAIN_INDEX_ROW_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index_row.html') +LINK_DETAILS_TEMPLATE = str(Path(TEMPLATES_DIR) / 'link_details.html') TITLE_LOADING_MSG = 'Not yet archived...' @@ -44,8 +41,8 @@ TITLE_LOADING_MSG = 'Not yet archived...' def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: """parse an archive index html file and return the list of urls""" - index_path = join(out_dir, HTML_INDEX_FILENAME) - if os.path.exists(index_path): + index_path = Path(out_dir) / HTML_INDEX_FILENAME + if index_path.exists(): with open(index_path, 'r', encoding='utf-8') as f: for line in f: if 'class="link-url"' in line: @@ -56,12 +53,12 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: def write_html_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None: """write the html link index to a given path""" - copy_and_overwrite(join(TEMPLATES_DIR, FAVICON_FILENAME), join(out_dir, FAVICON_FILENAME)) - copy_and_overwrite(join(TEMPLATES_DIR, ROBOTS_TXT_FILENAME), join(out_dir, ROBOTS_TXT_FILENAME)) - copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME)) + copy_and_overwrite(str(Path(TEMPLATES_DIR) / FAVICON_FILENAME), str(out_dir / FAVICON_FILENAME)) + copy_and_overwrite(str(Path(TEMPLATES_DIR) / ROBOTS_TXT_FILENAME), str(out_dir / ROBOTS_TXT_FILENAME)) + copy_and_overwrite(str(Path(TEMPLATES_DIR) / STATIC_DIR_NAME), str(out_dir / STATIC_DIR_NAME)) rendered_html = main_index_template(links, finished=finished) - atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html) + atomic_write(str(out_dir / HTML_INDEX_FILENAME), rendered_html) @enforce_types @@ -100,7 +97,7 @@ def main_index_row_template(link: Link) -> str: # before pages are finished archiving, show fallback loading favicon 'favicon_url': ( - join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico') + str(Path(ARCHIVE_DIR_NAME) / link.timestamp / 'favicon.ico') # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs=' ), @@ -119,7 +116,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: out_dir = out_dir or link.link_dir rendered_html = link_details_template(link) - atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html) + atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html) @enforce_types From ad04fb530049a8428013e96f2ec3dbbe0ef8f669 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:32:26 -0500 Subject: [PATCH 09/29] Replaced os.path in init index --- archivebox/index/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 06832dbc..a496e03c 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -575,7 +575,7 @@ def is_archived(link: Link) -> bool: return is_valid(link) and link.is_archived def is_unarchived(link: Link) -> bool: - if not os.path.exists(link.link_dir): + if not Path(link.link_dir).exists(): return True return not link.is_archived From 3fb410a604e13dec5f88c0b89f0ffc696aed29b5 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:42:57 -0500 Subject: [PATCH 10/29] Replaced os.path in favicon.py --- archivebox/extractors/favicon.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index fe8895a5..86d2c506 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -1,6 +1,5 @@ __package__ = 'archivebox.extractors' -import os from pathlib import Path from typing import Optional @@ -22,7 +21,7 @@ from ..logging_util import TimedProgress @enforce_types def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool: out_dir = out_dir or link.link_dir - if os.path.exists(os.path.join(out_dir, 'favicon.ico')): + if (Path(out_dir) / 'favicon.ico').exists(): return False return SAVE_FAVICON From fa364ed728b3f6f458938a2abfd3d3a36b763299 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:43:14 -0500 Subject: [PATCH 11/29] Replaced od.path in init cli --- archivebox/cli/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index aa26715b..83055e8e 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -6,12 +6,13 @@ import sys import argparse from typing import Optional, Dict, List, IO +from pathlib import Path from ..config import OUTPUT_DIR from importlib import import_module -CLI_DIR = os.path.dirname(os.path.abspath(__file__)) +CLI_DIR = Path(__file__).resolve().parent # these common commands will appear sorted before any others for ease-of-use meta_cmds = ('help', 'version') From ce71747538f6703d57d657f639ea80fc12ed347d Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:43:35 -0500 Subject: [PATCH 12/29] replaced os.path in init extractors --- archivebox/extractors/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 53a77941..60f20adf 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -75,7 +75,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s out_dir = out_dir or Path(link.link_dir) try: - is_new = not os.path.exists(out_dir) + is_new = not Path(out_dir).exists() if is_new: os.makedirs(out_dir) From 0e7c337dcb5d0c138c783033512fe3cde44f1b5f Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:54:32 -0500 Subject: [PATCH 13/29] Replaced os.path in settings.py --- archivebox/core/settings.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 14b3b369..5cb15cdb 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -2,6 +2,7 @@ __package__ = 'archivebox.core' import os import sys +from pathlib import Path from django.utils.crypto import get_random_string @@ -49,9 +50,9 @@ TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', 'DIRS': [ - os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME), - os.path.join(PYTHON_DIR, 'themes', 'default'), - os.path.join(PYTHON_DIR, 'themes'), + Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME, + Path(PYTHON_DIR) / 'themes' / 'default', + Path(PYTHON_DIR) / 'themes', ], 'APP_DIRS': True, 'OPTIONS': { @@ -70,7 +71,7 @@ WSGI_APPLICATION = 'core.wsgi.application' DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME), + 'NAME': Path(OUTPUT_DIR) / SQL_INDEX_FILENAME, } } @@ -105,7 +106,7 @@ SHELL_PLUS_PRINT_SQL = False IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' if IS_SHELL: - os.environ['PYTHONSTARTUP'] = os.path.join(PYTHON_DIR, 'core', 'welcome_message.py') + os.environ['PYTHONSTARTUP'] = Path(PYTHON_DIR) / 'core' / 'welcome_message.py' LANGUAGE_CODE = 'en-us' @@ -122,6 +123,6 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' STATIC_URL = '/static/' STATICFILES_DIRS = [ - os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME, 'static'), - os.path.join(PYTHON_DIR, 'themes', 'default', 'static'), + Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME / 'static', + Path(PYTHON_DIR) / 'themes' / 'default' / 'static', ] From 897bace84dec33b71aed48c576b2a62fb58ed8e3 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Thu, 1 Oct 2020 12:07:57 -0500 Subject: [PATCH 14/29] Fixed paths in settings --- archivebox/core/settings.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 5cb15cdb..44065de4 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -50,9 +50,9 @@ TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', 'DIRS': [ - Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME, - Path(PYTHON_DIR) / 'themes' / 'default', - Path(PYTHON_DIR) / 'themes', + str(Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME), + str(Path(PYTHON_DIR) / 'themes' / 'default'), + str(Path(PYTHON_DIR) / 'themes'), ], 'APP_DIRS': True, 'OPTIONS': { @@ -71,7 +71,7 @@ WSGI_APPLICATION = 'core.wsgi.application' DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': Path(OUTPUT_DIR) / SQL_INDEX_FILENAME, + 'NAME': str(Path(OUTPUT_DIR) / SQL_INDEX_FILENAME), } } @@ -106,7 +106,7 @@ SHELL_PLUS_PRINT_SQL = False IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' if IS_SHELL: - os.environ['PYTHONSTARTUP'] = Path(PYTHON_DIR) / 'core' / 'welcome_message.py' + os.environ['PYTHONSTARTUP'] = str(Path(PYTHON_DIR) / 'core' / 'welcome_message.py') LANGUAGE_CODE = 'en-us' @@ -123,6 +123,6 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' STATIC_URL = '/static/' STATICFILES_DIRS = [ - Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME / 'static', - Path(PYTHON_DIR) / 'themes' / 'default' / 'static', + str(Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME / 'static'), + str(Path(PYTHON_DIR) / 'themes' / 'default' / 'static'), ] From 16b5ca32077d6dc49aa182952c0017d3bb3c50ea Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 14:54:51 -0500 Subject: [PATCH 15/29] Replaced os.path in init config --- archivebox/config/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 4cd78609..3d7e3730 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -431,7 +431,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: with open(f'{config_path}.bak', 'r') as old: atomic_write(config_path, old.read()) - if os.path.exists(f'{config_path}.bak'): + if Path(f'{config_path}.bak').exists(): os.remove(f'{config_path}.bak') return {} @@ -540,7 +540,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]: if node_modules_bin.exists(): return str(node_modules_bin.resolve()) - return shutil.which(os.path.expanduser(binary)) or binary + return shutil.which(Path(binary).expanduser()) or binary def bin_hash(binary: Optional[str]) -> Optional[str]: if binary is None: @@ -634,17 +634,17 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: } def get_external_locations(config: ConfigDict) -> ConfigValue: - abspath = lambda path: None if path is None else os.path.abspath(path) + abspath = lambda path: None if path is None else Path(path).resolve() return { 'CHROME_USER_DATA_DIR': { 'path': abspath(config['CHROME_USER_DATA_DIR']), 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'], - 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')), + 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(), }, 'COOKIES_FILE': { 'path': abspath(config['COOKIES_FILE']), 'enabled': config['USE_WGET'] and config['COOKIES_FILE'], - 'is_valid': False if config['COOKIES_FILE'] is None else os.path.exists(config['COOKIES_FILE']), + 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(), }, } @@ -828,7 +828,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None: # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) if config['CHROME_USER_DATA_DIR'] is not None: - if not os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')): + if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(): stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red') stderr(f' {config["CHROME_USER_DATA_DIR"]}') stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.') From 25ac18c8b7cb28631f777fc9ada2da9907dc1e8a Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 15:04:46 -0500 Subject: [PATCH 16/29] Replaced os.path in system.py --- archivebox/system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/system.py b/archivebox/system.py index f7d1d41c..e07c69c7 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -64,7 +64,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS) -> @enforce_types def copy_and_overwrite(from_path: str, to_path: str): """copy a given file or directory to a given path, overwriting the destination""" - if os.path.isdir(from_path): + if Path(from_path).is_dir(): shutil.rmtree(to_path, ignore_errors=True) shutil.copytree(from_path, to_path) else: From 01461a98a77dcf6c4c83c3f25c892ccf403e8529 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 15:05:03 -0500 Subject: [PATCH 17/29] Replaced os.path in logging_util.py --- archivebox/logging_util.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 089d49ab..eef0c30e 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -390,7 +390,7 @@ def log_list_finished(links): def log_removal_started(links: List["Link"], yes: bool, delete: bool): print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI)) if delete: - file_counts = [link.num_outputs for link in links if os.path.exists(link.link_dir)] + file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()] print( f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)' @@ -445,9 +445,9 @@ def log_shell_welcome_msg(): @enforce_types def pretty_path(path: Union[Path, str]) -> str: """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" - pwd = os.path.abspath('.') + pwd = Path('.').resolve() # parent = os.path.abspath(os.path.join(pwd, os.path.pardir)) - return str(path).replace(pwd + '/', './') + return str(path).replace(str(pwd) + '/', './') @enforce_types @@ -518,11 +518,11 @@ def printable_folder_status(name: str, folder: Dict) -> str: color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-' if folder['path']: - if os.path.exists(folder['path']): + if Path(folder['path']).exists(): num_files = ( f'{len(os.listdir(folder["path"]))} files' - if os.path.isdir(folder['path']) else - printable_filesize(os.path.getsize(folder['path'])) + if Path(folder['path']).is_dir() else + printable_filesize(Path(folder['path']).stat().st_size) ) else: num_files = 'missing' From 3e26ab3ce3695452141d99671dd0e5865ae9a096 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Wed, 30 Sep 2020 15:35:51 -0500 Subject: [PATCH 18/29] Replaced os.path in clic tests --- archivebox/cli/tests.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py index 1f44784d..4d7016aa 100755 --- a/archivebox/cli/tests.py +++ b/archivebox/cli/tests.py @@ -7,6 +7,7 @@ import os import sys import shutil import unittest +from pathlib import Path from contextlib import contextmanager @@ -109,13 +110,13 @@ class TestInit(unittest.TestCase): with output_hidden(): archivebox_init.main([]) - assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)) - assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME)) - assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)) + assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() + assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() + assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0 def test_conflicting_init(self): - with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f: + with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+') as f: f.write('test') try: @@ -125,9 +126,9 @@ class TestInit(unittest.TestCase): except SystemExit: pass - assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)) - assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME)) - assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)) + assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() + assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() + assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() try: load_main_index(out_dir=OUTPUT_DIR) assert False, 'load_main_index should raise an exception when no index is present' @@ -159,7 +160,7 @@ class TestAdd(unittest.TestCase): assert len(all_links) == 30 def test_add_arg_file(self): - test_file = os.path.join(OUTPUT_DIR, 'test.txt') + test_file = Path(OUTPUT_DIR) / 'test.txt' with open(test_file, 'w+') as f: f.write(test_urls) From 8d3295458c2ffaf0e2fa01279f0e5e14930f9e6e Mon Sep 17 00:00:00 2001 From: Adam Wolf Date: Sat, 3 Oct 2020 14:57:55 -0500 Subject: [PATCH 19/29] Add a bookmarklet The bookmarklet lets you quickly open the Add page with the URL already populated in the URLs box. --- archivebox/core/views.py | 11 +++++++++++ archivebox/themes/default/add_links.html | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 4144b2db..7cd8b104 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -114,12 +114,23 @@ class AddView(UserPassesTestMixin, FormView): template_name = "add_links.html" form_class = AddLinkForm + def get_initial(self): + """Prefill the AddLinkForm with the 'url' GET parameter""" + if self.request.method == 'GET': + url = self.request.GET.get('url', None) + if url: + return {'url': url} + else: + return super().get_initial() + def test_func(self): return PUBLIC_ADD_VIEW or self.request.user.is_authenticated def get_context_data(self, *args, **kwargs): context = super().get_context_data(*args, **kwargs) context["title"] = "Add URLs" + # We can't just call request.build_absolute_uri in the template, because it would include query parameters + context["absolute_add_path"] = self.request.build_absolute_uri(self.request.path) return context def form_valid(self, form): diff --git a/archivebox/themes/default/add_links.html b/archivebox/themes/default/add_links.html index cb6f4341..0b384f5c 100644 --- a/archivebox/themes/default/add_links.html +++ b/archivebox/themes/default/add_links.html @@ -49,6 +49,12 @@ (it's safe to leave this page, adding will continue in the background) + {% if absolute_add_path %} +
+

Bookmark this link to quickly add to your archive: + Add to ArchiveBox

+
+ {% endif %}