From d9fd1e38111b91090b10f1bf73e5b67f7151fffe Mon Sep 17 00:00:00 2001 From: BlipRanger Date: Thu, 10 Dec 2020 10:51:57 -0500 Subject: [PATCH 01/93] Add selector for archive modes --- archivebox/core/forms.py | 20 +++++++++++++++++++- archivebox/core/views.py | 5 +++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 8f48929b..4905464d 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -10,10 +10,28 @@ CHOICES = ( ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'), ) +ARCHIVE_METHODS = [ + ('title', 'title'), + ('favicon', 'favicon'), + ('wget', 'wget'), + ('warc', 'warc'), + ('pdf', 'pdf'), + ('screenshot', 'screenshot'), + ('dom', 'dom'), + ('singlefile', 'singlefile'), + ('git', 'git'), + ('media', 'media'), + ('archive_org', 'archive_org'), +] + + class AddLinkForm(forms.Form): url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0') - + archiveMethods = forms.MultipleChoiceField( + required=False, + widget=forms.SelectMultiple, + choices=ARCHIVE_METHODS,) class TagWidgetMixin: def format_value(self, value): diff --git a/archivebox/core/views.py b/archivebox/core/views.py index dfea7700..5faf3a29 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -138,12 +138,17 @@ class AddView(UserPassesTestMixin, FormView): url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') depth = 0 if form.cleaned_data["depth"] == "0" else 1 + extractors = "" + for extractor in form.cleaned_data["archiveMethods"]: + extractors = extractors + extractor + ',' input_kwargs = { "urls": url, "depth": depth, "update_all": False, "out_dir": OUTPUT_DIR, } + if extractors: + input_kwargs.append("extractors": extractors) add_stdout = StringIO() with redirect_stdout(add_stdout): add(**input_kwargs) From 8b0ff2dfee1a6549a7275fe84fbffb2f60ed5fb8 Mon Sep 17 00:00:00 2001 From: BlipRanger Date: Thu, 10 Dec 2020 11:08:27 -0500 Subject: [PATCH 02/93] update instead of append --- archivebox/core/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 5faf3a29..a195ea24 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -148,7 +148,7 @@ class AddView(UserPassesTestMixin, FormView): "out_dir": OUTPUT_DIR, } if extractors: - input_kwargs.append("extractors": extractors) + input_kwargs.update({"extractors": extractors}) add_stdout = StringIO() with redirect_stdout(add_stdout): add(**input_kwargs) From 7ce1f631830bc114823191379486ee37bd6f45ee Mon Sep 17 00:00:00 2001 From: BlipRanger Date: Thu, 10 Dec 2020 12:44:38 -0500 Subject: [PATCH 03/93] Update archivebox/core/forms.py Format cleanup Co-authored-by: Nick Sweeting --- archivebox/core/forms.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 4905464d..14893d96 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -28,11 +28,11 @@ ARCHIVE_METHODS = [ class AddLinkForm(forms.Form): url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0') - archiveMethods = forms.MultipleChoiceField( - required=False, - widget=forms.SelectMultiple, - choices=ARCHIVE_METHODS,) - + archive_methods = forms.MultipleChoiceField( + required=False, + widget=forms.SelectMultiple, + choices=ARCHIVE_METHODS, + ) class TagWidgetMixin: def format_value(self, value): if value is not None and not isinstance(value, str): From 35809eab1c09f327c7aee9c66194f4825b795181 Mon Sep 17 00:00:00 2001 From: BlipRanger Date: Thu, 10 Dec 2020 12:45:30 -0500 Subject: [PATCH 04/93] Update archivebox/core/views.py Cleaner handling of the archive methods input Co-authored-by: Nick Sweeting --- archivebox/core/views.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index a195ea24..a9578869 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -138,9 +138,7 @@ class AddView(UserPassesTestMixin, FormView): url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') depth = 0 if form.cleaned_data["depth"] == "0" else 1 - extractors = "" - for extractor in form.cleaned_data["archiveMethods"]: - extractors = extractors + extractor + ',' + extractors = ','.join(form.cleaned_data["archive_methods"]) input_kwargs = { "urls": url, "depth": depth, From 6f462b45d7dd6bc5a0d49a3329c592d32c610b9f Mon Sep 17 00:00:00 2001 From: BlipRanger Date: Thu, 10 Dec 2020 12:46:16 -0500 Subject: [PATCH 05/93] Update archivebox/core/forms.py Cleaner handling of the ARCHIVE_METHODS values Co-authored-by: Nick Sweeting --- archivebox/core/forms.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 14893d96..25d393ad 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -10,18 +10,11 @@ CHOICES = ( ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'), ) +from ..extractors import get_default_archive_methods + ARCHIVE_METHODS = [ - ('title', 'title'), - ('favicon', 'favicon'), - ('wget', 'wget'), - ('warc', 'warc'), - ('pdf', 'pdf'), - ('screenshot', 'screenshot'), - ('dom', 'dom'), - ('singlefile', 'singlefile'), - ('git', 'git'), - ('media', 'media'), - ('archive_org', 'archive_org'), + (name, name) + for name, _, _ in get_default_archive_methods() ] From 9fa70b3452836cafb975cb0dbb37b52a74ab68eb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 15:48:46 +0200 Subject: [PATCH 06/93] add extractors arg to oneshot command and bump version to v0.5.1 --- archivebox/cli/archivebox_add.py | 2 +- archivebox/cli/archivebox_oneshot.py | 8 ++++++++ archivebox/main.py | 9 +++++---- package.json | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index b4e65231..41c7554d 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -89,8 +89,8 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional index_only=command.index_only, overwrite=command.overwrite, init=command.init, - out_dir=pwd or OUTPUT_DIR, extractors=command.extract, + out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py index 2353d101..af68bac2 100644 --- a/archivebox/cli/archivebox_oneshot.py +++ b/archivebox/cli/archivebox_oneshot.py @@ -36,6 +36,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ' ~/Desktop/sites_list.csv\n' ) ) + parser.add_argument( + "--extract", + type=str, + help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ + This does not take precedence over the configuration", + default="" + ) parser.add_argument( '--out-dir', type=str, @@ -55,6 +62,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional oneshot( url=stdin_url or url, out_dir=Path(command.out_dir).resolve(), + extractors=command.extract, ) diff --git a/archivebox/main.py b/archivebox/main.py index 6463bab6..97c13c4e 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -511,7 +511,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: @enforce_types -def oneshot(url: str, out_dir: Path=OUTPUT_DIR): +def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): """ Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. You can run this to archive single pages without needing to create a whole collection with archivebox init. @@ -523,7 +523,8 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR): color='red' ) raise SystemExit(2) - methods = ignore_methods(['title']) + + methods = extractors.split(",") if extractors else ignore_methods(['title']) archive_link(oneshot_link[0], out_dir=out_dir, methods=methods) return oneshot_link @@ -534,8 +535,8 @@ def add(urls: Union[str, List[str]], index_only: bool=False, overwrite: bool=False, init: bool=False, - out_dir: Path=OUTPUT_DIR, - extractors: str="") -> List[Link]: + extractors: str="", + out_dir: Path=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' diff --git a/package.json b/package.json index 8d88a3fd..36545fb7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.5.0", + "version": "0.5.1", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", From c084e70ea84f24a88a33512a8d9856e36c7d93a5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 16:21:09 +0200 Subject: [PATCH 07/93] fix TEMPLATES_DIR location --- archivebox/config.py | 4 ++-- archivebox/core/settings.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 32f27dfa..68d32939 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -275,7 +275,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}}, 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent}, - 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME / 'legacy'}, + 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME}, 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()}, 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, @@ -682,7 +682,7 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: 'TEMPLATES_DIR': { 'path': (config['TEMPLATES_DIR']).resolve(), 'enabled': True, - 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(), + 'is_valid': (config['TEMPLATES_DIR'] / config['ACTIVE_THEME'] / 'static').exists(), }, # 'NODE_MODULES_DIR': { # 'path': , diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 43a1e153..e8ed6b16 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -12,6 +12,7 @@ from ..config import ( ALLOWED_HOSTS, PACKAGE_DIR, ACTIVE_THEME, + TEMPLATES_DIR_NAME, SQL_INDEX_FILENAME, OUTPUT_DIR, ) @@ -68,14 +69,14 @@ AUTHENTICATION_BACKENDS = [ STATIC_URL = '/static/' STATICFILES_DIRS = [ - str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME / 'static'), - str(Path(PACKAGE_DIR) / 'themes' / 'default' / 'static'), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME / 'static'), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default' / 'static'), ] TEMPLATE_DIRS = [ - str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME), - str(Path(PACKAGE_DIR) / 'themes' / 'default'), - str(Path(PACKAGE_DIR) / 'themes'), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default'), + str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME), ] TEMPLATES = [ From 6623497f187ab9cb847b875f5489403a0b9d51d5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 16:21:33 +0200 Subject: [PATCH 08/93] fix MERCURY_PATH in version output when missing --- archivebox/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 68d32939..7700a7de 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -320,7 +320,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, - 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if (c['USE_MERCURY'] and c['MERCURY_BINARY']) else None}, # mercury is unversioned + 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(bin_path(c['MERCURY_BINARY'])) else None}, # mercury is unversioned 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, @@ -595,7 +595,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]: if node_modules_bin.exists(): return str(node_modules_bin.resolve()) - return shutil.which(Path(binary).expanduser()) or binary + return shutil.which(Path(binary).expanduser()) or shutil.which(binary) or binary def bin_hash(binary: Optional[str]) -> Optional[str]: if binary is None: From 30f8d3f1917cefd10f33564a907aeb1027cd43fe Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 16:21:52 +0200 Subject: [PATCH 09/93] show python implementation name and flip verison output order for easier reading when wrapped on small screens --- archivebox/logging_util.py | 27 +++++++++++++++++---------- archivebox/main.py | 3 ++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 7bce3313..f2b86735 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -19,6 +19,7 @@ if TYPE_CHECKING: from .util import enforce_types from .config import ( ConfigDict, + OUTPUT_DIR, PYTHON_ENCODING, ANSI, IS_TTY, @@ -514,19 +515,24 @@ def printable_folder_status(name: str, folder: Dict) -> str: else: num_files = 'missing' - if ' ' in str(folder['path']): - folder['path'] = f'"{folder["path"]}"' + path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else '' + if path and ' ' in path: + path = f'"{path}"' + + # if path is just a plain dot, replace it back with the full path for clarity + if path == '.': + path = str(OUTPUT_DIR) return ' '.join(( ANSI[color], symbol, ANSI['reset'], - name.ljust(22), - (str(folder["path"]) or '').ljust(76), + name.ljust(21), num_files.ljust(14), ANSI[color], - note, + note.ljust(8), ANSI['reset'], + path.ljust(76), )) @@ -546,17 +552,18 @@ def printable_dependency_version(name: str, dependency: Dict) -> str: else: color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' - if ' ' in (dependency["path"] or ''): - dependency["path"] = f'"{dependency["path"]}"' + path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else '' + if path and ' ' in path: + path = f'"{path}"' return ' '.join(( ANSI[color], symbol, ANSI['reset'], - name.ljust(22), - (dependency["path"] or '').ljust(76), + name.ljust(21), version.ljust(14), ANSI[color], - note, + note.ljust(8), ANSI['reset'], + path.ljust(76), )) diff --git a/archivebox/main.py b/archivebox/main.py index 97c13c4e..6476fd7d 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -69,6 +69,7 @@ from .config import ( ANSI, IS_TTY, IN_DOCKER, + PYTHON_VERSION, USER, ARCHIVEBOX_BINARY, ONLY_NEW, @@ -218,7 +219,7 @@ def version(quiet: bool=False, else: print('ArchiveBox v{}'.format(VERSION)) p = platform.uname() - print(p.system, platform.platform(), p.machine) + print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, f'(in Docker)' if IN_DOCKER else f'(not in Docker)') print() print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) From 09fa58ea4b5a908a2a0e24ebfa3195293231fe55 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 16:22:38 +0200 Subject: [PATCH 10/93] ignore egg info in root --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 677066cf..736b884e 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ venv/ node_modules/ # Packaging artifacts +archivebox.egg-info archivebox-*.tar.gz build/ dist/ From e17c30ed2b4600c8e0477d9faee9e789b69be6a2 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 16:49:00 +0200 Subject: [PATCH 11/93] save test output in tests/out --- .gitignore | 1 + bin/test.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 736b884e..e29719e4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.pyc __pycache__/ .mypy_cache/ +tests/out/ # Python and Node dependencies venv/ diff --git a/bin/test.sh b/bin/test.sh index 3c472812..b33921af 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -14,4 +14,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" source "$DIR/.venv/bin/activate" -pytest -s +pytest -s --basetemp=tests/out From e03d17c20873829375596a5f9da7002a5cc5ab93 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 16:49:18 +0200 Subject: [PATCH 12/93] test extract flag on oneshot --- tests/test_oneshot.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/test_oneshot.py b/tests/test_oneshot.py index 8e4016da..4057a6ad 100644 --- a/tests/test_oneshot.py +++ b/tests/test_oneshot.py @@ -9,11 +9,21 @@ def test_oneshot_command_exists(tmp_path, disable_extractors_dict): def test_oneshot_command_saves_page_in_right_folder(tmp_path, disable_extractors_dict): disable_extractors_dict.update({"SAVE_DOM": "true"}) - process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], - capture_output=True, env=disable_extractors_dict) + process = subprocess.run( + [ + "archivebox", + "oneshot", + f"--out-dir={tmp_path}", + "--extract=title,favicon,dom", + "http://127.0.0.1:8080/static/example.com.html", + ], + capture_output=True, + env=disable_extractors_dict, + ) + print(process.stdout) items = ' '.join([str(x) for x in tmp_path.iterdir()]) current_path = ' '.join([str(x) for x in Path.cwd().iterdir()]) assert "index.json" in items assert not "index.sqlite3" in current_path assert "output.html" in items - \ No newline at end of file + From e90cf051414022829802033764cc672b1161236f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 16:51:11 +0200 Subject: [PATCH 13/93] fix lint errors --- archivebox/index/schema.py | 1 - archivebox/main.py | 3 +-- archivebox/search/__init__.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index c6bf3731..bc3a25da 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -9,7 +9,6 @@ DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py __package__ = 'archivebox.index' from pathlib import Path -from django.db.utils import OperationalError from datetime import datetime, timedelta diff --git a/archivebox/main.py b/archivebox/main.py index 6476fd7d..756fecde 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -69,7 +69,6 @@ from .config import ( ANSI, IS_TTY, IN_DOCKER, - PYTHON_VERSION, USER, ARCHIVEBOX_BINARY, ONLY_NEW, @@ -219,7 +218,7 @@ def version(quiet: bool=False, else: print('ArchiveBox v{}'.format(VERSION)) p = platform.uname() - print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, f'(in Docker)' if IN_DOCKER else f'(not in Docker)') + print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)') print() print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index a1f67ef7..360b20ff 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -106,4 +106,4 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): color='red', ) else: - write_search_index(link, texts, out_dir=out_dir) \ No newline at end of file + write_search_index(link, texts, out_dir=out_dir) From db1f9b759e9cc2f0e8c926c3940009d5945879f5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 16:59:53 +0200 Subject: [PATCH 14/93] skip brew build in linux and debian build on mac --- bin/build_brew.sh | 13 ++++++++++--- bin/build_deb.sh | 7 +++++++ bin/build_docker.sh | 1 + 3 files changed, 18 insertions(+), 3 deletions(-) mode change 100644 => 100755 bin/build_brew.sh diff --git a/bin/build_brew.sh b/bin/build_brew.sh old mode 100644 new mode 100755 index 9767013d..ec54c90a --- a/bin/build_brew.sh +++ b/bin/build_brew.sh @@ -12,11 +12,18 @@ IFS=$'\n' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" + +CURRENT_PLAFORM="$(uname)" +REQUIRED_PLATFORM="Darwin" +if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then + echo "[!] Skipping the Homebrew package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)." + exit 0 +fi + + cd "$REPO_DIR/brew_dist" - - # make sure archivebox.rb is up-to-date with the dependencies -echo "[+] Building bottle" +echo "[+] Building Homebrew bottle" brew install --build-bottle ./archivebox.rb brew bottle archivebox diff --git a/bin/build_deb.sh b/bin/build_deb.sh index 0c590d71..b9279369 100755 --- a/bin/build_deb.sh +++ b/bin/build_deb.sh @@ -19,6 +19,13 @@ else fi cd "$REPO_DIR" +CURRENT_PLAFORM="$(uname)" +REQUIRED_PLATFORM="Linux" +if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then + echo "[!] Skipping the Debian package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)." + exit 0 +fi + VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" DEBIAN_VERSION="1" PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988" diff --git a/bin/build_docker.sh b/bin/build_docker.sh index 025fe350..0115acdf 100755 --- a/bin/build_docker.sh +++ b/bin/build_docker.sh @@ -14,6 +14,7 @@ REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && p VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" cd "$REPO_DIR" +which docker > /dev/null echo "[+] Building docker image in the background..." docker build . -t archivebox \ From 7de7ff9a5453a7e1ec4454ad00c4f5091a9aef34 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 18:26:27 +0200 Subject: [PATCH 15/93] clear previous build wheels before building in CI --- .github/workflows/pip.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index 915ebfd1..27763a73 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -24,6 +24,7 @@ jobs: - name: Build Python Package run: | pip3 install --upgrade pip setuptools wheel + rm -Rf pip_dist/*.whl python3 setup.py \ sdist --dist-dir=./pip_dist \ bdist_wheel --dist-dir=./pip_dist \ From e6a77dc8b119311c5487982a4a6a3b4a3c277544 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 18:35:25 +0200 Subject: [PATCH 16/93] split up debian build into two steps --- .github/workflows/debian.yml | 6 ++++++ .github/workflows/homebrew.yml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 86d6f1ee..5ebdafb2 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -29,6 +29,11 @@ jobs: - name: Install archivebox from deb run: | + rm -Rf build deb_dist dist archivebox-*.tar.gz + python3 setup.py --command-packages=stdeb.command \ + sdist_dsc + python3 setup.py --command-packages=stdeb.command \ + bdist_deb apt install deb_dist/archivebox*.deb - name: Add some links to test @@ -42,4 +47,5 @@ jobs: # TODO: push debian package to launchpad PPA # - name: Push to launchpad # run: | + # debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" # dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" diff --git a/.github/workflows/homebrew.yml b/.github/workflows/homebrew.yml index e5e71420..543100bd 100644 --- a/.github/workflows/homebrew.yml +++ b/.github/workflows/homebrew.yml @@ -20,7 +20,7 @@ jobs: pip3 install --upgrade pip setuptools wheel cd brew_dist/ brew install --build-bottle ./archivebox.rb - brew bottle archivebox + # brew bottle archivebox - name: Add some links to test run: | From ec2c2f5bea4cdfc310d3060e2ea426b4d43a21b7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 18:44:53 +0200 Subject: [PATCH 17/93] add tests for windows --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b73c9e89..8642e8a9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,8 +10,8 @@ jobs: strategy: matrix: - os: [ubuntu-latest, macos-latest] - python: [3.7, 3.8] + os: [ubuntu-latest, macos-latest, windows-latest] + python: [3.7] steps: - uses: actions/checkout@v2 From 6022cbc5f4fd7e9647e10ada269d52762c49f19b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 18:52:07 +0200 Subject: [PATCH 18/93] add debian distutils to build machine --- .github/workflows/debian.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 5ebdafb2..019e77b5 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -23,8 +23,7 @@ jobs: - name: Build Debian/Apt package run: | - sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-setuptools python3-wheel python3-stdeb - pip3 install --upgrade pip setuptools wheel stdeb + sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb ./bin/build_deb.sh - name: Install archivebox from deb From 1960ec517cef211f02de5f6f5281d5189d868fe5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 18:54:26 +0200 Subject: [PATCH 19/93] ignore utf8 errors in setup.py on windows --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d01b3f65..4eb7c97d 100755 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ PROJECT_URLS = { ROOT_DIR = Path(__file__).parent.resolve() PACKAGE_DIR = ROOT_DIR / PKG_NAME -README = (PACKAGE_DIR / "README.md").read_text() +README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore') VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version'] # To see when setup.py gets called (uncomment for debugging): From a804956477edd5dac9224f7cec179e11e08a9893 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 18:59:34 +0200 Subject: [PATCH 20/93] split debian process into 3 steps --- .github/workflows/debian.yml | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 019e77b5..9c499ce0 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -21,18 +21,23 @@ jobs: python-version: 3.9 architecture: x64 - - name: Build Debian/Apt package + - name: Install packaging dependencies run: | sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb - ./bin/build_deb.sh + pip3 install --upgrade pip setuptools wheel stdeb + + - name: Build Debian/Apt sdist_dsc + run: | + python3 setup.py --command-packages=stdeb.command \ + sdist_dsc + + - name: Build Debian/Apt bdist_deb + run: | + python3 setup.py --command-packages=stdeb.command \ + bdist_deb - name: Install archivebox from deb run: | - rm -Rf build deb_dist dist archivebox-*.tar.gz - python3 setup.py --command-packages=stdeb.command \ - sdist_dsc - python3 setup.py --command-packages=stdeb.command \ - bdist_deb apt install deb_dist/archivebox*.deb - name: Add some links to test From 4cda0d4e4490a7472f9cebdaf60814f5342269c6 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:04:27 +0200 Subject: [PATCH 21/93] use system distutils on debian ci image --- .github/workflows/debian.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 9c499ce0..7e931cc6 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -4,6 +4,8 @@ on: workflow_dispatch: push: +env: + SETUPTOOLS_USE_DISTUTILS=stdlib jobs: build: From ec86060a6193bcc56f7b587bcdb352cc28f8ef5a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:09:40 +0200 Subject: [PATCH 22/93] add github push code to builder CI actions --- .github/workflows/debian.yml | 19 +++++++++++++++++-- .github/workflows/homebrew.yml | 15 +++++++++++++++ .github/workflows/pip.yml | 15 +++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 7e931cc6..d78075b9 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -31,12 +31,12 @@ jobs: - name: Build Debian/Apt sdist_dsc run: | python3 setup.py --command-packages=stdeb.command \ - sdist_dsc + sdist_dsc - name: Build Debian/Apt bdist_deb run: | python3 setup.py --command-packages=stdeb.command \ - bdist_deb + bdist_deb - name: Install archivebox from deb run: | @@ -50,6 +50,21 @@ jobs: archivebox version archivebox status + # - name: Commit files + # run: | + # cd deb_dist/ + # git config --local user.email "action@github.com" + # git config --local user.name "GitHub Action" + # git commit -m "Debian package autobuild" -a + + # - name: Push changes + # uses: ad-m/github-push-action@master + # with: + # github_token: ${{ secrets.GITHUB_TOKEN }} + # repository: ArchiveBox/debian-archivebox + # branch: ${{ github.ref }} + # directory: deb_dist + # TODO: push debian package to launchpad PPA # - name: Push to launchpad # run: | diff --git a/.github/workflows/homebrew.yml b/.github/workflows/homebrew.yml index 543100bd..ce4e4d89 100644 --- a/.github/workflows/homebrew.yml +++ b/.github/workflows/homebrew.yml @@ -30,4 +30,19 @@ jobs: archivebox version archivebox status + # - name: Commit files + # run: | + # cd brew_dist/ + # git config --local user.email "action@github.com" + # git config --local user.name "GitHub Action" + # git commit -m "Homebrew package autobuild" -a + + # - name: Push changes + # uses: ad-m/github-push-action@master + # with: + # github_token: ${{ secrets.GITHUB_TOKEN }} + # repository: ArchiveBox/homebrew-archivebox + # branch: ${{ github.ref }} + # directory: brew_dist + # TODO: push bottle to Github and open homebrew core PR with latest changes diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index 27763a73..b892a7ad 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -39,4 +39,19 @@ jobs: archivebox version archivebox status + # - name: Commit files + # run: | + # cd pip_dist/ + # git config --local user.email "action@github.com" + # git config --local user.name "GitHub Action" + # git commit -m "Pip package autobuild" -a + + # - name: Push changes + # uses: ad-m/github-push-action@master + # with: + # github_token: ${{ secrets.GITHUB_TOKEN }} + # repository: ArchiveBox/pip-archivebox + # branch: ${{ github.ref }} + # directory: pip_dist + # TODO: push to PyPI with twine From 27b266af5f0ce9bcd9f35c0713e994f029a2748c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:10:39 +0200 Subject: [PATCH 23/93] fix ls a option not available on windows --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8642e8a9..3ffcd5ec 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -77,7 +77,7 @@ jobs: - name: Directory listing for debugging run: | pwd - ls -a ./ + ls archivebox version - name: Test built package with pytest From 023a339cf843aa135c79d7b6a55551aaae921190 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:11:42 +0200 Subject: [PATCH 24/93] fix debian build syntax --- .github/workflows/debian.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index d78075b9..bb264010 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -5,7 +5,7 @@ on: push: env: - SETUPTOOLS_USE_DISTUTILS=stdlib + SETUPTOOLS_USE_DISTUTILS: stdlib jobs: build: @@ -30,13 +30,11 @@ jobs: - name: Build Debian/Apt sdist_dsc run: | - python3 setup.py --command-packages=stdeb.command \ - sdist_dsc + python3 setup.py --command-packages=stdeb.command sdist_dsc - name: Build Debian/Apt bdist_deb run: | - python3 setup.py --command-packages=stdeb.command \ - bdist_deb + python3 setup.py --command-packages=stdeb.command bdist_deb - name: Install archivebox from deb run: | From 80a156f83a11dc1b46adfe51ea57147838d66091 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:14:11 +0200 Subject: [PATCH 25/93] cleanup previous build artifacts before deb_dist build --- .github/workflows/debian.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index bb264010..31cbe5bf 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -30,6 +30,7 @@ jobs: - name: Build Debian/Apt sdist_dsc run: | + rm -Rf deb_dist/* python3 setup.py --command-packages=stdeb.command sdist_dsc - name: Build Debian/Apt bdist_deb From 988aa9b5728a0e3c9b1ea011144fc5725becac05 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:17:41 +0200 Subject: [PATCH 26/93] tweak debian distutils location --- .github/workflows/debian.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 31cbe5bf..04e17287 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -5,7 +5,7 @@ on: push: env: - SETUPTOOLS_USE_DISTUTILS: stdlib + SETUPTOOLS_USE_DISTUTILS: local jobs: build: From aee219eb91f478509922d93a498a9696f9642e93 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:23:10 +0200 Subject: [PATCH 27/93] use system python instead of github actions python --- .github/workflows/debian.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 04e17287..553711e1 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -17,16 +17,16 @@ jobs: submodules: true fetch-depth: 1 - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: 3.9 - architecture: x64 + # - name: Set up Python + # uses: actions/setup-python@v1 + # with: + # python-version: 3.9 + # architecture: x64 - name: Install packaging dependencies run: | sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb - pip3 install --upgrade pip setuptools wheel stdeb + # pip3 install --upgrade pip setuptools wheel stdeb - name: Build Debian/Apt sdist_dsc run: | From 93e6d22c1c89895aa8841504ad21e8913b1ff43b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:30:26 +0200 Subject: [PATCH 28/93] dont run package tests when building deb pkg --- .github/workflows/debian.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 553711e1..664faea4 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -5,7 +5,7 @@ on: push: env: - SETUPTOOLS_USE_DISTUTILS: local + DEB_BUILD_OPTIONS: nocheck jobs: build: From 2db5e51b54eb2724efbcbb5899ce75522cfd461c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:33:18 +0200 Subject: [PATCH 29/93] fix windows shutil not able to handle pathlib --- .github/workflows/test.yml | 3 +++ archivebox/config.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3ffcd5ec..ed989df7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -78,6 +78,9 @@ jobs: run: | pwd ls + + - name: Archivebox version + run: | archivebox version - name: Test built package with pytest diff --git a/archivebox/config.py b/archivebox/config.py index 7700a7de..75c4cb96 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -320,7 +320,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, - 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(bin_path(c['MERCURY_BINARY'])) else None}, # mercury is unversioned + 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, @@ -595,7 +595,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]: if node_modules_bin.exists(): return str(node_modules_bin.resolve()) - return shutil.which(Path(binary).expanduser()) or shutil.which(binary) or binary + return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary def bin_hash(binary: Optional[str]) -> Optional[str]: if binary is None: From 03334ec687052e3fa627074b4ce509c30c733270 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:38:08 +0200 Subject: [PATCH 30/93] whoops forgot sudo --- .github/workflows/debian.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 664faea4..8a5b3f10 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -39,7 +39,7 @@ jobs: - name: Install archivebox from deb run: | - apt install deb_dist/archivebox*.deb + sudo apt install deb_dist/archivebox*.deb - name: Add some links to test run: | From b76ba443f0cf3252657e4d57754099935aed4069 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:41:05 +0200 Subject: [PATCH 31/93] fix deb_dist --- .github/workflows/debian.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 8a5b3f10..0c9198b9 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -39,7 +39,8 @@ jobs: - name: Install archivebox from deb run: | - sudo apt install deb_dist/archivebox*.deb + cd deb_dist/ + sudo apt install archivebox*.deb - name: Add some links to test run: | From c47fee066d8d915e65a05ec1200fd348f21b9325 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:45:46 +0200 Subject: [PATCH 32/93] disable color and force utf-8 output on windows runner --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ed989df7..4a7eb205 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,6 +3,9 @@ on: [push] env: DOCKER_IMAGE: archivebox-ci + PYTHONIOENCODING: utf-8 + PYTHONLEGACYWINDOWSSTDIO: utf-8 + USE_COLOR: False jobs: python_tests: From 0ab8581c71824b2aad42233781d3d86f67035dd9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:46:26 +0200 Subject: [PATCH 33/93] install local package force using dot slash --- .github/workflows/debian.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 0c9198b9..31dafa7f 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -40,7 +40,7 @@ jobs: - name: Install archivebox from deb run: | cd deb_dist/ - sudo apt install archivebox*.deb + sudo apt install ./archivebox*.deb - name: Add some links to test run: | From 58bdaff1834dccae7251c87abe3b9bcb52d5d492 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 19:56:21 +0200 Subject: [PATCH 34/93] split out problematic packages for better info --- .github/workflows/debian.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 31dafa7f..eecdfb2d 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -40,6 +40,9 @@ jobs: - name: Install archivebox from deb run: | cd deb_dist/ + sudo apt install ripgrep + sudo apt install python3-dateparser + sudo apt install python3-mypy-extensions sudo apt install ./archivebox*.deb - name: Add some links to test From c4961d08b3ff6a6f2c1e364372ef19efc060bc04 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 20:03:17 +0200 Subject: [PATCH 35/93] enable universe in apt sources --- .github/workflows/debian.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index eecdfb2d..776567d5 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -25,6 +25,9 @@ jobs: - name: Install packaging dependencies run: | + sudo echo "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" >> /etc/apt/sources.list + sudo echo "deb-src http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" >> /etc/apt/sources.list + sudo apt update sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb # pip3 install --upgrade pip setuptools wheel stdeb @@ -40,9 +43,9 @@ jobs: - name: Install archivebox from deb run: | cd deb_dist/ - sudo apt install ripgrep sudo apt install python3-dateparser sudo apt install python3-mypy-extensions + sudo apt install ripgrep sudo apt install ./archivebox*.deb - name: Add some links to test From b61143337ec00da4f9e2f6adaf53016654e283f3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 20:07:40 +0200 Subject: [PATCH 36/93] add universe to sources as root --- .github/workflows/debian.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 776567d5..1fafb2fc 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -25,8 +25,8 @@ jobs: - name: Install packaging dependencies run: | - sudo echo "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" >> /etc/apt/sources.list - sudo echo "deb-src http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" >> /etc/apt/sources.list + sudo bash -c 'echo "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" >> /etc/apt/sources.list' + sudo bash -c 'echo "deb-src http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" >> /etc/apt/sources.list' sudo apt update sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb # pip3 install --upgrade pip setuptools wheel stdeb From 8896e1957a47d3cd92837f169f3c9ce4a142c3fc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 20:10:11 +0200 Subject: [PATCH 37/93] force add focal instead of bionic --- .github/workflows/debian.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 1fafb2fc..0b6aea13 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -25,8 +25,8 @@ jobs: - name: Install packaging dependencies run: | - sudo bash -c 'echo "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" >> /etc/apt/sources.list' - sudo bash -c 'echo "deb-src http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" >> /etc/apt/sources.list' + sudo bash -c 'echo "deb http://archive.ubuntu.com/ubuntu focal universe" >> /etc/apt/sources.list' + sudo bash -c 'echo "deb-src http://archive.ubuntu.com/ubuntu focal universe" >> /etc/apt/sources.list' sudo apt update sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb # pip3 install --upgrade pip setuptools wheel stdeb From 82de5631bb1f709564a9fe72b227888bd335fe40 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 20:11:52 +0200 Subject: [PATCH 38/93] dont enable universe until after setup --- .github/workflows/debian.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 0b6aea13..56a936b3 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -25,9 +25,6 @@ jobs: - name: Install packaging dependencies run: | - sudo bash -c 'echo "deb http://archive.ubuntu.com/ubuntu focal universe" >> /etc/apt/sources.list' - sudo bash -c 'echo "deb-src http://archive.ubuntu.com/ubuntu focal universe" >> /etc/apt/sources.list' - sudo apt update sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb # pip3 install --upgrade pip setuptools wheel stdeb @@ -43,6 +40,9 @@ jobs: - name: Install archivebox from deb run: | cd deb_dist/ + sudo bash -c 'echo "deb http://archive.ubuntu.com/ubuntu focal universe" >> /etc/apt/sources.list' + sudo bash -c 'echo "deb-src http://archive.ubuntu.com/ubuntu focal universe" >> /etc/apt/sources.list' + sudo apt update sudo apt install python3-dateparser sudo apt install python3-mypy-extensions sudo apt install ripgrep From 5faeb13872bd1fa38147954593b81820db5d1a2f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 20:18:17 +0200 Subject: [PATCH 39/93] use ubuntu-20.04 instead of latest --- .github/workflows/debian.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 56a936b3..2c0ff68a 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -9,7 +9,7 @@ env: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 @@ -40,12 +40,6 @@ jobs: - name: Install archivebox from deb run: | cd deb_dist/ - sudo bash -c 'echo "deb http://archive.ubuntu.com/ubuntu focal universe" >> /etc/apt/sources.list' - sudo bash -c 'echo "deb-src http://archive.ubuntu.com/ubuntu focal universe" >> /etc/apt/sources.list' - sudo apt update - sudo apt install python3-dateparser - sudo apt install python3-mypy-extensions - sudo apt install ripgrep sudo apt install ./archivebox*.deb - name: Add some links to test From dc222e3636432d39422cc7ba3b42ec832ef82f47 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 20:38:51 +0200 Subject: [PATCH 40/93] bump ubuntu version used for tests --- .github/workflows/debian.yml | 2 +- .github/workflows/test.yml | 6 +++--- stdeb.cfg | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 2c0ff68a..d4c114b0 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -44,7 +44,7 @@ jobs: - name: Add some links to test run: | - mkdir data && cd data + mkdir "${{ github.workspace }}/data" && cd "${{ github.workspace }}/data" archivebox init archivebox add 'https://example.com' archivebox version diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4a7eb205..e9f1d23a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ jobs: strategy: matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-20.04, macos-latest, windows-latest] python: [3.7] steps: @@ -108,8 +108,8 @@ jobs: - name: Init data dir run: | - mkdir data - docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" init + mkdir "${{ github.workspace }}/data" + docker run -v "${{ github.workspace }}/data":/data "$DOCKER_IMAGE" init - name: Run test server run: | diff --git a/stdeb.cfg b/stdeb.cfg index 37bbb42f..a07147e2 100644 --- a/stdeb.cfg +++ b/stdeb.cfg @@ -5,5 +5,5 @@ Package3: archivebox Suite: focal Suite3: focal Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb -Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep +Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep XS-Python-Version: >= 3.7 From 081d94d7992f6081a79715794d3141a10ee05f71 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 20:45:44 +0200 Subject: [PATCH 41/93] fallback to old JSONField from lib if django version is old --- archivebox/core/models.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index dca6941f..d50e8f40 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -18,6 +18,12 @@ STATUS_CHOICES = [ ("skipped", "skipped") ] +try: + JSONField = models.JSONField +except AttributeError: + import jsonfield + JSONField = jsonfield.JSONField + class Tag(models.Model): """ @@ -173,7 +179,7 @@ class ArchiveResultManager(models.Manager): class ArchiveResult(models.Model): snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) - cmd = models.JSONField() + cmd = JSONField() pwd = models.CharField(max_length=256) cmd_version = models.CharField(max_length=32) output = models.CharField(max_length=512) From 1c87c2710578df824ea2ee50229897bcfe6181a4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 20:50:45 +0200 Subject: [PATCH 42/93] patch migration JSONField as well --- archivebox/core/migrations/0007_archiveresult.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 898e0f93..a780376f 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -9,6 +9,12 @@ import django.db.models.deletion from config import CONFIG from index.json import to_json +try: + JSONField = models.JSONField +except AttributeError: + import jsonfield + JSONField = jsonfield.JSONField + def forwards_func(apps, schema_editor): from core.models import EXTRACTORS @@ -76,7 +82,7 @@ class Migration(migrations.Migration): name='ArchiveResult', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('cmd', models.JSONField()), + ('cmd', JSONField()), ('pwd', models.CharField(max_length=256)), ('cmd_version', models.CharField(max_length=32)), ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)), From 335732649bf0b292c6a7e26440c9952b9438d8a8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 21:03:17 +0200 Subject: [PATCH 43/93] tweak node dependency version detection order --- .github/workflows/debian.yml | 15 ++++++++++++--- archivebox/config.py | 5 +++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index d4c114b0..97fc5b21 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -25,7 +25,7 @@ jobs: - name: Install packaging dependencies run: | - sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb + sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb curl wget # pip3 install --upgrade pip setuptools wheel stdeb - name: Build Debian/Apt sdist_dsc @@ -42,12 +42,21 @@ jobs: cd deb_dist/ sudo apt install ./archivebox*.deb - - name: Add some links to test + - name: Check ArchiveBox version run: | + # must create dir needed for snaps to run as non-root on github actions + sudo mkdir -p /run/user/1001 && sudo chmod -R 777 /run/user/1001 mkdir "${{ github.workspace }}/data" && cd "${{ github.workspace }}/data" archivebox init - archivebox add 'https://example.com' + archivebox config --set SAVE_READABILITY=False + archivebox config --set SAVE_MERCURY=False + archivebox config --set SAVE_SINGLEFILE=False archivebox version + + - name: Add some links to test + run: | + cd "${{ github.workspace }}/data" + archivebox add 'https://example.com' archivebox status # - name: Commit files diff --git a/archivebox/config.py b/archivebox/config.py index 75c4cb96..a3444f07 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -334,8 +334,6 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()}, 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None}, - 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])}, - 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']}, 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, @@ -343,6 +341,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']}, 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, + + 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])}, + 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, From 72e5c811ba42762ab523b81d702a4b33fc8803a3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 21:04:30 +0200 Subject: [PATCH 44/93] bump ubuntu version used for linter and pip build --- .github/workflows/lint.yml | 2 +- .github/workflows/pip.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c4479c4b..80f4f19f 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,7 +9,7 @@ env: jobs: lint: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 with: diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index b892a7ad..bbb44019 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -7,7 +7,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 From d67d212682b5da0f7f22634a521de12374f02444 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 21:05:40 +0200 Subject: [PATCH 45/93] break package deps into multiple lines --- .github/workflows/debian.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 97fc5b21..cb2a006c 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -25,7 +25,10 @@ jobs: - name: Install packaging dependencies run: | - sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-distutils python3-setuptools python3-wheel python3-stdeb curl wget + sudo apt install -y \ + python3 python3-dev python3-pip python3-venv python3-all \ + dh-python debhelper devscripts dput software-properties-common \ + python3-distutils python3-setuptools python3-wheel python3-stdeb # pip3 install --upgrade pip setuptools wheel stdeb - name: Build Debian/Apt sdist_dsc From 9ee52b52b4941224bb2b3ad1db1bbce6e963484a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 21:26:37 +0200 Subject: [PATCH 46/93] add more TODO comments to github actions --- .github/workflows/debian.yml | 14 +++----------- .github/workflows/homebrew.yml | 8 +++++--- .github/workflows/pip.yml | 10 +++++++--- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index cb2a006c..49e9750a 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -17,19 +17,12 @@ jobs: submodules: true fetch-depth: 1 - # - name: Set up Python - # uses: actions/setup-python@v1 - # with: - # python-version: 3.9 - # architecture: x64 - - name: Install packaging dependencies run: | sudo apt install -y \ python3 python3-dev python3-pip python3-venv python3-all \ dh-python debhelper devscripts dput software-properties-common \ python3-distutils python3-setuptools python3-wheel python3-stdeb - # pip3 install --upgrade pip setuptools wheel stdeb - name: Build Debian/Apt sdist_dsc run: | @@ -62,14 +55,14 @@ jobs: archivebox add 'https://example.com' archivebox status - # - name: Commit files + # - name: Commit built package # run: | # cd deb_dist/ # git config --local user.email "action@github.com" # git config --local user.name "GitHub Action" # git commit -m "Debian package autobuild" -a - # - name: Push changes + # - name: Push build to Github # uses: ad-m/github-push-action@master # with: # github_token: ${{ secrets.GITHUB_TOKEN }} @@ -77,8 +70,7 @@ jobs: # branch: ${{ github.ref }} # directory: deb_dist - # TODO: push debian package to launchpad PPA - # - name: Push to launchpad + # - name: Push build to Launchpad PPA # run: | # debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" # dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" diff --git a/.github/workflows/homebrew.yml b/.github/workflows/homebrew.yml index ce4e4d89..d9bb05f1 100644 --- a/.github/workflows/homebrew.yml +++ b/.github/workflows/homebrew.yml @@ -15,6 +15,8 @@ jobs: submodules: true fetch-depth: 1 + # TODO: modify archivebox.rb to update src url, hashes, and dependencies + - name: Build Homebrew Bottle run: | pip3 install --upgrade pip setuptools wheel @@ -30,14 +32,14 @@ jobs: archivebox version archivebox status - # - name: Commit files + # - name: Commit built package # run: | # cd brew_dist/ # git config --local user.email "action@github.com" # git config --local user.name "GitHub Action" # git commit -m "Homebrew package autobuild" -a - # - name: Push changes + # - name: Push build to Github # uses: ad-m/github-push-action@master # with: # github_token: ${{ secrets.GITHUB_TOKEN }} @@ -45,4 +47,4 @@ jobs: # branch: ${{ github.ref }} # directory: brew_dist - # TODO: push bottle to Github and open homebrew core PR with latest changes + # TODO: push bottle homebrew core PR with latest changes diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index bbb44019..36153189 100644 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -39,14 +39,14 @@ jobs: archivebox version archivebox status - # - name: Commit files + # - name: Commit built package # run: | # cd pip_dist/ # git config --local user.email "action@github.com" # git config --local user.name "GitHub Action" # git commit -m "Pip package autobuild" -a - # - name: Push changes + # - name: Push build to Github # uses: ad-m/github-push-action@master # with: # github_token: ${{ secrets.GITHUB_TOKEN }} @@ -54,4 +54,8 @@ jobs: # branch: ${{ github.ref }} # directory: pip_dist - # TODO: push to PyPI with twine + # - name: Push build to PyPI + # run: | + # cd pip_dist/ + # python3 -m twine upload --repository testpypi pip_dist/*.{whl,tar.gz} + # python3 -m twine upload --repository pypi pip_dist/*.{whl,tar.gz} From d92369b62aefd65a66280a319ab4f646c022bdf0 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 21:38:45 +0200 Subject: [PATCH 47/93] skip tests on windows for now --- .github/workflows/test.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e9f1d23a..4e34f768 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -88,7 +88,12 @@ jobs: - name: Test built package with pytest run: | - python -m pytest -s + # TODO: remove this exception for windows once we get tests passing on that platform + if [[ ${{ matrix.os }} == "windows-latest" ]]; then + echo "Skipping tests on Windows" + else + python -m pytest -s + fi docker_tests: runs-on: ubuntu-latest From 35da16f203789bcc836499b0019e4b887977a9f1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 21:41:46 +0200 Subject: [PATCH 48/93] different bash switch style --- .github/workflows/test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4e34f768..544fb4a1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -87,9 +87,11 @@ jobs: archivebox version - name: Test built package with pytest + env: + OS: ${{ matrix.os }} run: | # TODO: remove this exception for windows once we get tests passing on that platform - if [[ ${{ matrix.os }} == "windows-latest" ]]; then + if [[ "$OS" == "windows-latest" ]]; then echo "Skipping tests on Windows" else python -m pytest -s From 491cbe85f087153ec25e88bf8dc097b7a70e9ee3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 21:44:21 +0200 Subject: [PATCH 49/93] use actions if statment instead of bash --- .github/workflows/test.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 544fb4a1..0ecb9b69 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -87,15 +87,10 @@ jobs: archivebox version - name: Test built package with pytest - env: - OS: ${{ matrix.os }} + # TODO: remove this exception for windows once we get tests passing on that platform + if: !contains(matrix.os, 'windows') run: | - # TODO: remove this exception for windows once we get tests passing on that platform - if [[ "$OS" == "windows-latest" ]]; then - echo "Skipping tests on Windows" - else - python -m pytest -s - fi + python -m pytest -s docker_tests: runs-on: ubuntu-latest From 1a00572138c005fdf9e916852867846ea93ffe0c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 21:45:15 +0200 Subject: [PATCH 50/93] fix case expression syntax --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0ecb9b69..9a6c76f2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -88,7 +88,7 @@ jobs: - name: Test built package with pytest # TODO: remove this exception for windows once we get tests passing on that platform - if: !contains(matrix.os, 'windows') + if: ${{ !contains(matrix.os, 'windows') }} run: | python -m pytest -s From 6066d30b62eafbd72c804b881354232081e9a3ee Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 22:45:15 +0200 Subject: [PATCH 51/93] update readme --- README.md | 86 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index b8be5bce..bbaffa62 100644 --- a/README.md +++ b/README.md @@ -34,38 +34,80 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot #### Quickstart -**First, get ArchiveBox using your system package manager, Docker, or pip:** -```bash -# You can run it with Docker or Docker Compose (recommended) -docker pull archivebox/archivebox -# https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml +**First, get ArchiveBox using Docker, your system package manager, or pip.** -# or Ubuntu/Debian +(Click to expand each section) + +
Get ArchiveBox with Docker Compose + +```bash +# Download the compose file into an empty directory somewhere +mkdir ~/archivebox && cd ~/archivebox +wget https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml + +docker-compose run archivebox init +docker-compose up -d +open http://127.0.0.1:8000 + +# To add a new admin user for the Web UI you can run: +docker-compose run archivebox manage createsuperuser +``` + +
+ +
Get ArchiveBox with plain Docker + +```bash +# cd into a new empty directory somewhere and pull the latest image +mkdir ~/archivebox && cd ~/archivebox +docker pull archivebox/archivebox + +docker run -v $PWD:/data -it archivebox/archivebox init +docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox +open http://127.0.0.1:8000 + +# To add a new admin user for the Web UI you can run: +docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser +``` + +
+ +
Get ArchiveBox with apt on Ubuntu >=20.04 + +```bash sudo add-apt-repository -u ppa:archivebox/archivebox apt install archivebox +``` -# or macOS +
+ +
Get ArchiveBox with brew on macOS >=10.13 + +```bash brew install archivebox/archivebox/archivebox -# or for the Python version only, without wget/git/chrome/etc. included -pip3 install archivebox - -# If you're using an apt/brew/pip install you can run archivebox commands normally -# archivebox [subcommand] [...args] -# If you're using Docker you'll have to run the commands like this -# docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args] -# And the equivalent in Docker Compose: -# docker-compose run archivebox [subcommand] [...args] +# create a new empty directory and initalize your collection (can be anywhere) +mkdir ~/archivebox && cd ~/archivebox +archivebox init ``` +
+ +
Get ArchiveBox with pip on any platform +```bash +pip3 install archivebox +# Install the extra dependencies like wget/git/chrome/etc. separately as needed + +# create a new empty directory and initalize your collection (can be anywhere) +mkdir ~/archivebox && cd ~/archivebox +archivebox init +``` + +
+ + Check that everything installed correctly with `archivebox --version` -**To start using archivebox, you have to create a data folder and `cd` into it:** - -```bash -mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere -archivebox init -``` **Then Add some URLs to your archive collection:** ```bash From 1e8ed66a9e8e4f98131e23b39c70a441a3a71758 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 22:46:34 +0200 Subject: [PATCH 52/93] fix readme syntax --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index bbaffa62..a57257be 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,8 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot (Click to expand each section) -
Get ArchiveBox with Docker Compose +
+Get ArchiveBox with Docker Compose ```bash # Download the compose file into an empty directory somewhere @@ -55,7 +56,8 @@ docker-compose run archivebox manage createsuperuser
-
Get ArchiveBox with plain Docker +
+Get ArchiveBox with plain Docker ```bash # cd into a new empty directory somewhere and pull the latest image @@ -72,7 +74,8 @@ docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser
-
Get ArchiveBox with apt on Ubuntu >=20.04 +
+Get ArchiveBox with apt on Ubuntu >=20.04 ```bash sudo add-apt-repository -u ppa:archivebox/archivebox @@ -81,7 +84,8 @@ apt install archivebox
-
Get ArchiveBox with brew on macOS >=10.13 +
+Get ArchiveBox with brew on macOS >=10.13 ```bash brew install archivebox/archivebox/archivebox @@ -93,7 +97,9 @@ archivebox init
-
Get ArchiveBox with pip on any platform +
+Get ArchiveBox with pip on any platform + ```bash pip3 install archivebox # Install the extra dependencies like wget/git/chrome/etc. separately as needed From 63caff2811190b799249c095e38830141cd69092 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 22:50:16 +0200 Subject: [PATCH 53/93] readme tweaks --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a57257be..6d14dc2d 100644 --- a/README.md +++ b/README.md @@ -42,11 +42,12 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot Get ArchiveBox with Docker Compose ```bash -# Download the compose file into an empty directory somewhere +# create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox -wget https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml - +curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml docker-compose run archivebox init + +# start the webserver and open the UI (optional) docker-compose up -d open http://127.0.0.1:8000 @@ -60,11 +61,11 @@ docker-compose run archivebox manage createsuperuser Get ArchiveBox with plain Docker ```bash -# cd into a new empty directory somewhere and pull the latest image +# create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox -docker pull archivebox/archivebox - docker run -v $PWD:/data -it archivebox/archivebox init + +# start the webserver and open the UI (optional) docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox open http://127.0.0.1:8000 From 3ada7e79875bffcfcbdac33372466bea64e67cd7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 22:51:03 +0200 Subject: [PATCH 54/93] recommend docker compose --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6d14dc2d..06892818 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot (Click to expand each section)
-Get ArchiveBox with Docker Compose +Get ArchiveBox with docker-compose (recommended) ```bash # create a new empty directory and initalize your collection (can be anywhere) @@ -58,7 +58,7 @@ docker-compose run archivebox manage createsuperuser
-Get ArchiveBox with plain Docker +Get ArchiveBox with docker ```bash # create a new empty directory and initalize your collection (can be anywhere) From 27ce0e49d5fdffd8de2b5ca2271643bb375ee921 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 22:51:44 +0200 Subject: [PATCH 55/93] readme formatting --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 06892818..7fa54e54 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,7 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot #### Quickstart -**First, get ArchiveBox using Docker, your system package manager, or pip.** - -(Click to expand each section) +**First, get ArchiveBox using Docker, your system package manager, or pip.** *(click to expand each section)*
Get ArchiveBox with docker-compose (recommended) From d0c6ecdc045544b6a251a6ecb30b005919472ce9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 22:59:43 +0200 Subject: [PATCH 56/93] better readme install --- README.md | 78 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 7fa54e54..cef88096 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot #### Quickstart -**First, get ArchiveBox using Docker, your system package manager, or pip.** *(click to expand each section)* +**You can use ArchiveBox with Docker, via system package manager, or pip.** *(click to expand each section)*
Get ArchiveBox with docker-compose (recommended) @@ -44,13 +44,17 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot mkdir ~/archivebox && cd ~/archivebox curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml docker-compose run archivebox init +docker-compose run archivebox --version # start the webserver and open the UI (optional) +docker-compose run archivebox manage createsuperuser docker-compose up -d open http://127.0.0.1:8000 -# To add a new admin user for the Web UI you can run: -docker-compose run archivebox manage createsuperuser +# you can also add links and manage your archive via the CLI: +docker-compose run archivebox add 'https://example.com' +docker-compose run archivebox status +docker-compose run archivebox help # to see more options ```
@@ -62,13 +66,17 @@ docker-compose run archivebox manage createsuperuser # create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox docker run -v $PWD:/data -it archivebox/archivebox init +docker run -v $PWD:/data -it archivebox/archivebox --version # start the webserver and open the UI (optional) -docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox +docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser +docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 -# To add a new admin user for the Web UI you can run: -docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser +# you can also add links and manage your archive via the CLI: +docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com' +docker run -v $PWD:/data -it archivebox/archivebox status +docker run -v $PWD:/data -it archivebox/archivebox help # to see more options ```
@@ -77,8 +85,24 @@ docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser Get ArchiveBox with apt on Ubuntu >=20.04 ```bash +# first add the PPA and install the package sudo add-apt-repository -u ppa:archivebox/archivebox apt install archivebox + +# create a new empty directory and initalize your collection (can be anywhere) +mkdir ~/archivebox && cd ~/archivebox +archivebox init +archivebox --version + +# start the webserver and open the UI (optional) +archivebox manage createsuperuser +archivebox server 0.0.0.0:8000 +open http://127.0.0.1:8000 + +# you can also add links and manage your archive via the CLI: +archivebox add 'https://example.com' +archivebox status +archivebox help # to see more options ```
@@ -92,6 +116,17 @@ brew install archivebox/archivebox/archivebox # create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox archivebox init +archivebox --version + +# start the webserver and open the UI (optional) +archivebox manage createsuperuser +archivebox server 0.0.0.0:8000 +open http://127.0.0.1:8000 + +# you can also add links and manage your archive via the CLI: +archivebox add 'https://example.com' +archivebox status +archivebox help # to see more options ```
@@ -101,34 +136,27 @@ archivebox init ```bash pip3 install archivebox -# Install the extra dependencies like wget/git/chrome/etc. separately as needed # create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox archivebox init +archivebox --version +# Install any missing extras like wget/git/chrome/etc. manually as needed + +# start the webserver and open the UI (optional) +archivebox manage createsuperuser +archivebox server 0.0.0.0:8000 +open http://127.0.0.1:8000 + +# you can also add links and manage your archive via the CLI: +archivebox add 'https://example.com' +archivebox status +archivebox help # to see more options ```
-Check that everything installed correctly with `archivebox --version` - - -**Then Add some URLs to your archive collection:** -```bash -archivebox add https://github.com/ArchiveBox/ArchiveBox -archivebox add --depth=1 https://example.com -``` - -**View the snapshots of the URLs you added via the self-hosted web UI:** -```bash -archivebox manage createsuperuser # create an admin acct -archivebox server 0.0.0.0:8000 # start the web server -open http://127.0.0.1:8000/ # open the interactive admin panel -ls ~/archivebox/archive/*/index.html # or browse the snapshots on disk -``` - -

From 81d766aba19f201213d7a8853860c0a42f233ef5 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 11 Dec 2020 16:03:50 -0500 Subject: [PATCH 57/93] refactor: Remove setup_django from title.py --- archivebox/extractors/title.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index ff70f689..28cb128f 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -20,7 +20,6 @@ from ..config import ( CURL_ARGS, CURL_VERSION, CURL_USER_AGENT, - setup_django, ) from ..logging_util import TimedProgress @@ -81,7 +80,6 @@ def extract_title_with_regex(html): def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """try to guess the page's title from its content""" - setup_django(out_dir=out_dir) from core.models import Snapshot output: ArchiveOutput = None From 0331ed162a72b79676b5cb257afc2fee7d9cc03a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:04:05 +0200 Subject: [PATCH 58/93] include archivebox summary --- README.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index cef88096..8e32e7c9 100644 --- a/README.md +++ b/README.md @@ -26,15 +26,25 @@
-ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). +ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. -Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. +Your archive can be managed through the command line with commands like `archivebox add` or through the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API. -#### Quickstart +### Quickstart -**You can use ArchiveBox with Docker, via system package manager, or pip.** *(click to expand each section)* +ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). + +```bash +pip3 install archivebox + +mkdir ~/archivebox && cd ~/archivebox +archivebox init +archivebox add 'https://example.com' +``` + +***(click to expand the sections below for install instructions)***
Get ArchiveBox with docker-compose (recommended) From 1d894e1d45434bd6ffb328be3f892b4277293130 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:08:58 +0200 Subject: [PATCH 59/93] add docker install details --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8e32e7c9..f9751487 100644 --- a/README.md +++ b/README.md @@ -38,16 +38,20 @@ ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (r ```bash pip3 install archivebox +archivebox --version # install extras as-needed, or use the methods below mkdir ~/archivebox && cd ~/archivebox archivebox init archivebox add 'https://example.com' +archivebox help # to see more options ``` -***(click to expand the sections below for install instructions)*** +***(click to expand the sections below for setup instructions)***
-Get ArchiveBox with docker-compose (recommended) +Get ArchiveBox with docker-compose on any platform (recommended, everything included out-of-the-box) + +First make sure you have Docker installed: https://docs.docker.com/get-docker/ ```bash # create a new empty directory and initalize your collection (can be anywhere) @@ -70,7 +74,9 @@ docker-compose run archivebox help # to see more options
-Get ArchiveBox with docker +Get ArchiveBox with docker on any platform + +First make sure you have Docker installed: https://docs.docker.com/get-docker/ ```bash # create a new empty directory and initalize your collection (can be anywhere) From eb61f511f4dc30bb6ee33805919805b5e1afc453 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:09:44 +0200 Subject: [PATCH 60/93] clarify note about arm --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f9751487..6e1a0698 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot ### Quickstart -ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). +ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both Arm and Intel CPUs). ```bash pip3 install archivebox From ab4059bb56c6ead5a7a886a0aae113447ee49c9c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:11:22 +0200 Subject: [PATCH 61/93] formatting --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e1a0698..6abf6185 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,8 @@ ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (r ```bash pip3 install archivebox -archivebox --version # install extras as-needed, or use the methods below +archivebox --version +# install extras manually as-needed, or use one of full setup methods below mkdir ~/archivebox && cd ~/archivebox archivebox init @@ -46,7 +47,7 @@ archivebox add 'https://example.com' archivebox help # to see more options ``` -***(click to expand the sections below for setup instructions)*** +***(click to expand the sections below for full setup instructions)***
Get ArchiveBox with docker-compose on any platform (recommended, everything included out-of-the-box) From 61b2b847b8cec4c6fde4e252d9f54c7a06e04a83 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:21:14 +0200 Subject: [PATCH 62/93] add note about included extras --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6abf6185..5f13cafe 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (r ```bash pip3 install archivebox archivebox --version -# install extras manually as-needed, or use one of full setup methods below +# install extras manually as-needed, or use one of full setup methods below to get everything included out-of-the-box mkdir ~/archivebox && cd ~/archivebox archivebox init From 997dccc67647d41eefd58b5b80bc4757e5f5e6dc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:22:11 +0200 Subject: [PATCH 63/93] wording tweaks --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5f13cafe..0d729b04 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (r ```bash pip3 install archivebox archivebox --version -# install extras manually as-needed, or use one of full setup methods below to get everything included out-of-the-box +# install extras as-needed, or use one of full setup methods below to get everything out-of-the-box mkdir ~/archivebox && cd ~/archivebox archivebox init From 29be247c6b3557dd70c1cbe9d0132c59d9fb6c12 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:23:56 +0200 Subject: [PATCH 64/93] formatting --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 0d729b04..064f005e 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ archivebox add 'https://example.com' archivebox help # to see more options ``` -***(click to expand the sections below for full setup instructions)*** +*(click to expand the sections below for full setup instructions)*
Get ArchiveBox with docker-compose on any platform (recommended, everything included out-of-the-box) @@ -102,7 +102,6 @@ docker run -v $PWD:/data -it archivebox/archivebox help # to see more options Get ArchiveBox with apt on Ubuntu >=20.04 ```bash -# first add the PPA and install the package sudo add-apt-repository -u ppa:archivebox/archivebox apt install archivebox From a28547cbcaf4140527586d8d58d80d833da29a21 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 11 Dec 2020 16:27:15 -0500 Subject: [PATCH 65/93] refactor: Remove get_empty_snapshot queryset function and generate it directly --- archivebox/index/__init__.py | 9 ++------- archivebox/main.py | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index bf1d0c6a..53ce3f26 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -243,12 +243,6 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: log_indexing_process_finished() -@enforce_types -def get_empty_snapshot_queryset(out_dir: Path=OUTPUT_DIR): - setup_django(out_dir, check_db=True) - from core.models import Snapshot - return Snapshot.objects.none() - @enforce_types def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: """parse and load existing index with any new links from import_path merged in""" @@ -390,8 +384,9 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: color='red', ) raise SystemExit(2) + from core.models import Snapshot - qsearch = get_empty_snapshot_queryset() + qsearch = Snapshot.objects.none() for pattern in filter_patterns: try: qsearch |= query_search_index(pattern) diff --git a/archivebox/main.py b/archivebox/main.py index 756fecde..eb8cd6a0 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -29,7 +29,6 @@ from .util import enforce_types # type: ignore from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .index import ( load_main_index, - get_empty_snapshot_queryset, parse_links_from_source, dedupe_links, write_main_index, @@ -265,6 +264,7 @@ def run(subcommand: str, @enforce_types def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Initialize a new ArchiveBox collection in the current directory""" + from core.models import Snapshot Path(out_dir).mkdir(exist_ok=True) is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) @@ -335,7 +335,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: print() print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI)) - all_links = get_empty_snapshot_queryset() + all_links = Snapshot.objects.none() pending_links: Dict[str, Link] = {} if existing_index: From 0682758e9e9d473d8fcc4294d85183607361edac Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:32:48 +0200 Subject: [PATCH 66/93] add note about windows being in beta --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 064f005e..654a1c52 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot ### Quickstart -ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both Arm and Intel CPUs). +ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS (stable), Windows (beta), and Linux/BSD (stable on both Intel and ARM CPUs). ```bash pip3 install archivebox From c40ca7b8e946a8d8a3ca9ec91d41ed867e4c0ec8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Dec 2020 23:42:23 +0200 Subject: [PATCH 67/93] add scheduled archiving note --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 654a1c52..60bd60ee 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,9 @@ ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. -Your archive can be managed through the command line with commands like `archivebox add` or through the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. +Your archive can be managed through the command line with commands like `archivebox add`, through the built-in Web UI `archivebox server`, or via the Python library API (beta). It can ingest bookmarks from a browser or service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. You can also schedule regular/realtime imports with `archivebox schedule`. -The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API. +The main index is a self-contained `index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: several types of HTML snapshots (wget, Chrome headless, singlefile), PDF snapshotting, screenshotting, WARC archiving, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python library API. ### Quickstart @@ -188,7 +188,7 @@ For more information, see the Date: Fri, 11 Dec 2020 16:43:48 -0500 Subject: [PATCH 68/93] refactor: Remove setup_django from search --- archivebox/search/__init__.py | 3 +-- archivebox/search/backends/ripgrep.py | 3 +-- tests/test_oneshot.py | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 360b20ff..6191ede9 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -6,7 +6,7 @@ from django.db.models import QuerySet from archivebox.index.schema import Link from archivebox.util import enforce_types -from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE +from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE from .utils import get_indexable_content, log_index_started @@ -49,7 +49,6 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: @enforce_types def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: - setup_django(out_dir, check_db=True) from core.models import Snapshot if search_backend_enabled(): diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index ff02008d..e2e03c9b 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -2,7 +2,7 @@ import re from subprocess import run, PIPE, DEVNULL from typing import List, Generator -from archivebox.config import setup_django, ARCHIVE_DIR +from archivebox.config import ARCHIVE_DIR from archivebox.util import enforce_types RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') @@ -30,7 +30,6 @@ def search(text: str) -> List[str]: if is_rg_installed.returncode: raise Exception("ripgrep binary not found, install ripgrep to use this search backend") - setup_django(check_db=True) from core.models import Snapshot rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)] diff --git a/tests/test_oneshot.py b/tests/test_oneshot.py index 4057a6ad..560ac43c 100644 --- a/tests/test_oneshot.py +++ b/tests/test_oneshot.py @@ -20,7 +20,6 @@ def test_oneshot_command_saves_page_in_right_folder(tmp_path, disable_extractors capture_output=True, env=disable_extractors_dict, ) - print(process.stdout) items = ' '.join([str(x) for x in tmp_path.iterdir()]) current_path = ' '.join([str(x) for x in Path.cwd().iterdir()]) assert "index.json" in items From ce53b0220c12c459c1bb06d95ccc47bd501b5b49 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 11 Dec 2020 17:36:31 -0500 Subject: [PATCH 69/93] refactor: Remove setup_django from index --- archivebox/index/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 53ce3f26..4f4ac3d4 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -18,7 +18,6 @@ from ..util import ( ExtendedEncoder, ) from ..config import ( - setup_django, ARCHIVE_DIR_NAME, SQL_INDEX_FILENAME, JSON_INDEX_FILENAME, @@ -246,7 +245,6 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: @enforce_types def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: """parse and load existing index with any new links from import_path merged in""" - setup_django(out_dir, check_db=True) from core.models import Snapshot try: return Snapshot.objects.all() From 57d1a3d4e546d2383f23533bd743568a2621ee03 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 11 Dec 2020 17:49:16 -0500 Subject: [PATCH 70/93] refactor: Remove setup_django from html.py --- archivebox/index/html.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 4ead04ce..a62e2c7e 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -23,7 +23,6 @@ from ..config import ( GIT_SHA, FOOTER_INFO, HTML_INDEX_FILENAME, - setup_django, ) MAIN_INDEX_TEMPLATE = 'main_index.html' @@ -111,7 +110,6 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str: """render a given html template string with the given template content""" from django.template.loader import render_to_string - setup_django(check_db=False) return render_to_string(template, context) From a57a5b6b837770996fb44f2271fcc92872cec636 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 11 Dec 2020 18:02:56 -0500 Subject: [PATCH 71/93] refactor: call setup_django with the `check_db` attribute for the commands that actually need the database --- archivebox/cli/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 3df41809..f9a55efd 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -63,7 +63,7 @@ def run_subcommand(subcommand: str, if subcommand not in meta_cmds: from ..config import setup_django - setup_django(in_memory_db=subcommand in fake_db) + setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds) module = import_module('.archivebox_{}'.format(subcommand), __package__) module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore From c3ced9d8258c44165420825804a6dc654ac8aa32 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:10:55 +0200 Subject: [PATCH 72/93] add section break --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 60bd60ee..a7060c5f 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,9 @@ archivebox help # to see more options
- + +--- +

From d421ab58c1734a47bd6a1859238b041edde426e5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:30:18 +0200 Subject: [PATCH 73/93] add static export instructions --- README.md | 110 ++++++++++-------------------------------------------- 1 file changed, 19 insertions(+), 91 deletions(-) diff --git a/README.md b/README.md index a7060c5f..2fc01818 100644 --- a/README.md +++ b/README.md @@ -107,14 +107,20 @@ apt install archivebox # create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox +npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' archivebox init archivebox --version -# start the webserver and open the UI (optional) +# start the webserver and open the web UI (optional) archivebox manage createsuperuser archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 +# to export a static HTML/json version of the index: +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json +open ./index.html + # you can also add links and manage your archive via the CLI: archivebox add 'https://example.com' archivebox status @@ -131,6 +137,7 @@ brew install archivebox/archivebox/archivebox # create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox +npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' archivebox init archivebox --version @@ -139,6 +146,11 @@ archivebox manage createsuperuser archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 +# to export a static HTML/json version of the index: +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json +open ./index.html + # you can also add links and manage your archive via the CLI: archivebox add 'https://example.com' archivebox status @@ -155,6 +167,7 @@ pip3 install archivebox # create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox +npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' archivebox init archivebox --version # Install any missing extras like wget/git/chrome/etc. manually as needed @@ -164,6 +177,11 @@ archivebox manage createsuperuser archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 +# to export a static HTML/json version of the index: +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json +open ./index.html + # you can also add links and manage your archive via the CLI: archivebox add 'https://example.com' archivebox status @@ -171,7 +189,6 @@ archivebox help # to see more options ```
- --- @@ -308,95 +325,6 @@ archivebox add 'https://example.com#2020-10-25' --- -# Setup - -## Docker Compose - -*This is the recommended way of running ArchiveBox.* - -It comes with everything working out of the box, including all extractors, -a headless browser runtime, a full webserver, and CLI interface. - -```bash -# docker-compose run archivebox [args] - -mkdir archivebox && cd archivebox -wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml' -docker-compose run archivebox init -docker-compose run archivebox add 'https://example.com' -docker-compose run archivebox manage createsuperuser -docker-compose up -open http://127.0.0.1:8000 -``` - -## Docker - -```bash -# docker run -v $PWD:/data -it archivebox/archivebox [args] - -mkdir archivebox && cd archivebox -docker run -v $PWD:/data -it archivebox/archivebox init -docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com' -docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser - -# run the webserver to access the web UI -docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000 -open http://127.0.0.1:8000 - -# or export a static version of the index if you dont want to run a server -docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html -docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json -open ./index.html -``` - - -## Bare Metal - -```bash -# archivebox [args] - -# on Debian/Ubuntu -sudo add-apt-repository -u ppa:archivebox/archivebox -apt install archivebox - -# on macOS -brew install archivebox/archivebox/archivebox -``` - -Initialize your archive in a directory somewhere and add some links: -```bash -mkdir ~/archivebox && cd archivebox -npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' -archivebox init -archivebox add 'https://example.com' # add URLs as args pipe them in via stdin -archivebox add --depth=1 https://example.com/table-of-contents.html -# it can injest links from many formats, including RSS/JSON/XML/MD/TXT and more -curl https://getpocket.com/users/USERNAME/feed/all | archivebox add -``` - -Start the webserver to access the web UI: -```bash -archivebox manage createsuperuser -archivebox server 0.0.0.0:8000 - -open http://127.0.0.1:8000 -``` - -Or export a static HTML version of the index if you don't want to run a webserver: -```bash -archivebox list --html --with-headers > index.html -archivebox list --json --with-headers > index.json -open ./index.html -``` - -To view more information about your dependencies, data, or the CLI: -```bash -archivebox version -archivebox status -archivebox help -``` ---- -
From 0a97f502ff90fae730bbbbf464e9420a7720791b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:33:03 +0200 Subject: [PATCH 74/93] make setup instructions shorter --- README.md | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 2fc01818..82e2f9cf 100644 --- a/README.md +++ b/README.md @@ -116,14 +116,11 @@ archivebox manage createsuperuser archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 -# to export a static HTML/json version of the index: -archivebox list --html --with-headers > index.html -archivebox list --json --with-headers > index.json -open ./index.html - -# you can also add links and manage your archive via the CLI: +# you can also add URLs and manage the archive via the CLI and filesystem: archivebox add 'https://example.com' archivebox status +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json archivebox help # to see more options ``` @@ -141,19 +138,16 @@ npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' archivebox init archivebox --version -# start the webserver and open the UI (optional) +# start the webserver and open the web UI (optional) archivebox manage createsuperuser archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 -# to export a static HTML/json version of the index: -archivebox list --html --with-headers > index.html -archivebox list --json --with-headers > index.json -open ./index.html - -# you can also add links and manage your archive via the CLI: +# you can also add URLs and manage the archive via the CLI and filesystem: archivebox add 'https://example.com' archivebox status +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json archivebox help # to see more options ``` @@ -172,19 +166,16 @@ archivebox init archivebox --version # Install any missing extras like wget/git/chrome/etc. manually as needed -# start the webserver and open the UI (optional) +# start the webserver and open the web UI (optional) archivebox manage createsuperuser archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 -# to export a static HTML/json version of the index: -archivebox list --html --with-headers > index.html -archivebox list --json --with-headers > index.json -open ./index.html - -# you can also add links and manage your archive via the CLI: +# you can also add URLs and manage the archive via the CLI and filesystem: archivebox add 'https://example.com' archivebox status +archivebox list --html --with-headers > index.html +archivebox list --json --with-headers > index.json archivebox help # to see more options ``` From be9a1a9ad7f4be2e60731d1ca619ebe4e781c2d3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:37:07 +0200 Subject: [PATCH 75/93] add note about why docker-compose is the recommended method --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 82e2f9cf..b2830247 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ archivebox help # to see more options Get ArchiveBox with docker-compose on any platform (recommended, everything included out-of-the-box) First make sure you have Docker installed: https://docs.docker.com/get-docker/ +This is the recommended way to run ArchiveBox because it includes *all* the extractors like chrome, wget, youtube-dl, git, etc., as well as full-text search with sonic, and many other great features. ```bash # create a new empty directory and initalize your collection (can be anywhere) From 34c30401d74b7260de2aa13630e2c2e5264f1d42 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:37:39 +0200 Subject: [PATCH 76/93] missing newline --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b2830247..00bb4a5b 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ archivebox help # to see more options
Get ArchiveBox with docker-compose on any platform (recommended, everything included out-of-the-box) -First make sure you have Docker installed: https://docs.docker.com/get-docker/ +First make sure you have Docker installed: https://docs.docker.com/get-docker/ This is the recommended way to run ArchiveBox because it includes *all* the extractors like chrome, wget, youtube-dl, git, etc., as well as full-text search with sonic, and many other great features. ```bash From bfd21a3912873be6c66e8338e52a7017ca591956 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:40:58 +0200 Subject: [PATCH 77/93] add note about archivebox schedule --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 00bb4a5b..5e1c6de3 100644 --- a/README.md +++ b/README.md @@ -248,7 +248,7 @@ archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12' See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. -It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly. +It also includes a built-in scheduled import feature with `archivebox schedule` and browser bookmarklet, so you can pull in URLs from RSS feeds, websites, or the filesystem regularly/on-demand. ## Output formats From 13c8b976be298295ac61d3211405773e22322c0a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:43:37 +0200 Subject: [PATCH 78/93] add more examples to quickstart --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 5e1c6de3..0362f9f3 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ archivebox --version mkdir ~/archivebox && cd ~/archivebox archivebox init archivebox add 'https://example.com' +archivebox schedule --every day https://getpocket.com/users/USERNAME/feed/all +archivebox oneshot --extract=media https://www.youtube.com/watch?v=dQw4w9WgXcQ archivebox help # to see more options ``` From d82d864f54b648da48f1c06b1b26f2feadd243e1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:45:07 +0200 Subject: [PATCH 79/93] add more examples --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0362f9f3..e050bce0 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,9 @@ archivebox --version mkdir ~/archivebox && cd ~/archivebox archivebox init archivebox add 'https://example.com' +archivebox add --depth=1 'https://example.com' archivebox schedule --every day https://getpocket.com/users/USERNAME/feed/all -archivebox oneshot --extract=media https://www.youtube.com/watch?v=dQw4w9WgXcQ +archivebox oneshot --extract=title,favicon,media https://www.youtube.com/watch?v=dQw4w9WgXcQ archivebox help # to see more options ``` From 87eccfdbda2ca8b56a9f767b9aebbe777ff402c3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:45:27 +0200 Subject: [PATCH 80/93] newline --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e050bce0..89363268 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ archivebox --version mkdir ~/archivebox && cd ~/archivebox archivebox init + archivebox add 'https://example.com' archivebox add --depth=1 'https://example.com' archivebox schedule --every day https://getpocket.com/users/USERNAME/feed/all From d9e76752b7165dcb1005607ded0134be759e1e78 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:45:54 +0200 Subject: [PATCH 81/93] add comment --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 89363268..3f70c61b 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ pip3 install archivebox archivebox --version # install extras as-needed, or use one of full setup methods below to get everything out-of-the-box -mkdir ~/archivebox && cd ~/archivebox +mkdir ~/archivebox && cd ~/archivebox # this can be anywhere archivebox init archivebox add 'https://example.com' From 327308573ad4961c1d3f0db089ab196cc1a9afcf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:50:16 +0200 Subject: [PATCH 82/93] better dev instructions in readme --- README.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3f70c61b..b64b1f35 100644 --- a/README.md +++ b/README.md @@ -435,22 +435,18 @@ All contributions to ArchiveBox are welcomed! Check our [issues](https://github. First, install the system dependencies from the "Bare Metal" section above. Then you can clone the ArchiveBox repo and install ```python3 -git clone https://github.com/ArchiveBox/ArchiveBox -cd ArchiveBox +git clone https://github.com/ArchiveBox/ArchiveBox && cd ArchiveBox git checkout master # or the branch you want to test -git pull -git submodule init -git submodule update +git pull --recurse-submodules # Install ArchiveBox + python dependencies python3 -m venv .venv && source .venv/bin/activate && pip install -e .[dev] -# or -pipenv install --dev && pipenv shell +# or with pipenv: pipenv install --dev && pipenv shell # Install node dependencies npm install -# Optional: install the extractor dependencies +# Optional: install extractor dependencies manually or with helper script ./bin/setup.sh # Optional: develop via docker by mounting the code dir into the container @@ -490,6 +486,8 @@ You can also run all these in Docker. For more examples see the Github Actions C # or individually: ./bin/build_docs.sh ./bin/build_pip.sh +./bin/build_deb.sh +./bin/build_brew.sh ./bin/build_docker.sh ``` From 33d360bcff9164ca9fe65802c6666b639a2d058a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:53:32 +0200 Subject: [PATCH 83/93] better install instructions --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b64b1f35..bfeda329 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ The main index is a self-contained `index.sqlite3` file, and each snapshot is st ### Quickstart -ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS (stable), Windows (beta), and Linux/BSD (stable on both Intel and ARM CPUs). +ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). +It works on macOS (with `brew`/`pip3`), Windows (beta with `docker`/`pip3`), and Linux/BSD (Intel and ARM CPUs with `docker`/`apt`/`pip3`). ```bash pip3 install archivebox @@ -56,7 +57,8 @@ archivebox help # to see more options
Get ArchiveBox with docker-compose on any platform (recommended, everything included out-of-the-box) -First make sure you have Docker installed: https://docs.docker.com/get-docker/ +First make sure you have Docker installed: https://docs.docker.com/get-docker/ +

This is the recommended way to run ArchiveBox because it includes *all* the extractors like chrome, wget, youtube-dl, git, etc., as well as full-text search with sonic, and many other great features. ```bash @@ -82,8 +84,7 @@ docker-compose run archivebox help # to see more options
Get ArchiveBox with docker on any platform -First make sure you have Docker installed: https://docs.docker.com/get-docker/ - +First make sure you have Docker installed: https://docs.docker.com/get-docker/
```bash # create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox From 096749da8759e4435dfb049b4273783b5c2eb3f6 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 01:55:10 +0200 Subject: [PATCH 84/93] better spacing --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bfeda329..fc009b89 100644 --- a/README.md +++ b/README.md @@ -206,7 +206,7 @@ ArchiveBox is a command line tool, self-hostable web-archiving server, and Pytho To use ArchiveBox you start by creating a folder for your data to live in (it can be anywhere on your system), and running `archivebox init` inside of it. That will create a sqlite3 index and an `ArchiveBox.conf` file. After that, you can continue to add/export/manage/etc using the CLI `archivebox help`, or you can run the Web UI (recommended). If you only want to archive a single site, you can run `archivebox oneshot` to avoid having to create a whole collection. -The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage. +The CLI is considered "stable", the ArchiveBox Python API and REST APIs are "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is "alpha". At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots. From ebc5a2bd3c5e16b4584324bf7297385739642353 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 02:03:51 +0200 Subject: [PATCH 85/93] release notes --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fc009b89..b7359c1f 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ The main index is a self-contained `index.sqlite3` file, and each snapshot is st ### Quickstart ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). -It works on macOS (with `brew`/`pip3`), Windows (beta with `docker`/`pip3`), and Linux/BSD (Intel and ARM CPUs with `docker`/`apt`/`pip3`). +It works on Linux/BSD (Intel and ARM CPUs with `docker`/`apt`/`pip3`), macOS (with `brew`/`pip3`), and Windows (beta with `docker`/`pip3`). ```bash pip3 install archivebox From 0cff57da027c554e603519565b2a6b5586380fc3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 13:10:42 +0200 Subject: [PATCH 86/93] minor readme tweaks --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b7359c1f..61fe1753 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ archivebox init archivebox add 'https://example.com' archivebox add --depth=1 'https://example.com' -archivebox schedule --every day https://getpocket.com/users/USERNAME/feed/all +archivebox schedule --every=day https://getpocket.com/users/USERNAME/feed/all archivebox oneshot --extract=title,favicon,media https://www.youtube.com/watch?v=dQw4w9WgXcQ archivebox help # to see more options ``` @@ -298,8 +298,8 @@ archivebox add 'https://example.com/any/url/you/want/to/keep/secret/' # without first disabling share the URL with 3rd party APIs: archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org -archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL -archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google +archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL +archivebox config --set CHROME_BINARY=chromium # optional: switch to chromium to avoid Chrome phoning home to Google ``` Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. From 1b4f8788959190f7351824a8ed6031bc51b1a9da Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 13:20:24 +0200 Subject: [PATCH 87/93] add deb sources --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 61fe1753..5f9aed21 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,7 @@ The main index is a self-contained `index.sqlite3` file, and each snapshot is st ### Quickstart -ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). -It works on Linux/BSD (Intel and ARM CPUs with `docker`/`apt`/`pip3`), macOS (with `brew`/`pip3`), and Windows (beta with `docker`/`pip3`). +It works on Linux/BSD (Intel and ARM CPUs with `docker`/`apt`/`pip3`), macOS (with `docker`/`brew`/`pip3`), and Windows (beta with `docker`/`pip3`). ```bash pip3 install archivebox @@ -109,7 +108,7 @@ docker run -v $PWD:/data -it archivebox/archivebox help # to see more options ```bash sudo add-apt-repository -u ppa:archivebox/archivebox -apt install archivebox +sudo apt install archivebox # create a new empty directory and initalize your collection (can be anywhere) mkdir ~/archivebox && cd ~/archivebox @@ -130,6 +129,13 @@ archivebox list --json --with-headers > index.json archivebox help # to see more options ``` +For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`: +```bash +deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main +deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main +``` +(you may need to install some other dependencies manually however) +
From 31ab762ee1de45d9435a356622b227d581607150 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 13:25:03 +0200 Subject: [PATCH 88/93] add missing outputs to readme list --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 5f9aed21..54e0b24e 100644 --- a/README.md +++ b/README.md @@ -274,11 +274,14 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details - **Title:** `title` title of the site - **Favicon:** `favicon.ico` favicon of the site +- **Headers:** `headers.json` Any HTTP headers the site returns are saved in a json file +- **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile - **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present - **WARC:** `warc/.gz` gzipped WARC of all the resources fetched while archiving - **PDF:** `output.pdf` Printed PDF of site using headless chrome - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome +- **Readability:** `article.html/json` Article text extraction using Readability - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links From 24d4c446247aafbef9787cfb9fd9a78675437b52 Mon Sep 17 00:00:00 2001 From: jdcaballerov Date: Sat, 12 Dec 2020 07:36:31 -0500 Subject: [PATCH 89/93] Add ripgrep configs --- archivebox/config.py | 11 +++++++++++ archivebox/search/backends/ripgrep.py | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index a3444f07..d3e34151 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -161,6 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'USE_CHROME': {'type': bool, 'default': True}, 'USE_NODE': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, + 'USE_RIPGREP': {'type': bool, 'default': True}, 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, @@ -170,6 +171,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'NODE_BINARY': {'type': str, 'default': 'node'}, + 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'CHROME_BINARY': {'type': str, 'default': None}, 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, @@ -312,6 +314,8 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, + 'USE_RIPGREP': {'default': lambda c: c['USE_RIPGREP']}, + 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, @@ -827,6 +831,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_CHROME'], 'is_valid': bool(config['CHROME_VERSION']), }, + 'RIPGREP_BINARY': { + 'path': bin_path(config['RIPGREP_BINARY']), + 'version': config['RIPGREP_VERSION'], + 'hash': bin_hash(config['RIPGREP_BINARY']), + 'enabled': config['USE_RIPGREP'], + 'is_valid': bool(config['RIPGREP_VERSION']), + }, } def get_chrome_info(config: ConfigDict) -> ConfigValue: diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index e2e03c9b..b37eca20 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -2,7 +2,7 @@ import re from subprocess import run, PIPE, DEVNULL from typing import List, Generator -from archivebox.config import ARCHIVE_DIR +from archivebox.config import ARCHIVE_DIR, RIPGREP_BINARY from archivebox.util import enforce_types RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') @@ -26,7 +26,7 @@ def flush(snapshot_ids: Generator[str, None, None]): @enforce_types def search(text: str) -> List[str]: - is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL) + is_rg_installed = run(['which', RIPGREP_BINARY], stdout=DEVNULL, stderr=DEVNULL) if is_rg_installed.returncode: raise Exception("ripgrep binary not found, install ripgrep to use this search backend") From 50df10886346f12d16124fd8cf5a09a41ff9ee3c Mon Sep 17 00:00:00 2001 From: jdcaballerov <743513+jdcaballerov@users.noreply.github.com> Date: Sat, 12 Dec 2020 08:34:00 -0500 Subject: [PATCH 90/93] Update archivebox/config.py Co-authored-by: Nick Sweeting --- archivebox/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index d3e34151..6c42eef5 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -314,7 +314,6 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, - 'USE_RIPGREP': {'default': lambda c: c['USE_RIPGREP']}, 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, From aa53f4f088bd5eca63db394d71597c32cdcb9d6c Mon Sep 17 00:00:00 2001 From: jdcaballerov <743513+jdcaballerov@users.noreply.github.com> Date: Sat, 12 Dec 2020 08:36:01 -0500 Subject: [PATCH 91/93] Update archivebox/search/backends/ripgrep.py Co-authored-by: Nick Sweeting --- archivebox/search/backends/ripgrep.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index b37eca20..b6532bfd 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -26,8 +26,7 @@ def flush(snapshot_ids: Generator[str, None, None]): @enforce_types def search(text: str) -> List[str]: - is_rg_installed = run(['which', RIPGREP_BINARY], stdout=DEVNULL, stderr=DEVNULL) - if is_rg_installed.returncode: + if not RIPGREP_VERSION: raise Exception("ripgrep binary not found, install ripgrep to use this search backend") from core.models import Snapshot @@ -44,4 +43,3 @@ def search(text: str) -> List[str]: snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] return snap_ids - From 9b6afa36a386c9e8f7c8d09c8f7a80ec70a285db Mon Sep 17 00:00:00 2001 From: jdcaballerov <743513+jdcaballerov@users.noreply.github.com> Date: Sat, 12 Dec 2020 08:36:08 -0500 Subject: [PATCH 92/93] Update archivebox/search/backends/ripgrep.py Co-authored-by: Nick Sweeting --- archivebox/search/backends/ripgrep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index b6532bfd..887a66d6 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -2,7 +2,7 @@ import re from subprocess import run, PIPE, DEVNULL from typing import List, Generator -from archivebox.config import ARCHIVE_DIR, RIPGREP_BINARY +from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION from archivebox.util import enforce_types RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') From 326fe69eead7d5509ae9fa4ed716474536b37847 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 12 Dec 2020 12:35:32 -0500 Subject: [PATCH 93/93] fix lint error --- archivebox/search/backends/ripgrep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index 887a66d6..840d2d2d 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -1,5 +1,5 @@ import re -from subprocess import run, PIPE, DEVNULL +from subprocess import run, PIPE from typing import List, Generator from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION