diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 220707b9..086e3d7b 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,8 +1,8 @@
---
name: 🐞 Bug report
about: Create a report to help us improve
-title: 'Bugfix: ...'
-labels: 'changes: bugfixes'
+title: 'Bug: ...'
+labels: 'bug'
assignees: ''
---
diff --git a/.gitmodules b/.gitmodules
index 0993934a..196c9a92 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,3 +23,6 @@
[submodule "archivebox/vendor/django-taggit"]
path = archivebox/vendor/django-taggit
url = https://github.com/jazzband/django-taggit
+[submodule "archivebox/vendor/python-atomicwrites"]
+ path = archivebox/vendor/python-atomicwrites
+ url = https://github.com/untitaker/python-atomicwrites
diff --git a/Dockerfile b/Dockerfile
index 8cf2da30..b11d3382 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -50,13 +50,6 @@ RUN apt-get update -qq \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/*
-# Install apt development dependencies
-# RUN apt-get install -qq \
-# && apt-get install -qq -y --no-install-recommends \
-# python3 python3-dev python3-pip python3-venv python3-all \
-# dh-python debhelper devscripts dput software-properties-common \
-# python3-distutils python3-setuptools python3-wheel python3-stdeb
-
# Install Node environment
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
&& echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
@@ -79,17 +72,26 @@ WORKDIR "$CODE_DIR"
ENV PATH="${PATH}:$VENV_PATH/bin"
RUN python -m venv --clear --symlinks "$VENV_PATH" \
&& pip install --upgrade --quiet pip setuptools
-ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt"
+ADD "./setup.py" "$CODE_DIR/"
+ADD "./README.md" "./package.json" "$CODE_DIR/archivebox/"
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
build-essential python-dev python3-dev \
- # && pip install --upgrade pip \
- && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \
- && pip install --quiet "sonic-client==0.0.5" \
+ && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
+ && pip install --quiet -r /tmp/requirements.txt \
&& apt-get purge -y build-essential python-dev python3-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
+# Install apt development dependencies
+# RUN apt-get install -qq \
+# && apt-get install -qq -y --no-install-recommends \
+# python3 python3-dev python3-pip python3-venv python3-all \
+# dh-python debhelper devscripts dput software-properties-common \
+# python3-distutils python3-setuptools python3-wheel python3-stdeb
+# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
+ # && pip install --quiet -r /tmp/dev_requirements.txt
+
# Install ArchiveBox Python package and its dependencies
WORKDIR "$CODE_DIR"
ADD . "$CODE_DIR"
@@ -115,5 +117,8 @@ RUN /app/bin/docker_entrypoint.sh archivebox version
VOLUME "$DATA_DIR"
EXPOSE 8000
+HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
+ CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
+
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
-CMD ["archivebox", "server", "0.0.0.0:8000"]
+CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]
diff --git a/README.md b/README.md
index 76b51be3..d6c3d8ff 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ archivebox help
### Quickstart
-**🖥 Supported OSs:** Linux/BSD, macOS, Windows **🎮 CPU Architectures:** x86, amd64, arm7, arm8 (raspi >=3)
+**🖥 Supported OSs:** Linux/BSD, macOS, Windows (w/ Docker) **🎮 CPU Architectures:** x86, amd64, arm7, arm8 (raspi >=3)
**📦 Distributions:** `docker`/`apt`/`brew`/`pip3`/`npm` (in order of completeness)
*(click to expand your preferred **► `distribution`** below for full setup instructions)*
@@ -103,22 +103,29 @@ archivebox help
First make sure you have Docker installed: https://docs.docker.com/get-docker/
+Download the [`docker-compose.yml`](https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml) file.
-# create a new empty directory and initalize your collection (can be anywhere)
-mkdir ~/archivebox && cd ~/archivebox
curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
-docker-compose run archivebox init
-docker-compose run archivebox --version
+
-# start the webserver and open the UI (optional)
+Start the server.
+
+docker-compose run archivebox server --quick-init
docker-compose run archivebox manage createsuperuser
-docker-compose up -d
-open 'http://127.0.0.1:8000'
+
+Open [`http://127.0.0.1:8000`](http://127.0.0.1:8000).
+
+
# you can also add links and manage your archive via the CLI:
docker-compose run archivebox add 'https://example.com'
+echo 'https://example.com' | docker-compose run archivebox -T add
docker-compose run archivebox status
docker-compose run archivebox help # to see more options
+
+# when passing stdin/stdout via the cli, use the -T flag
+echo 'https://example.com' | docker-compose run -T archivebox add
+docker-compose run -T archivebox list --html --with-headers > index.html
This is the recommended way to run ArchiveBox because it includes all the extractors like:
@@ -127,7 +134,7 @@ chrome, wget, youtube-dl, git, etc., full-text search w/ sonic, and many other g
-Get ArchiveBox with docker
on any platform
+Get ArchiveBox with docker
on macOS/Linux/Windows
First make sure you have Docker installed: https://docs.docker.com/get-docker/
@@ -145,21 +152,30 @@ open http://127.0.0.1:8000
docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
docker run -v $PWD:/data -it archivebox/archivebox status
docker run -v $PWD:/data -it archivebox/archivebox help # to see more options
+
+# when passing stdin/stdout via the cli, use only -i (not -it)
+echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add
+docker run -v $PWD:/data -i archivebox/archivebox list --html --with-headers > index.html
-Get ArchiveBox with apt
on Ubuntu >=20.04
+Get ArchiveBox with apt
on Ubuntu/Debian
-First make sure you're on Ubuntu >= 20.04, or scroll down for older/non-Ubuntu instructions.
+This method should work on all Ubuntu/Debian based systems, including x86, amd64, arm7, and arm8 CPUs (e.g. Raspberry Pis >=3).
+
+If you're on Ubuntu >= 20.04, add the `apt` repository with `add-apt-repository`:
+(on other Ubuntu/Debian-based systems follow the ♰ instructions below)
# add the repo to your sources and install the archivebox package using apt
sudo apt install software-properties-common
sudo add-apt-repository -u ppa:archivebox/archivebox
sudo apt install archivebox
+
+
# create a new empty directory and initalize your collection (can be anywhere)
mkdir ~/archivebox && cd ~/archivebox
npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
@@ -178,21 +194,25 @@ archivebox list --json --with-headers > index.json
archivebox help # to see more options
-For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`:
+♰ On other Ubuntu/Debian-based systems add these sources directly to /etc/apt/sources.list
:
-deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
-deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
+echo "deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" > /etc/apt/sources.list.d/archivebox.list
+echo "deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" >> /etc/apt/sources.list.d/archivebox.list
+sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys C258F79DCC02E369
+sudo apt update
+sudo apt install archivebox
+sudo snap install chromium
+archivebox --version
+# then scroll back up and continue the initalization instructions above
-Then run `apt update; apt install archivebox; archivebox --version`.
-
(you may need to install some other dependencies manually however)
-Get ArchiveBox with brew
on macOS >=10.13
+Get ArchiveBox with brew
on macOS
First make sure you have Homebrew installed: https://brew.sh/#install
@@ -252,13 +272,12 @@ archivebox help # to see more options
No matter which install method you choose, they all roughly follow this 3-step process and all provide the same CLI, Web UI, and on-disk data format.
-
-
-1. Install ArchiveBox: `apt/brew/pip3 install archivebox`
-2. Start a collection: `archivebox init`
-3. Start archiving: `archivebox add 'https://example.com'`
-
-
+
+- Install ArchiveBox:
apt/brew/pip3 install archivebox
+- Start a collection:
archivebox init
+- Start archiving:
archivebox add 'https://example.com'
+- View the archive:
archivebox server
or archivebox list ...
, ls ./archive/*/index.html
+
@@ -307,8 +326,13 @@ archivebox add < ~/Downloads/firefox_bookmarks_export.html
archivebox add < any_text_with_urls_in_it.txt
archivebox add --depth=1 'https://example.com/some/downloads.html'
archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
-```
+# (if using docker add -i when passing via stdin)
+echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add
+
+# (if using docker-compose add -T when passing via stdin)
+echo 'https://example.com' | docker-compose run -T archivebox add
+```
-

TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file)
-

[Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](http://i.imgur.com/AtcvUZA.png), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](http://help.opera.com/Windows/12.10/en/importexport.html), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive))
@@ -328,6 +352,8 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
# to browse your index statically without running the archivebox server, run:
archivebox list --html --with-headers > index.html
archivebox list --json --with-headers > index.json
+# if running these commands with docker-compose, add -T:
+# docker-compose run -T archivebox list ...
# then open the static index in a browser
open index.html
@@ -338,13 +364,13 @@ ls ./archive/
/
- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
- **Title**, **Favicon**, **Headers** Response headers, site favicon, and parsed site title
+- **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile
- **Wget Clone:** `example.com/page-name.html` wget clone of the site with `warc/.gz`
- Chrome Headless
- - **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile
- **PDF:** `output.pdf` Printed PDF of site using headless chrome
- **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
- - **Readability:** `article.html/json` Article text extraction using Readability
+- **Article Text:** `article.html/json` Article text extraction using Readability & Mercury
- **Archive.org Permalink:** `archive.org.txt` A link to the saved site on archive.org
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
@@ -534,7 +560,8 @@ Whether you want to learn which organizations are the big players in the web arc
_A collection of the most active internet archiving communities and initiatives._
- Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
-- Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
+- Reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter
+- Hire us to develop an internet archiving solution for you [@MonadicalSAS](https://twitter.com/MonadicalSAS) [Monadical.com](https://monadical.com)
@@ -719,7 +746,10 @@ archivebox manage dbshell
-This project is maintained mostly in my spare time with the help from generous contributors and Monadical.com.
+
+This project is maintained mostly in my spare time with the help from generous contributors and Monadical (✨ hire them for dev work!).
+
+
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index f9a55efd..890065a4 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -63,7 +63,11 @@ def run_subcommand(subcommand: str,
if subcommand not in meta_cmds:
from ..config import setup_django
- setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
+
+ cmd_requires_db = subcommand in archive_cmds
+ init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
+
+ setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index 41c7554d..a96888b0 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -22,6 +22,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
add_help=True,
formatter_class=SmartFormatter,
)
+ parser.add_argument(
+ '--tag', '-t',
+ type=str,
+ default='',
+ help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
+ )
parser.add_argument(
'--update-all', #'-n',
action='store_true',
@@ -75,7 +81,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
)
command = parser.parse_args(args or ())
urls = command.urls
- stdin_urls = accept_stdin(stdin)
+
+ stdin_urls = ''
+ if not urls:
+ stdin_urls = accept_stdin(stdin)
+
if (stdin_urls and urls) or (not stdin and not urls):
stderr(
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
@@ -85,6 +95,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
add(
urls=stdin_urls or urls,
depth=command.depth,
+ tag=command.tag,
update_all=command.update_all,
index_only=command.index_only,
overwrite=command.overwrite,
diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py
index f81286c6..25621972 100644
--- a/archivebox/cli/archivebox_config.py
+++ b/archivebox/cli/archivebox_config.py
@@ -45,7 +45,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help='KEY or KEY=VALUE formatted config values to get or set',
)
command = parser.parse_args(args or ())
- config_options_str = accept_stdin(stdin)
+
+ config_options_str = ''
+ if not command.config_options:
+ config_options_str = accept_stdin(stdin)
config(
config_options_str=config_options_str,
diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py
index 6255ef26..5753269c 100755
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -27,11 +27,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
action='store_true',
help='Ignore unrecognized files in current directory and initialize anyway',
)
+ parser.add_argument(
+ '--quick', '-q',
+ action='store_true',
+ help='Run any updates or migrations without rechecking all snapshot dirs',
+ )
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
init(
force=command.force,
+ quick=command.quick,
out_dir=pwd or OUTPUT_DIR,
)
diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py
index 3838cf60..5477bfc8 100644
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@@ -12,6 +12,7 @@ from ..main import list_all
from ..util import docstring
from ..config import OUTPUT_DIR
from ..index import (
+ LINK_FILTERS,
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@@ -23,7 +24,7 @@ from ..index import (
get_corrupted_folders,
get_unrecognized_folders,
)
-from ..logging_util import SmartFormatter, accept_stdin, stderr
+from ..logging_util import SmartFormatter, reject_stdin, stderr
@docstring(list_all.__doc__)
@@ -44,7 +45,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
group.add_argument(
'--json', #'-j',
action='store_true',
- help="Print the output in JSON format with all columns included.",
+ help="Print the output in JSON format with all columns included",
)
group.add_argument(
'--html',
@@ -59,19 +60,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
parser.add_argument(
'--sort', #'-s',
type=str,
- help="List the links sorted using the given key, e.g. timestamp or updated.",
+ help="List the links sorted using the given key, e.g. timestamp or updated",
default=None,
)
parser.add_argument(
'--before', #'-b',
type=float,
- help="List only links bookmarked before the given timestamp.",
+ help="List only links bookmarked before (less than) the given timestamp",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
- help="List only links bookmarked after the given timestamp.",
+ help="List only links bookmarked after (greater than or equal to) the given timestamp",
default=None,
)
parser.add_argument(
@@ -96,9 +97,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
)
)
parser.add_argument(
- '--filter-type',
+ '--filter-type', '-t',
type=str,
- choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
+ choices=(*LINK_FILTERS.keys(), 'search'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
@@ -107,20 +108,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
nargs='*',
type=str,
default=None,
- help='List only URLs matching these filter patterns.'
+ help='List only URLs matching these filter patterns'
)
command = parser.parse_args(args or ())
- filter_patterns_str = accept_stdin(stdin)
+ reject_stdin(stdin)
if command.with_headers and not (command.json or command.html or command.csv):
stderr(
- '[X] --with-headers can only be used with --json, --html or --csv options.\n',
+ '[X] --with-headers can only be used with --json, --html or --csv options\n',
color='red',
)
raise SystemExit(2)
matching_folders = list_all(
- filter_patterns_str=filter_patterns_str,
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
status=command.status,
diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py
index af68bac2..411cce8b 100644
--- a/archivebox/cli/archivebox_oneshot.py
+++ b/archivebox/cli/archivebox_oneshot.py
@@ -50,8 +50,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help= "Path to save the single archive folder to, e.g. ./example.com_archive"
)
command = parser.parse_args(args or ())
+ stdin_url = None
url = command.url
- stdin_url = accept_stdin(stdin)
+ if not url:
+ stdin_url = accept_stdin(stdin)
+
if (stdin_url and url) or (not stdin and not url):
stderr(
'[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py
index cb073e95..dadf2654 100644
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -61,7 +61,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help='URLs matching this filter pattern will be removed from the index.'
)
command = parser.parse_args(args or ())
- filter_str = accept_stdin(stdin)
+
+ filter_str = None
+ if not command.filter_patterns:
+ filter_str = accept_stdin(stdin)
remove(
filter_str=filter_str,
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index a4d96dc9..4cc050dd 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -38,10 +38,20 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
action='store_true',
help='Enable DEBUG=True mode with more verbose errors',
)
+ parser.add_argument(
+ '--nothreading',
+ action='store_true',
+ help='Force runserver to run in single-threaded mode',
+ )
parser.add_argument(
'--init',
action='store_true',
- help='Run archivebox init before starting the server',
+ help='Run a full archivebox init/upgrade before starting the server',
+ )
+ parser.add_argument(
+ '--quick-init', '-i',
+ action='store_true',
+ help='Run quick archivebox init/upgrade before starting the server',
)
parser.add_argument(
'--createsuperuser',
@@ -52,10 +62,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
reject_stdin(__command__, stdin)
server(
- runserver_args=command.runserver_args,
+ runserver_args=command.runserver_args + (['--nothreading'] if command.nothreading else []),
reload=command.reload,
debug=command.debug,
init=command.init,
+ quick_init=command.quick_init,
createsuperuser=command.createsuperuser,
out_dir=pwd or OUTPUT_DIR,
)
diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index 6748096e..500d4c07 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -12,6 +12,7 @@ from ..main import update
from ..util import docstring
from ..config import OUTPUT_DIR
from ..index import (
+ LINK_FILTERS,
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@@ -89,9 +90,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
)
)
parser.add_argument(
- '--filter-type',
+ '--filter-type', '-t',
type=str,
- choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
+ choices=(*LINK_FILTERS.keys(), 'search'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
@@ -110,7 +111,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
default=""
)
command = parser.parse_args(args or ())
- filter_patterns_str = accept_stdin(stdin)
+
+ filter_patterns_str = None
+ if not command.filter_patterns:
+ filter_patterns_str = accept_stdin(stdin)
update(
resume=command.resume,
diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py
new file mode 100644
index 00000000..04c54df8
--- /dev/null
+++ b/archivebox/cli/tests.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+
+
+import os
+import sys
+import shutil
+import unittest
+from pathlib import Path
+
+from contextlib import contextmanager
+
+TEST_CONFIG = {
+ 'USE_COLOR': 'False',
+ 'SHOW_PROGRESS': 'False',
+
+ 'OUTPUT_DIR': 'data.tests',
+
+ 'SAVE_ARCHIVE_DOT_ORG': 'False',
+ 'SAVE_TITLE': 'False',
+
+ 'USE_CURL': 'False',
+ 'USE_WGET': 'False',
+ 'USE_GIT': 'False',
+ 'USE_CHROME': 'False',
+ 'USE_YOUTUBEDL': 'False',
+}
+
+OUTPUT_DIR = 'data.tests'
+os.environ.update(TEST_CONFIG)
+
+from ..main import init
+from ..index import load_main_index
+from ..config import (
+ SQL_INDEX_FILENAME,
+ JSON_INDEX_FILENAME,
+ HTML_INDEX_FILENAME,
+)
+
+from . import (
+ archivebox_init,
+ archivebox_add,
+ archivebox_remove,
+)
+
+HIDE_CLI_OUTPUT = True
+
+test_urls = '''
+https://example1.com/what/is/happening.html?what=1#how-about-this=1
+https://example2.com/what/is/happening/?what=1#how-about-this=1
+HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
+https://example4.com/what/is/happening.html
+https://example5.com/
+https://example6.com
+
+http://example7.com
+[https://example8.com/what/is/this.php?what=1]
+[and http://example9.com?what=1&other=3#and-thing=2]
+https://example10.com#and-thing=2 "
+abcdef
+sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
+example13.bada
+and example14.badb
+htt://example15.badc
+'''
+
+stdout = sys.stdout
+stderr = sys.stderr
+
+
+@contextmanager
+def output_hidden(show_failing=True):
+ if not HIDE_CLI_OUTPUT:
+ yield
+ return
+
+ sys.stdout = open('stdout.txt', 'w+', encoding='utf-8')
+ sys.stderr = open('stderr.txt', 'w+', encoding='utf-8')
+ try:
+ yield
+ sys.stdout.close()
+ sys.stderr.close()
+ sys.stdout = stdout
+ sys.stderr = stderr
+ except Exception:
+ sys.stdout.close()
+ sys.stderr.close()
+ sys.stdout = stdout
+ sys.stderr = stderr
+ if show_failing:
+ with open('stdout.txt', 'r', encoding='utf-8') as f:
+ print(f.read())
+ with open('stderr.txt', 'r', encoding='utf-8') as f:
+ print(f.read())
+ raise
+ finally:
+ os.remove('stdout.txt')
+ os.remove('stderr.txt')
+
+
+class TestInit(unittest.TestCase):
+ def setUp(self):
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+ def tearDown(self):
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+ def test_basic_init(self):
+ with output_hidden():
+ archivebox_init.main([])
+
+ assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
+ assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
+ assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
+ assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
+
+ def test_conflicting_init(self):
+ with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f:
+ f.write('test')
+
+ try:
+ with output_hidden(show_failing=False):
+ archivebox_init.main([])
+ assert False, 'Init should have exited with an exception'
+ except SystemExit:
+ pass
+
+ assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
+ assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
+ assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
+ try:
+ load_main_index(out_dir=OUTPUT_DIR)
+ assert False, 'load_main_index should raise an exception when no index is present'
+ except Exception:
+ pass
+
+ def test_no_dirty_state(self):
+ with output_hidden():
+ init()
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+ with output_hidden():
+ init()
+
+
+class TestAdd(unittest.TestCase):
+ def setUp(self):
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+ with output_hidden():
+ init()
+
+ def tearDown(self):
+ shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+ def test_add_arg_url(self):
+ with output_hidden():
+ archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
+
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 30
+
+ def test_add_arg_file(self):
+ test_file = Path(OUTPUT_DIR) / 'test.txt'
+ with open(test_file, 'w+', encoding='utf') as f:
+ f.write(test_urls)
+
+ with output_hidden():
+ archivebox_add.main([test_file])
+
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 12
+ os.remove(test_file)
+
+ def test_add_stdin_url(self):
+ with output_hidden():
+ archivebox_add.main([], stdin=test_urls)
+
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 12
+
+
+class TestRemove(unittest.TestCase):
+ def setUp(self):
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+ with output_hidden():
+ init()
+ archivebox_add.main([], stdin=test_urls)
+
+ # def tearDown(self):
+ # shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+
+ def test_remove_exact(self):
+ with output_hidden():
+ archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
+
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 11
+
+ def test_remove_regex(self):
+ with output_hidden():
+ archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
+
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 4
+
+ def test_remove_domain(self):
+ with output_hidden():
+ archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
+
+ all_links = load_main_index(out_dir=OUTPUT_DIR)
+ assert len(all_links) == 10
+
+ def test_remove_none(self):
+ try:
+ with output_hidden(show_failing=False):
+ archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
+ assert False, 'Should raise if no URLs match'
+ except Exception:
+ pass
+
+
+if __name__ == '__main__':
+ if '--verbose' in sys.argv or '-v' in sys.argv:
+ HIDE_CLI_OUTPUT = False
+
+ unittest.main()
diff --git a/archivebox/config.py b/archivebox/config.py
index 3d48344f..1c284ae7 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -29,10 +29,12 @@ import json
import getpass
import platform
import shutil
+import sqlite3
import django
from hashlib import md5
from pathlib import Path
+from datetime import datetime
from typing import Optional, Type, Tuple, Dict, Union, List
from subprocess import run, PIPE, DEVNULL
from configparser import ConfigParser
@@ -77,6 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
+ 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
},
'ARCHIVE_METHOD_TOGGLES': {
@@ -99,8 +102,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'ARCHIVE_METHOD_OPTIONS': {
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
- 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
+ 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
+ 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
@@ -111,7 +115,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
- 'YOUTUBEDL_ARGS': {'type': list, 'default': ['--write-description',
+ 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: ['--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
@@ -122,7 +126,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
- '--max-filesize=750m',
+ '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
]},
@@ -287,7 +291,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
- 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
@@ -459,7 +462,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
config_file.optionxform = str
config_file.read(config_path)
- with open(config_path, 'r') as old:
+ with open(config_path, 'r', encoding='utf-8') as old:
atomic_write(f'{config_path}.bak', old.read())
find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
@@ -480,14 +483,14 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
from django.utils.crypto import get_random_string
- chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.'
+ chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
random_secret_key = get_random_string(50, chars)
if 'SERVER_CONFIG' in config_file:
config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
else:
config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
- with open(config_path, 'w+') as new:
+ with open(config_path, 'w+', encoding='utf-8') as new:
config_file.write(new)
try:
@@ -499,7 +502,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
}
except:
# something went horribly wrong, rever to the previous version
- with open(f'{config_path}.bak', 'r') as old:
+ with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
atomic_write(config_path, old.read())
if Path(f'{config_path}.bak').exists():
@@ -1062,23 +1065,72 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
try:
import django
+ from django.core.management import call_command
+
sys.path.append(str(config['PACKAGE_DIR']))
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
+ # Check to make sure JSON extension is available in our Sqlite3 instance
+ try:
+ cursor = sqlite3.connect(':memory:').cursor()
+ cursor.execute('SELECT JSON(\'{"a": "b"}\')')
+ except sqlite3.OperationalError as exc:
+ stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
+ hint([
+ 'Upgrade your Python version or install the extension manually:',
+ 'https://code.djangoproject.com/wiki/JSON1Extension'
+ ])
+
if in_memory_db:
- # Put the db in memory and run migrations in case any command requires it
- from django.core.management import call_command
+ # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
+ # in those cases we create a temporary in-memory db and run the migrations
+ # immediately to get a usable in-memory-database at startup
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
django.setup()
call_command("migrate", interactive=False, verbosity=0)
else:
+ # Otherwise use default sqlite3 file-based database and initialize django
+ # without running migrations automatically (user runs them manually by calling init)
django.setup()
+
+
+ from django.conf import settings
+
+ # log startup message to the error log
+ with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
+ command = ' '.join(sys.argv)
+ ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S')
+ f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
+
if check_db:
+ # Enable WAL mode in sqlite3
+ from django.db import connection
+ with connection.cursor() as cursor:
+ current_mode = cursor.execute("PRAGMA journal_mode")
+ if current_mode != 'wal':
+ cursor.execute("PRAGMA journal_mode=wal;")
+
+ # Create cache table in DB if needed
+ try:
+ from django.core.cache import cache
+ cache.get('test', None)
+ except django.db.utils.OperationalError:
+ call_command("createcachetable", verbosity=0)
+
+
+ # if archivebox gets imported multiple times, we have to close
+ # the sqlite3 whenever we init from scratch to avoid multiple threads
+ # sharing the same connection by accident
+ from django.db import connections
+ for conn in connections.all():
+ conn.close_if_unusable_or_obsolete()
+
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
+
except KeyboardInterrupt:
raise SystemExit(2)
diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index bacc53c0..91feb07b 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -1,6 +1,7 @@
__package__ = 'archivebox.core'
from io import StringIO
+from pathlib import Path
from contextlib import redirect_stdout
from django.contrib import admin
@@ -13,15 +14,15 @@ from django import forms
from ..util import htmldecode, urldecode, ansi_to_html
-from core.models import Snapshot, Tag
-from core.forms import AddLinkForm, TagField
+from core.models import Snapshot, ArchiveResult, Tag
+from core.forms import AddLinkForm
from core.mixins import SearchResultsAdminMixin
from index.html import snapshot_icons
from logging_util import printable_filesize
from main import add, remove
-from config import OUTPUT_DIR
+from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
from extractors import archive_links
# Admin URLs
@@ -36,77 +37,34 @@ from extractors import archive_links
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
-def update_snapshots(modeladmin, request, queryset):
- archive_links([
- snapshot.as_link()
- for snapshot in queryset
- ], out_dir=OUTPUT_DIR)
-update_snapshots.short_description = "Archive"
-def update_titles(modeladmin, request, queryset):
- archive_links([
- snapshot.as_link()
- for snapshot in queryset
- ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
-update_titles.short_description = "Pull title"
+class ArchiveResultInline(admin.TabularInline):
+ model = ArchiveResult
-def overwrite_snapshots(modeladmin, request, queryset):
- archive_links([
- snapshot.as_link()
- for snapshot in queryset
- ], overwrite=True, out_dir=OUTPUT_DIR)
-overwrite_snapshots.short_description = "Re-archive (overwrite)"
+class TagInline(admin.TabularInline):
+ model = Snapshot.tags.through
-def verify_snapshots(modeladmin, request, queryset):
- for snapshot in queryset:
- print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
-
-verify_snapshots.short_description = "Check"
-
-def delete_snapshots(modeladmin, request, queryset):
- remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
-
-delete_snapshots.short_description = "Delete"
+from django.contrib.admin.helpers import ActionForm
-class SnapshotAdminForm(forms.ModelForm):
- tags = TagField(required=False)
-
- class Meta:
- model = Snapshot
- fields = "__all__"
-
- def save(self, commit=True):
- # Based on: https://stackoverflow.com/a/49933068/3509554
-
- # Get the unsave instance
- instance = forms.ModelForm.save(self, False)
- tags = self.cleaned_data.pop("tags")
-
- #update save_m2m
- def new_save_m2m():
- instance.save_tags(tags)
-
- # Do we need to save all changes now?
- self.save_m2m = new_save_m2m
- if commit:
- instance.save()
-
- return instance
+class SnapshotActionForm(ActionForm):
+ tag = forms.ModelChoiceField(queryset=Tag.objects.all(), required=False)
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
sort_fields = ('title_str', 'url_str', 'added')
- readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
+ readonly_fields = ('uuid', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
- fields = (*readonly_fields, 'title', 'tags')
+ fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields)
list_filter = ('added', 'updated', 'tags')
ordering = ['-added']
- actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
- actions_template = 'admin/actions_as_select.html'
- form = SnapshotAdminForm
- list_per_page = 40
+ actions = ['delete_snapshots', 'overwrite_snapshots', 'update_snapshots', 'update_titles', 'verify_snapshots', 'add_tag', 'remove_tag']
+ autocomplete_fields = ['tags']
+ inlines = [ArchiveResultInline]
+ list_per_page = SNAPSHOTS_PER_PAGE
+
+ action_form = SnapshotActionForm
def get_urls(self):
urls = super().get_urls()
@@ -116,21 +74,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
return custom_urls + urls
def get_queryset(self, request):
+ self.request = request
return super().get_queryset(request).prefetch_related('tags')
def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True))
- def id_str(self, obj):
+ # TODO: figure out a different way to do this, you cant nest forms so this doenst work
+ # def action(self, obj):
+ # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
+ # # action: update_snapshots
+ # # select_across: 0
+ # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
+ # return format_html(
+ # '''
+ #
+ # ''',
+ # csrf.get_token(self.request),
+ # obj.id,
+ # )
+
+ def uuid(self, obj):
return format_html(
- '{}
',
- obj.url_hash[:8],
+ '{}
View index ➡️ View actions ⚙️',
+ obj.id,
+ obj.timestamp,
+ obj.id,
)
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
- format_html('{} ', tag.id, tag)
+ format_html('{} ', tag.id, tag)
for tag in obj.tags.all()
if str(tag).strip()
)
@@ -152,7 +135,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
return snapshot_icons(obj)
def size(self, obj):
- archive_size = obj.archive_size
+ archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
@@ -190,28 +173,135 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
rendered_response = self.changelist_view(request)
# Restore values
- self.change_list_template = saved_change_list_template
+ self.change_list_template = saved_change_list_template
self.list_per_page = saved_list_per_page
self.list_max_show_all = saved_list_max_show_all
return rendered_response
+
+
+ def update_snapshots(self, request, queryset):
+ archive_links([
+ snapshot.as_link()
+ for snapshot in queryset
+ ], out_dir=OUTPUT_DIR)
+ update_snapshots.short_description = "Archive"
+
+ def update_titles(self, request, queryset):
+ archive_links([
+ snapshot.as_link()
+ for snapshot in queryset
+ ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
+ update_titles.short_description = "Pull title"
+
+ def overwrite_snapshots(self, request, queryset):
+ archive_links([
+ snapshot.as_link()
+ for snapshot in queryset
+ ], overwrite=True, out_dir=OUTPUT_DIR)
+ overwrite_snapshots.short_description = "Re-archive (overwrite)"
+
+ def verify_snapshots(self, request, queryset):
+ for snapshot in queryset:
+ print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
+
+ verify_snapshots.short_description = "Check"
+
+ def delete_snapshots(self, request, queryset):
+ remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
+
+ delete_snapshots.short_description = "Delete"
+
+ def add_tag(self, request, queryset):
+ tag = request.POST['tag']
+ for obj in queryset:
+ obj.tags.add(tag)
+
+ add_tag.short_description = "Add tag"
+
+ def remove_tag(self, request, queryset):
+ tag = request.POST['tag']
+ for obj in queryset:
+ obj.tags.remove(tag)
+
+ remove_tag.short_description = "Remove tag"
+
- id_str.short_description = 'ID'
title_str.short_description = 'Title'
url_str.short_description = 'Original URL'
- id_str.admin_order_field = 'id'
title_str.admin_order_field = 'title'
url_str.admin_order_field = 'url'
+
+
class TagAdmin(admin.ModelAdmin):
- list_display = ('slug', 'name', 'id')
+ list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
sort_fields = ('id', 'name', 'slug')
- readonly_fields = ('id',)
+ readonly_fields = ('id', 'num_snapshots', 'snapshots')
search_fields = ('id', 'name', 'slug')
fields = (*readonly_fields, 'name', 'slug')
+ actions = ['delete_selected']
+ ordering = ['-id']
+ def num_snapshots(self, obj):
+ return format_html(
+ '{} total',
+ obj.id,
+ obj.snapshot_set.count(),
+ )
+
+ def snapshots(self, obj):
+ total_count = obj.snapshot_set.count()
+ return mark_safe('
'.join(
+ format_html(
+ '{} [{}] {}
',
+ snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
+ snap.id,
+ snap.timestamp,
+ snap.url,
+ )
+ for snap in obj.snapshot_set.order_by('-updated')[:10]
+ ) + (f'
and {total_count-10} more...' if obj.snapshot_set.count() > 10 else ''))
+
+
+class ArchiveResultAdmin(admin.ModelAdmin):
+ list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'cmd_str', 'status', 'output_str')
+ sort_fields = ('start_ts', 'extractor', 'status')
+ readonly_fields = ('id', 'uuid', 'snapshot_str')
+ search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
+ fields = (*readonly_fields, 'snapshot', 'snapshot__tags', 'extractor', 'status', 'start_ts', 'end_ts', 'pwd', 'cmd', 'cmd_version', 'output')
+ autocomplete_fields = ['snapshot']
+
+ list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
+ ordering = ['-start_ts']
+ list_per_page = SNAPSHOTS_PER_PAGE
+
+ def snapshot_str(self, obj):
+ return format_html(
+ '[{}]
'
+ '{}',
+ obj.snapshot.timestamp,
+ obj.snapshot.timestamp,
+ obj.snapshot.url[:128],
+ )
+
+ def cmd_str(self, obj):
+ return format_html(
+ '{}
',
+ ' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd),
+ )
+
+ def output_str(self, obj):
+ return format_html(
+ '↗️{}
',
+ obj.snapshot.timestamp,
+ obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
+ obj.output,
+ )
+
+ snapshot_str.short_description = 'snapshot'
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
@@ -266,4 +356,5 @@ admin.site = ArchiveBoxAdmin()
admin.site.register(get_user_model())
admin.site.register(Snapshot, SnapshotAdmin)
admin.site.register(Tag, TagAdmin)
+admin.site.register(ArchiveResult, ArchiveResultAdmin)
admin.site.disable_action('delete_selected')
diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py
index ed584c68..e3e904df 100644
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -20,7 +20,8 @@ ARCHIVE_METHODS = [
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
- depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
+ tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
+ depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
archive_methods = forms.MultipleChoiceField(
label="Archive methods (select at least 1, otherwise all will be used by default)",
required=False,
diff --git a/archivebox/core/migrations/0009_auto_20210216_1038.py b/archivebox/core/migrations/0009_auto_20210216_1038.py
new file mode 100644
index 00000000..2817fe54
--- /dev/null
+++ b/archivebox/core/migrations/0009_auto_20210216_1038.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-16 10:38
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0008_auto_20210105_1421'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='updated',
+ field=models.DateTimeField(auto_now=True, db_index=True, null=True),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0010_auto_20210216_1055.py b/archivebox/core/migrations/0010_auto_20210216_1055.py
new file mode 100644
index 00000000..0af61a39
--- /dev/null
+++ b/archivebox/core/migrations/0010_auto_20210216_1055.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-16 10:55
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0009_auto_20210216_1038'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='start_ts',
+ field=models.DateTimeField(db_index=True),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0011_auto_20210216_1331.py b/archivebox/core/migrations/0011_auto_20210216_1331.py
new file mode 100644
index 00000000..d2226674
--- /dev/null
+++ b/archivebox/core/migrations/0011_auto_20210216_1331.py
@@ -0,0 +1,24 @@
+# Generated by Django 3.1.3 on 2021-02-16 13:31
+
+from django.db import migrations, models
+import uuid
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0010_auto_20210216_1055'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='archiveresult',
+ name='uuid',
+ field=models.UUIDField(default=uuid.uuid4, editable=False),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='extractor',
+ field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0012_auto_20210216_1425.py b/archivebox/core/migrations/0012_auto_20210216_1425.py
new file mode 100644
index 00000000..310058ac
--- /dev/null
+++ b/archivebox/core/migrations/0012_auto_20210216_1425.py
@@ -0,0 +1,23 @@
+# Generated by Django 3.1.3 on 2021-02-16 14:25
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0011_auto_20210216_1331'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='cmd_version',
+ field=models.CharField(blank=True, default=None, max_length=128, null=True),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='output',
+ field=models.CharField(max_length=1024),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0013_auto_20210218_0729.py b/archivebox/core/migrations/0013_auto_20210218_0729.py
new file mode 100644
index 00000000..d3fe3b4f
--- /dev/null
+++ b/archivebox/core/migrations/0013_auto_20210218_0729.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0012_auto_20210216_1425'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='title',
+ field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0014_auto_20210218_0729.py b/archivebox/core/migrations/0014_auto_20210218_0729.py
new file mode 100644
index 00000000..db81934f
--- /dev/null
+++ b/archivebox/core/migrations/0014_auto_20210218_0729.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0013_auto_20210218_0729'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='title',
+ field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0015_auto_20210218_0730.py b/archivebox/core/migrations/0015_auto_20210218_0730.py
new file mode 100644
index 00000000..b782a217
--- /dev/null
+++ b/archivebox/core/migrations/0015_auto_20210218_0730.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:30
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0014_auto_20210218_0729'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='title',
+ field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0016_auto_20210218_1204.py b/archivebox/core/migrations/0016_auto_20210218_1204.py
new file mode 100644
index 00000000..4637feab
--- /dev/null
+++ b/archivebox/core/migrations/0016_auto_20210218_1204.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 12:04
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0015_auto_20210218_0730'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='tags',
+ field=models.ManyToManyField(blank=True, to='core.Tag'),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0017_auto_20210219_0211.py b/archivebox/core/migrations/0017_auto_20210219_0211.py
new file mode 100644
index 00000000..221a250b
--- /dev/null
+++ b/archivebox/core/migrations/0017_auto_20210219_0211.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-19 02:11
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0016_auto_20210218_1204'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='tag',
+ name='slug',
+ field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0018_auto_20210327_0952.py b/archivebox/core/migrations/0018_auto_20210327_0952.py
new file mode 100644
index 00000000..d0f3dde1
--- /dev/null
+++ b/archivebox/core/migrations/0018_auto_20210327_0952.py
@@ -0,0 +1,23 @@
+# Generated by Django 3.1.3 on 2021-03-27 09:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0017_auto_20210219_0211'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='tag',
+ name='name',
+ field=models.CharField(max_length=100, unique=True),
+ ),
+ migrations.AlterField(
+ model_name='tag',
+ name='slug',
+ field=models.SlugField(blank=True, max_length=100, unique=True),
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 13d75b66..e7741920 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -2,12 +2,15 @@ __package__ = 'archivebox.core'
import uuid
-from django.db import models, transaction
+from django.db import models
from django.utils.functional import cached_property
from django.utils.text import slugify
+from django.core.cache import cache
from django.db.models import Case, When, Value, IntegerField
-from ..util import parse_date
+from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
+from ..system import get_dir_size
+from ..util import parse_date, base_url, hashurl
from ..index.schema import Link
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
@@ -29,8 +32,11 @@ class Tag(models.Model):
"""
Based on django-taggit model
"""
- name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
- slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
+ name = models.CharField(unique=True, blank=False, max_length=100)
+
+ # slug is autoset on save from name, never set it manually
+ slug = models.SlugField(unique=True, blank=True, max_length=100)
+
class Meta:
verbose_name = "Tag"
@@ -49,20 +55,21 @@ class Tag(models.Model):
if self._state.adding and not self.slug:
self.slug = self.slugify(self.name)
- with transaction.atomic():
- slugs = set(
- type(self)
- ._default_manager.filter(slug__startswith=self.slug)
- .values_list("slug", flat=True)
- )
+ # if name is different but slug conficts with another tags slug, append a counter
+ # with transaction.atomic():
+ slugs = set(
+ type(self)
+ ._default_manager.filter(slug__startswith=self.slug)
+ .values_list("slug", flat=True)
+ )
- i = None
- while True:
- slug = self.slugify(self.name, i)
- if slug not in slugs:
- self.slug = slug
- return super().save(*args, **kwargs)
- i = 1 if i is None else i+1
+ i = None
+ while True:
+ slug = self.slugify(self.name, i)
+ if slug not in slugs:
+ self.slug = slug
+ return super().save(*args, **kwargs)
+ i = 1 if i is None else i+1
else:
return super().save(*args, **kwargs)
@@ -73,11 +80,11 @@ class Snapshot(models.Model):
url = models.URLField(unique=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
- title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
+ title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
added = models.DateTimeField(auto_now_add=True, db_index=True)
- updated = models.DateTimeField(null=True, blank=True, db_index=True)
- tags = models.ManyToManyField(Tag)
+ updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
+ tags = models.ManyToManyField(Tag, blank=True)
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
@@ -109,13 +116,24 @@ class Snapshot(models.Model):
from ..index import load_link_details
return load_link_details(self.as_link())
- def tags_str(self) -> str:
- return ','.join(self.tags.order_by('name').values_list('name', flat=True))
+ def tags_str(self, nocache=True) -> str:
+ cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
+ calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
+ if nocache:
+ tags_str = calc_tags_str()
+ cache.set(cache_key, tags_str)
+ return tags_str
+ return cache.get_or_set(cache_key, calc_tags_str)
@cached_property
def bookmarked(self):
return parse_date(self.timestamp)
+ @cached_property
+ def bookmarked_date(self):
+ # TODO: remove this
+ return self.bookmarked
+
@cached_property
def is_archived(self):
return self.as_link().is_archived
@@ -126,23 +144,31 @@ class Snapshot(models.Model):
@cached_property
def url_hash(self):
- return self.as_link().url_hash
+ return hashurl(self.url)
@cached_property
def base_url(self):
- return self.as_link().base_url
+ return base_url(self.url)
@cached_property
def link_dir(self):
- return self.as_link().link_dir
+ return str(ARCHIVE_DIR / self.timestamp)
@cached_property
def archive_path(self):
- return self.as_link().archive_path
+ return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
@cached_property
def archive_size(self):
- return self.as_link().archive_size
+ cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
+
+ def calc_dir_size():
+ try:
+ return get_dir_size(self.link_dir)[0]
+ except Exception:
+ return 0
+
+ return cache.get_or_set(cache_key, calc_dir_size)
@cached_property
def history(self):
@@ -151,17 +177,40 @@ class Snapshot(models.Model):
@cached_property
def latest_title(self):
- if ('title' in self.history
- and self.history['title']
- and (self.history['title'][-1].status == 'succeeded')
- and self.history['title'][-1].output.strip()):
- return self.history['title'][-1].output.strip()
+ if self.title:
+ return self.title # whoopdedoo that was easy
+
+ try:
+ # take longest successful title from ArchiveResult db history
+ return sorted(
+ self.archiveresult_set\
+ .filter(extractor='title', status='succeeded', output__isnull=False)\
+ .values_list('output', flat=True),
+ key=lambda r: len(r),
+ )[-1]
+ except IndexError:
+ pass
+
+ try:
+ # take longest successful title from Link json index file history
+ return sorted(
+ (
+ result.output.strip()
+ for result in self.history['title']
+ if result.status == 'succeeded' and result.output.strip()
+ ),
+ key=lambda r: len(r),
+ )[-1]
+ except (KeyError, IndexError):
+ pass
+
return None
def save_tags(self, tags=()):
tags_id = []
for tag in tags:
- tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
+ if tag.strip():
+ tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
self.tags.clear()
self.tags.add(*tags_id)
@@ -178,15 +227,18 @@ class ArchiveResultManager(models.Manager):
class ArchiveResult(models.Model):
+ id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')
+ uuid = models.UUIDField(default=uuid.uuid4, editable=False)
+
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
+ extractor = models.CharField(choices=EXTRACTORS, max_length=32)
cmd = JSONField()
pwd = models.CharField(max_length=256)
- cmd_version = models.CharField(max_length=32, default=None, null=True, blank=True)
- output = models.CharField(max_length=512)
- start_ts = models.DateTimeField()
+ cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
+ output = models.CharField(max_length=1024)
+ start_ts = models.DateTimeField(db_index=True)
end_ts = models.DateTimeField()
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
- extractor = models.CharField(choices=EXTRACTORS, max_length=32)
objects = ArchiveResultManager()
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index e73c93d9..6a795702 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -2,6 +2,9 @@ __package__ = 'archivebox.core'
import os
import sys
+import re
+import logging
+import tempfile
from pathlib import Path
from django.utils.crypto import get_random_string
@@ -14,6 +17,7 @@ from ..config import (
TEMPLATES_DIR_NAME,
SQL_INDEX_FILENAME,
OUTPUT_DIR,
+ LOGS_DIR,
)
@@ -62,6 +66,40 @@ AUTHENTICATION_BACKENDS = [
'django.contrib.auth.backends.ModelBackend',
]
+# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
+DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
+if DEBUG_TOOLBAR:
+ try:
+ import debug_toolbar # noqa
+ DEBUG_TOOLBAR = True
+ except ImportError:
+ DEBUG_TOOLBAR = False
+
+if DEBUG_TOOLBAR:
+ INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
+ INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
+ DEBUG_TOOLBAR_CONFIG = {
+ "SHOW_TOOLBAR_CALLBACK": lambda request: True,
+ "RENDER_PANELS": True,
+ }
+ DEBUG_TOOLBAR_PANELS = [
+ 'debug_toolbar.panels.history.HistoryPanel',
+ 'debug_toolbar.panels.versions.VersionsPanel',
+ 'debug_toolbar.panels.timer.TimerPanel',
+ 'debug_toolbar.panels.settings.SettingsPanel',
+ 'debug_toolbar.panels.headers.HeadersPanel',
+ 'debug_toolbar.panels.request.RequestPanel',
+ 'debug_toolbar.panels.sql.SQLPanel',
+ 'debug_toolbar.panels.staticfiles.StaticFilesPanel',
+ # 'debug_toolbar.panels.templates.TemplatesPanel',
+ 'debug_toolbar.panels.cache.CachePanel',
+ 'debug_toolbar.panels.signals.SignalsPanel',
+ 'debug_toolbar.panels.logging.LoggingPanel',
+ 'debug_toolbar.panels.redirects.RedirectsPanel',
+ 'debug_toolbar.panels.profiling.ProfilingPanel',
+ 'djdt_flamegraph.FlamegraphPanel',
+ ]
+ MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
################################################################################
### Staticfile and Template Settings
@@ -107,6 +145,22 @@ DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': DATABASE_NAME,
+ 'OPTIONS': {
+ 'timeout': 60,
+ 'check_same_thread': False,
+ },
+ # DB setup is sometimes modified at runtime by setup_django() in config.py
+ }
+}
+
+CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache'
+# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache'
+# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache'
+
+CACHES = {
+ 'default': {
+ 'BACKEND': CACHE_BACKEND,
+ 'LOCATION': 'django_cache_default',
}
}
@@ -117,7 +171,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
### Security Settings
################################################################################
-SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.')
+SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
@@ -131,6 +185,8 @@ SESSION_COOKIE_AGE = 1209600 # 2 weeks
SESSION_EXPIRE_AT_BROWSER_CLOSE = False
SESSION_SAVE_EVERY_REQUEST = True
+SESSION_ENGINE = "django.contrib.sessions.backends.db"
+
AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
{'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
@@ -163,3 +219,73 @@ USE_TZ = False
DATETIME_FORMAT = 'Y-m-d g:iA'
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
+
+
+################################################################################
+### Logging Settings
+################################################################################
+
+IGNORABLE_404_URLS = [
+ re.compile(r'apple-touch-icon.*\.png$'),
+ re.compile(r'favicon\.ico$'),
+ re.compile(r'robots\.txt$'),
+ re.compile(r'.*\.(css|js)\.map$'),
+]
+
+class NoisyRequestsFilter(logging.Filter):
+ def filter(self, record):
+ logline = record.getMessage()
+
+ # ignore harmless 404s for the patterns in IGNORABLE_404_URLS
+ for ignorable_url_pattern in IGNORABLE_404_URLS:
+ ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M)
+ if ignorable_log_pattern.match(logline):
+ return 0
+
+ # ignore staticfile requests that 200 or 30*
+ ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
+ if ignoreable_200_log_pattern.match(logline):
+ return 0
+
+ return 1
+
+if LOGS_DIR.exists():
+ ERROR_LOG = (LOGS_DIR / 'errors.log')
+else:
+ # meh too many edge cases here around creating log dir w/ correct permissions
+ # cant be bothered, just trash the log and let them figure it out via stdout/stderr
+ ERROR_LOG = tempfile.NamedTemporaryFile().name
+
+LOGGING = {
+ 'version': 1,
+ 'disable_existing_loggers': False,
+ 'handlers': {
+ 'console': {
+ 'class': 'logging.StreamHandler',
+ },
+ 'logfile': {
+ 'level': 'ERROR',
+ 'class': 'logging.handlers.RotatingFileHandler',
+ 'filename': ERROR_LOG,
+ 'maxBytes': 1024 * 1024 * 25, # 25 MB
+ 'backupCount': 10,
+ },
+ },
+ 'filters': {
+ 'noisyrequestsfilter': {
+ '()': NoisyRequestsFilter,
+ }
+ },
+ 'loggers': {
+ 'django': {
+ 'handlers': ['console', 'logfile'],
+ 'level': 'INFO',
+ 'filters': ['noisyrequestsfilter'],
+ },
+ 'django.server': {
+ 'handlers': ['console', 'logfile'],
+ 'level': 'INFO',
+ 'filters': ['noisyrequestsfilter'],
+ }
+ },
+}
diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py
index 182e4dca..87a302b8 100644
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -2,6 +2,7 @@ from django.contrib import admin
from django.urls import path, include
from django.views import static
+from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.conf import settings
from django.views.generic.base import RedirectView
@@ -13,8 +14,8 @@ from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
urlpatterns = [
path('public/', PublicIndexView.as_view(), name='public-index'),
- path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
- path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
+ path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
+ path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
@@ -35,35 +36,43 @@ urlpatterns = [
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
path('', HomepageView.as_view(), name='Home'),
]
+urlpatterns += staticfiles_urlpatterns()
- # # Proposed UI URLs spec
- # path('', HomepageView)
- # path('/add', AddView)
- # path('/public', PublicIndexView)
- # path('/snapshot/:slug', SnapshotView)
-
- # path('/admin', admin.site.urls)
- # path('/accounts', django.contrib.auth.urls)
+if settings.DEBUG_TOOLBAR:
+ import debug_toolbar
+ urlpatterns += [
+ path('__debug__/', include(debug_toolbar.urls)),
+ ]
- # # Prposed REST API spec
- # # :slugs can be uuid, short_uuid, or any of the unique index_fields
- # path('api/v1/'),
- # path('api/v1/core/' [GET])
- # path('api/v1/core/snapshot/', [GET, POST, PUT]),
- # path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]),
- # path('api/v1/core/archiveresult', [GET, POST, PUT]),
- # path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]),
- # path('api/v1/core/tag/', [GET, POST, PUT]),
- # path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]),
- # path('api/v1/cli/', [GET])
- # path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
+# # Proposed FUTURE URLs spec
+# path('', HomepageView)
+# path('/add', AddView)
+# path('/public', PublicIndexView)
+# path('/snapshot/:slug', SnapshotView)
- # path('api/v1/extractors/', [GET])
- # path('api/v1/extractors/:extractor/', [GET]),
- # path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function
+# path('/admin', admin.site.urls)
+# path('/accounts', django.contrib.auth.urls)
- # future, just an idea:
- # path('api/v1/scheduler/', [GET])
- # path('api/v1/scheduler/task/', [GET, POST, PUT]),
- # path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]),
+# # Prposed REST API spec
+# # :slugs can be uuid, short_uuid, or any of the unique index_fields
+# path('api/v1/'),
+# path('api/v1/core/' [GET])
+# path('api/v1/core/snapshot/', [GET, POST, PUT]),
+# path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]),
+# path('api/v1/core/archiveresult', [GET, POST, PUT]),
+# path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]),
+# path('api/v1/core/tag/', [GET, POST, PUT]),
+# path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]),
+
+# path('api/v1/cli/', [GET])
+# path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
+
+# path('api/v1/extractors/', [GET])
+# path('api/v1/extractors/:extractor/', [GET]),
+# path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function
+
+# future, just an idea:
+# path('api/v1/scheduler/', [GET])
+# path('api/v1/scheduler/task/', [GET, POST, PUT]),
+# path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]),
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 0e19fad6..36794a8d 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -4,8 +4,8 @@ from io import StringIO
from contextlib import redirect_stdout
from django.shortcuts import render, redirect
-
-from django.http import HttpResponse
+from django.http import HttpResponse, Http404
+from django.utils.html import format_html, mark_safe
from django.views import View, static
from django.views.generic.list import ListView
from django.views.generic import FormView
@@ -22,6 +22,7 @@ from ..config import (
PUBLIC_ADD_VIEW,
VERSION,
FOOTER_INFO,
+ SNAPSHOTS_PER_PAGE,
)
from main import add
from ..util import base_url, ansi_to_html
@@ -43,10 +44,6 @@ class SnapshotView(View):
# render static html index from filesystem archive//index.html
def get(self, request, path):
- # missing trailing slash -> redirect to index
- if '/' not in path:
- return redirect(f'{path}/index.html')
-
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
@@ -55,46 +52,163 @@ class SnapshotView(View):
except (IndexError, ValueError):
slug, archivefile = path.split('/', 1)[0], 'index.html'
- all_pages = list(Snapshot.objects.all())
-
# slug is a timestamp
- by_ts = {page.timestamp: page for page in all_pages}
- try:
- # print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
- response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
- response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
- return response
- except KeyError:
- pass
+ if slug.replace('.','').isdigit():
- # slug is a hash
- by_hash = {page.url_hash: page for page in all_pages}
- try:
- timestamp = by_hash[slug].timestamp
- return redirect(f'/archive/{timestamp}/{archivefile}')
- except KeyError:
- pass
+ # missing trailing slash -> redirect to index
+ if '/' not in path:
+ return redirect(f'{path}/index.html')
+ try:
+ try:
+ snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
+ response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
+ response["Link"] = f'<{snapshot.url}>; rel="canonical"'
+ return response
+ except Snapshot.DoesNotExist:
+ if Snapshot.objects.filter(timestamp__startswith=slug).exists():
+ raise Snapshot.MultipleObjectsReturned
+ else:
+ raise
+ except Snapshot.DoesNotExist:
+ # Snapshot does not exist
+ return HttpResponse(
+ format_html(
+ (
+ '
'
+ 'No Snapshot directories match the given timestamp or UUID: {}
'
+ 'You can add a new Snapshot, or return to the Main Index'
+ ''
+ ),
+ slug,
+ path,
+ ),
+ content_type="text/html",
+ status=404,
+ )
+ except Snapshot.MultipleObjectsReturned:
+ snapshot_hrefs = mark_safe('
').join(
+ format_html(
+ '{} {}
{} {}',
+ snap.added.strftime('%Y-%m-%d %H:%M:%S'),
+ snap.timestamp,
+ snap.timestamp,
+ snap.url,
+ snap.title or '',
+ )
+ for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
+ )
+ return HttpResponse(
+ format_html(
+ (
+ 'Multiple Snapshots match the given timestamp/UUID {}
'
+ ),
+ slug,
+ ) + snapshot_hrefs + format_html(
+ (
+ '
'
+ 'Choose a Snapshot to proceed or go back to the Main Index'
+ )
+ ),
+ content_type="text/html",
+ status=404,
+ )
+ except Http404:
+ # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
+ return HttpResponse(
+ format_html(
+ (
+ '
'
+ f'Snapshot [{snapshot.timestamp}]
exists in DB, but resource {snapshot.timestamp}/'
+ '{}'
+ f'
does not exist in snapshot dir yet.
'
+ 'Maybe this resource type is not availabe for this Snapshot,
or the archiving process has not completed yet?
'
+ f'# run this cmd to finish archiving this Snapshot
archivebox update -t timestamp {snapshot.timestamp}
'
+ ''
+ '
Next steps:'
+ f'- list all the
Snapshot files .*
'
+ f'- view the
Snapshot ./index.html
'
+ f'- go to the
Snapshot admin to edit
'
+ f'- go to the
Snapshot actions to re-archive
'
+ '- or return to
the main index... '
+ ''
+ ),
+ archivefile,
+ ),
+ content_type="text/html",
+ status=404,
+ )
# slug is a URL
- by_url = {page.base_url: page for page in all_pages}
try:
- # TODO: add multiple snapshot support by showing index of all snapshots
- # for given url instead of redirecting to timestamp index
- timestamp = by_url[base_url(path)].timestamp
- return redirect(f'/archive/{timestamp}/index.html')
- except KeyError:
- pass
-
- return HttpResponse(
- 'No archived link matches the given timestamp or hash.',
- content_type="text/plain",
- status=404,
- )
+ try:
+ # try exact match on full url first
+ snapshot = Snapshot.objects.get(
+ Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path)
+ )
+ except Snapshot.DoesNotExist:
+ # fall back to match on exact base_url
+ try:
+ snapshot = Snapshot.objects.get(
+ Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
+ )
+ except Snapshot.DoesNotExist:
+ # fall back to matching base_url as prefix
+ snapshot = Snapshot.objects.get(
+ Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
+ )
+ return redirect(f'/archive/{snapshot.timestamp}/index.html')
+ except Snapshot.DoesNotExist:
+ return HttpResponse(
+ format_html(
+ (
+ '
'
+ 'No Snapshots match the given url: {}
'
+ 'Return to the Main Index, or:
'
+ '+ Add a new Snapshot for {}
'
+ ''
+ ),
+ base_url(path),
+ path if '://' in path else f'https://{path}',
+ path,
+ ),
+ content_type="text/html",
+ status=404,
+ )
+ except Snapshot.MultipleObjectsReturned:
+ snapshot_hrefs = mark_safe('
').join(
+ format_html(
+ '{} {}
{} {}',
+ snap.added.strftime('%Y-%m-%d %H:%M:%S'),
+ snap.timestamp,
+ snap.timestamp,
+ snap.url,
+ snap.title or '',
+ )
+ for snap in Snapshot.objects.filter(
+ Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
+ ).only('url', 'timestamp', 'title', 'added').order_by('-added')
+ )
+ return HttpResponse(
+ format_html(
+ (
+ 'Multiple Snapshots match the given URL {}
'
+ ),
+ base_url(path),
+ ) + snapshot_hrefs + format_html(
+ (
+ '
'
+ 'Choose a Snapshot to proceed or go back to the Main Index'
+ )
+ ),
+ content_type="text/html",
+ status=404,
+ )
+
class PublicIndexView(ListView):
template_name = 'public_index.html'
model = Snapshot
- paginate_by = 100
+ paginate_by = SNAPSHOTS_PER_PAGE
ordering = ['title']
def get_context_data(self, **kwargs):
@@ -105,12 +219,14 @@ class PublicIndexView(ListView):
}
def get_queryset(self, **kwargs):
- qs = super().get_queryset(**kwargs)
+ qs = super().get_queryset(**kwargs)
query = self.request.GET.get('q')
if query:
qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
+
for snapshot in qs:
- snapshot.icons = snapshot_icons(snapshot)
+ # lazy load snapshot icons, otherwise it will load icons for entire index at once
+ snapshot.icons = lambda: snapshot_icons(snapshot)
return qs
def get(self, *args, **kwargs):
@@ -130,9 +246,9 @@ class AddView(UserPassesTestMixin, FormView):
if self.request.method == 'GET':
url = self.request.GET.get('url', None)
if url:
- return {'url': url}
- else:
- return super().get_initial()
+ return {'url': url if '://' in url else f'https://{url}'}
+
+ return super().get_initial()
def test_func(self):
return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
@@ -145,15 +261,18 @@ class AddView(UserPassesTestMixin, FormView):
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION,
'FOOTER_INFO': FOOTER_INFO,
+ 'stdout': '',
}
def form_valid(self, form):
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
+ tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = {
"urls": url,
+ "tag": tag,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
diff --git a/archivebox/core/wsgi.py b/archivebox/core/wsgi.py
index f933afae..94993b92 100644
--- a/archivebox/core/wsgi.py
+++ b/archivebox/core/wsgi.py
@@ -7,10 +7,10 @@ For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""
-import os
+
+from archivebox.config import setup_django
+setup_django(in_memory_db=False, check_db=True)
from django.core.wsgi import get_wsgi_application
-os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
-
application = get_wsgi_application()
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 15968097..09b56c66 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -44,16 +44,16 @@ def get_default_archive_methods():
return [
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
- ('wget', should_save_wget, save_wget),
+ ('headers', should_save_headers, save_headers),
('singlefile', should_save_singlefile, save_singlefile),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
- ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
+ ('wget', should_save_wget, save_wget),
+ ('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
('mercury', should_save_mercury, save_mercury),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
- ('headers', should_save_headers, save_headers),
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
]
@@ -115,6 +115,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
+
+ # bump the updated time on the main Snapshot here, this is critical
+ # to be able to cache summaries of the ArchiveResults for a given
+ # snapshot without having to load all the results from the DB each time.
+ # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
+ # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
+ snapshot.save()
else:
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1
diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py
index 1f382190..a0883113 100644
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -31,7 +31,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'archive.org.txt').exists():
- # if open(path, 'r').read().strip() != 'None':
+ # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
return False
return SAVE_ARCHIVE_DOT_ORG
diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py
index d9e32c0a..e7d20362 100644
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@@ -54,11 +54,13 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "mercury"
- output = str(output_folder)
+ output = "mercury"
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
+ output_folder.mkdir(exist_ok=True)
+
# Get plain text version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
@@ -71,6 +73,11 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
except json.JSONDecodeError:
raise ShellError(cmd, result)
+ if article_text.get('failed'):
+ raise ArchiveError('Mercury was not able to get article text from the URL')
+
+ atomic_write(str(output_folder / "content.txt"), article_text["content"])
+
# Get HTML version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
@@ -82,9 +89,10 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
except json.JSONDecodeError:
raise ShellError(cmd, result)
- output_folder.mkdir(exist_ok=True)
+ if article_text.get('failed'):
+ raise ArchiveError('Mercury was not able to get article HTML from the URL')
+
atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
- atomic_write(str(output_folder / "content.txt"), article_text["content"])
atomic_write(str(output_folder / "article.json"), article_json)
# Check for common failure cases
diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py
index 6e48cd9a..d7c1e303 100644
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@@ -35,7 +35,7 @@ def get_html(link: Link, path: Path) -> str:
document = None
for source in sources:
try:
- with open(abs_path / source, "r") as f:
+ with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError):
@@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "readability"
- output = str(output_folder)
+ output = "readability"
# Readability Docs: https://github.com/mozilla/readability
@@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
temp_doc.write(document.encode("utf-8"))
temp_doc.close()
+ if not document or len(document) < 10:
+ raise ArchiveError('Readability could not find HTML to parse for article text')
+
cmd = [
DEPENDENCIES['READABILITY_BINARY']['path'],
- temp_doc.name
+ temp_doc.name,
]
result = run(cmd, cwd=out_dir, timeout=timeout)
- result_json = json.loads(result.stdout)
+ try:
+ result_json = json.loads(result.stdout)
+ except json.JSONDecodeError:
+ raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
+
output_folder.mkdir(exist_ok=True)
readability_content = result_json.pop("textContent")
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
@@ -112,6 +119,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
except (Exception, OSError) as err:
status = 'failed'
output = err
+ cmd = [cmd[0], './{singlefile,dom}.html']
finally:
timer.end()
@@ -121,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
cmd_version=READABILITY_VERSION,
output=output,
status=status,
- index_texts= [readability_content] if readability_content else [],
+ index_texts=[readability_content] if readability_content else [],
**timer.stats,
)
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 04ab0a8d..d3d1bedc 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -356,6 +356,7 @@ LINK_FILTERS = {
'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
'tag': lambda pattern: Q(tags__name=pattern),
+ 'timestamp': lambda pattern: Q(timestamp=pattern),
}
@enforce_types
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index ebfe7d78..c4f66f55 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -1,11 +1,12 @@
__package__ = 'archivebox.index'
-from datetime import datetime
-from typing import List, Optional, Iterator, Mapping
from pathlib import Path
+from datetime import datetime
+from collections import defaultdict
+from typing import List, Optional, Iterator, Mapping
from django.utils.html import format_html, mark_safe
-from collections import defaultdict
+from django.core.cache import cache
from .schema import Link
from ..system import atomic_write
@@ -20,7 +21,6 @@ from ..util import (
from ..config import (
OUTPUT_DIR,
VERSION,
- GIT_SHA,
FOOTER_INFO,
HTML_INDEX_FILENAME,
SAVE_ARCHIVE_DOT_ORG,
@@ -60,7 +60,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
return render_django_template(template, {
'version': VERSION,
- 'git_sha': GIT_SHA,
+ 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
'num_links': str(len(links)),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
@@ -116,71 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str:
- from core.models import EXTRACTORS
+ cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
+
+ def calc_snapshot_icons():
+ from core.models import EXTRACTORS
+ # start = datetime.now()
- # start = datetime.now()
+ archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+ link = snapshot.as_link()
+ path = link.archive_path
+ canon = link.canonical_outputs()
+ output = ""
+ output_template = '{} '
+ icons = {
+ "singlefile": "❶",
+ "wget": "🆆",
+ "dom": "🅷",
+ "pdf": "📄",
+ "screenshot": "💻",
+ "media": "📼",
+ "git": "🅶",
+ "archive_org": "🏛",
+ "readability": "🆁",
+ "mercury": "🅼",
+ "warc": "📦"
+ }
+ exclude = ["favicon", "title", "headers", "archive_org"]
+ # Missing specific entry for WARC
- archive_results = snapshot.archiveresult_set.filter(status="succeeded")
- link = snapshot.as_link()
- path = link.archive_path
- canon = link.canonical_outputs()
- output = ""
- output_template = '{} '
- icons = {
- "singlefile": "❶",
- "wget": "🆆",
- "dom": "🅷",
- "pdf": "📄",
- "screenshot": "💻",
- "media": "📼",
- "git": "🅶",
- "archive_org": "🏛",
- "readability": "🆁",
- "mercury": "🅼",
- "warc": "📦"
- }
- exclude = ["favicon", "title", "headers", "archive_org"]
- # Missing specific entry for WARC
+ extractor_outputs = defaultdict(lambda: None)
+ for extractor, _ in EXTRACTORS:
+ for result in archive_results:
+ if result.extractor == extractor and result:
+ extractor_outputs[extractor] = result
- extractor_outputs = defaultdict(lambda: None)
- for extractor, _ in EXTRACTORS:
- for result in archive_results:
- if result.extractor == extractor and result:
- extractor_outputs[extractor] = result
+ for extractor, _ in EXTRACTORS:
+ if extractor not in exclude:
+ existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
+ # if existing:
+ # existing = (Path(path) / existing)
+ # if existing.is_file():
+ # existing = True
+ # elif existing.is_dir():
+ # existing = any(existing.glob('*.*'))
+ output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
+ extractor, icons.get(extractor, "?"))
+ if extractor == "wget":
+ # warc isn't technically it's own extractor, so we have to add it after wget
+
+ # get from db (faster but less thurthful)
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # get from filesystem (slower but more accurate)
+ # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+ output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
- for extractor, _ in EXTRACTORS:
- if extractor not in exclude:
- existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
- # if existing:
- # existing = (Path(path) / existing)
- # if existing.is_file():
- # existing = True
- # elif existing.is_dir():
- # existing = any(existing.glob('*.*'))
- output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
- extractor, icons.get(extractor, "?"))
- if extractor == "wget":
- # warc isn't technically it's own extractor, so we have to add it after wget
-
- # get from db (faster but less thurthful)
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # get from filesystem (slower but more accurate)
- # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
- output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
+ if extractor == "archive_org":
+ # The check for archive_org is different, so it has to be handled separately
- if extractor == "archive_org":
- # The check for archive_org is different, so it has to be handled separately
+ # get from db (faster)
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+ # get from filesystem (slower)
+ # target_path = Path(path) / "archive.org.txt"
+ # exists = target_path.exists()
+ output += '{} '.format(canon["archive_org_path"], str(exists),
+ "archive_org", icons.get("archive_org", "?"))
- # get from db (faster)
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- # get from filesystem (slower)
- # target_path = Path(path) / "archive.org.txt"
- # exists = target_path.exists()
- output += '{} '.format(canon["archive_org_path"], str(exists),
- "archive_org", icons.get("archive_org", "?"))
+ result = format_html('{}', mark_safe(output))
+ # end = datetime.now()
+ # print(((end - start).total_seconds()*1000) // 1, 'ms')
+ return result
- result = format_html('{}', mark_safe(output))
- # end = datetime.now()
- # print(((end - start).total_seconds()*1000) // 1, 'ms')
- return result
+ return cache.get_or_set(cache_key, calc_snapshot_icons)
+ # return calc_snapshot_icons()
+
+
diff --git a/archivebox/index/json.py b/archivebox/index/json.py
index f24b969f..441e6854 100644
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@@ -15,7 +15,6 @@ from ..config import (
VERSION,
OUTPUT_DIR,
FOOTER_INFO,
- GIT_SHA,
DEPENDENCIES,
JSON_INDEX_FILENAME,
ARCHIVE_DIR_NAME,
@@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = {
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
- 'git_sha': GIT_SHA,
+ 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 1ca4e801..00831e19 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass, asdict, field, fields
+from django.utils.functional import cached_property
from ..system import get_dir_size
@@ -133,7 +134,6 @@ class Link:
updated: Optional[datetime] = None
schema: str = 'Link'
-
def __str__(self) -> str:
return f'[{self.timestamp}] {self.url} "{self.title}"'
@@ -190,6 +190,7 @@ class Link:
}
if extended:
info.update({
+ 'snapshot_id': self.snapshot_id,
'link_dir': self.link_dir,
'archive_path': self.archive_path,
@@ -201,6 +202,9 @@ class Link:
'basename': self.basename,
'extension': self.extension,
'is_static': self.is_static,
+
+ 'tags_str': self.tags, # only used to render static index in index/html.py, remove if no longer needed there
+ 'icons': None, # only used to render static index in index/html.py, remove if no longer needed there
'bookmarked_date': self.bookmarked_date,
'updated_date': self.updated_date,
@@ -255,6 +259,11 @@ class Link:
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
+ @cached_property
+ def snapshot_id(self):
+ from core.models import Snapshot
+ return str(Snapshot.objects.only('id').get(url=self.url).id)
+
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]
diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py
index 1e99f67c..2fcabd61 100644
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -7,7 +7,7 @@ from django.db.models import QuerySet
from django.db import transaction
from .schema import Link
-from ..util import enforce_types
+from ..util import enforce_types, parse_date
from ..config import OUTPUT_DIR
@@ -23,13 +23,15 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
)
@enforce_types
-def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None:
- with transaction.atomic():
- snapshots.delete()
+def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+ if atomic:
+ with transaction.atomic():
+ return snapshots.delete()
+ return snapshots.delete()
@enforce_types
def write_link_to_sql_index(link: Link):
- from core.models import Snapshot
+ from core.models import Snapshot, ArchiveResult
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
tags = info.pop("tags")
if tags is None:
@@ -41,36 +43,74 @@ def write_link_to_sql_index(link: Link):
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
- snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
+ snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
snapshot.save_tags(tags)
+
+ for extractor, entries in link.history.items():
+ for entry in entries:
+ if isinstance(entry, dict):
+ result, _ = ArchiveResult.objects.get_or_create(
+ snapshot_id=snapshot.id,
+ extractor=extractor,
+ start_ts=parse_date(entry['start_ts']),
+ defaults={
+ 'end_ts': parse_date(entry['end_ts']),
+ 'cmd': entry['cmd'],
+ 'output': entry['output'],
+ 'cmd_version': entry.get('cmd_version') or 'unknown',
+ 'pwd': entry['pwd'],
+ 'status': entry['status'],
+ }
+ )
+ else:
+ result, _ = ArchiveResult.objects.update_or_create(
+ snapshot_id=snapshot.id,
+ extractor=extractor,
+ start_ts=parse_date(entry.start_ts),
+ defaults={
+ 'end_ts': parse_date(entry.end_ts),
+ 'cmd': entry.cmd,
+ 'output': entry.output,
+ 'cmd_version': entry.cmd_version or 'unknown',
+ 'pwd': entry.pwd,
+ 'status': entry.status,
+ }
+ )
+
return snapshot
@enforce_types
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
- with transaction.atomic():
- for link in links:
- write_link_to_sql_index(link)
+ for link in links:
+ # with transaction.atomic():
+ # write_link_to_sql_index(link)
+ write_link_to_sql_index(link)
@enforce_types
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
from core.models import Snapshot
- with transaction.atomic():
- try:
- snap = Snapshot.objects.get(url=link.url)
- except Snapshot.DoesNotExist:
- snap = write_link_to_sql_index(link)
- snap.title = link.title
+ # with transaction.atomic():
+ # try:
+ # snap = Snapshot.objects.get(url=link.url)
+ # except Snapshot.DoesNotExist:
+ # snap = write_link_to_sql_index(link)
+ # snap.title = link.title
+ try:
+ snap = Snapshot.objects.get(url=link.url)
+ except Snapshot.DoesNotExist:
+ snap = write_link_to_sql_index(link)
+ snap.title = link.title
- tag_set = (
- set(tag.strip() for tag in (link.tags or '').split(','))
- )
- tag_list = list(tag_set) or []
+ tag_set = (
+ set(tag.strip() for tag in (link.tags or '').split(','))
+ )
+ tag_list = list(tag_set) or []
- snap.save()
- snap.save_tags(tag_list)
+ snap.save()
+ snap.save_tags(tag_list)
diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py
index f2b86735..492ae55e 100644
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re
import os
import sys
+import stat
import time
import argparse
from math import log
@@ -11,18 +12,21 @@ from pathlib import Path
from datetime import datetime
from dataclasses import dataclass
-from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
+from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
if TYPE_CHECKING:
from .index.schema import Link, ArchiveResult
+from .system import get_dir_size
from .util import enforce_types
from .config import (
ConfigDict,
OUTPUT_DIR,
PYTHON_ENCODING,
+ VERSION,
ANSI,
IS_TTY,
+ IN_DOCKER,
TERM_WIDTH,
SHOW_PROGRESS,
SOURCES_DIR_NAME,
@@ -50,6 +54,37 @@ class RuntimeStats:
_LAST_RUN_STATS = RuntimeStats()
+def debug_dict_summary(obj: Dict[Any, Any]) -> None:
+ stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
+
+
+def get_fd_info(fd) -> Dict[str, Any]:
+ NAME = fd.name[1:-1]
+ FILENO = fd.fileno()
+ MODE = os.fstat(FILENO).st_mode
+ IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
+ IS_PIPE = stat.S_ISFIFO(MODE)
+ IS_FILE = stat.S_ISREG(MODE)
+ IS_TERMINAL = not (IS_PIPE or IS_FILE)
+ IS_LINE_BUFFERED = fd.line_buffering
+ IS_READABLE = fd.readable()
+ return {
+ 'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
+ 'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
+ 'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
+ 'IS_READABLE': IS_READABLE,
+ }
+
+
+# # Log debug information about stdin, stdout, and stderr
+# sys.stdout.write('[>&1] this is python stdout\n')
+# sys.stderr.write('[>&2] this is python stderr\n')
+
+# debug_dict_summary(get_fd_info(sys.stdin))
+# debug_dict_summary(get_fd_info(sys.stdout))
+# debug_dict_summary(get_fd_info(sys.stderr))
+
+
class SmartFormatter(argparse.HelpFormatter):
"""Patched formatter that prints newlines in argparse help strings"""
@@ -62,22 +97,40 @@ class SmartFormatter(argparse.HelpFormatter):
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
- if stdin and not stdin.isatty():
- stdin_raw_text = stdin.read().strip()
+ if not stdin:
+ return None
+
+ if IN_DOCKER:
+ # when TTY is disabled in docker we cant tell if stdin is being piped in or not
+ # if we try to read stdin when its not piped we will hang indefinitely waiting for it
+ return None
+
+ if not stdin.isatty():
+ # stderr('READING STDIN TO REJECT...')
+ stdin_raw_text = stdin.read()
if stdin_raw_text:
+ # stderr('GOT STDIN!', len(stdin_str))
stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
stderr()
raise SystemExit(1)
+ return None
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
"""accept any standard input and return it as a string or None"""
+
if not stdin:
return None
- elif stdin and not stdin.isatty():
- stdin_str = stdin.read().strip()
- return stdin_str or None
+
+ if not stdin.isatty():
+ # stderr('READING STDIN TO ACCEPT...')
+ stdin_str = stdin.read()
+
+ if stdin_str:
+ # stderr('GOT STDIN...', len(stdin_str))
+ return stdin_str
+
return None
@@ -174,7 +227,6 @@ def progress_bar(seconds: int, prefix: str='') -> None:
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
- from .config import VERSION, ANSI
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -233,11 +285,11 @@ def log_indexing_process_finished():
def log_indexing_started(out_path: str):
if IS_TTY:
- sys.stdout.write(f' > {out_path}')
+ sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
def log_indexing_finished(out_path: str):
- print(f'\r √ {out_path}')
+ print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
### Archiving Stage
@@ -272,8 +324,6 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
total=num_links,
))
print()
- print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
- print(' archivebox server # then visit http://127.0.0.1:8000')
print(' Continue archiving where you left off by running:')
print(' archivebox update --resume={}'.format(timestamp))
@@ -331,6 +381,9 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
else:
_LAST_RUN_STATS.succeeded += 1
+ size = get_dir_size(link_dir)
+ print(' {black}{} files ({}){reset}'.format(size[2], printable_filesize(size[0]), **ANSI))
+
def log_archive_method_started(method: str):
print(' > {}'.format(method))
diff --git a/archivebox/main.py b/archivebox/main.py
index c1751528..5c697c55 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -67,6 +67,7 @@ from .config import (
ConfigDict,
ANSI,
IS_TTY,
+ DEBUG,
IN_DOCKER,
USER,
ARCHIVEBOX_BINARY,
@@ -76,6 +77,7 @@ from .config import (
ARCHIVE_DIR,
LOGS_DIR,
CONFIG_FILE,
+ CONFIG_FILENAME,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
@@ -84,6 +86,7 @@ from .config import (
SQL_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
+ SEARCH_BACKEND_ENGINE,
check_dependencies,
check_data_folder,
write_config_file,
@@ -125,14 +128,19 @@ ALLOWED_IN_OUTPUT_DIR = {
'node_modules',
'package-lock.json',
'static',
+ 'sonic',
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
SQL_INDEX_FILENAME,
+ f'{SQL_INDEX_FILENAME}-wal',
+ f'{SQL_INDEX_FILENAME}-shm',
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
+ CONFIG_FILENAME,
+ f'{CONFIG_FILENAME}.bak',
}
@enforce_types
@@ -214,9 +222,23 @@ def version(quiet: bool=False,
if quiet:
print(VERSION)
else:
+ # ArchiveBox v0.5.6
+ # Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
print('ArchiveBox v{}'.format(VERSION))
p = platform.uname()
- print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
+ print(
+ sys.implementation.name.title(),
+ p.system,
+ platform.platform(),
+ p.machine,
+ )
+ print(
+ f'IN_DOCKER={IN_DOCKER}',
+ f'DEBUG={DEBUG}',
+ f'IS_TTY={IS_TTY}',
+ f'TZ={os.environ.get("TZ", "UTC")}',
+ f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
+ )
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@@ -261,7 +283,7 @@ def run(subcommand: str,
@enforce_types
-def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
from core.models import Snapshot
@@ -276,13 +298,12 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
if is_empty and not existing_index:
- print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
- print(f' {out_dir}')
- print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
+ print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
elif existing_index:
- print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
- print(f' {out_dir}')
- print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ # TODO: properly detect and print the existing version in current index as well
+ print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI))
+ print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
else:
if force:
stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
@@ -303,30 +324,25 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
else:
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
+ print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
Path(SOURCES_DIR).mkdir(exist_ok=True)
- print(f' √ {SOURCES_DIR}')
-
Path(ARCHIVE_DIR).mkdir(exist_ok=True)
- print(f' √ {ARCHIVE_DIR}')
-
Path(LOGS_DIR).mkdir(exist_ok=True)
- print(f' √ {LOGS_DIR}')
-
+ print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
write_config_file({}, out_dir=out_dir)
- print(f' √ {CONFIG_FILE}')
+
if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
- print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
+ print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
else:
- print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
+ print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
- print(f' √ {DATABASE_FILE}')
- print()
for migration_line in apply_migrations(out_dir):
print(f' {migration_line}')
-
assert DATABASE_FILE.exists()
+ print()
+ print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
# from django.contrib.auth.models import User
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
@@ -334,7 +350,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
# call_command("createsuperuser", interactive=True)
print()
- print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
+ print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI))
all_links = Snapshot.objects.none()
pending_links: Dict[str, Link] = {}
@@ -343,63 +359,77 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
all_links = load_main_index(out_dir=out_dir, warn=False)
print(' √ Loaded {} links from existing main index.'.format(all_links.count()))
- # Links in data folders that dont match their timestamp
- fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
- if fixed:
- print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
- if cant_fix:
- print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
+ if quick:
+ print(' > Skipping full snapshot directory check (quick mode)')
+ else:
+ try:
+ # Links in data folders that dont match their timestamp
+ fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
+ if fixed:
+ print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
+ if cant_fix:
+ print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
- # Links in JSON index but not in main index
- orphaned_json_links = {
- link.url: link
- for link in parse_json_main_index(out_dir)
- if not all_links.filter(url=link.url).exists()
- }
- if orphaned_json_links:
- pending_links.update(orphaned_json_links)
- print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
+ # Links in JSON index but not in main index
+ orphaned_json_links = {
+ link.url: link
+ for link in parse_json_main_index(out_dir)
+ if not all_links.filter(url=link.url).exists()
+ }
+ if orphaned_json_links:
+ pending_links.update(orphaned_json_links)
+ print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
- # Links in data dir indexes but not in main index
- orphaned_data_dir_links = {
- link.url: link
- for link in parse_json_links_details(out_dir)
- if not all_links.filter(url=link.url).exists()
- }
- if orphaned_data_dir_links:
- pending_links.update(orphaned_data_dir_links)
- print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
+ # Links in data dir indexes but not in main index
+ orphaned_data_dir_links = {
+ link.url: link
+ for link in parse_json_links_details(out_dir)
+ if not all_links.filter(url=link.url).exists()
+ }
+ if orphaned_data_dir_links:
+ pending_links.update(orphaned_data_dir_links)
+ print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
- # Links in invalid/duplicate data dirs
- invalid_folders = {
- folder: link
- for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
- }
- if invalid_folders:
- print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
- print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
- print()
- print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
- print(' archivebox status')
- print(' archivebox list --status=invalid')
+ # Links in invalid/duplicate data dirs
+ invalid_folders = {
+ folder: link
+ for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
+ }
+ if invalid_folders:
+ print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
+ print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items()))
+ print()
+ print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
+ print(' archivebox status')
+ print(' archivebox list --status=invalid')
+ except (KeyboardInterrupt, SystemExit):
+ stderr()
+ stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red')
+ stderr(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.')
+ stderr()
+ stderr(' {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI))
+ stderr(' archivebox init --quick')
+ raise SystemExit(1)
+
+ write_main_index(list(pending_links.values()), out_dir=out_dir)
- write_main_index(list(pending_links.values()), out_dir=out_dir)
-
- print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+ print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else:
- print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
- print()
- print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
- print(' archivebox server # then visit http://127.0.0.1:8000')
- print()
- print(' To add new links, you can run:')
- print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
- print()
- print(' For more usage and examples, run:')
- print(' archivebox help')
+ print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
+
+ if Snapshot.objects.count() < 25: # hide the hints for experienced users
+ print()
+ print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
+ print(' archivebox server # then visit http://127.0.0.1:8000')
+ print()
+ print(' To add new links, you can run:')
+ print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
+ print()
+ print(' For more usage and examples, run:')
+ print(' archivebox help')
json_index = Path(out_dir) / JSON_INDEX_FILENAME
html_index = Path(out_dir) / HTML_INDEX_FILENAME
@@ -531,6 +561,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
@enforce_types
def add(urls: Union[str, List[str]],
+ tag: str='',
depth: int=0,
update_all: bool=not ONLY_NEW,
index_only: bool=False,
@@ -540,6 +571,8 @@ def add(urls: Union[str, List[str]],
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive"""
+ from core.models import Tag
+
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
extractors = extractors.split(",") if extractors else []
@@ -572,26 +605,48 @@ def add(urls: Union[str, List[str]],
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
+
new_links = dedupe_links(all_links, imported_links)
write_main_index(links=new_links, out_dir=out_dir)
all_links = load_main_index(out_dir=out_dir)
if index_only:
- return all_links
+ # mock archive all the links using the fake index_only extractor method in order to update their state
+ if overwrite:
+ archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir)
+ else:
+ archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir)
+ else:
+ # fully run the archive extractor methods for each link
+ archive_kwargs = {
+ "out_dir": out_dir,
+ }
+ if extractors:
+ archive_kwargs["methods"] = extractors
+
+ if update_all:
+ archive_links(all_links, overwrite=overwrite, **archive_kwargs)
+ elif overwrite:
+ archive_links(imported_links, overwrite=True, **archive_kwargs)
+ elif new_links:
+ archive_links(new_links, overwrite=False, **archive_kwargs)
+
+
+ # add any tags to imported links
+ tags = [
+ Tag.objects.get_or_create(name=name.strip())[0]
+ for name in tag.split(',')
+ if name.strip()
+ ]
+ if tags:
+ for link in imported_links:
+ snapshot = link.as_snapshot()
+ snapshot.tags.add(*tags)
+ snapshot.tags_str(nocache=True)
+ snapshot.save()
+ # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
- # Run the archive methods for each link
- archive_kwargs = {
- "out_dir": out_dir,
- }
- if extractors:
- archive_kwargs["methods"] = extractors
- if update_all:
- archive_links(all_links, overwrite=overwrite, **archive_kwargs)
- elif overwrite:
- archive_links(imported_links, overwrite=True, **archive_kwargs)
- elif new_links:
- archive_links(new_links, overwrite=False, **archive_kwargs)
return all_links
@@ -811,11 +866,15 @@ def list_links(snapshots: Optional[QuerySet]=None,
all_snapshots = load_main_index(out_dir=out_dir)
if after is not None:
- all_snapshots = all_snapshots.filter(timestamp__lt=after)
+ all_snapshots = all_snapshots.filter(timestamp__gte=after)
if before is not None:
- all_snapshots = all_snapshots.filter(timestamp__gt=before)
+ all_snapshots = all_snapshots.filter(timestamp__lt=before)
if filter_patterns:
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
+
+ if not all_snapshots:
+ stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
+
return all_snapshots
@enforce_types
@@ -1061,6 +1120,7 @@ def server(runserver_args: Optional[List[str]]=None,
reload: bool=False,
debug: bool=False,
init: bool=False,
+ quick_init: bool=False,
createsuperuser: bool=False,
out_dir: Path=OUTPUT_DIR) -> None:
"""Run the ArchiveBox HTTP server"""
@@ -1069,9 +1129,14 @@ def server(runserver_args: Optional[List[str]]=None,
if init:
run_subcommand('init', stdin=None, pwd=out_dir)
+ print()
+ elif quick_init:
+ run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
+ print()
if createsuperuser:
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
+ print()
# setup config for django runserver
from . import config
@@ -1083,12 +1148,9 @@ def server(runserver_args: Optional[List[str]]=None,
from django.core.management import call_command
from django.contrib.auth.models import User
- admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last()
-
print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
- if admin_user:
- hint('The admin username is{lightblue} {}{reset}\n'.format(admin_user.username, **ANSI))
- else:
+ print(' > Logging errors to ./logs/errors.log')
+ if not User.objects.filter(is_superuser=True).exists():
print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
print()
print(' To create an admin user, run:')
@@ -1106,7 +1168,6 @@ def server(runserver_args: Optional[List[str]]=None,
config.SHOW_PROGRESS = False
config.DEBUG = config.DEBUG or debug
-
call_command("runserver", *runserver_args)
diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py
index 441c08ac..4af2c5ac 100644
--- a/archivebox/parsers/__init__.py
+++ b/archivebox/parsers/__init__.py
@@ -68,7 +68,6 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
"""
parse a list of URLS without touching the filesystem
"""
- check_url_parsing_invariants()
timer = TimedProgress(TIMEOUT * 4)
#urls = list(map(lambda x: x + "\n", urls))
@@ -89,8 +88,6 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
RSS feed, bookmarks export, or text file
"""
- check_url_parsing_invariants()
-
timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file:
links, parser = run_parser_functions(file, timer, root_url=root_url)
@@ -173,31 +170,48 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
return source_path
-def check_url_parsing_invariants() -> None:
- """Check that plain text regex URL parsing works as expected"""
-
- # this is last-line-of-defense to make sure the URL_REGEX isn't
- # misbehaving, as the consequences could be disastrous and lead to many
- # incorrect/badly parsed links being added to the archive
-
- test_urls = '''
- https://example1.com/what/is/happening.html?what=1#how-about-this=1
- https://example2.com/what/is/happening/?what=1#how-about-this=1
- HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
- https://example4.com/what/is/happening.html
- https://example5.com/
- https://example6.com
-
- http://example7.com
- [https://example8.com/what/is/this.php?what=1]
- [and http://example9.com?what=1&other=3#and-thing=2]
- https://example10.com#and-thing=2 "
- abcdef
- sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
- example13.bada
- and example14.badb
- htt://example15.badc
- '''
- # print('\n'.join(re.findall(URL_REGEX, test_urls)))
- assert len(re.findall(URL_REGEX, test_urls)) == 12
-
+# Check that plain text regex URL parsing works as expected
+# this is last-line-of-defense to make sure the URL_REGEX isn't
+# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
+# the consequences of bad URL parsing could be disastrous and lead to many
+# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
+_test_url_strs = {
+ 'example.com': 0,
+ '/example.com': 0,
+ '//example.com': 0,
+ ':/example.com': 0,
+ '://example.com': 0,
+ 'htt://example8.com': 0,
+ '/htt://example.com': 0,
+ 'https://example': 1,
+ 'https://localhost/2345': 1,
+ 'https://localhost:1234/123': 1,
+ '://': 0,
+ 'https://': 0,
+ 'http://': 0,
+ 'ftp://': 0,
+ 'ftp://example.com': 0,
+ 'https://example.com': 1,
+ 'https://example.com/': 1,
+ 'https://a.example.com': 1,
+ 'https://a.example.com/': 1,
+ 'https://a.example.com/what/is/happening.html': 1,
+ 'https://a.example.com/what/ís/happening.html': 1,
+ 'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
+ 'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
+ 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
+ 'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
+ 'https://example.com?what=1#how-about-this=1&2%20baf': 1,
+ 'http://example7.com': 1,
+ '[https://example8.com/what/is/this.php?what=1]': 1,
+ '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
+ 'https://example10.com#and-thing=2 "': 1,
+ 'abcdef': 1,
+ 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
+ 'http://examplehttp://15.badc': 2,
+ 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
+ '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
+}
+for url_str, num_urls in _test_url_strs.items():
+ assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
+ f'{url_str} does not contain {num_urls} urls')
diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py
index e6d15455..82d1880e 100644
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@@ -16,7 +16,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
if extra_path:
fpath = f'{fpath}/{extra_path}'
- with open(fpath, 'r') as file:
+ with open(fpath, 'r', encoding='utf-8') as file:
data = file.read()
if data:
return [data]
diff --git a/archivebox/system.py b/archivebox/system.py
index 2191c70a..3c43eeaf 100644
--- a/archivebox/system.py
+++ b/archivebox/system.py
@@ -10,7 +10,7 @@ from typing import Optional, Union, Set, Tuple
from subprocess import run as subprocess_run
from crontab import CronTab
-from atomicwrites import atomic_write as lib_atomic_write
+from .vendor.atomicwrites import atomic_write as lib_atomic_write
from .util import enforce_types, ExtendedEncoder
from .config import OUTPUT_PERMISSIONS
@@ -37,10 +37,11 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
mode = 'wb+' if isinstance(contents, bytes) else 'w'
+ encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
try:
- with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
+ with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f:
if isinstance(contents, dict):
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
elif isinstance(contents, (bytes, str)):
diff --git a/archivebox/templates/admin/actions_as_select.html b/archivebox/templates/admin/actions_as_select.html
index 86a77190..e69de29b 100644
--- a/archivebox/templates/admin/actions_as_select.html
+++ b/archivebox/templates/admin/actions_as_select.html
@@ -1 +0,0 @@
-actions_as_select
diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html
index d8ad8d00..a3d21ba9 100644
--- a/archivebox/templates/admin/base.html
+++ b/archivebox/templates/admin/base.html
@@ -20,7 +20,7 @@
-