diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml
index 86d6f1ee..49e9750a 100644
--- a/.github/workflows/debian.yml
+++ b/.github/workflows/debian.yml
@@ -4,10 +4,12 @@ on:
workflow_dispatch:
push:
+env:
+ DEB_BUILD_OPTIONS: nocheck
jobs:
build:
- runs-on: ubuntu-latest
+ runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
@@ -15,31 +17,60 @@ jobs:
submodules: true
fetch-depth: 1
- - name: Set up Python
- uses: actions/setup-python@v1
- with:
- python-version: 3.9
- architecture: x64
-
- - name: Build Debian/Apt package
+ - name: Install packaging dependencies
run: |
- sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-setuptools python3-wheel python3-stdeb
- pip3 install --upgrade pip setuptools wheel stdeb
- ./bin/build_deb.sh
+ sudo apt install -y \
+ python3 python3-dev python3-pip python3-venv python3-all \
+ dh-python debhelper devscripts dput software-properties-common \
+ python3-distutils python3-setuptools python3-wheel python3-stdeb
+
+ - name: Build Debian/Apt sdist_dsc
+ run: |
+ rm -Rf deb_dist/*
+ python3 setup.py --command-packages=stdeb.command sdist_dsc
+
+ - name: Build Debian/Apt bdist_deb
+ run: |
+ python3 setup.py --command-packages=stdeb.command bdist_deb
- name: Install archivebox from deb
run: |
- apt install deb_dist/archivebox*.deb
+ cd deb_dist/
+ sudo apt install ./archivebox*.deb
+
+ - name: Check ArchiveBox version
+ run: |
+ # must create dir needed for snaps to run as non-root on github actions
+ sudo mkdir -p /run/user/1001 && sudo chmod -R 777 /run/user/1001
+ mkdir "${{ github.workspace }}/data" && cd "${{ github.workspace }}/data"
+ archivebox init
+ archivebox config --set SAVE_READABILITY=False
+ archivebox config --set SAVE_MERCURY=False
+ archivebox config --set SAVE_SINGLEFILE=False
+ archivebox version
- name: Add some links to test
run: |
- mkdir data && cd data
- archivebox init
+ cd "${{ github.workspace }}/data"
archivebox add 'https://example.com'
- archivebox version
archivebox status
- # TODO: push debian package to launchpad PPA
- # - name: Push to launchpad
+ # - name: Commit built package
# run: |
+ # cd deb_dist/
+ # git config --local user.email "action@github.com"
+ # git config --local user.name "GitHub Action"
+ # git commit -m "Debian package autobuild" -a
+
+ # - name: Push build to Github
+ # uses: ad-m/github-push-action@master
+ # with:
+ # github_token: ${{ secrets.GITHUB_TOKEN }}
+ # repository: ArchiveBox/debian-archivebox
+ # branch: ${{ github.ref }}
+ # directory: deb_dist
+
+ # - name: Push build to Launchpad PPA
+ # run: |
+ # debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
# dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
diff --git a/.github/workflows/homebrew.yml b/.github/workflows/homebrew.yml
index e5e71420..d9bb05f1 100644
--- a/.github/workflows/homebrew.yml
+++ b/.github/workflows/homebrew.yml
@@ -15,12 +15,14 @@ jobs:
submodules: true
fetch-depth: 1
+ # TODO: modify archivebox.rb to update src url, hashes, and dependencies
+
- name: Build Homebrew Bottle
run: |
pip3 install --upgrade pip setuptools wheel
cd brew_dist/
brew install --build-bottle ./archivebox.rb
- brew bottle archivebox
+ # brew bottle archivebox
- name: Add some links to test
run: |
@@ -30,4 +32,19 @@ jobs:
archivebox version
archivebox status
- # TODO: push bottle to Github and open homebrew core PR with latest changes
+ # - name: Commit built package
+ # run: |
+ # cd brew_dist/
+ # git config --local user.email "action@github.com"
+ # git config --local user.name "GitHub Action"
+ # git commit -m "Homebrew package autobuild" -a
+
+ # - name: Push build to Github
+ # uses: ad-m/github-push-action@master
+ # with:
+ # github_token: ${{ secrets.GITHUB_TOKEN }}
+ # repository: ArchiveBox/homebrew-archivebox
+ # branch: ${{ github.ref }}
+ # directory: brew_dist
+
+ # TODO: push bottle homebrew core PR with latest changes
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index c4479c4b..80f4f19f 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,7 +9,7 @@ env:
jobs:
lint:
- runs-on: ubuntu-latest
+ runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
with:
diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml
index 915ebfd1..36153189 100644
--- a/.github/workflows/pip.yml
+++ b/.github/workflows/pip.yml
@@ -7,7 +7,7 @@ on:
jobs:
build:
- runs-on: ubuntu-latest
+ runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
@@ -24,6 +24,7 @@ jobs:
- name: Build Python Package
run: |
pip3 install --upgrade pip setuptools wheel
+ rm -Rf pip_dist/*.whl
python3 setup.py \
sdist --dist-dir=./pip_dist \
bdist_wheel --dist-dir=./pip_dist \
@@ -38,4 +39,23 @@ jobs:
archivebox version
archivebox status
- # TODO: push to PyPI with twine
+ # - name: Commit built package
+ # run: |
+ # cd pip_dist/
+ # git config --local user.email "action@github.com"
+ # git config --local user.name "GitHub Action"
+ # git commit -m "Pip package autobuild" -a
+
+ # - name: Push build to Github
+ # uses: ad-m/github-push-action@master
+ # with:
+ # github_token: ${{ secrets.GITHUB_TOKEN }}
+ # repository: ArchiveBox/pip-archivebox
+ # branch: ${{ github.ref }}
+ # directory: pip_dist
+
+ # - name: Push build to PyPI
+ # run: |
+ # cd pip_dist/
+ # python3 -m twine upload --repository testpypi pip_dist/*.{whl,tar.gz}
+ # python3 -m twine upload --repository pypi pip_dist/*.{whl,tar.gz}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b73c9e89..9a6c76f2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -3,6 +3,9 @@ on: [push]
env:
DOCKER_IMAGE: archivebox-ci
+ PYTHONIOENCODING: utf-8
+ PYTHONLEGACYWINDOWSSTDIO: utf-8
+ USE_COLOR: False
jobs:
python_tests:
@@ -10,8 +13,8 @@ jobs:
strategy:
matrix:
- os: [ubuntu-latest, macos-latest]
- python: [3.7, 3.8]
+ os: [ubuntu-20.04, macos-latest, windows-latest]
+ python: [3.7]
steps:
- uses: actions/checkout@v2
@@ -77,10 +80,15 @@ jobs:
- name: Directory listing for debugging
run: |
pwd
- ls -a ./
+ ls
+
+ - name: Archivebox version
+ run: |
archivebox version
- name: Test built package with pytest
+ # TODO: remove this exception for windows once we get tests passing on that platform
+ if: ${{ !contains(matrix.os, 'windows') }}
run: |
python -m pytest -s
@@ -102,8 +110,8 @@ jobs:
- name: Init data dir
run: |
- mkdir data
- docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" init
+ mkdir "${{ github.workspace }}/data"
+ docker run -v "${{ github.workspace }}/data":/data "$DOCKER_IMAGE" init
- name: Run test server
run: |
diff --git a/.gitignore b/.gitignore
index 677066cf..e29719e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
*.pyc
__pycache__/
.mypy_cache/
+tests/out/
# Python and Node dependencies
venv/
@@ -11,6 +12,7 @@ venv/
node_modules/
# Packaging artifacts
+archivebox.egg-info
archivebox-*.tar.gz
build/
dist/
diff --git a/README.md b/README.md
index b8be5bce..54e0b24e 100644
--- a/README.md
+++ b/README.md
@@ -26,62 +26,175 @@
-ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64).
+ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects.
-Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time.
+Your archive can be managed through the command line with commands like `archivebox add`, through the built-in Web UI `archivebox server`, or via the Python library API (beta). It can ingest bookmarks from a browser or service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. You can also schedule regular/realtime imports with `archivebox schedule`.
-The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API.
+The main index is a self-contained `index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: several types of HTML snapshots (wget, Chrome headless, singlefile), PDF snapshotting, screenshotting, WARC archiving, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python library API.
-#### Quickstart
+### Quickstart
+
+It works on Linux/BSD (Intel and ARM CPUs with `docker`/`apt`/`pip3`), macOS (with `docker`/`brew`/`pip3`), and Windows (beta with `docker`/`pip3`).
-**First, get ArchiveBox using your system package manager, Docker, or pip:**
```bash
-# You can run it with Docker or Docker Compose (recommended)
-docker pull archivebox/archivebox
-# https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml
+pip3 install archivebox
+archivebox --version
+# install extras as-needed, or use one of full setup methods below to get everything out-of-the-box
-# or Ubuntu/Debian
+mkdir ~/archivebox && cd ~/archivebox # this can be anywhere
+archivebox init
+
+archivebox add 'https://example.com'
+archivebox add --depth=1 'https://example.com'
+archivebox schedule --every=day https://getpocket.com/users/USERNAME/feed/all
+archivebox oneshot --extract=title,favicon,media https://www.youtube.com/watch?v=dQw4w9WgXcQ
+archivebox help # to see more options
+```
+
+*(click to expand the sections below for full setup instructions)*
+
+
+Get ArchiveBox with docker-compose
on any platform (recommended, everything included out-of-the-box)
+
+First make sure you have Docker installed: https://docs.docker.com/get-docker/
+
+This is the recommended way to run ArchiveBox because it includes *all* the extractors like chrome, wget, youtube-dl, git, etc., as well as full-text search with sonic, and many other great features.
+
+```bash
+# create a new empty directory and initalize your collection (can be anywhere)
+mkdir ~/archivebox && cd ~/archivebox
+curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml
+docker-compose run archivebox init
+docker-compose run archivebox --version
+
+# start the webserver and open the UI (optional)
+docker-compose run archivebox manage createsuperuser
+docker-compose up -d
+open http://127.0.0.1:8000
+
+# you can also add links and manage your archive via the CLI:
+docker-compose run archivebox add 'https://example.com'
+docker-compose run archivebox status
+docker-compose run archivebox help # to see more options
+```
+
+
+
+
+Get ArchiveBox with docker
on any platform
+
+First make sure you have Docker installed: https://docs.docker.com/get-docker/
+```bash
+# create a new empty directory and initalize your collection (can be anywhere)
+mkdir ~/archivebox && cd ~/archivebox
+docker run -v $PWD:/data -it archivebox/archivebox init
+docker run -v $PWD:/data -it archivebox/archivebox --version
+
+# start the webserver and open the UI (optional)
+docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser
+docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000
+open http://127.0.0.1:8000
+
+# you can also add links and manage your archive via the CLI:
+docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
+docker run -v $PWD:/data -it archivebox/archivebox status
+docker run -v $PWD:/data -it archivebox/archivebox help # to see more options
+```
+
+
+
+
+Get ArchiveBox with apt
on Ubuntu >=20.04
+
+```bash
sudo add-apt-repository -u ppa:archivebox/archivebox
-apt install archivebox
+sudo apt install archivebox
-# or macOS
+# create a new empty directory and initalize your collection (can be anywhere)
+mkdir ~/archivebox && cd ~/archivebox
+npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
+archivebox init
+archivebox --version
+
+# start the webserver and open the web UI (optional)
+archivebox manage createsuperuser
+archivebox server 0.0.0.0:8000
+open http://127.0.0.1:8000
+
+# you can also add URLs and manage the archive via the CLI and filesystem:
+archivebox add 'https://example.com'
+archivebox status
+archivebox list --html --with-headers > index.html
+archivebox list --json --with-headers > index.json
+archivebox help # to see more options
+```
+
+For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`:
+```bash
+deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
+deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
+```
+(you may need to install some other dependencies manually however)
+
+
+
+
+Get ArchiveBox with brew
on macOS >=10.13
+
+```bash
brew install archivebox/archivebox/archivebox
-# or for the Python version only, without wget/git/chrome/etc. included
+# create a new empty directory and initalize your collection (can be anywhere)
+mkdir ~/archivebox && cd ~/archivebox
+npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
+archivebox init
+archivebox --version
+
+# start the webserver and open the web UI (optional)
+archivebox manage createsuperuser
+archivebox server 0.0.0.0:8000
+open http://127.0.0.1:8000
+
+# you can also add URLs and manage the archive via the CLI and filesystem:
+archivebox add 'https://example.com'
+archivebox status
+archivebox list --html --with-headers > index.html
+archivebox list --json --with-headers > index.json
+archivebox help # to see more options
+```
+
+
+
+
+Get ArchiveBox with pip
on any platform
+
+```bash
pip3 install archivebox
-# If you're using an apt/brew/pip install you can run archivebox commands normally
-# archivebox [subcommand] [...args]
-# If you're using Docker you'll have to run the commands like this
-# docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]
-# And the equivalent in Docker Compose:
-# docker-compose run archivebox [subcommand] [...args]
-```
-
-Check that everything installed correctly with `archivebox --version`
-
-**To start using archivebox, you have to create a data folder and `cd` into it:**
-
-```bash
-mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere
+# create a new empty directory and initalize your collection (can be anywhere)
+mkdir ~/archivebox && cd ~/archivebox
+npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
archivebox init
+archivebox --version
+# Install any missing extras like wget/git/chrome/etc. manually as needed
+
+# start the webserver and open the web UI (optional)
+archivebox manage createsuperuser
+archivebox server 0.0.0.0:8000
+open http://127.0.0.1:8000
+
+# you can also add URLs and manage the archive via the CLI and filesystem:
+archivebox add 'https://example.com'
+archivebox status
+archivebox list --html --with-headers > index.html
+archivebox list --json --with-headers > index.json
+archivebox help # to see more options
```
-**Then Add some URLs to your archive collection:**
-```bash
-archivebox add https://github.com/ArchiveBox/ArchiveBox
-archivebox add --depth=1 https://example.com
-```
-
-**View the snapshots of the URLs you added via the self-hosted web UI:**
-```bash
-archivebox manage createsuperuser # create an admin acct
-archivebox server 0.0.0.0:8000 # start the web server
-open http://127.0.0.1:8000/ # open the interactive admin panel
-ls ~/archivebox/archive/*/index.html # or browse the snapshots on disk
-```
-
-
+
+
+---
+
@@ -97,9 +210,9 @@ For more information, see the
.gz` gzipped WARC of all the resources fetched while archiving
- **PDF:** `output.pdf` Printed PDF of site using headless chrome
- **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
+- **Readability:** `article.html/json` Article text extraction using Readability
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
@@ -191,8 +307,8 @@ archivebox add 'https://example.com/any/url/you/want/to/keep/secret/'
# without first disabling share the URL with 3rd party APIs:
archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org
-archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL
-archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google
+archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL
+archivebox config --set CHROME_BINARY=chromium # optional: switch to chromium to avoid Chrome phoning home to Google
```
Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
@@ -215,95 +331,6 @@ archivebox add 'https://example.com#2020-10-25'
---
-# Setup
-
-## Docker Compose
-
-*This is the recommended way of running ArchiveBox.*
-
-It comes with everything working out of the box, including all extractors,
-a headless browser runtime, a full webserver, and CLI interface.
-
-```bash
-# docker-compose run archivebox [args]
-
-mkdir archivebox && cd archivebox
-wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
-docker-compose run archivebox init
-docker-compose run archivebox add 'https://example.com'
-docker-compose run archivebox manage createsuperuser
-docker-compose up
-open http://127.0.0.1:8000
-```
-
-## Docker
-
-```bash
-# docker run -v $PWD:/data -it archivebox/archivebox [args]
-
-mkdir archivebox && cd archivebox
-docker run -v $PWD:/data -it archivebox/archivebox init
-docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
-docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser
-
-# run the webserver to access the web UI
-docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000
-open http://127.0.0.1:8000
-
-# or export a static version of the index if you dont want to run a server
-docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html
-docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json
-open ./index.html
-```
-
-
-## Bare Metal
-
-```bash
-# archivebox [args]
-
-# on Debian/Ubuntu
-sudo add-apt-repository -u ppa:archivebox/archivebox
-apt install archivebox
-
-# on macOS
-brew install archivebox/archivebox/archivebox
-```
-
-Initialize your archive in a directory somewhere and add some links:
-```bash
-mkdir ~/archivebox && cd archivebox
-npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
-archivebox init
-archivebox add 'https://example.com' # add URLs as args pipe them in via stdin
-archivebox add --depth=1 https://example.com/table-of-contents.html
-# it can injest links from many formats, including RSS/JSON/XML/MD/TXT and more
-curl https://getpocket.com/users/USERNAME/feed/all | archivebox add
-```
-
-Start the webserver to access the web UI:
-```bash
-archivebox manage createsuperuser
-archivebox server 0.0.0.0:8000
-
-open http://127.0.0.1:8000
-```
-
-Or export a static HTML version of the index if you don't want to run a webserver:
-```bash
-archivebox list --html --with-headers > index.html
-archivebox list --json --with-headers > index.json
-open ./index.html
-```
-
-To view more information about your dependencies, data, or the CLI:
-```bash
-archivebox version
-archivebox status
-archivebox help
-```
----
-
@@ -418,22 +445,18 @@ All contributions to ArchiveBox are welcomed! Check our [issues](https://github.
First, install the system dependencies from the "Bare Metal" section above.
Then you can clone the ArchiveBox repo and install
```python3
-git clone https://github.com/ArchiveBox/ArchiveBox
-cd ArchiveBox
+git clone https://github.com/ArchiveBox/ArchiveBox && cd ArchiveBox
git checkout master # or the branch you want to test
-git pull
-git submodule init
-git submodule update
+git pull --recurse-submodules
# Install ArchiveBox + python dependencies
python3 -m venv .venv && source .venv/bin/activate && pip install -e .[dev]
-# or
-pipenv install --dev && pipenv shell
+# or with pipenv: pipenv install --dev && pipenv shell
# Install node dependencies
npm install
-# Optional: install the extractor dependencies
+# Optional: install extractor dependencies manually or with helper script
./bin/setup.sh
# Optional: develop via docker by mounting the code dir into the container
@@ -473,6 +496,8 @@ You can also run all these in Docker. For more examples see the Github Actions C
# or individually:
./bin/build_docs.sh
./bin/build_pip.sh
+./bin/build_deb.sh
+./bin/build_brew.sh
./bin/build_docker.sh
```
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index 3df41809..f9a55efd 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -63,7 +63,7 @@ def run_subcommand(subcommand: str,
if subcommand not in meta_cmds:
from ..config import setup_django
- setup_django(in_memory_db=subcommand in fake_db)
+ setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index b4e65231..41c7554d 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -89,8 +89,8 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
index_only=command.index_only,
overwrite=command.overwrite,
init=command.init,
- out_dir=pwd or OUTPUT_DIR,
extractors=command.extract,
+ out_dir=pwd or OUTPUT_DIR,
)
diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py
index 2353d101..af68bac2 100644
--- a/archivebox/cli/archivebox_oneshot.py
+++ b/archivebox/cli/archivebox_oneshot.py
@@ -36,6 +36,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
' ~/Desktop/sites_list.csv\n'
)
)
+ parser.add_argument(
+ "--extract",
+ type=str,
+ help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
+ This does not take precedence over the configuration",
+ default=""
+ )
parser.add_argument(
'--out-dir',
type=str,
@@ -55,6 +62,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
oneshot(
url=stdin_url or url,
out_dir=Path(command.out_dir).resolve(),
+ extractors=command.extract,
)
diff --git a/archivebox/config.py b/archivebox/config.py
index 32f27dfa..6c42eef5 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -161,6 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'USE_CHROME': {'type': bool, 'default': True},
'USE_NODE': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True},
+ 'USE_RIPGREP': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
@@ -170,6 +171,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'NODE_BINARY': {'type': str, 'default': 'node'},
+ 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
@@ -275,7 +277,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
- 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME / 'legacy'},
+ 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
@@ -312,6 +314,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
+ 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
@@ -320,7 +323,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
- 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if (c['USE_MERCURY'] and c['MERCURY_BINARY']) else None}, # mercury is unversioned
+ 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@@ -334,8 +337,6 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
- 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
- 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
@@ -343,6 +344,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
+
+ 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
+ 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
@@ -595,7 +599,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
if node_modules_bin.exists():
return str(node_modules_bin.resolve())
- return shutil.which(Path(binary).expanduser()) or binary
+ return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
if binary is None:
@@ -682,7 +686,7 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
'TEMPLATES_DIR': {
'path': (config['TEMPLATES_DIR']).resolve(),
'enabled': True,
- 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
+ 'is_valid': (config['TEMPLATES_DIR'] / config['ACTIVE_THEME'] / 'static').exists(),
},
# 'NODE_MODULES_DIR': {
# 'path': ,
@@ -826,6 +830,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_CHROME'],
'is_valid': bool(config['CHROME_VERSION']),
},
+ 'RIPGREP_BINARY': {
+ 'path': bin_path(config['RIPGREP_BINARY']),
+ 'version': config['RIPGREP_VERSION'],
+ 'hash': bin_hash(config['RIPGREP_BINARY']),
+ 'enabled': config['USE_RIPGREP'],
+ 'is_valid': bool(config['RIPGREP_VERSION']),
+ },
}
def get_chrome_info(config: ConfigDict) -> ConfigValue:
diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py
index 051cf50b..86b29bb7 100644
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -10,11 +10,22 @@ CHOICES = (
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
)
+from ..extractors import get_default_archive_methods
+
+ARCHIVE_METHODS = [
+ (name, name)
+ for name, _, _ in get_default_archive_methods()
+]
+
+
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
-
-
+ archive_methods = forms.MultipleChoiceField(
+ required=False,
+ widget=forms.SelectMultiple,
+ choices=ARCHIVE_METHODS,
+ )
class TagWidgetMixin:
def format_value(self, value):
if value is not None and not isinstance(value, str):
diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index 898e0f93..a780376f 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -9,6 +9,12 @@ import django.db.models.deletion
from config import CONFIG
from index.json import to_json
+try:
+ JSONField = models.JSONField
+except AttributeError:
+ import jsonfield
+ JSONField = jsonfield.JSONField
+
def forwards_func(apps, schema_editor):
from core.models import EXTRACTORS
@@ -76,7 +82,7 @@ class Migration(migrations.Migration):
name='ArchiveResult',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
- ('cmd', models.JSONField()),
+ ('cmd', JSONField()),
('pwd', models.CharField(max_length=256)),
('cmd_version', models.CharField(max_length=32)),
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index dca6941f..d50e8f40 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -18,6 +18,12 @@ STATUS_CHOICES = [
("skipped", "skipped")
]
+try:
+ JSONField = models.JSONField
+except AttributeError:
+ import jsonfield
+ JSONField = jsonfield.JSONField
+
class Tag(models.Model):
"""
@@ -173,7 +179,7 @@ class ArchiveResultManager(models.Manager):
class ArchiveResult(models.Model):
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
- cmd = models.JSONField()
+ cmd = JSONField()
pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=32)
output = models.CharField(max_length=512)
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 43a1e153..e8ed6b16 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -12,6 +12,7 @@ from ..config import (
ALLOWED_HOSTS,
PACKAGE_DIR,
ACTIVE_THEME,
+ TEMPLATES_DIR_NAME,
SQL_INDEX_FILENAME,
OUTPUT_DIR,
)
@@ -68,14 +69,14 @@ AUTHENTICATION_BACKENDS = [
STATIC_URL = '/static/'
STATICFILES_DIRS = [
- str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME / 'static'),
- str(Path(PACKAGE_DIR) / 'themes' / 'default' / 'static'),
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME / 'static'),
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default' / 'static'),
]
TEMPLATE_DIRS = [
- str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME),
- str(Path(PACKAGE_DIR) / 'themes' / 'default'),
- str(Path(PACKAGE_DIR) / 'themes'),
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME),
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default'),
+ str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
]
TEMPLATES = [
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index a81d98f3..b46e364e 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -150,12 +150,15 @@ class AddView(UserPassesTestMixin, FormView):
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
depth = 0 if form.cleaned_data["depth"] == "0" else 1
+ extractors = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = {
"urls": url,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
}
+ if extractors:
+ input_kwargs.update({"extractors": extractors})
add_stdout = StringIO()
with redirect_stdout(add_stdout):
add(**input_kwargs)
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index ff70f689..28cb128f 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -20,7 +20,6 @@ from ..config import (
CURL_ARGS,
CURL_VERSION,
CURL_USER_AGENT,
- setup_django,
)
from ..logging_util import TimedProgress
@@ -81,7 +80,6 @@ def extract_title_with_regex(html):
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""
- setup_django(out_dir=out_dir)
from core.models import Snapshot
output: ArchiveOutput = None
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index bf1d0c6a..4f4ac3d4 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -18,7 +18,6 @@ from ..util import (
ExtendedEncoder,
)
from ..config import (
- setup_django,
ARCHIVE_DIR_NAME,
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
@@ -243,16 +242,9 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
log_indexing_process_finished()
-@enforce_types
-def get_empty_snapshot_queryset(out_dir: Path=OUTPUT_DIR):
- setup_django(out_dir, check_db=True)
- from core.models import Snapshot
- return Snapshot.objects.none()
-
@enforce_types
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
- setup_django(out_dir, check_db=True)
from core.models import Snapshot
try:
return Snapshot.objects.all()
@@ -390,8 +382,9 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type:
color='red',
)
raise SystemExit(2)
+ from core.models import Snapshot
- qsearch = get_empty_snapshot_queryset()
+ qsearch = Snapshot.objects.none()
for pattern in filter_patterns:
try:
qsearch |= query_search_index(pattern)
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index 4ead04ce..a62e2c7e 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -23,7 +23,6 @@ from ..config import (
GIT_SHA,
FOOTER_INFO,
HTML_INDEX_FILENAME,
- setup_django,
)
MAIN_INDEX_TEMPLATE = 'main_index.html'
@@ -111,7 +110,6 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
"""render a given html template string with the given template content"""
from django.template.loader import render_to_string
- setup_django(check_db=False)
return render_to_string(template, context)
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index c6bf3731..bc3a25da 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -9,7 +9,6 @@ DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
__package__ = 'archivebox.index'
from pathlib import Path
-from django.db.utils import OperationalError
from datetime import datetime, timedelta
diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py
index 7bce3313..f2b86735 100644
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@@ -19,6 +19,7 @@ if TYPE_CHECKING:
from .util import enforce_types
from .config import (
ConfigDict,
+ OUTPUT_DIR,
PYTHON_ENCODING,
ANSI,
IS_TTY,
@@ -514,19 +515,24 @@ def printable_folder_status(name: str, folder: Dict) -> str:
else:
num_files = 'missing'
- if ' ' in str(folder['path']):
- folder['path'] = f'"{folder["path"]}"'
+ path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
+ if path and ' ' in path:
+ path = f'"{path}"'
+
+ # if path is just a plain dot, replace it back with the full path for clarity
+ if path == '.':
+ path = str(OUTPUT_DIR)
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
- name.ljust(22),
- (str(folder["path"]) or '').ljust(76),
+ name.ljust(21),
num_files.ljust(14),
ANSI[color],
- note,
+ note.ljust(8),
ANSI['reset'],
+ path.ljust(76),
))
@@ -546,17 +552,18 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
- if ' ' in (dependency["path"] or ''):
- dependency["path"] = f'"{dependency["path"]}"'
+ path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
+ if path and ' ' in path:
+ path = f'"{path}"'
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
- name.ljust(22),
- (dependency["path"] or '').ljust(76),
+ name.ljust(21),
version.ljust(14),
ANSI[color],
- note,
+ note.ljust(8),
ANSI['reset'],
+ path.ljust(76),
))
diff --git a/archivebox/main.py b/archivebox/main.py
index 6463bab6..eb8cd6a0 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -29,7 +29,6 @@ from .util import enforce_types # type: ignore
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import (
load_main_index,
- get_empty_snapshot_queryset,
parse_links_from_source,
dedupe_links,
write_main_index,
@@ -218,7 +217,7 @@ def version(quiet: bool=False,
else:
print('ArchiveBox v{}'.format(VERSION))
p = platform.uname()
- print(p.system, platform.platform(), p.machine)
+ print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@@ -265,6 +264,7 @@ def run(subcommand: str,
@enforce_types
def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
+ from core.models import Snapshot
Path(out_dir).mkdir(exist_ok=True)
is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
@@ -335,7 +335,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
print()
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
- all_links = get_empty_snapshot_queryset()
+ all_links = Snapshot.objects.none()
pending_links: Dict[str, Link] = {}
if existing_index:
@@ -511,7 +511,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
@enforce_types
-def oneshot(url: str, out_dir: Path=OUTPUT_DIR):
+def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
"""
Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
You can run this to archive single pages without needing to create a whole collection with archivebox init.
@@ -523,7 +523,8 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR):
color='red'
)
raise SystemExit(2)
- methods = ignore_methods(['title'])
+
+ methods = extractors.split(",") if extractors else ignore_methods(['title'])
archive_link(oneshot_link[0], out_dir=out_dir, methods=methods)
return oneshot_link
@@ -534,8 +535,8 @@ def add(urls: Union[str, List[str]],
index_only: bool=False,
overwrite: bool=False,
init: bool=False,
- out_dir: Path=OUTPUT_DIR,
- extractors: str="") -> List[Link]:
+ extractors: str="",
+ out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive"""
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index b281d1a4..6191ede9 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -6,7 +6,7 @@ from django.db.models import QuerySet
from archivebox.index.schema import Link
from archivebox.util import enforce_types
-from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
+from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
from .utils import get_indexable_content, log_index_started
@@ -49,7 +49,6 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
@enforce_types
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
- setup_django(out_dir, check_db=True)
from core.models import Snapshot
if search_backend_enabled():
@@ -107,4 +106,3 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
)
else:
write_search_index(link, texts, out_dir=out_dir)
-
\ No newline at end of file
diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py
index ff02008d..840d2d2d 100644
--- a/archivebox/search/backends/ripgrep.py
+++ b/archivebox/search/backends/ripgrep.py
@@ -1,8 +1,8 @@
import re
-from subprocess import run, PIPE, DEVNULL
+from subprocess import run, PIPE
from typing import List, Generator
-from archivebox.config import setup_django, ARCHIVE_DIR
+from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION
from archivebox.util import enforce_types
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
@@ -26,11 +26,9 @@ def flush(snapshot_ids: Generator[str, None, None]):
@enforce_types
def search(text: str) -> List[str]:
- is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
- if is_rg_installed.returncode:
+ if not RIPGREP_VERSION:
raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
- setup_django(check_db=True)
from core.models import Snapshot
rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
@@ -45,4 +43,3 @@ def search(text: str) -> List[str]:
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
return snap_ids
-
diff --git a/bin/build_brew.sh b/bin/build_brew.sh
old mode 100644
new mode 100755
index 9767013d..ec54c90a
--- a/bin/build_brew.sh
+++ b/bin/build_brew.sh
@@ -12,11 +12,18 @@ IFS=$'\n'
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
+
+CURRENT_PLAFORM="$(uname)"
+REQUIRED_PLATFORM="Darwin"
+if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then
+ echo "[!] Skipping the Homebrew package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)."
+ exit 0
+fi
+
+
cd "$REPO_DIR/brew_dist"
-
-
# make sure archivebox.rb is up-to-date with the dependencies
-echo "[+] Building bottle"
+echo "[+] Building Homebrew bottle"
brew install --build-bottle ./archivebox.rb
brew bottle archivebox
diff --git a/bin/build_deb.sh b/bin/build_deb.sh
index 0c590d71..b9279369 100755
--- a/bin/build_deb.sh
+++ b/bin/build_deb.sh
@@ -19,6 +19,13 @@ else
fi
cd "$REPO_DIR"
+CURRENT_PLAFORM="$(uname)"
+REQUIRED_PLATFORM="Linux"
+if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then
+ echo "[!] Skipping the Debian package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)."
+ exit 0
+fi
+
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
DEBIAN_VERSION="1"
PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988"
diff --git a/bin/build_docker.sh b/bin/build_docker.sh
index 025fe350..0115acdf 100755
--- a/bin/build_docker.sh
+++ b/bin/build_docker.sh
@@ -14,6 +14,7 @@ REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && p
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
cd "$REPO_DIR"
+which docker > /dev/null
echo "[+] Building docker image in the background..."
docker build . -t archivebox \
diff --git a/bin/test.sh b/bin/test.sh
index 3c472812..b33921af 100755
--- a/bin/test.sh
+++ b/bin/test.sh
@@ -14,4 +14,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
source "$DIR/.venv/bin/activate"
-pytest -s
+pytest -s --basetemp=tests/out
diff --git a/package.json b/package.json
index 8d88a3fd..36545fb7 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "archivebox",
- "version": "0.5.0",
+ "version": "0.5.1",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting ",
"license": "MIT",
diff --git a/setup.py b/setup.py
index d01b3f65..4eb7c97d 100755
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ PROJECT_URLS = {
ROOT_DIR = Path(__file__).parent.resolve()
PACKAGE_DIR = ROOT_DIR / PKG_NAME
-README = (PACKAGE_DIR / "README.md").read_text()
+README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
# To see when setup.py gets called (uncomment for debugging):
diff --git a/stdeb.cfg b/stdeb.cfg
index 37bbb42f..a07147e2 100644
--- a/stdeb.cfg
+++ b/stdeb.cfg
@@ -5,5 +5,5 @@ Package3: archivebox
Suite: focal
Suite3: focal
Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
-Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
+Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
XS-Python-Version: >= 3.7
diff --git a/tests/test_oneshot.py b/tests/test_oneshot.py
index 8e4016da..560ac43c 100644
--- a/tests/test_oneshot.py
+++ b/tests/test_oneshot.py
@@ -9,11 +9,20 @@ def test_oneshot_command_exists(tmp_path, disable_extractors_dict):
def test_oneshot_command_saves_page_in_right_folder(tmp_path, disable_extractors_dict):
disable_extractors_dict.update({"SAVE_DOM": "true"})
- process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"],
- capture_output=True, env=disable_extractors_dict)
+ process = subprocess.run(
+ [
+ "archivebox",
+ "oneshot",
+ f"--out-dir={tmp_path}",
+ "--extract=title,favicon,dom",
+ "http://127.0.0.1:8080/static/example.com.html",
+ ],
+ capture_output=True,
+ env=disable_extractors_dict,
+ )
items = ' '.join([str(x) for x in tmp_path.iterdir()])
current_path = ' '.join([str(x) for x in Path.cwd().iterdir()])
assert "index.json" in items
assert not "index.sqlite3" in current_path
assert "output.html" in items
-
\ No newline at end of file
+