mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
Merge branch 'dev' into method_allow_deny
This commit is contained in:
commit
63ad43f46c
33 changed files with 4485 additions and 1748 deletions
|
@ -5,16 +5,21 @@ __pycache__/
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
.github/
|
.github/
|
||||||
|
.git/
|
||||||
|
.pdm-build/
|
||||||
|
.pdm-python/
|
||||||
|
.eggs/
|
||||||
|
|
||||||
venv/
|
venv/
|
||||||
.venv/
|
.venv/
|
||||||
.docker-venv/
|
.docker-venv/
|
||||||
|
node_modules/
|
||||||
|
|
||||||
build/
|
build/
|
||||||
dist/
|
dist/
|
||||||
pip_dist/
|
|
||||||
!pip_dist/archivebox.egg-info/requires.txt
|
|
||||||
brew_dist/
|
brew_dist/
|
||||||
|
deb_dist/
|
||||||
|
pip_dist/
|
||||||
assets/
|
assets/
|
||||||
|
|
||||||
data/
|
data/
|
||||||
|
|
4
.github/workflows/pip.yml
vendored
4
.github/workflows/pip.yml
vendored
|
@ -7,7 +7,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
@ -18,7 +18,7 @@ jobs:
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v1
|
uses: actions/setup-python@v1
|
||||||
with:
|
with:
|
||||||
python-version: 3.9
|
python-version: 3.11
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
|
||||||
- name: Build Python Package
|
- name: Build Python Package
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -13,6 +13,8 @@ venv/
|
||||||
node_modules/
|
node_modules/
|
||||||
|
|
||||||
# Packaging artifacts
|
# Packaging artifacts
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build
|
||||||
archivebox.egg-info
|
archivebox.egg-info
|
||||||
archivebox-*.tar.gz
|
archivebox-*.tar.gz
|
||||||
build/
|
build/
|
||||||
|
|
181
Dockerfile
181
Dockerfile
|
@ -16,15 +16,17 @@
|
||||||
# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
|
# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
|
||||||
|
|
||||||
|
|
||||||
FROM python:3.11-slim-bullseye
|
FROM debian:bookworm-backports
|
||||||
|
|
||||||
LABEL name="archivebox" \
|
LABEL name="archivebox" \
|
||||||
maintainer="Nick Sweeting <archivebox-docker@sweeting.me>" \
|
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
|
||||||
description="All-in-one personal internet archiving container" \
|
description="All-in-one personal internet archiving container" \
|
||||||
homepage="https://github.com/ArchiveBox/ArchiveBox" \
|
homepage="https://github.com/ArchiveBox/ArchiveBox" \
|
||||||
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
|
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
|
||||||
|
|
||||||
# System-level base config
|
######### Base System Setup ####################################
|
||||||
|
|
||||||
|
# Global system-level config
|
||||||
ENV TZ=UTC \
|
ENV TZ=UTC \
|
||||||
LANGUAGE=en_US:en \
|
LANGUAGE=en_US:en \
|
||||||
LC_ALL=C.UTF-8 \
|
LC_ALL=C.UTF-8 \
|
||||||
|
@ -32,103 +34,146 @@ ENV TZ=UTC \
|
||||||
PYTHONIOENCODING=UTF-8 \
|
PYTHONIOENCODING=UTF-8 \
|
||||||
PYTHONUNBUFFERED=1 \
|
PYTHONUNBUFFERED=1 \
|
||||||
DEBIAN_FRONTEND=noninteractive \
|
DEBIAN_FRONTEND=noninteractive \
|
||||||
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
|
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
|
||||||
|
npm_config_loglevel=error
|
||||||
|
|
||||||
# Application-level base config
|
# Application-level config
|
||||||
ENV CODE_DIR=/app \
|
ENV CODE_DIR=/app \
|
||||||
VENV_PATH=/venv \
|
|
||||||
DATA_DIR=/data \
|
DATA_DIR=/data \
|
||||||
NODE_DIR=/node \
|
GLOBAL_VENV=/venv \
|
||||||
|
APP_VENV=/app/.venv \
|
||||||
|
NODE_MODULES=/app/node_modules \
|
||||||
ARCHIVEBOX_USER="archivebox"
|
ARCHIVEBOX_USER="archivebox"
|
||||||
|
|
||||||
|
ENV PATH="$PATH:$GLOBAL_VENV/bin:$APP_VENV/bin:$NODE_MODULES/.bin"
|
||||||
|
|
||||||
|
|
||||||
# Create non-privileged user for archivebox and chrome
|
# Create non-privileged user for archivebox and chrome
|
||||||
RUN groupadd --system $ARCHIVEBOX_USER \
|
RUN echo "[*] Setting up system environment..." \
|
||||||
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER
|
&& groupadd --system $ARCHIVEBOX_USER \
|
||||||
|
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \
|
||||||
|
&& mkdir -p /etc/apt/keyrings
|
||||||
|
|
||||||
# Install system dependencies
|
# Install system apt dependencies (adding backports to access more recent apt updates)
|
||||||
RUN apt-get update -qq \
|
RUN echo "[+] Installing system dependencies..." \
|
||||||
&& apt-get install -qq -y --no-install-recommends \
|
&& echo 'deb https://deb.debian.org/debian bullseye-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
|
||||||
apt-transport-https ca-certificates gnupg2 zlib1g-dev \
|
&& apt-get update -qq \
|
||||||
dumb-init gosu cron unzip curl \
|
&& apt-get install -qq -y \
|
||||||
|
apt-transport-https ca-certificates gnupg2 curl wget \
|
||||||
|
zlib1g-dev dumb-init gosu cron unzip \
|
||||||
|
nano iputils-ping dnsutils htop procps \
|
||||||
|
# 1. packaging dependencies
|
||||||
|
# 2. docker and init system dependencies
|
||||||
|
# 3. frivolous CLI helpers to make debugging failed archiving easier
|
||||||
|
&& mkdir -p /etc/apt/keyrings \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install apt dependencies
|
|
||||||
RUN apt-get update -qq \
|
######### Language Environments ####################################
|
||||||
&& apt-get install -qq -y --no-install-recommends \
|
|
||||||
wget curl chromium git ffmpeg youtube-dl ripgrep \
|
|
||||||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
|
||||||
&& ln -s /usr/bin/chromium /usr/bin/chromium-browser \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install Node environment
|
# Install Node environment
|
||||||
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
|
RUN echo "[+] Installing Node environment..." \
|
||||||
&& echo 'deb https://deb.nodesource.com/node_18.x buster main' >> /etc/apt/sources.list \
|
&& echo 'deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main' >> /etc/apt/sources.list.d/nodejs.list \
|
||||||
|
&& curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y --no-install-recommends \
|
&& apt-get install -qq -y nodejs \
|
||||||
nodejs \
|
&& npm i -g npm \
|
||||||
# && npm install -g npm \
|
&& node --version \
|
||||||
|
&& npm --version
|
||||||
|
|
||||||
|
# Install Python environment
|
||||||
|
RUN echo "[+] Installing Python environment..." \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||||
|
python3 python3-pip python3-venv python3-setuptools python3-wheel python-dev-is-python3 \
|
||||||
|
python3-ldap libldap2-dev libsasl2-dev libssl-dev \
|
||||||
|
&& rm /usr/lib/python3*/EXTERNALLY-MANAGED \
|
||||||
|
&& python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \
|
||||||
|
&& $GLOBAL_VENV/bin/pip install --upgrade pip pdm setuptools wheel python-ldap \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
######### Extractor Dependencies ##################################
|
||||||
|
|
||||||
|
# Install apt dependencies
|
||||||
|
RUN echo "[+] Installing extractor APT dependencies..." \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||||
|
curl wget git yt-dlp ffmpeg ripgrep \
|
||||||
|
# Packages we have also needed in the past:
|
||||||
|
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
|
||||||
|
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install chromium browser using playwright
|
||||||
|
ENV PLAYWRIGHT_BROWSERS_PATH="/browsers"
|
||||||
|
RUN echo "[+] Installing extractor Chromium dependency..." \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& $GLOBAL_VENV/bin/pip install playwright \
|
||||||
|
&& $GLOBAL_VENV/bin/playwright install --with-deps chromium \
|
||||||
|
&& CHROME_BINARY="$($GLOBAL_VENV/bin/python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
|
||||||
|
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
|
||||||
|
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
|
||||||
|
&& chown -R $ARCHIVEBOX_USER "/home/${ARCHIVEBOX_USER}/.config"
|
||||||
|
|
||||||
# Install Node dependencies
|
# Install Node dependencies
|
||||||
WORKDIR "$NODE_DIR"
|
|
||||||
ENV PATH="${PATH}:$NODE_DIR/node_modules/.bin" \
|
|
||||||
npm_config_loglevel=error
|
|
||||||
ADD ./package.json ./package.json
|
|
||||||
ADD ./package-lock.json ./package-lock.json
|
|
||||||
RUN npm ci
|
|
||||||
|
|
||||||
# Install Python dependencies
|
|
||||||
WORKDIR "$CODE_DIR"
|
WORKDIR "$CODE_DIR"
|
||||||
ENV PATH="${PATH}:$VENV_PATH/bin"
|
COPY --chown=root:root --chmod=755 "package.json" "package-lock.json" "$CODE_DIR/"
|
||||||
RUN python -m venv --clear --symlinks "$VENV_PATH" \
|
RUN echo "[+] Installing extractor Node dependencies..." \
|
||||||
&& pip install --upgrade --quiet pip setuptools \
|
&& npm ci --prefer-offline --no-audit \
|
||||||
&& mkdir -p "$CODE_DIR/archivebox"
|
&& npm version
|
||||||
ADD "./setup.py" "$CODE_DIR/"
|
|
||||||
ADD "./package.json" "$CODE_DIR/archivebox/"
|
|
||||||
RUN apt-get update -qq \
|
|
||||||
&& apt-get install -qq -y --no-install-recommends \
|
|
||||||
build-essential python-dev python3-dev \
|
|
||||||
&& echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \
|
|
||||||
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
|
|
||||||
&& pip install -r /tmp/requirements.txt \
|
|
||||||
&& pip install --upgrade youtube-dl yt-dlp \
|
|
||||||
&& apt-get purge -y build-essential python-dev python3-dev \
|
|
||||||
&& apt-get autoremove -y \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install apt development dependencies
|
######### Build Dependencies ####################################
|
||||||
# RUN apt-get install -qq \
|
|
||||||
# && apt-get install -qq -y --no-install-recommends \
|
|
||||||
# python3 python3-dev python3-pip python3-venv python3-all \
|
|
||||||
# dh-python debhelper devscripts dput software-properties-common \
|
|
||||||
# python3-distutils python3-setuptools python3-wheel python3-stdeb
|
|
||||||
# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
|
|
||||||
# && pip install --quiet -r /tmp/dev_requirements.txt
|
|
||||||
|
|
||||||
# Install ArchiveBox Python package and its dependencies
|
# # Installing Python dependencies to build from source
|
||||||
WORKDIR "$CODE_DIR"
|
# WORKDIR "$CODE_DIR"
|
||||||
ADD . "$CODE_DIR"
|
# COPY --chown=root:root --chmod=755 "./pyproject.toml" "./pdm.lock" "$CODE_DIR/"
|
||||||
RUN chown -R root:root . && chmod a+rX -R . && pip install -e .
|
# RUN echo "[+] Installing project Python dependencies..." \
|
||||||
|
# && apt-get update -qq \
|
||||||
|
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||||
|
# build-essential libssl-dev libldap2-dev libsasl2-dev \
|
||||||
|
# && pdm use -f $GLOBAL_VENV \
|
||||||
|
# && pdm install --fail-fast --no-lock --group :all --no-self \
|
||||||
|
# && pdm build \
|
||||||
|
# && apt-get purge -y \
|
||||||
|
# build-essential libssl-dev libldap2-dev libsasl2-dev \
|
||||||
|
# # these are only needed to build CPython libs, we discard after build phase to shrink layer size
|
||||||
|
# && apt-get autoremove -y \
|
||||||
|
# && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install ArchiveBox Python package from source
|
||||||
|
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
|
||||||
|
RUN echo "[*] Installing ArchiveBox package from /app..." \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& $GLOBAL_VENV/bin/pip install -e "$CODE_DIR"[sonic,ldap]
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
|
||||||
# Setup ArchiveBox runtime config
|
# Setup ArchiveBox runtime config
|
||||||
WORKDIR "$DATA_DIR"
|
WORKDIR "$DATA_DIR"
|
||||||
ENV IN_DOCKER=True \
|
ENV IN_DOCKER=True \
|
||||||
|
WGET_BINARY="wget" \
|
||||||
|
YOUTUBEDL_BINARY="yt-dlp" \
|
||||||
CHROME_SANDBOX=False \
|
CHROME_SANDBOX=False \
|
||||||
CHROME_BINARY="/usr/bin/chromium-browser" \
|
CHROME_BINARY="/usr/bin/chromium-browser" \
|
||||||
USE_SINGLEFILE=True \
|
USE_SINGLEFILE=True \
|
||||||
SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \
|
SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
|
||||||
USE_READABILITY=True \
|
USE_READABILITY=True \
|
||||||
READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" \
|
READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
|
||||||
USE_MERCURY=True \
|
USE_MERCURY=True \
|
||||||
MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" \
|
MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
|
||||||
YOUTUBEDL_BINARY="yt-dlp"
|
|
||||||
|
|
||||||
# Print version for nice docker finish summary
|
# Print version for nice docker finish summary
|
||||||
# RUN archivebox version
|
# RUN archivebox version
|
||||||
RUN /app/bin/docker_entrypoint.sh archivebox version
|
RUN echo "[√] Finished Docker build succesfully. Saving build summary in: /version_info.txt" \
|
||||||
|
&& uname -a | tee -a /version_info.txt \
|
||||||
|
&& env --chdir="$NODE_DIR" npm version | tee -a /version_info.txt \
|
||||||
|
&& env --chdir="$CODE_DIR" pdm info | tee -a /version_info.txt \
|
||||||
|
&& "$CODE_DIR/bin/docker_entrypoint.sh" archivebox version 2>&1 | tee -a /version_info.txt
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
|
||||||
# Open up the interfaces to the outside world
|
# Open up the interfaces to the outside world
|
||||||
VOLUME "$DATA_DIR"
|
VOLUME "/data"
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
# Optional:
|
# Optional:
|
||||||
|
|
27
README.md
27
README.md
|
@ -10,13 +10,13 @@
|
||||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
|
||||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a>
|
||||||
|
|
||||||
<pre lang="bash"><code style="white-space: pre-line">"Your own personal internet archive" (网站存档 / 爬虫)
|
<pre lang="bash" align="center"><code style="white-space: pre-line; text-align: center" align="center">"Your own personal internet archive" (网站存档 / 爬虫)
|
||||||
curl -sSL 'https://get.archivebox.io' | sh
|
curl -sSL 'https://get.archivebox.io' | sh
|
||||||
</code></pre>
|
</code></pre>
|
||||||
|
|
||||||
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
|
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
|
||||||
|
|
||||||
<a href="https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
|
||||||
<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
|
||||||
<a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=active"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=active"/></a>
|
||||||
<a href="https://pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-yellow.svg?logo=python&logoColor=yellow"/></a>
|
<a href="https://pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-yellow.svg?logo=python&logoColor=yellow"/></a>
|
||||||
|
@ -86,7 +86,7 @@ ls ./archive/*/index.json # or browse directly via the filesyste
|
||||||
|
|
||||||
## Key Features
|
## Key Features
|
||||||
|
|
||||||
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up online, stores all data locally
|
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), doesn't require signing up online, stores all data locally
|
||||||
- [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies)
|
- [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies)
|
||||||
- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||||
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl or yt-dlp), articles (readability), code (git), etc.](#output-formats)
|
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl or yt-dlp), articles (readability), code (git), etc.](#output-formats)
|
||||||
|
@ -119,9 +119,9 @@ ls ./archive/*/index.json # or browse directly via the filesyste
|
||||||
<br/><br/>
|
<br/><br/>
|
||||||
<ol>
|
<ol>
|
||||||
<li>Install <a href="https://docs.docker.com/get-docker/">Docker</a> and <a href="https://docs.docker.com/compose/install/#install-using-pip">Docker Compose</a> on your system (if not already installed).</li>
|
<li>Install <a href="https://docs.docker.com/get-docker/">Docker</a> and <a href="https://docs.docker.com/compose/install/#install-using-pip">Docker Compose</a> on your system (if not already installed).</li>
|
||||||
<li>Download the <a href="https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml" download><code>docker-compose.yml</code></a> file into a new empty directory (can be anywhere).
|
<li>Download the <a href="https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/docker-compose.yml" download><code>docker-compose.yml</code></a> file into a new empty directory (can be anywhere).
|
||||||
<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox
|
<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox
|
||||||
curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
|
curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/docker-compose.yml'
|
||||||
</code></pre></li>
|
</code></pre></li>
|
||||||
<li>Run the initial setup and create an admin user.
|
<li>Run the initial setup and create an admin user.
|
||||||
<pre lang="bash"><code style="white-space: pre-line">docker compose run archivebox init --setup
|
<pre lang="bash"><code style="white-space: pre-line">docker compose run archivebox init --setup
|
||||||
|
@ -499,7 +499,7 @@ env CHROME_BINARY=chromium archivebox ... # run with a one-off config
|
||||||
|
|
||||||
<sup>These methods also work the same way when run inside Docker, see the <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration">Docker Configuration</a> wiki page for details.</sup>
|
<sup>These methods also work the same way when run inside Docker, see the <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration">Docker Configuration</a> wiki page for details.</sup>
|
||||||
|
|
||||||
**The config loading logic with all the options defined is here: [`archivebox/config.py`](https://github.com/ArchiveBox/ArchiveBox/blob/master/archivebox/config.py).**
|
**The config loading logic with all the options defined is here: [`archivebox/config.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config.py).**
|
||||||
|
|
||||||
Most options are also documented on the **[Configuration Wiki page](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)**.
|
Most options are also documented on the **[Configuration Wiki page](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)**.
|
||||||
|
|
||||||
|
@ -588,7 +588,8 @@ Each snapshot subfolder `./archive/<timestamp>/` includes a static `index.json`
|
||||||
|
|
||||||
You can export the main index to browse it statically without needing to run a server.
|
You can export the main index to browse it statically without needing to run a server.
|
||||||
|
|
||||||
*Note about large exports: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.*
|
> **Note**
|
||||||
|
> These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# archivebox list --help
|
# archivebox list --help
|
||||||
|
@ -615,7 +616,7 @@ The paths in the static exports are relative, make sure to keep them next to you
|
||||||
|
|
||||||
### Archiving Private Content
|
### Archiving Private Content
|
||||||
|
|
||||||
<a id="archiving-private-urls"/>
|
<a id="archiving-private-urls"></a>
|
||||||
|
|
||||||
If you're importing pages with private content or URLs containing secret tokens you don't want public (e.g Google Docs, paywalled content, unlisted videos, etc.), **you may want to disable some of the extractor methods to avoid leaking that content to 3rd party APIs or the public**.
|
If you're importing pages with private content or URLs containing secret tokens you don't want public (e.g Google Docs, paywalled content, unlisted videos, etc.), **you may want to disable some of the extractor methods to avoid leaking that content to 3rd party APIs or the public**.
|
||||||
|
|
||||||
|
@ -796,7 +797,7 @@ Whether you want to learn which organizations are the big players in the web arc
|
||||||
- [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#communities)
|
- [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#communities)
|
||||||
_A collection of the most active internet archiving communities and initiatives._
|
_A collection of the most active internet archiving communities and initiatives._
|
||||||
- Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
|
- Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
|
||||||
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
|
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://items.ssrc.org/parameters/on-the-importance-of-web-archiving/)" blog post.
|
||||||
- Reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter
|
- Reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
@ -867,7 +868,7 @@ All contributions to ArchiveBox are welcomed! Check our [issues](https://github.
|
||||||
|
|
||||||
For low hanging fruit / easy first tickets, see: <a href="https://github.com/ArchiveBox/ArchiveBox/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc+label%3A%22help+wanted%22">ArchiveBox/Issues `#good first ticket` `#help wanted`</a>.
|
For low hanging fruit / easy first tickets, see: <a href="https://github.com/ArchiveBox/ArchiveBox/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc+label%3A%22help+wanted%22">ArchiveBox/Issues `#good first ticket` `#help wanted`</a>.
|
||||||
|
|
||||||
**Python API Documentation:** https://docs.archivebox.io/en/master/archivebox.html#module-archivebox.main
|
**Python API Documentation:** https://docs.archivebox.io/en/dev/archivebox.html#module-archivebox.main
|
||||||
|
|
||||||
### Setup the dev environment
|
### Setup the dev environment
|
||||||
|
|
||||||
|
@ -985,6 +986,7 @@ archivebox init --setup
|
||||||
<details><summary><i>Click to expand...</i></summary>
|
<details><summary><i>Click to expand...</i></summary>
|
||||||
|
|
||||||
Make sure to run this whenever you change things in `models.py`.
|
Make sure to run this whenever you change things in `models.py`.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd archivebox/
|
cd archivebox/
|
||||||
./manage.py makemigrations
|
./manage.py makemigrations
|
||||||
|
@ -993,6 +995,7 @@ cd path/to/test/data/
|
||||||
archivebox shell
|
archivebox shell
|
||||||
archivebox manage dbshell
|
archivebox manage dbshell
|
||||||
```
|
```
|
||||||
|
|
||||||
(uses `pytest -s`)
|
(uses `pytest -s`)
|
||||||
https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
|
https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
|
||||||
|
|
||||||
|
@ -1000,7 +1003,9 @@ https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-dj
|
||||||
|
|
||||||
#### Contributing a new extractor
|
#### Contributing a new extractor
|
||||||
|
|
||||||
<details><summary><i>Click to expand...</i></summary><br/><br/>
|
<details><summary><i>Click to expand...</i></summary>
|
||||||
|
|
||||||
|
<br/><br/>
|
||||||
|
|
||||||
ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page.
|
ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page.
|
||||||
|
|
||||||
|
|
34
SECURITY.md
Normal file
34
SECURITY.md
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# Security Policy
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security Information
|
||||||
|
|
||||||
|
Please see this wiki page for important notices about ArchiveBox security, publishing your archives securely, and the dangers of executing archived JS:
|
||||||
|
|
||||||
|
https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview
|
||||||
|
|
||||||
|
Also see this section of the README about important caveats when running ArchiveBox:
|
||||||
|
|
||||||
|
https://github.com/ArchiveBox/ArchiveBox?tab=readme-ov-file#caveats
|
||||||
|
|
||||||
|
You can also read these pages for more information about ArchiveBox's internals, development environment, DB schema, and more:
|
||||||
|
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox#archive-layout
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox#archivebox-development
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Reporting a Vulnerability
|
||||||
|
|
||||||
|
We use Github's built-in [Private Reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability) feature to accept vulnerability reports.
|
||||||
|
|
||||||
|
1. Go to the Security tab on our Github repo: https://github.com/ArchiveBox/ArchiveBox/security
|
||||||
|
|
||||||
|
2. Click the ["Report a Vulnerability"](https://github.com/ArchiveBox/ArchiveBox/security/advisories/new) button
|
||||||
|
|
||||||
|
3. Fill out the form to submit the details of the report and it will be securely sent to the maintainers
|
||||||
|
|
||||||
|
You can also contact the maintainers via our public [Zulip Chat Server zulip.archivebox.io](https://zulip.archivebox.io) or [Twitter DMs @ArchiveBoxApp](https://twitter.com/ArchiveBoxApp).
|
|
@ -57,9 +57,17 @@ SYSTEM_USER = getpass.getuser() or os.getlogin()
|
||||||
try:
|
try:
|
||||||
import pwd
|
import pwd
|
||||||
SYSTEM_USER = pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
|
SYSTEM_USER = pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
|
||||||
|
except KeyError:
|
||||||
|
# Process' UID might not map to a user in cases such as running the Docker image
|
||||||
|
# (where `archivebox` is 999) as a different UID.
|
||||||
|
pass
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
# pwd is only needed for some linux systems, doesn't exist on windows
|
# pwd is only needed for some linux systems, doesn't exist on windows
|
||||||
pass
|
pass
|
||||||
|
except Exception:
|
||||||
|
# this should never happen, uncomment to debug
|
||||||
|
# raise
|
||||||
|
pass
|
||||||
|
|
||||||
############################### Config Schema ##################################
|
############################### Config Schema ##################################
|
||||||
|
|
||||||
|
@ -82,8 +90,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
||||||
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
|
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
|
||||||
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
|
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
|
||||||
|
|
||||||
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
|
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
|
||||||
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
|
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
|
||||||
|
|
||||||
|
'ADMIN_USERNAME': {'type': str, 'default': None},
|
||||||
|
'ADMIN_PASSWORD': {'type': str, 'default': None},
|
||||||
|
|
||||||
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
|
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
|
||||||
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
|
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
|
||||||
},
|
},
|
||||||
|
@ -100,12 +113,22 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
||||||
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
||||||
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
||||||
'TIMEZONE': {'type': str, 'default': 'UTC'},
|
'TIMEZONE': {'type': str, 'default': 'UTC'},
|
||||||
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
|
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
|
||||||
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
|
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
|
||||||
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
||||||
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
|
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
|
||||||
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
|
||||||
|
'LDAP': {'type': bool, 'default': False},
|
||||||
|
'LDAP_SERVER_URI': {'type': str, 'default': None},
|
||||||
|
'LDAP_BIND_DN': {'type': str, 'default': None},
|
||||||
|
'LDAP_BIND_PASSWORD': {'type': str, 'default': None},
|
||||||
|
'LDAP_USER_BASE': {'type': str, 'default': None},
|
||||||
|
'LDAP_USER_FILTER': {'type': str, 'default': None},
|
||||||
|
'LDAP_USERNAME_ATTR': {'type': str, 'default': None},
|
||||||
|
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
||||||
|
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
||||||
|
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
||||||
},
|
},
|
||||||
|
|
||||||
'ARCHIVE_METHOD_TOGGLES': {
|
'ARCHIVE_METHOD_TOGGLES': {
|
||||||
|
@ -151,10 +174,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'--write-thumbnail',
|
'--write-thumbnail',
|
||||||
'--no-call-home',
|
'--no-call-home',
|
||||||
'--write-sub',
|
'--write-sub',
|
||||||
'--all-subs',
|
'--write-auto-subs',
|
||||||
# There are too many of these and youtube
|
|
||||||
# throttles you with HTTP error 429
|
|
||||||
#'--write-auto-subs',
|
|
||||||
'--convert-subs=srt',
|
'--convert-subs=srt',
|
||||||
'--yes-playlist',
|
'--yes-playlist',
|
||||||
'--continue',
|
'--continue',
|
||||||
|
@ -167,7 +187,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'--ignore-errors',
|
'--ignore-errors',
|
||||||
'--geo-bypass',
|
'--geo-bypass',
|
||||||
'--add-metadata',
|
'--add-metadata',
|
||||||
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
|
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
|
||||||
]},
|
]},
|
||||||
|
|
||||||
|
|
||||||
|
@ -216,18 +236,19 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
|
|
||||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||||
'WGET_BINARY': {'type': str, 'default': 'wget'},
|
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
|
||||||
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||||
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
|
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
|
||||||
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
||||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
|
|
||||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||||
'CHROME_BINARY': {'type': str, 'default': None},
|
'CHROME_BINARY': {'type': str, 'default': None},
|
||||||
|
|
||||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||||
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
||||||
|
|
||||||
|
'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -420,7 +441,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||||
|
|
||||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
||||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
|
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
||||||
|
|
||||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||||
|
|
|
@ -20,6 +20,17 @@ from ..config import (
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
LOGS_DIR,
|
LOGS_DIR,
|
||||||
TIMEZONE,
|
TIMEZONE,
|
||||||
|
|
||||||
|
LDAP,
|
||||||
|
LDAP_SERVER_URI,
|
||||||
|
LDAP_BIND_DN,
|
||||||
|
LDAP_BIND_PASSWORD,
|
||||||
|
LDAP_USER_BASE,
|
||||||
|
LDAP_USER_FILTER,
|
||||||
|
LDAP_USERNAME_ATTR,
|
||||||
|
LDAP_FIRSTNAME_ATTR,
|
||||||
|
LDAP_LASTNAME_ATTR,
|
||||||
|
LDAP_EMAIL_ATTR,
|
||||||
)
|
)
|
||||||
|
|
||||||
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
||||||
|
@ -55,6 +66,12 @@ INSTALLED_APPS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# For usage with https://www.jetadmin.io/integrations/django
|
||||||
|
# INSTALLED_APPS += ['jet_django']
|
||||||
|
# JET_PROJECT = 'archivebox'
|
||||||
|
# JET_TOKEN = 'some-api-token-here'
|
||||||
|
|
||||||
|
|
||||||
MIDDLEWARE = [
|
MIDDLEWARE = [
|
||||||
'core.middleware.TimezoneMiddleware',
|
'core.middleware.TimezoneMiddleware',
|
||||||
'django.middleware.security.SecurityMiddleware',
|
'django.middleware.security.SecurityMiddleware',
|
||||||
|
@ -67,11 +84,58 @@ MIDDLEWARE = [
|
||||||
'core.middleware.CacheControlMiddleware',
|
'core.middleware.CacheControlMiddleware',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Authentication Settings
|
||||||
|
################################################################################
|
||||||
|
|
||||||
AUTHENTICATION_BACKENDS = [
|
AUTHENTICATION_BACKENDS = [
|
||||||
'django.contrib.auth.backends.RemoteUserBackend',
|
'django.contrib.auth.backends.RemoteUserBackend',
|
||||||
'django.contrib.auth.backends.ModelBackend',
|
'django.contrib.auth.backends.ModelBackend',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if LDAP:
|
||||||
|
try:
|
||||||
|
import ldap
|
||||||
|
from django_auth_ldap.config import LDAPSearch
|
||||||
|
|
||||||
|
global AUTH_LDAP_SERVER_URI
|
||||||
|
global AUTH_LDAP_BIND_DN
|
||||||
|
global AUTH_LDAP_BIND_PASSWORD
|
||||||
|
global AUTH_LDAP_USER_SEARCH
|
||||||
|
global AUTH_LDAP_USER_ATTR_MAP
|
||||||
|
|
||||||
|
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
|
||||||
|
AUTH_LDAP_BIND_DN = LDAP_BIND_DN
|
||||||
|
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
|
||||||
|
|
||||||
|
assert AUTH_LDAP_SERVER_URI and LDAP_USERNAME_ATTR and LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True'
|
||||||
|
|
||||||
|
AUTH_LDAP_USER_SEARCH = LDAPSearch(
|
||||||
|
LDAP_USER_BASE,
|
||||||
|
ldap.SCOPE_SUBTREE,
|
||||||
|
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
|
||||||
|
)
|
||||||
|
|
||||||
|
AUTH_LDAP_USER_ATTR_MAP = {
|
||||||
|
'username': LDAP_USERNAME_ATTR,
|
||||||
|
'first_name': LDAP_FIRSTNAME_ATTR,
|
||||||
|
'last_name': LDAP_LASTNAME_ATTR,
|
||||||
|
'email': LDAP_EMAIL_ATTR,
|
||||||
|
}
|
||||||
|
|
||||||
|
AUTHENTICATION_BACKENDS = [
|
||||||
|
'django_auth_ldap.backend.LDAPBackend',
|
||||||
|
]
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
|
||||||
|
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
|
||||||
|
# sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
### Debug Settings
|
||||||
|
################################################################################
|
||||||
|
|
||||||
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
|
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
|
||||||
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
|
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
|
||||||
if DEBUG_TOOLBAR:
|
if DEBUG_TOOLBAR:
|
||||||
|
@ -267,8 +331,8 @@ class NoisyRequestsFilter(logging.Filter):
|
||||||
if LOGS_DIR.exists():
|
if LOGS_DIR.exists():
|
||||||
ERROR_LOG = (LOGS_DIR / 'errors.log')
|
ERROR_LOG = (LOGS_DIR / 'errors.log')
|
||||||
else:
|
else:
|
||||||
# meh too many edge cases here around creating log dir w/ correct permissions
|
# historically too many edge cases here around creating log dir w/ correct permissions early on
|
||||||
# cant be bothered, just trash the log and let them figure it out via stdout/stderr
|
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
|
||||||
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
||||||
|
|
||||||
LOGGING = {
|
LOGGING = {
|
||||||
|
|
|
@ -33,6 +33,9 @@ urlpatterns = [
|
||||||
path('admin/', admin.site.urls),
|
path('admin/', admin.site.urls),
|
||||||
|
|
||||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||||
|
path('error/', lambda _: 1/0),
|
||||||
|
|
||||||
|
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
||||||
|
|
||||||
path('index.html', RedirectView.as_view(url='/')),
|
path('index.html', RedirectView.as_view(url='/')),
|
||||||
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
|
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
chrome_args,
|
||||||
|
chrome_cleanup,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
chrome_cleanup()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
chrome_args,
|
||||||
|
chrome_cleanup,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
chrome_cleanup()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
|
|
|
@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
try:
|
try:
|
||||||
result_json = json.loads(result.stdout)
|
result_json = json.loads(result.stdout)
|
||||||
assert result_json and 'content' in result_json
|
assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
|
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||||
output_tail = [
|
output_tail = [
|
||||||
line.strip()
|
line.strip()
|
||||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
|
||||||
if line.strip()
|
if line.strip()
|
||||||
]
|
]
|
||||||
hints = (
|
hints = (
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
chrome_args,
|
||||||
|
chrome_cleanup,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
chrome_cleanup()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ from ..logging_util import TimedProgress
|
||||||
|
|
||||||
HTML_TITLE_REGEX = re.compile(
|
HTML_TITLE_REGEX = re.compile(
|
||||||
r'<title.*?>' # start matching text after <title> tag
|
r'<title.*?>' # start matching text after <title> tag
|
||||||
r'(.[^<>]+)', # get everything up to these symbols
|
r'([^<>]+)', # get everything up to these symbols
|
||||||
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -177,7 +177,7 @@ def snapshot_icons(snapshot) -> str:
|
||||||
# The check for archive_org is different, so it has to be handled separately
|
# The check for archive_org is different, so it has to be handled separately
|
||||||
|
|
||||||
# get from db (faster)
|
# get from db (faster)
|
||||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||||
# get from filesystem (slower)
|
# get from filesystem (slower)
|
||||||
# target_path = Path(path) / "archive.org.txt"
|
# target_path = Path(path) / "archive.org.txt"
|
||||||
# exists = target_path.exists()
|
# exists = target_path.exists()
|
||||||
|
|
|
@ -441,7 +441,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
||||||
|
|
||||||
hints = (
|
hints = (
|
||||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||||
for line in hints[:5] if line.strip()
|
for line in list(hints)[:5] if line.strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -533,11 +533,27 @@ def log_shell_welcome_msg():
|
||||||
### Helpers
|
### Helpers
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def pretty_path(path: Union[Path, str]) -> str:
|
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
|
||||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||||
pwd = Path('.').resolve()
|
pwd = str(Path(pwd)) # .resolve()
|
||||||
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
|
path = str(path)
|
||||||
return str(path).replace(str(pwd) + '/', './')
|
|
||||||
|
if not path:
|
||||||
|
return path
|
||||||
|
|
||||||
|
# replace long absolute paths with ./ relative ones to save on terminal output width
|
||||||
|
if path.startswith(pwd) and (pwd != '/'):
|
||||||
|
path = path.replace(pwd, '.', 1)
|
||||||
|
|
||||||
|
# quote paths containing spaces
|
||||||
|
if ' ' in path:
|
||||||
|
path = f'"{path}"'
|
||||||
|
|
||||||
|
# if path is just a plain dot, replace it back with the absolute path for clarity
|
||||||
|
if path == '.':
|
||||||
|
path = pwd
|
||||||
|
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -578,6 +594,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
||||||
else:
|
else:
|
||||||
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
|
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
|
||||||
|
|
||||||
|
|
||||||
if folder['path']:
|
if folder['path']:
|
||||||
if Path(folder['path']).exists():
|
if Path(folder['path']).exists():
|
||||||
num_files = (
|
num_files = (
|
||||||
|
@ -592,13 +609,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
||||||
# add symbol @ next to filecount if path is a remote filesystem mount
|
# add symbol @ next to filecount if path is a remote filesystem mount
|
||||||
num_files = f'{num_files} @' if num_files else '@'
|
num_files = f'{num_files} @' if num_files else '@'
|
||||||
|
|
||||||
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
|
path = pretty_path(folder['path'])
|
||||||
if path and ' ' in path:
|
|
||||||
path = f'"{path}"'
|
|
||||||
|
|
||||||
# if path is just a plain dot, replace it back with the full path for clarity
|
|
||||||
if path == '.':
|
|
||||||
path = str(OUTPUT_DIR)
|
|
||||||
|
|
||||||
return ' '.join((
|
return ' '.join((
|
||||||
ANSI[color],
|
ANSI[color],
|
||||||
|
@ -629,9 +640,7 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
|
||||||
else:
|
else:
|
||||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||||
|
|
||||||
path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
|
path = pretty_path(dependency['path'])
|
||||||
if path and ' ' in path:
|
|
||||||
path = f'"{path}"'
|
|
||||||
|
|
||||||
return ' '.join((
|
return ' '.join((
|
||||||
ANSI[color],
|
ANSI[color],
|
||||||
|
|
|
@ -112,6 +112,8 @@ from .config import (
|
||||||
load_all_config,
|
load_all_config,
|
||||||
CONFIG,
|
CONFIG,
|
||||||
USER_CONFIG,
|
USER_CONFIG,
|
||||||
|
ADMIN_USERNAME,
|
||||||
|
ADMIN_PASSWORD,
|
||||||
get_real_name,
|
get_real_name,
|
||||||
setup_django,
|
setup_django,
|
||||||
)
|
)
|
||||||
|
@ -216,7 +218,7 @@ def version(quiet: bool=False,
|
||||||
if not quiet:
|
if not quiet:
|
||||||
# 0.6.3
|
# 0.6.3
|
||||||
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
|
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
|
||||||
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
|
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
|
||||||
|
|
||||||
p = platform.uname()
|
p = platform.uname()
|
||||||
print(
|
print(
|
||||||
|
@ -236,7 +238,8 @@ def version(quiet: bool=False,
|
||||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||||
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
|
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
|
||||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||||
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
|
f'FS_USER={PUID}:{PGID}',
|
||||||
|
f'FS_PERMS={OUTPUT_PERMISSIONS}',
|
||||||
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
|
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
|
||||||
)
|
)
|
||||||
print()
|
print()
|
||||||
|
@ -251,19 +254,19 @@ def version(quiet: bool=False,
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
||||||
for name, folder in CODE_LOCATIONS.items():
|
for name, path in CODE_LOCATIONS.items():
|
||||||
print(printable_folder_status(name, folder))
|
print(printable_folder_status(name, path))
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
|
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
|
||||||
for name, folder in EXTERNAL_LOCATIONS.items():
|
for name, path in EXTERNAL_LOCATIONS.items():
|
||||||
print(printable_folder_status(name, folder))
|
print(printable_folder_status(name, path))
|
||||||
|
|
||||||
print()
|
print()
|
||||||
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
||||||
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
||||||
for name, folder in DATA_LOCATIONS.items():
|
for name, path in DATA_LOCATIONS.items():
|
||||||
print(printable_folder_status(name, folder))
|
print(printable_folder_status(name, path))
|
||||||
else:
|
else:
|
||||||
print()
|
print()
|
||||||
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
||||||
|
@ -419,14 +422,16 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
|
||||||
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
||||||
|
|
||||||
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||||
|
|
||||||
|
from django.contrib.auth.models import User
|
||||||
|
|
||||||
|
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
|
||||||
|
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
|
||||||
|
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
|
||||||
|
|
||||||
if existing_index:
|
if existing_index:
|
||||||
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
||||||
else:
|
else:
|
||||||
# TODO: allow creating new supersuer via env vars on first init
|
|
||||||
# if config.HTTP_USER and config.HTTP_PASS:
|
|
||||||
# from django.contrib.auth.models import User
|
|
||||||
# User.objects.create_superuser(HTTP_USER, '', HTTP_PASS)
|
|
||||||
|
|
||||||
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
|
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
|
||||||
|
|
||||||
json_index = out_dir / JSON_INDEX_FILENAME
|
json_index = out_dir / JSON_INDEX_FILENAME
|
||||||
|
|
|
@ -34,6 +34,7 @@ from ..index.schema import Link
|
||||||
from ..logging_util import TimedProgress, log_source_saved
|
from ..logging_util import TimedProgress, log_source_saved
|
||||||
|
|
||||||
from . import pocket_api
|
from . import pocket_api
|
||||||
|
from . import readwise_reader_api
|
||||||
from . import wallabag_atom
|
from . import wallabag_atom
|
||||||
from . import pocket_html
|
from . import pocket_html
|
||||||
from . import pinboard_rss
|
from . import pinboard_rss
|
||||||
|
@ -51,6 +52,7 @@ from . import url_list
|
||||||
PARSERS = {
|
PARSERS = {
|
||||||
# Specialized parsers
|
# Specialized parsers
|
||||||
pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
|
pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
|
||||||
|
readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
|
||||||
wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
|
wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
|
||||||
pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
|
pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
|
||||||
pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
|
pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
|
||||||
|
@ -233,6 +235,10 @@ _test_url_strs = {
|
||||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||||
'<test>http://example7.com</test>': 1,
|
'<test>http://example7.com</test>': 1,
|
||||||
|
'https://<test>': 0,
|
||||||
|
'https://[test]': 0,
|
||||||
|
'http://"test"': 0,
|
||||||
|
'http://\'test\'': 0,
|
||||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||||
|
|
|
@ -17,7 +17,10 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||||
|
|
||||||
json_file.seek(0)
|
json_file.seek(0)
|
||||||
links = json.load(json_file)
|
|
||||||
|
# sometimes the first line is a comment or filepath, so we get everything after the first {
|
||||||
|
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
|
||||||
|
links = json.loads(json_file_json_str)
|
||||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
|
|
123
archivebox/parsers/readwise_reader_api.py
Normal file
123
archivebox/parsers/readwise_reader_api.py
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
__package__ = "archivebox.parsers"
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from typing import IO, Iterable, Optional
|
||||||
|
from configparser import ConfigParser
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ..index.schema import Link
|
||||||
|
from ..util import enforce_types
|
||||||
|
from ..system import atomic_write
|
||||||
|
from ..config import (
|
||||||
|
SOURCES_DIR,
|
||||||
|
READWISE_READER_TOKENS,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
|
||||||
|
|
||||||
|
|
||||||
|
class ReadwiseReaderAPI:
|
||||||
|
cursor: Optional[str]
|
||||||
|
|
||||||
|
def __init__(self, api_token, cursor=None) -> None:
|
||||||
|
self.api_token = api_token
|
||||||
|
self.cursor = cursor
|
||||||
|
|
||||||
|
def get_archive(self):
|
||||||
|
response = requests.get(
|
||||||
|
url="https://readwise.io/api/v3/list/",
|
||||||
|
headers={"Authorization": "Token s71gNtiNDWquEvlJFFUyDU10ao8fn99lGyNryvyllQcDSnrd7X"},
|
||||||
|
params={
|
||||||
|
"location": "archive",
|
||||||
|
"pageCursor": self.cursor,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response
|
||||||
|
|
||||||
|
def get_readwise_reader_articles(api: ReadwiseReaderAPI):
|
||||||
|
response = api.get_archive()
|
||||||
|
body = response.json()
|
||||||
|
articles = body["results"]
|
||||||
|
|
||||||
|
yield from articles
|
||||||
|
|
||||||
|
|
||||||
|
if body['nextPageCursor']:
|
||||||
|
api.cursor = body["nextPageCursor"]
|
||||||
|
yield from get_readwise_reader_articles(api)
|
||||||
|
|
||||||
|
|
||||||
|
def link_from_article(article: dict, sources: list):
|
||||||
|
url: str = article['source_url']
|
||||||
|
title = article["title"] or url
|
||||||
|
timestamp = datetime.fromisoformat(article['updated_at']).timestamp()
|
||||||
|
|
||||||
|
return Link(
|
||||||
|
url=url,
|
||||||
|
timestamp=str(timestamp),
|
||||||
|
title=title,
|
||||||
|
tags="",
|
||||||
|
sources=sources,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def write_cursor(username: str, since: str):
|
||||||
|
if not API_DB_PATH.exists():
|
||||||
|
atomic_write(API_DB_PATH, "")
|
||||||
|
|
||||||
|
since_file = ConfigParser()
|
||||||
|
since_file.optionxform = str
|
||||||
|
since_file.read(API_DB_PATH)
|
||||||
|
|
||||||
|
since_file[username] = {"since": since}
|
||||||
|
|
||||||
|
with open(API_DB_PATH, "w+") as new:
|
||||||
|
since_file.write(new)
|
||||||
|
|
||||||
|
|
||||||
|
def read_cursor(username: str) -> Optional[str]:
|
||||||
|
if not API_DB_PATH.exists():
|
||||||
|
atomic_write(API_DB_PATH, "")
|
||||||
|
|
||||||
|
config_file = ConfigParser()
|
||||||
|
config_file.optionxform = str
|
||||||
|
config_file.read(API_DB_PATH)
|
||||||
|
|
||||||
|
return config_file.get(username, "since", fallback=None)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def should_parse_as_readwise_reader_api(text: str) -> bool:
|
||||||
|
return text.startswith("readwise-reader://")
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
|
"""Parse bookmarks from the Readwise Reader API"""
|
||||||
|
|
||||||
|
input_buffer.seek(0)
|
||||||
|
pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
|
||||||
|
for line in input_buffer:
|
||||||
|
if should_parse_as_readwise_reader_api(line):
|
||||||
|
username = pattern.search(line).group(1)
|
||||||
|
api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
|
||||||
|
|
||||||
|
for article in get_readwise_reader_articles(api):
|
||||||
|
yield link_from_article(article, sources=[line])
|
||||||
|
|
||||||
|
if api.cursor:
|
||||||
|
write_cursor(username, api.cursor)
|
||||||
|
|
||||||
|
|
||||||
|
KEY = "readwise_reader_api"
|
||||||
|
NAME = "Readwise Reader API"
|
||||||
|
PARSER = parse_readwise_reader_api_export
|
|
@ -1,62 +1,3 @@
|
||||||
{% extends "base.html" %}
|
|
||||||
{% load static %}
|
|
||||||
|
|
||||||
{% block body %}
|
|
||||||
<div id="toolbar">
|
|
||||||
<form id="changelist-search" action="{% url 'public-index' %}" method="get">
|
|
||||||
<div>
|
|
||||||
<label for="searchbar"><img src="/static/admin/img/search.svg" alt="Search"></label>
|
|
||||||
<input type="text" size="40" name="q" value="" id="searchbar" autofocus placeholder="Title, URL, tags, timestamp, or content...".>
|
|
||||||
<input type="submit" value="Search" style="height: 36px; padding-top: 6px; margin: 8px"/>
|
|
||||||
<input type="button"
|
|
||||||
value="♺"
|
|
||||||
title="Refresh..."
|
|
||||||
onclick="location.href='{% url 'public-index' %}'"
|
|
||||||
style="background-color: rgba(121, 174, 200, 0.8); height: 30px; font-size: 0.8em; margin-top: 12px; padding-top: 6px; float:right">
|
|
||||||
</input>
|
|
||||||
</div>
|
|
||||||
</form>
|
|
||||||
</div>
|
|
||||||
<table id="table-bookmarks">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th style="width: 100px;">Bookmarked</th>
|
|
||||||
<th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
|
|
||||||
<th style="width: 140px">Files</th>
|
|
||||||
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
{% for link in object_list %}
|
|
||||||
{% include 'main_index_row.html' with link=link %}
|
|
||||||
{% endfor %}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
<center>
|
|
||||||
<span class="step-links">
|
|
||||||
{% if page_obj.has_previous %}
|
|
||||||
<a href="{% url 'public-index' %}?page=1">« first</a>
|
|
||||||
<a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<span class="current">
|
|
||||||
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
|
|
||||||
</span>
|
|
||||||
|
|
||||||
{% if page_obj.has_next %}
|
|
||||||
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
|
|
||||||
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last »</a>
|
|
||||||
{% endif %}
|
|
||||||
</span>
|
|
||||||
|
|
||||||
{% if page_obj.has_next %}
|
|
||||||
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
|
|
||||||
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last »</a>
|
|
||||||
{% endif %}
|
|
||||||
</span>
|
|
||||||
<br>
|
|
||||||
</center>
|
|
||||||
{% endblock %}
|
|
||||||
{% extends "admin/base_site.html" %}
|
{% extends "admin/base_site.html" %}
|
||||||
{% load i18n admin_urls static admin_list %}
|
{% load i18n admin_urls static admin_list %}
|
||||||
{% load core_tags %}
|
{% load core_tags %}
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
<br/>
|
<br/>
|
||||||
<div class="loader"></div>
|
<div class="loader"></div>
|
||||||
<br/>
|
<br/>
|
||||||
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
|
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for detailed progress...
|
||||||
</center>
|
</center>
|
||||||
</div>
|
</div>
|
||||||
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
|
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
|
||||||
|
@ -46,19 +46,22 @@
|
||||||
</form>
|
</form>
|
||||||
<br/><br/><br/>
|
<br/><br/><br/>
|
||||||
<center id="delay-warning" style="display: none">
|
<center id="delay-warning" style="display: none">
|
||||||
<small>(it's safe to leave this page, adding will continue in the background)</small>
|
<small>(you will be redirected to your <a href="/">Snapshot list</a> momentarily, its safe to close this page at any time)</small>
|
||||||
</center>
|
</center>
|
||||||
{% if absolute_add_path %}
|
{% if absolute_add_path %}
|
||||||
<center id="bookmarklet">
|
<!-- <center id="bookmarklet">
|
||||||
<p>Bookmark this link to quickly add to your archive:
|
<p>Bookmark this link to quickly add to your archive:
|
||||||
<a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p>
|
<a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p>
|
||||||
</center>
|
</center> -->
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<script>
|
<script>
|
||||||
document.getElementById('add-form').addEventListener('submit', function(event) {
|
document.getElementById('add-form').addEventListener('submit', function(event) {
|
||||||
document.getElementById('in-progress').style.display = 'block'
|
document.getElementById('in-progress').style.display = 'block'
|
||||||
document.getElementById('add-form').style.display = 'none'
|
document.getElementById('add-form').style.display = 'none'
|
||||||
document.getElementById('delay-warning').style.display = 'block'
|
document.getElementById('delay-warning').style.display = 'block'
|
||||||
|
setTimeout(function() {
|
||||||
|
window.location = '/'
|
||||||
|
}, 2000)
|
||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
</script>
|
</script>
|
||||||
|
|
|
@ -17,6 +17,8 @@ from requests.exceptions import RequestException, ReadTimeout
|
||||||
|
|
||||||
from .vendor.base32_crockford import encode as base32_encode # type: ignore
|
from .vendor.base32_crockford import encode as base32_encode # type: ignore
|
||||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||||
|
from os.path import lexists
|
||||||
|
from os import remove as remove_file
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import chardet
|
import chardet
|
||||||
|
@ -59,7 +61,7 @@ URL_REGEX = re.compile(
|
||||||
r'(?=('
|
r'(?=('
|
||||||
r'http[s]?://' # start matching from allowed schemes
|
r'http[s]?://' # start matching from allowed schemes
|
||||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||||
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
|
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||||
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
||||||
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
|
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
|
||||||
r'))',
|
r'))',
|
||||||
|
@ -272,6 +274,16 @@ def chrome_args(**options) -> List[str]:
|
||||||
|
|
||||||
return cmd_args
|
return cmd_args
|
||||||
|
|
||||||
|
def chrome_cleanup():
|
||||||
|
"""
|
||||||
|
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||||
|
a timeout or other error
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .config import IN_DOCKER
|
||||||
|
|
||||||
|
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
|
||||||
|
remove_file("/home/archivebox/.config/chromium/SingletonLock")
|
||||||
|
|
||||||
def ansi_to_html(text):
|
def ansi_to_html(text):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -65,8 +65,9 @@ check_platforms || (recreate_builder && check_platforms) || exit 1
|
||||||
|
|
||||||
|
|
||||||
echo "[+] Building archivebox:$VERSION docker image..."
|
echo "[+] Building archivebox:$VERSION docker image..."
|
||||||
#docker build . \
|
# docker builder prune
|
||||||
docker buildx build --platform "$REQUIRED_PLATFORMS" --push . \
|
# docker build . --no-cache -t archivebox-dev \
|
||||||
|
docker buildx build --platform "$REQUIRED_PLATFORMS" --load . \
|
||||||
-t archivebox \
|
-t archivebox \
|
||||||
-t archivebox:$TAG_NAME \
|
-t archivebox:$TAG_NAME \
|
||||||
-t archivebox:$VERSION \
|
-t archivebox:$VERSION \
|
||||||
|
|
|
@ -25,7 +25,10 @@ cd "$REPO_DIR"
|
||||||
rm -Rf build dist
|
rm -Rf build dist
|
||||||
|
|
||||||
echo "[+] Building sdist, bdist_wheel, and egg_info"
|
echo "[+] Building sdist, bdist_wheel, and egg_info"
|
||||||
python3 setup.py \
|
# python3 setup.py \
|
||||||
sdist --dist-dir=./pip_dist \
|
# sdist --dist-dir=./pip_dist \
|
||||||
bdist_wheel --dist-dir=./pip_dist \
|
# bdist_wheel --dist-dir=./pip_dist \
|
||||||
egg_info --egg-base=./pip_dist
|
# egg_info --egg-base=./pip_dist
|
||||||
|
|
||||||
|
# pip install --upgrade pip setuptools build
|
||||||
|
python -m build
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env bash
|
#!/bin/bash
|
||||||
|
|
||||||
DATA_DIR="${DATA_DIR:-/data}"
|
DATA_DIR="${DATA_DIR:-/data}"
|
||||||
ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
|
ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
|
||||||
|
@ -12,22 +12,26 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then
|
||||||
groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
|
groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
export PUID="$(id -u archivebox)"
|
||||||
|
export PGID="$(id -g archivebox)"
|
||||||
|
|
||||||
# Set the permissions of the data dir to match the archivebox user
|
# Check the permissions of the data dir (or create if it doesn't exist)
|
||||||
if [[ -d "$DATA_DIR/archive" ]]; then
|
if [[ -d "$DATA_DIR/archive" ]]; then
|
||||||
# check data directory permissions
|
if touch "$DATA_DIR/archive/.permissions_test_safe_to_delete"; then
|
||||||
if [[ ! "$(stat -c %u $DATA_DIR/archive)" = "$(id -u archivebox)" ]]; then
|
# It's fine, we are able to write to the data directory
|
||||||
echo "Change in ownership detected, please be patient while we chown existing files"
|
rm "$DATA_DIR/archive/.permissions_test_safe_to_delete"
|
||||||
echo "This could take some time..."
|
# echo "[√] Permissions are correct"
|
||||||
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER -R "$DATA_DIR"
|
else
|
||||||
|
echo "[X] Permissions Error: ArchiveBox is not able to write to your data dir. You need to fix the data dir ownership and retry:" >2
|
||||||
|
echo " chown -R $PUID:$PGID data" >2
|
||||||
|
echo " https://docs.linuxserver.io/general/understanding-puid-and-pgid" >2
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
# create data directory
|
# create data directory
|
||||||
mkdir -p "$DATA_DIR/logs"
|
mkdir -p "$DATA_DIR/logs"
|
||||||
chown -R $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
|
|
||||||
fi
|
fi
|
||||||
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
|
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" "$DATA_DIR"/*
|
||||||
|
|
||||||
|
|
||||||
# Drop permissions to run commands as the archivebox user
|
# Drop permissions to run commands as the archivebox user
|
||||||
if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then
|
if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then
|
||||||
|
|
|
@ -34,6 +34,8 @@ services:
|
||||||
# - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
|
# - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
|
||||||
# - PUID=1000 # set to your host user's UID & GID if you encounter permissions issues
|
# - PUID=1000 # set to your host user's UID & GID if you encounter permissions issues
|
||||||
# - PGID=1000
|
# - PGID=1000
|
||||||
|
# - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo
|
||||||
|
# - ADMIN_PASSWORD=SomeSecretPassword
|
||||||
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
|
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
|
||||||
# - SEARCH_BACKEND_HOST_NAME=sonic
|
# - SEARCH_BACKEND_HOST_NAME=sonic
|
||||||
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
|
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
|
||||||
|
|
3041
package-lock.json
generated
3041
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -1,13 +1,13 @@
|
||||||
{
|
{
|
||||||
"name": "archivebox",
|
"name": "archivebox",
|
||||||
"version": "0.6.3",
|
"version": "0.7.0",
|
||||||
"description": "ArchiveBox: The self-hosted internet archive",
|
"description": "ArchiveBox: The self-hosted internet archive",
|
||||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||||
"repository": "github:ArchiveBox/ArchiveBox",
|
"repository": "github:ArchiveBox/ArchiveBox",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
|
"@postlight/parser": "^2.2.3",
|
||||||
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
|
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
|
||||||
"single-file": "git+https://github.com/gildas-lormeau/SingleFile.git"
|
"single-file-cli": "^1.1.12"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
121
pyproject.toml
Normal file
121
pyproject.toml
Normal file
|
@ -0,0 +1,121 @@
|
||||||
|
[project]
|
||||||
|
name = "archivebox"
|
||||||
|
version = "0.7.0"
|
||||||
|
description = "Self-hosted internet archiving solution."
|
||||||
|
authors = [
|
||||||
|
{name = "Nick Sweeting", email = "setup.py@archivebox.io"},
|
||||||
|
]
|
||||||
|
dependencies = [
|
||||||
|
"setuptools>=68.2.2",
|
||||||
|
"croniter>=0.3.34",
|
||||||
|
"dateparser>=1.0.0",
|
||||||
|
"django-extensions>=3.0.3",
|
||||||
|
"django>=3.1.3,<3.2",
|
||||||
|
"ipython>5.0.0",
|
||||||
|
"mypy-extensions>=0.4.3",
|
||||||
|
"python-crontab>=2.5.1",
|
||||||
|
"requests>=2.24.0",
|
||||||
|
"w3lib>=1.22.0",
|
||||||
|
# "youtube-dl>=2021.04.17",
|
||||||
|
"yt-dlp>=2021.4.11",
|
||||||
|
"playwright>=1.39.0",
|
||||||
|
]
|
||||||
|
requires-python = ">=3.9"
|
||||||
|
readme = "README.md"
|
||||||
|
license = {text = "MIT"}
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 4 - Beta",
|
||||||
|
"Environment :: Console",
|
||||||
|
"Environment :: Web Environment",
|
||||||
|
"Framework :: Django",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Intended Audience :: Education",
|
||||||
|
"Intended Audience :: End Users/Desktop",
|
||||||
|
"Intended Audience :: Information Technology",
|
||||||
|
"Intended Audience :: Legal Industry",
|
||||||
|
"Intended Audience :: System Administrators",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Natural Language :: English",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.7",
|
||||||
|
"Programming Language :: Python :: 3.8",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Topic :: Internet :: WWW/HTTP",
|
||||||
|
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
||||||
|
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
|
||||||
|
"Topic :: Sociology :: History",
|
||||||
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||||
|
"Topic :: System :: Archiving",
|
||||||
|
"Topic :: System :: Archiving :: Backup",
|
||||||
|
"Topic :: System :: Recovery Tools",
|
||||||
|
"Topic :: Utilities",
|
||||||
|
"Typing :: Typed",
|
||||||
|
]
|
||||||
|
|
||||||
|
# pdm lock -G:all
|
||||||
|
# pdm install -G:all
|
||||||
|
[tool.pdm.dev-dependencies]
|
||||||
|
build = [
|
||||||
|
"pdm",
|
||||||
|
"bottle",
|
||||||
|
"setuptools",
|
||||||
|
"stdeb",
|
||||||
|
"twine",
|
||||||
|
"wheel",
|
||||||
|
]
|
||||||
|
lint = [
|
||||||
|
"flake8",
|
||||||
|
"mypy",
|
||||||
|
"django-stubs",
|
||||||
|
]
|
||||||
|
test = [
|
||||||
|
"pytest",
|
||||||
|
]
|
||||||
|
debug = [
|
||||||
|
"django-debug-toolbar",
|
||||||
|
"djdt_flamegraph",
|
||||||
|
"ipdb",
|
||||||
|
]
|
||||||
|
doc = [
|
||||||
|
"recommonmark",
|
||||||
|
"sphinx",
|
||||||
|
"sphinx-rtd-theme",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
sonic = [
|
||||||
|
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
|
||||||
|
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
|
||||||
|
"sonic-client>=0.0.5",
|
||||||
|
]
|
||||||
|
ldap = [
|
||||||
|
# apt install libldap2-dev libsasl2-dev
|
||||||
|
"django-auth-ldap>=4.1.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
archivebox = "archivebox.cli:main"
|
||||||
|
|
||||||
|
[tool.pdm.scripts]
|
||||||
|
lint = "./bin/lint.sh"
|
||||||
|
test = "./bin/test.sh"
|
||||||
|
# all = {composite = ["lint mypackage/", "test -v tests/"]}
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["pdm-backend"]
|
||||||
|
build-backend = "pdm.backend"
|
||||||
|
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/ArchiveBox/ArchiveBox"
|
||||||
|
Source = "https://github.com/ArchiveBox/ArchiveBox"
|
||||||
|
Documentation = "https://github.com/ArchiveBox/ArchiveBox/wiki"
|
||||||
|
"Bug Tracker" = "https://github.com/ArchiveBox/ArchiveBox/issues"
|
||||||
|
Changelog = "https://github.com/ArchiveBox/ArchiveBox/releases"
|
||||||
|
Roadmap = "https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap"
|
||||||
|
Community = "https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community"
|
||||||
|
Demo = "https://demo.archivebox.io"
|
||||||
|
Donate = "https://github.com/ArchiveBox/ArchiveBox/wiki/Donations"
|
||||||
|
|
||||||
|
|
265
setup.py
265
setup.py
|
@ -1,145 +1,150 @@
|
||||||
import json
|
#####################################################################################
|
||||||
import setuptools
|
# THIS FILE IS DEPRECATED AND WILL BE REMOVED EVENTUALLU
|
||||||
from setuptools.command.test import test
|
# ALL FUTURE CHANGES SHOULD HAPPEN IN pyproject.toml with pdm
|
||||||
|
#####################################################################################
|
||||||
|
|
||||||
from pathlib import Path
|
# import json
|
||||||
|
# import setuptools
|
||||||
|
# from setuptools.command.test import test
|
||||||
|
|
||||||
|
# from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
PKG_NAME = "archivebox"
|
# PKG_NAME = "archivebox"
|
||||||
DESCRIPTION = "Self-hosted internet archiving solution."
|
# DESCRIPTION = "Self-hosted internet archiving solution."
|
||||||
LICENSE = "MIT"
|
# LICENSE = "MIT"
|
||||||
AUTHOR = "Nick Sweeting"
|
# AUTHOR = "Nick Sweeting"
|
||||||
AUTHOR_EMAIL="git@nicksweeting.com"
|
# AUTHOR_EMAIL="setup.py@archivebox.io"
|
||||||
REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
|
# REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
|
||||||
PROJECT_URLS = {
|
# PROJECT_URLS = {
|
||||||
"Source": f"{REPO_URL}",
|
# "Source": f"{REPO_URL}",
|
||||||
"Documentation": f"{REPO_URL}/wiki",
|
# "Documentation": f"{REPO_URL}/wiki",
|
||||||
"Bug Tracker": f"{REPO_URL}/issues",
|
# "Bug Tracker": f"{REPO_URL}/issues",
|
||||||
"Changelog": f"{REPO_URL}/releases",
|
# "Changelog": f"{REPO_URL}/releases",
|
||||||
"Roadmap": f"{REPO_URL}/wiki/Roadmap",
|
# "Roadmap": f"{REPO_URL}/wiki/Roadmap",
|
||||||
"Community": f"{REPO_URL}/wiki/Web-Archiving-Community",
|
# "Community": f"{REPO_URL}/wiki/Web-Archiving-Community",
|
||||||
"Demo": f"https://demo.archivebox.io",
|
# "Demo": f"https://demo.archivebox.io",
|
||||||
"Donate": f"{REPO_URL}/wiki/Donations",
|
# "Donate": f"{REPO_URL}/wiki/Donations",
|
||||||
}
|
# }
|
||||||
|
|
||||||
ROOT_DIR = Path(__file__).parent.resolve()
|
# ROOT_DIR = Path(__file__).parent.resolve()
|
||||||
PACKAGE_DIR = ROOT_DIR / PKG_NAME
|
# PACKAGE_DIR = ROOT_DIR / PKG_NAME
|
||||||
|
|
||||||
README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
|
# README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
|
||||||
VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
|
# VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
|
||||||
|
|
||||||
PYTHON_REQUIRES = ">=3.7"
|
# class DisabledTestCommand(test):
|
||||||
SETUP_REQUIRES = ["wheel"]
|
# def run(self):
|
||||||
INSTALL_REQUIRES = [
|
# # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
|
||||||
# only add things here that have corresponding apt python3-packages available
|
# print('\n[X] Running tests via setup.py test is deprecated.')
|
||||||
# anything added here also needs to be added to our package dependencies in
|
# print(' Hint: Use the ./bin/test.sh script or pytest instead')
|
||||||
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
|
|
||||||
# if there is no apt python3-package equivalent, then vendor it instead in
|
|
||||||
# ./archivebox/vendor/
|
|
||||||
"requests>=2.24.0",
|
|
||||||
"mypy-extensions>=0.4.3",
|
|
||||||
"django>=3.1.3,<3.2",
|
|
||||||
"django-extensions>=3.0.3",
|
|
||||||
"dateparser>=1.0.0",
|
|
||||||
"youtube-dl>=2021.04.17",
|
|
||||||
"yt-dlp>=2021.4.11",
|
|
||||||
"python-crontab>=2.5.1",
|
|
||||||
"croniter>=0.3.34",
|
|
||||||
"w3lib>=1.22.0",
|
|
||||||
"ipython>5.0.0",
|
|
||||||
]
|
|
||||||
EXTRAS_REQUIRE = {
|
|
||||||
'sonic': [
|
|
||||||
"sonic-client>=0.0.5",
|
|
||||||
],
|
|
||||||
'dev': [
|
|
||||||
"setuptools",
|
|
||||||
"twine",
|
|
||||||
"wheel",
|
|
||||||
"flake8",
|
|
||||||
"ipdb",
|
|
||||||
"mypy",
|
|
||||||
"django-stubs",
|
|
||||||
"sphinx",
|
|
||||||
"sphinx-rtd-theme",
|
|
||||||
"recommonmark",
|
|
||||||
"pytest",
|
|
||||||
"bottle",
|
|
||||||
"stdeb",
|
|
||||||
"django-debug-toolbar",
|
|
||||||
"djdt_flamegraph",
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
# To see when setup.py gets called (uncomment for debugging):
|
# To see when setup.py gets called (uncomment for debugging):
|
||||||
# import sys
|
# import sys
|
||||||
# print(PACKAGE_DIR, f" (v{VERSION})")
|
# print(PACKAGE_DIR, f" (v{VERSION})")
|
||||||
# print('>', sys.executable, *sys.argv)
|
# print('>', sys.executable, *sys.argv)
|
||||||
|
|
||||||
|
# PYTHON_REQUIRES = ">=3.9"
|
||||||
|
# SETUP_REQUIRES = ["wheel"]
|
||||||
|
# INSTALL_REQUIRES = [
|
||||||
|
# # only add things here that have corresponding apt python3-packages available
|
||||||
|
# # anything added here also needs to be added to our package dependencies in
|
||||||
|
# # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
|
||||||
|
# # if there is no apt python3-package equivalent, then vendor it instead in
|
||||||
|
# # ./archivebox/vendor/
|
||||||
|
# "requests>=2.24.0",
|
||||||
|
# "mypy-extensions>=0.4.3",
|
||||||
|
# "django>=3.1.3,<3.2",
|
||||||
|
# "django-extensions>=3.0.3",
|
||||||
|
# "dateparser>=1.0.0",
|
||||||
|
# "youtube-dl>=2021.04.17",
|
||||||
|
# "yt-dlp>=2021.4.11",
|
||||||
|
# "python-crontab>=2.5.1",
|
||||||
|
# "croniter>=0.3.34",
|
||||||
|
# "w3lib>=1.22.0",
|
||||||
|
# "ipython>5.0.0",
|
||||||
|
# ]
|
||||||
|
# EXTRAS_REQUIRE = {
|
||||||
|
# 'sonic': [
|
||||||
|
# "sonic-client>=0.0.5",
|
||||||
|
# ],
|
||||||
|
# 'ldap': [
|
||||||
|
# "django-auth-ldap>=4.1.0",
|
||||||
|
# ],
|
||||||
|
# 'dev': [
|
||||||
|
# "setuptools",
|
||||||
|
# "twine",
|
||||||
|
# "wheel",
|
||||||
|
# "flake8",
|
||||||
|
# "ipdb",
|
||||||
|
# "mypy",
|
||||||
|
# "django-stubs",
|
||||||
|
# "sphinx",
|
||||||
|
# "sphinx-rtd-theme",
|
||||||
|
# "recommonmark",
|
||||||
|
# "pytest",
|
||||||
|
# "bottle",
|
||||||
|
# "stdeb",
|
||||||
|
# "django-debug-toolbar",
|
||||||
|
# "djdt_flamegraph",
|
||||||
|
# ],
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
# setuptools.setup(
|
||||||
|
# name=PKG_NAME,
|
||||||
|
# version=VERSION,
|
||||||
|
# license=LICENSE,
|
||||||
|
# author=AUTHOR,
|
||||||
|
# author_email=AUTHOR_EMAIL,
|
||||||
|
# description=DESCRIPTION,
|
||||||
|
# long_description=README,
|
||||||
|
# long_description_content_type="text/markdown",
|
||||||
|
# url=REPO_URL,
|
||||||
|
# project_urls=PROJECT_URLS,
|
||||||
|
# python_requires=PYTHON_REQUIRES,
|
||||||
|
# setup_requires=SETUP_REQUIRES,
|
||||||
|
# install_requires=INSTALL_REQUIRES,
|
||||||
|
# extras_require=EXTRAS_REQUIRE,
|
||||||
|
# packages=[PKG_NAME],
|
||||||
|
# include_package_data=True, # see MANIFEST.in
|
||||||
|
# entry_points={
|
||||||
|
# "console_scripts": [
|
||||||
|
# f"{PKG_NAME} = {PKG_NAME}.cli:main",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# classifiers=[
|
||||||
|
# "License :: OSI Approved :: MIT License",
|
||||||
|
# "Natural Language :: English",
|
||||||
|
# "Operating System :: OS Independent",
|
||||||
|
# "Development Status :: 4 - Beta",
|
||||||
|
|
||||||
class DisabledTestCommand(test):
|
# "Topic :: Utilities",
|
||||||
def run(self):
|
# "Topic :: System :: Archiving",
|
||||||
# setup.py test is deprecated, disable it here by force so stdeb doesnt run it
|
# "Topic :: System :: Archiving :: Backup",
|
||||||
print()
|
# "Topic :: System :: Recovery Tools",
|
||||||
print('[X] Running tests via setup.py test is deprecated.')
|
# "Topic :: Sociology :: History",
|
||||||
print(' Hint: Use the ./bin/test.sh script or pytest instead')
|
# "Topic :: Internet :: WWW/HTTP",
|
||||||
|
# "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
||||||
|
# "Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
|
||||||
|
# "Topic :: Software Development :: Libraries :: Python Modules",
|
||||||
|
|
||||||
|
# "Intended Audience :: Developers",
|
||||||
setuptools.setup(
|
# "Intended Audience :: Education",
|
||||||
name=PKG_NAME,
|
# "Intended Audience :: End Users/Desktop",
|
||||||
version=VERSION,
|
# "Intended Audience :: Information Technology",
|
||||||
license=LICENSE,
|
# "Intended Audience :: Legal Industry",
|
||||||
author=AUTHOR,
|
# "Intended Audience :: System Administrators",
|
||||||
author_email=AUTHOR_EMAIL,
|
|
||||||
description=DESCRIPTION,
|
|
||||||
long_description=README,
|
|
||||||
long_description_content_type="text/markdown",
|
|
||||||
url=REPO_URL,
|
|
||||||
project_urls=PROJECT_URLS,
|
|
||||||
python_requires=PYTHON_REQUIRES,
|
|
||||||
setup_requires=SETUP_REQUIRES,
|
|
||||||
install_requires=INSTALL_REQUIRES,
|
|
||||||
extras_require=EXTRAS_REQUIRE,
|
|
||||||
packages=[PKG_NAME],
|
|
||||||
include_package_data=True, # see MANIFEST.in
|
|
||||||
entry_points={
|
|
||||||
"console_scripts": [
|
|
||||||
f"{PKG_NAME} = {PKG_NAME}.cli:main",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
classifiers=[
|
|
||||||
"License :: OSI Approved :: MIT License",
|
|
||||||
"Natural Language :: English",
|
|
||||||
"Operating System :: OS Independent",
|
|
||||||
"Development Status :: 4 - Beta",
|
|
||||||
|
|
||||||
"Topic :: Utilities",
|
|
||||||
"Topic :: System :: Archiving",
|
|
||||||
"Topic :: System :: Archiving :: Backup",
|
|
||||||
"Topic :: System :: Recovery Tools",
|
|
||||||
"Topic :: Sociology :: History",
|
|
||||||
"Topic :: Internet :: WWW/HTTP",
|
|
||||||
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
||||||
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
|
|
||||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
||||||
|
|
||||||
"Intended Audience :: Developers",
|
|
||||||
"Intended Audience :: Education",
|
|
||||||
"Intended Audience :: End Users/Desktop",
|
|
||||||
"Intended Audience :: Information Technology",
|
|
||||||
"Intended Audience :: Legal Industry",
|
|
||||||
"Intended Audience :: System Administrators",
|
|
||||||
|
|
||||||
"Environment :: Console",
|
# "Environment :: Console",
|
||||||
"Environment :: Web Environment",
|
# "Environment :: Web Environment",
|
||||||
"Programming Language :: Python :: 3",
|
# "Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3.7",
|
# "Programming Language :: Python :: 3.7",
|
||||||
"Programming Language :: Python :: 3.8",
|
# "Programming Language :: Python :: 3.8",
|
||||||
"Programming Language :: Python :: 3.9",
|
# "Programming Language :: Python :: 3.9",
|
||||||
"Framework :: Django",
|
# "Framework :: Django",
|
||||||
"Typing :: Typed",
|
# "Typing :: Typed",
|
||||||
],
|
# ],
|
||||||
cmdclass={
|
# cmdclass={
|
||||||
"test": DisabledTestCommand,
|
# "test": DisabledTestCommand,
|
||||||
},
|
# },
|
||||||
)
|
# )
|
||||||
|
|
|
@ -6,6 +6,6 @@ Suite: focal
|
||||||
Suite3: focal
|
Suite3: focal
|
||||||
Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
|
Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
|
||||||
Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
|
Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
|
||||||
X-Python3-Version: >= 3.7
|
X-Python3-Version: >= 3.9
|
||||||
XS-Python-Version: >= 3.7
|
XS-Python-Version: >= 3.9
|
||||||
Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck
|
Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue