mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-15 07:34:27 -04:00
Merge branch 'dev' into issue1316
This commit is contained in:
commit
ef856e8051
50 changed files with 1469 additions and 1694 deletions
|
@ -17,6 +17,11 @@ venv/
|
||||||
.venv-old/
|
.venv-old/
|
||||||
.docker-venv/
|
.docker-venv/
|
||||||
node_modules/
|
node_modules/
|
||||||
|
chrome/
|
||||||
|
chromeprofile/
|
||||||
|
|
||||||
|
pdm.dev.lock
|
||||||
|
pdm.lock
|
||||||
|
|
||||||
docs/
|
docs/
|
||||||
build/
|
build/
|
||||||
|
|
5
.github/FUNDING.yml
vendored
5
.github/FUNDING.yml
vendored
|
@ -1,3 +1,2 @@
|
||||||
github: pirate
|
github: ["ArchiveBox", "pirate"]
|
||||||
patreon: theSquashSH
|
custom: ["https://donate.archivebox.io", "https://paypal.me/NicholasSweeting"]
|
||||||
custom: ["https://hcb.hackclub.com/donations/start/archivebox", "https://paypal.me/NicholasSweeting"]
|
|
||||||
|
|
16
.github/workflows/docker.yml
vendored
16
.github/workflows/docker.yml
vendored
|
@ -31,7 +31,7 @@ jobs:
|
||||||
with:
|
with:
|
||||||
version: latest
|
version: latest
|
||||||
install: true
|
install: true
|
||||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
- name: Builder instance name
|
- name: Builder instance name
|
||||||
run: echo ${{ steps.buildx.outputs.name }}
|
run: echo ${{ steps.buildx.outputs.name }}
|
||||||
|
@ -51,20 +51,26 @@ jobs:
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
if: github.event_name != 'pull_request'
|
if: github.event_name != 'pull_request'
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
|
|
||||||
- name: Collect Docker tags
|
- name: Collect Docker tags
|
||||||
|
# https://github.com/docker/metadata-action
|
||||||
id: docker_meta
|
id: docker_meta
|
||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: archivebox/archivebox,nikisweeting/archivebox
|
images: archivebox/archivebox,nikisweeting/archivebox
|
||||||
tags: |
|
tags: |
|
||||||
|
# :stable
|
||||||
type=ref,event=branch
|
type=ref,event=branch
|
||||||
|
# :0.7.3
|
||||||
type=semver,pattern={{version}}
|
type=semver,pattern={{version}}
|
||||||
|
# :0.7
|
||||||
type=semver,pattern={{major}}.{{minor}}
|
type=semver,pattern={{major}}.{{minor}}
|
||||||
|
# :sha-463ea54
|
||||||
type=sha
|
type=sha
|
||||||
type=raw,value=latest,enable={{is_default_branch}}
|
# :latest
|
||||||
|
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'stable') }}
|
||||||
|
|
||||||
- name: Build and push
|
- name: Build and push
|
||||||
id: docker_build
|
id: docker_build
|
||||||
|
@ -77,7 +83,7 @@ jobs:
|
||||||
tags: ${{ steps.docker_meta.outputs.tags }}
|
tags: ${{ steps.docker_meta.outputs.tags }}
|
||||||
cache-from: type=local,src=/tmp/.buildx-cache
|
cache-from: type=local,src=/tmp/.buildx-cache
|
||||||
cache-to: type=local,dest=/tmp/.buildx-cache-new
|
cache-to: type=local,dest=/tmp/.buildx-cache-new
|
||||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
- name: Image digest
|
- name: Image digest
|
||||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||||
|
|
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -13,8 +13,9 @@ venv/
|
||||||
node_modules/
|
node_modules/
|
||||||
|
|
||||||
# Ignore dev lockfiles (should always be built fresh)
|
# Ignore dev lockfiles (should always be built fresh)
|
||||||
requirements-dev.txt
|
pdm.lock
|
||||||
pdm.dev.lock
|
pdm.dev.lock
|
||||||
|
requirements-dev.txt
|
||||||
|
|
||||||
# Packaging artifacts
|
# Packaging artifacts
|
||||||
.pdm-python
|
.pdm-python
|
||||||
|
@ -26,9 +27,6 @@ dist/
|
||||||
|
|
||||||
# Data folders
|
# Data folders
|
||||||
data/
|
data/
|
||||||
data1/
|
|
||||||
data2/
|
|
||||||
data3/
|
|
||||||
data*/
|
data*/
|
||||||
output/
|
output/
|
||||||
|
|
||||||
|
|
89
Dockerfile
89
Dockerfile
|
@ -87,7 +87,9 @@ COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
|
||||||
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
|
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
|
||||||
|
|
||||||
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
|
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
|
||||||
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache \
|
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
|
||||||
|
&& echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
|
||||||
|
&& echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-intall-suggests \
|
||||||
&& rm -f /etc/apt/apt.conf.d/docker-clean
|
&& rm -f /etc/apt/apt.conf.d/docker-clean
|
||||||
|
|
||||||
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
|
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
|
||||||
|
@ -120,10 +122,10 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
|
||||||
# Install system apt dependencies (adding backports to access more recent apt updates)
|
# Install system apt dependencies (adding backports to access more recent apt updates)
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
|
echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
|
||||||
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
|
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
|
||||||
&& mkdir -p /etc/apt/keyrings \
|
&& mkdir -p /etc/apt/keyrings \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
&& apt-get install -qq -y -t bookworm-backports \
|
||||||
# 1. packaging dependencies
|
# 1. packaging dependencies
|
||||||
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
|
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
|
||||||
# 2. docker and init system dependencies
|
# 2. docker and init system dependencies
|
||||||
|
@ -134,27 +136,13 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
|
|
||||||
######### Language Environments ####################################
|
######### Language Environments ####################################
|
||||||
|
|
||||||
# Install Node environment
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
|
|
||||||
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
|
|
||||||
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
|
|
||||||
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
|
|
||||||
&& apt-get update -qq \
|
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
|
||||||
nodejs libatomic1 python3-minimal \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
|
||||||
# Update NPM to latest version
|
|
||||||
&& npm i -g npm --cache /root/.npm \
|
|
||||||
# Save version info
|
|
||||||
&& ( \
|
|
||||||
which node && node --version \
|
|
||||||
&& which npm && npm --version \
|
|
||||||
&& echo -e '\n\n' \
|
|
||||||
) | tee -a /VERSION.txt
|
|
||||||
|
|
||||||
# Install Python environment
|
# Install Python environment
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
|
echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
|
||||||
|
# && apt-get update -qq \
|
||||||
|
# && apt-get install -qq -y -t bookworm-backports --no-upgrade \
|
||||||
|
# python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip \
|
||||||
|
# && rm -rf /var/lib/apt/lists/* \
|
||||||
# tell PDM to allow using global system python site packages
|
# tell PDM to allow using global system python site packages
|
||||||
# && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
|
# && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
|
||||||
# create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
|
# create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
|
||||||
|
@ -171,13 +159,34 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
&& echo -e '\n\n' \
|
&& echo -e '\n\n' \
|
||||||
) | tee -a /VERSION.txt
|
) | tee -a /VERSION.txt
|
||||||
|
|
||||||
|
|
||||||
|
# Install Node environment
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
|
||||||
|
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
|
||||||
|
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
|
||||||
|
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
|
||||||
|
&& apt-get install -y -t bookworm-backports --no-upgrade \
|
||||||
|
nodejs \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
# Update NPM to latest version
|
||||||
|
&& npm i -g npm --cache /root/.npm \
|
||||||
|
# Save version info
|
||||||
|
&& ( \
|
||||||
|
which node && node --version \
|
||||||
|
&& which npm && npm --version \
|
||||||
|
&& echo -e '\n\n' \
|
||||||
|
) | tee -a /VERSION.txt
|
||||||
|
|
||||||
|
|
||||||
######### Extractor Dependencies ##################################
|
######### Extractor Dependencies ##################################
|
||||||
|
|
||||||
# Install apt dependencies
|
# Install apt dependencies
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Installing APT extractor dependencies globally using apt..." \
|
echo "[+] Installing APT extractor dependencies globally using apt..." \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
&& apt-get install -qq -y -t bookworm-backports \
|
||||||
curl wget git yt-dlp ffmpeg ripgrep \
|
curl wget git yt-dlp ffmpeg ripgrep \
|
||||||
# Packages we have also needed in the past:
|
# Packages we have also needed in the past:
|
||||||
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
|
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
|
||||||
|
@ -196,25 +205,21 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
|
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
&& apt-get install -qq -y -t bookworm-backports \
|
||||||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||||
|
at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
|
||||||
|
libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
|
||||||
|
libxaw7 libxcomposite1 libxdamage1 libxfont2 \
|
||||||
|
libxkbfile1 libxmu6 libxpm4 libxt6 x11-xkb-utils xfonts-encodings \
|
||||||
|
# xfonts-scalable xfonts-utils xserver-common xvfb \
|
||||||
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
|
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
|
||||||
# libxss1 dbus dbus-x11 upower \
|
# libxss1 dbus dbus-x11 upower \
|
||||||
# && service dbus start \
|
# && service dbus start \
|
||||||
&& if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
|
# install Chromium using playwright
|
||||||
# install Chromium using playwright
|
&& pip install playwright \
|
||||||
pip install playwright \
|
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
|
||||||
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
|
&& playwright install chromium \
|
||||||
&& playwright install --with-deps chromium \
|
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
|
||||||
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
|
|
||||||
else \
|
|
||||||
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
|
|
||||||
# apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
|
||||||
# chromium \
|
|
||||||
# && export CHROME_BINARY="$(which chromium)"; \
|
|
||||||
echo 'armv7 no longer supported in versions after v0.7.3' \
|
|
||||||
exit 1; \
|
|
||||||
fi \
|
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
|
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
|
||||||
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
|
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
|
||||||
|
@ -247,8 +252,8 @@ COPY --chown=root:root --chmod=755 "./pyproject.toml" "requirements.txt" "$CODE_
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||||
echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
|
echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
&& apt-get install -qq -y -t bookworm-backports \
|
||||||
build-essential \
|
# build-essential \
|
||||||
libssl-dev libldap2-dev libsasl2-dev \
|
libssl-dev libldap2-dev libsasl2-dev \
|
||||||
python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
|
python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
|
||||||
# && ln -s "$GLOBAL_VENV" "$APP_VENV" \
|
# && ln -s "$GLOBAL_VENV" "$APP_VENV" \
|
||||||
|
@ -258,8 +263,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
# && pdm export -o requirements.txt --without-hashes \
|
# && pdm export -o requirements.txt --without-hashes \
|
||||||
# && source $GLOBAL_VENV/bin/activate \
|
# && source $GLOBAL_VENV/bin/activate \
|
||||||
&& pip install -r requirements.txt \
|
&& pip install -r requirements.txt \
|
||||||
&& apt-get purge -y \
|
# && apt-get purge -y \
|
||||||
build-essential \
|
# build-essential \
|
||||||
&& apt-get autoremove -y \
|
&& apt-get autoremove -y \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
@ -269,7 +274,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
|
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
|
||||||
# && apt-get update -qq \
|
# && apt-get update -qq \
|
||||||
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
|
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
|
||||||
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
# && apt-get install -qq -y -t bookworm-backports \
|
||||||
# build-essential \
|
# build-essential \
|
||||||
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
|
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
|
||||||
&& pip install -e "$CODE_DIR"[sonic,ldap] \
|
&& pip install -e "$CODE_DIR"[sonic,ldap] \
|
||||||
|
|
|
@ -407,7 +407,7 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, W
|
||||||
> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
|
> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
|
||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
<li>TrueNAS: <a href="https://truecharts.org/charts/incubator/archivebox/">Official ArchiveBox TrueChart</a> / <a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">Custom App Guide</a></li>
|
<li>TrueNAS: <a href="https://truecharts.org/charts/stable/archivebox/">Official ArchiveBox TrueChart</a> / <a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">Custom App Guide</a></li>
|
||||||
<li><a href="https://unraid.net/community/apps?q=archivebox#r">UnRaid</a></li>
|
<li><a href="https://unraid.net/community/apps?q=archivebox#r">UnRaid</a></li>
|
||||||
<li><a href="https://github.com/YunoHost-Apps/archivebox_ynh">Yunohost</a></li>
|
<li><a href="https://github.com/YunoHost-Apps/archivebox_ynh">Yunohost</a></li>
|
||||||
<li><a href="https://www.cloudron.io/store/io.archivebox.cloudronapp.html">Cloudron</a></li>
|
<li><a href="https://www.cloudron.io/store/io.archivebox.cloudronapp.html">Cloudron</a></li>
|
||||||
|
@ -445,6 +445,9 @@ Other providers of paid ArchiveBox hosting (not officially endorsed):<br/>
|
||||||
<li><a href="https://fly.io/">
|
<li><a href="https://fly.io/">
|
||||||
<img src="https://img.shields.io/badge/Unmanaged_App-Fly.io-%239a2de6.svg?style=flat" height="22px"/>
|
<img src="https://img.shields.io/badge/Unmanaged_App-Fly.io-%239a2de6.svg?style=flat" height="22px"/>
|
||||||
</a> (USD $10-50+/mo, <a href="https://fly.io/docs/hands-on/start/">instructions</a>)</li>
|
</a> (USD $10-50+/mo, <a href="https://fly.io/docs/hands-on/start/">instructions</a>)</li>
|
||||||
|
<li><a href="https://railway.app/template/2Vvhmy">
|
||||||
|
<img src="https://img.shields.io/badge/Unmanaged_App-Railway-%23A11BE6.svg?style=flat" height="22px"/>
|
||||||
|
</a> (USD $0-5+/mo)</li>
|
||||||
<li><a href="https://aws.amazon.com/marketplace/pp/Linnovate-Open-Source-Innovation-Support-For-Archi/B08RVW6MJ2"><img src="https://img.shields.io/badge/Unmanaged_VPS-AWS-%23ee8135.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
|
<li><a href="https://aws.amazon.com/marketplace/pp/Linnovate-Open-Source-Innovation-Support-For-Archi/B08RVW6MJ2"><img src="https://img.shields.io/badge/Unmanaged_VPS-AWS-%23ee8135.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
|
||||||
<li><a href="https://azuremarketplace.microsoft.com/en-us/marketplace/apps/meanio.archivebox?ocid=gtmrewards_whatsnewblog_archivebox_vol118"><img src="https://img.shields.io/badge/Unmanaged_VPS-Azure-%237cb300.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
|
<li><a href="https://azuremarketplace.microsoft.com/en-us/marketplace/apps/meanio.archivebox?ocid=gtmrewards_whatsnewblog_archivebox_vol118"><img src="https://img.shields.io/badge/Unmanaged_VPS-Azure-%237cb300.svg?style=flat" height="22px"/></a> (USD $60-200+/mo)</li>
|
||||||
<br/>
|
<br/>
|
||||||
|
|
|
@ -1 +1,7 @@
|
||||||
__package__ = 'archivebox'
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
|
|
||||||
|
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
|
||||||
|
import datetime
|
||||||
|
from django.utils import timezone
|
||||||
|
timezone.utc = datetime.timezone.utc
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
__package__ = 'archivebox.api'
|
|
@ -1,3 +1,5 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,184 +0,0 @@
|
||||||
# archivebox_api.py
|
|
||||||
from typing import List, Optional
|
|
||||||
from enum import Enum
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from ninja import Router
|
|
||||||
from main import (
|
|
||||||
add,
|
|
||||||
remove,
|
|
||||||
update,
|
|
||||||
list_all,
|
|
||||||
ONLY_NEW,
|
|
||||||
) # Assuming these functions are defined in main.py
|
|
||||||
|
|
||||||
|
|
||||||
# Schemas
|
|
||||||
|
|
||||||
class StatusChoices(str, Enum):
|
|
||||||
indexed = 'indexed'
|
|
||||||
archived = 'archived'
|
|
||||||
unarchived = 'unarchived'
|
|
||||||
present = 'present'
|
|
||||||
valid = 'valid'
|
|
||||||
invalid = 'invalid'
|
|
||||||
duplicate = 'duplicate'
|
|
||||||
orphaned = 'orphaned'
|
|
||||||
corrupted = 'corrupted'
|
|
||||||
unrecognized = 'unrecognized'
|
|
||||||
|
|
||||||
|
|
||||||
class AddURLSchema(BaseModel):
|
|
||||||
urls: List[str]
|
|
||||||
tag: str = ""
|
|
||||||
depth: int = 0
|
|
||||||
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
|
|
||||||
update_all: bool = False
|
|
||||||
index_only: bool = False
|
|
||||||
overwrite: bool = False
|
|
||||||
init: bool = False
|
|
||||||
extractors: str = ""
|
|
||||||
parser: str = "auto"
|
|
||||||
|
|
||||||
|
|
||||||
class RemoveURLSchema(BaseModel):
|
|
||||||
yes: bool = False
|
|
||||||
delete: bool = False
|
|
||||||
before: Optional[float] = None
|
|
||||||
after: Optional[float] = None
|
|
||||||
filter_type: str = "exact"
|
|
||||||
filter_patterns: Optional[List[str]] = None
|
|
||||||
|
|
||||||
|
|
||||||
class UpdateSchema(BaseModel):
|
|
||||||
resume: Optional[float] = None
|
|
||||||
only_new: Optional[bool] = None
|
|
||||||
index_only: Optional[bool] = False
|
|
||||||
overwrite: Optional[bool] = False
|
|
||||||
before: Optional[float] = None
|
|
||||||
after: Optional[float] = None
|
|
||||||
status: Optional[StatusChoices] = None
|
|
||||||
filter_type: Optional[str] = 'exact'
|
|
||||||
filter_patterns: Optional[List[str]] = None
|
|
||||||
extractors: Optional[str] = ""
|
|
||||||
|
|
||||||
|
|
||||||
class ListAllSchema(BaseModel):
|
|
||||||
filter_patterns: Optional[List[str]] = None
|
|
||||||
filter_type: str = 'exact'
|
|
||||||
status: Optional[StatusChoices] = None
|
|
||||||
after: Optional[float] = None
|
|
||||||
before: Optional[float] = None
|
|
||||||
sort: Optional[str] = None
|
|
||||||
csv: Optional[str] = None
|
|
||||||
json: bool = False
|
|
||||||
html: bool = False
|
|
||||||
with_headers: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
# API Router
|
|
||||||
router = Router()
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/add", response={200: dict})
|
|
||||||
def api_add(request, payload: AddURLSchema):
|
|
||||||
try:
|
|
||||||
result = add(
|
|
||||||
urls=payload.urls,
|
|
||||||
tag=payload.tag,
|
|
||||||
depth=payload.depth,
|
|
||||||
update=payload.update,
|
|
||||||
update_all=payload.update_all,
|
|
||||||
index_only=payload.index_only,
|
|
||||||
overwrite=payload.overwrite,
|
|
||||||
init=payload.init,
|
|
||||||
extractors=payload.extractors,
|
|
||||||
parser=payload.parser,
|
|
||||||
)
|
|
||||||
# Currently the add function returns a list of ALL items in the DB, ideally only return new items
|
|
||||||
return {
|
|
||||||
"status": "success",
|
|
||||||
"message": "URLs added successfully.",
|
|
||||||
"result": str(result),
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
# Handle exceptions raised by the add function or during processing
|
|
||||||
return {"status": "error", "message": str(e)}
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/remove", response={200: dict})
|
|
||||||
def api_remove(request, payload: RemoveURLSchema):
|
|
||||||
try:
|
|
||||||
result = remove(
|
|
||||||
yes=payload.yes,
|
|
||||||
delete=payload.delete,
|
|
||||||
before=payload.before,
|
|
||||||
after=payload.after,
|
|
||||||
filter_type=payload.filter_type,
|
|
||||||
filter_patterns=payload.filter_patterns,
|
|
||||||
)
|
|
||||||
return {
|
|
||||||
"status": "success",
|
|
||||||
"message": "URLs removed successfully.",
|
|
||||||
"result": result,
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
# Handle exceptions raised by the remove function or during processing
|
|
||||||
return {"status": "error", "message": str(e)}
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/update", response={200: dict})
|
|
||||||
def api_update(request, payload: UpdateSchema):
|
|
||||||
try:
|
|
||||||
result = update(
|
|
||||||
resume=payload.resume,
|
|
||||||
only_new=payload.only_new,
|
|
||||||
index_only=payload.index_only,
|
|
||||||
overwrite=payload.overwrite,
|
|
||||||
before=payload.before,
|
|
||||||
after=payload.after,
|
|
||||||
status=payload.status,
|
|
||||||
filter_type=payload.filter_type,
|
|
||||||
filter_patterns=payload.filter_patterns,
|
|
||||||
extractors=payload.extractors,
|
|
||||||
)
|
|
||||||
return {
|
|
||||||
"status": "success",
|
|
||||||
"message": "Archive updated successfully.",
|
|
||||||
"result": result,
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
# Handle exceptions raised by the update function or during processing
|
|
||||||
return {"status": "error", "message": str(e)}
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/list_all", response={200: dict})
|
|
||||||
def api_list_all(request, payload: ListAllSchema):
|
|
||||||
try:
|
|
||||||
result = list_all(
|
|
||||||
filter_patterns=payload.filter_patterns,
|
|
||||||
filter_type=payload.filter_type,
|
|
||||||
status=payload.status,
|
|
||||||
after=payload.after,
|
|
||||||
before=payload.before,
|
|
||||||
sort=payload.sort,
|
|
||||||
csv=payload.csv,
|
|
||||||
json=payload.json,
|
|
||||||
html=payload.html,
|
|
||||||
with_headers=payload.with_headers,
|
|
||||||
)
|
|
||||||
# TODO: This is kind of bad, make the format a choice field
|
|
||||||
if payload.json:
|
|
||||||
return {"status": "success", "format": "json", "data": result}
|
|
||||||
elif payload.html:
|
|
||||||
return {"status": "success", "format": "html", "data": result}
|
|
||||||
elif payload.csv:
|
|
||||||
return {"status": "success", "format": "csv", "data": result}
|
|
||||||
else:
|
|
||||||
return {
|
|
||||||
"status": "success",
|
|
||||||
"message": "List generated successfully.",
|
|
||||||
"data": result,
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
# Handle exceptions raised by the list_all function or during processing
|
|
||||||
return {"status": "error", "message": str(e)}
|
|
|
@ -1,48 +1,107 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from django.http import HttpRequest
|
||||||
|
from django.contrib.auth import login
|
||||||
from django.contrib.auth import authenticate
|
from django.contrib.auth import authenticate
|
||||||
from ninja import Form, Router, Schema
|
from django.contrib.auth.models import AbstractBaseUser
|
||||||
from ninja.security import HttpBearer
|
|
||||||
|
|
||||||
from api.models import Token
|
from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth, django_auth_superuser
|
||||||
|
|
||||||
router = Router()
|
|
||||||
|
|
||||||
|
|
||||||
class GlobalAuth(HttpBearer):
|
def auth_using_token(token, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
|
||||||
def authenticate(self, request, token):
|
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
|
||||||
|
from api.models import APIToken # lazy import model to avoid loading it at urls.py import time
|
||||||
|
|
||||||
|
user = None
|
||||||
|
|
||||||
|
submitted_empty_form = token in ('string', '', None)
|
||||||
|
if submitted_empty_form:
|
||||||
|
user = request.user # see if user is authed via django session and use that as the default
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
return Token.objects.get(token=token).user
|
token = APIToken.objects.get(token=token)
|
||||||
except Token.DoesNotExist:
|
if token.is_valid():
|
||||||
|
user = token.user
|
||||||
|
except APIToken.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if not user:
|
||||||
|
print('[❌] Failed to authenticate API user using API Key:', request)
|
||||||
|
|
||||||
class AuthSchema(Schema):
|
return None
|
||||||
email: str
|
|
||||||
password: str
|
|
||||||
|
|
||||||
|
def auth_using_password(username, password, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
|
||||||
|
"""Given a username and password, check if they are valid and return the corresponding user"""
|
||||||
|
user = None
|
||||||
|
|
||||||
@router.post("/authenticate", auth=None) # overriding global auth
|
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
|
||||||
def get_token(request, auth_data: AuthSchema):
|
if submitted_empty_form:
|
||||||
user = authenticate(username=auth_data.email, password=auth_data.password)
|
user = request.user # see if user is authed via django session and use that as the default
|
||||||
if user:
|
|
||||||
# Assuming a user can have multiple tokens and you want to create a new one every time
|
|
||||||
new_token = Token.objects.create(user=user)
|
|
||||||
return {"token": new_token.token, "expires": new_token.expiry_as_iso8601}
|
|
||||||
else:
|
else:
|
||||||
return {"error": "Invalid credentials"}
|
user = authenticate(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not user:
|
||||||
|
print('[❌] Failed to authenticate API user using API Key:', request)
|
||||||
|
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
class TokenValidationSchema(Schema):
|
### Base Auth Types
|
||||||
token: str
|
|
||||||
|
class APITokenAuthCheck:
|
||||||
|
"""The base class for authentication methods that use an api.models.APIToken"""
|
||||||
|
def authenticate(self, request: HttpRequest, key: Optional[str]=None) -> Optional[AbstractBaseUser]:
|
||||||
|
user = auth_using_token(
|
||||||
|
token=key,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
if user is not None:
|
||||||
|
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
|
||||||
|
return user
|
||||||
|
|
||||||
|
class UserPassAuthCheck:
|
||||||
|
"""The base class for authentication methods that use a username & password"""
|
||||||
|
def authenticate(self, request: HttpRequest, username: Optional[str]=None, password: Optional[str]=None) -> Optional[AbstractBaseUser]:
|
||||||
|
user = auth_using_password(
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
if user is not None:
|
||||||
|
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
@router.post("/validate_token", auth=None) # No authentication required for this endpoint
|
### Django-Ninja-Provided Auth Methods
|
||||||
def validate_token(request, token_data: TokenValidationSchema):
|
|
||||||
try:
|
class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
|
||||||
# Attempt to authenticate using the provided token
|
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
|
||||||
user = GlobalAuth().authenticate(request, token_data.token)
|
pass
|
||||||
if user:
|
|
||||||
return {"status": "valid"}
|
class QueryParamTokenAuth(APITokenAuthCheck, APIKeyQuery):
|
||||||
else:
|
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
|
||||||
return {"status": "invalid"}
|
param_name = "api_key"
|
||||||
except Token.DoesNotExist:
|
|
||||||
return {"status": "invalid"}
|
class HeaderTokenAuth(APITokenAuthCheck, APIKeyHeader):
|
||||||
|
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
|
||||||
|
param_name = "X-API-Key"
|
||||||
|
|
||||||
|
class BearerTokenAuth(APITokenAuthCheck, HttpBearer):
|
||||||
|
"""Allow authenticating by passing Bearer=xyz as a request header"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
### Enabled Auth Methods
|
||||||
|
|
||||||
|
API_AUTH_METHODS = [
|
||||||
|
QueryParamTokenAuth(),
|
||||||
|
HeaderTokenAuth(),
|
||||||
|
BearerTokenAuth(),
|
||||||
|
django_auth_superuser,
|
||||||
|
UsernameAndPasswordAuth(),
|
||||||
|
]
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
# Generated by Django 3.1.14 on 2024-04-09 18:52
|
# Generated by Django 4.2.11 on 2024-04-25 04:19
|
||||||
|
|
||||||
import api.models
|
import api.models
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.db import migrations, models
|
from django.db import migrations, models
|
||||||
import django.db.models.deletion
|
import django.db.models.deletion
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
class Migration(migrations.Migration):
|
||||||
|
@ -16,13 +17,13 @@ class Migration(migrations.Migration):
|
||||||
|
|
||||||
operations = [
|
operations = [
|
||||||
migrations.CreateModel(
|
migrations.CreateModel(
|
||||||
name='Token',
|
name='APIToken',
|
||||||
fields=[
|
fields=[
|
||||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||||
('token', models.CharField(default=auth.models.hex_uuid, max_length=32, unique=True)),
|
('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
|
||||||
('created', models.DateTimeField(auto_now_add=True)),
|
('created', models.DateTimeField(auto_now_add=True)),
|
||||||
('expiry', models.DateTimeField(blank=True, null=True)),
|
('expires', models.DateTimeField(blank=True, null=True)),
|
||||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tokens', to=settings.AUTH_USER_MODEL)),
|
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
17
archivebox/api/migrations/0002_alter_apitoken_options.py
Normal file
17
archivebox/api/migrations/0002_alter_apitoken_options.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# Generated by Django 5.0.4 on 2024-04-26 05:28
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('api', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterModelOptions(
|
||||||
|
name='apitoken',
|
||||||
|
options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
|
||||||
|
),
|
||||||
|
]
|
|
@ -1,30 +1,63 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
import secrets
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from django.utils.translation import gettext_lazy as _
|
|
||||||
|
|
||||||
def hex_uuid():
|
from django_stubs_ext.db.models import TypedModelMeta
|
||||||
return uuid.uuid4().hex
|
|
||||||
|
|
||||||
|
|
||||||
class Token(models.Model):
|
def generate_secret_token() -> str:
|
||||||
user = models.ForeignKey(
|
# returns cryptographically secure string with len() == 32
|
||||||
settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tokens"
|
return secrets.token_hex(16)
|
||||||
)
|
|
||||||
token = models.CharField(max_length=32, default=hex_uuid, unique=True)
|
|
||||||
|
class APIToken(models.Model):
|
||||||
|
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||||
|
|
||||||
|
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
|
||||||
|
token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
|
||||||
|
|
||||||
created = models.DateTimeField(auto_now_add=True)
|
created = models.DateTimeField(auto_now_add=True)
|
||||||
expiry = models.DateTimeField(null=True, blank=True)
|
expires = models.DateTimeField(null=True, blank=True)
|
||||||
|
|
||||||
|
class Meta(TypedModelMeta):
|
||||||
|
verbose_name = "API Key"
|
||||||
|
verbose_name_plural = "API Keys"
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.token
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f'<APIToken user={self.user.username} token=************{self.token[-4:]}>'
|
||||||
|
|
||||||
|
def __json__(self) -> dict:
|
||||||
|
return {
|
||||||
|
"TYPE": "APIToken",
|
||||||
|
"id": str(self.id),
|
||||||
|
"user_id": str(self.user.id),
|
||||||
|
"user_username": self.user.username,
|
||||||
|
"token": self.token,
|
||||||
|
"created": self.created.isoformat(),
|
||||||
|
"expires": self.expires_as_iso8601,
|
||||||
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def expiry_as_iso8601(self):
|
def expires_as_iso8601(self):
|
||||||
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
|
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
|
||||||
expiry_date = (
|
expiry_date = self.expires or (timezone.now() + timedelta(days=365 * 100))
|
||||||
self.expiry if self.expiry else timezone.now() + timedelta(days=365 * 100)
|
|
||||||
)
|
|
||||||
return expiry_date.isoformat()
|
return expiry_date.isoformat()
|
||||||
|
|
||||||
def __str__(self):
|
def is_valid(self, for_date=None):
|
||||||
return self.token
|
for_date = for_date or timezone.now()
|
||||||
|
|
||||||
|
if self.expires and self.expires < for_date:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
|
@ -1,27 +1,30 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from ninja.testing import TestClient
|
from ninja.testing import TestClient
|
||||||
from archivebox.api.archive import router as archive_router
|
|
||||||
|
|
||||||
class ArchiveBoxAPITestCase(TestCase):
|
from .routes_cli import router
|
||||||
|
|
||||||
|
class ArchiveBoxCLIAPITestCase(TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.client = TestClient(archive_router)
|
self.client = TestClient(router)
|
||||||
|
|
||||||
def test_add_endpoint(self):
|
def test_add_endpoint(self):
|
||||||
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "test"})
|
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.json()["status"], "success")
|
self.assertTrue(response.json()["success"])
|
||||||
|
|
||||||
def test_remove_endpoint(self):
|
def test_remove_endpoint(self):
|
||||||
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
|
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.json()["status"], "success")
|
self.assertTrue(response.json()["success"])
|
||||||
|
|
||||||
def test_update_endpoint(self):
|
def test_update_endpoint(self):
|
||||||
response = self.client.post("/update", json={})
|
response = self.client.post("/update", json={})
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.json()["status"], "success")
|
self.assertTrue(response.json()["success"])
|
||||||
|
|
||||||
def test_list_all_endpoint(self):
|
def test_list_all_endpoint(self):
|
||||||
response = self.client.post("/list_all", json={})
|
response = self.client.post("/list_all", json={})
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertTrue("success" in response.json()["status"])
|
self.assertTrue(response.json()["success"])
|
||||||
|
|
17
archivebox/api/urls.py
Normal file
17
archivebox/api/urls.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from django.urls import path
|
||||||
|
from django.views.generic.base import RedirectView
|
||||||
|
|
||||||
|
from .v1_api import urls as v1_api_urls
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path("", RedirectView.as_view(url='/api/v1')),
|
||||||
|
|
||||||
|
path("v1/", v1_api_urls),
|
||||||
|
path("v1", RedirectView.as_view(url='/api/v1/docs')),
|
||||||
|
|
||||||
|
# ... v2 can be added here ...
|
||||||
|
# path("v2/", v2_api_urls),
|
||||||
|
# path("v2", RedirectView.as_view(url='/api/v2/docs')),
|
||||||
|
]
|
111
archivebox/api/v1_api.py
Normal file
111
archivebox/api/v1_api.py
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
|
||||||
|
from io import StringIO
|
||||||
|
from traceback import format_exception
|
||||||
|
from contextlib import redirect_stdout, redirect_stderr
|
||||||
|
|
||||||
|
from django.http import HttpRequest, HttpResponse
|
||||||
|
from django.core.exceptions import ObjectDoesNotExist, EmptyResultSet, PermissionDenied
|
||||||
|
|
||||||
|
from ninja import NinjaAPI, Swagger
|
||||||
|
|
||||||
|
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
|
||||||
|
|
||||||
|
from api.auth import API_AUTH_METHODS
|
||||||
|
from ..config import VERSION, COMMIT_HASH
|
||||||
|
|
||||||
|
|
||||||
|
COMMIT_HASH = COMMIT_HASH or 'unknown'
|
||||||
|
|
||||||
|
html_description=f'''
|
||||||
|
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
|
||||||
|
<br/>
|
||||||
|
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
|
||||||
|
<br/>
|
||||||
|
<ul>
|
||||||
|
<li>⬅️ Manage your server: <a href="/admin/api/"><b>Setup API Keys</b></a>, <a href="/admin/">Go to your Server Admin UI</a>, <a href="/">Go to your Snapshots list</a>
|
||||||
|
<li>💬 Ask questions and get help here: <a href="https://zulip.archivebox.io">ArchiveBox Chat Forum</a></li>
|
||||||
|
<li>🐞 Report API bugs here: <a href="https://github.com/ArchiveBox/ArchiveBox/issues">Github Issues</a></li>
|
||||||
|
<li>📚 ArchiveBox Documentation: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Github Wiki</a></li>
|
||||||
|
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
|
||||||
|
</ul>
|
||||||
|
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||||
|
api.add_router('/auth/', 'api.v1_auth.router')
|
||||||
|
api.add_router('/core/', 'api.v1_core.router')
|
||||||
|
api.add_router('/cli/', 'api.v1_cli.router')
|
||||||
|
return api
|
||||||
|
|
||||||
|
|
||||||
|
class NinjaAPIWithIOCapture(NinjaAPI):
|
||||||
|
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
|
||||||
|
stdout, stderr = StringIO(), StringIO()
|
||||||
|
|
||||||
|
with redirect_stderr(stderr):
|
||||||
|
with redirect_stdout(stdout):
|
||||||
|
request.stdout = stdout
|
||||||
|
request.stderr = stderr
|
||||||
|
|
||||||
|
response = super().create_temporal_response(request)
|
||||||
|
|
||||||
|
print('RESPONDING NOW', response)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
api = NinjaAPIWithIOCapture(
|
||||||
|
title='ArchiveBox API',
|
||||||
|
description=html_description,
|
||||||
|
version='1.0.0',
|
||||||
|
csrf=False,
|
||||||
|
auth=API_AUTH_METHODS,
|
||||||
|
urls_namespace="api",
|
||||||
|
docs=Swagger(settings={"persistAuthorization": True}),
|
||||||
|
# docs_decorator=login_required,
|
||||||
|
# renderer=ORJSONRenderer(),
|
||||||
|
)
|
||||||
|
api = register_urls(api)
|
||||||
|
urls = api.urls
|
||||||
|
|
||||||
|
|
||||||
|
@api.exception_handler(Exception)
|
||||||
|
def generic_exception_handler(request, err):
|
||||||
|
status = 503
|
||||||
|
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
|
||||||
|
status = 404
|
||||||
|
|
||||||
|
print(''.join(format_exception(err)))
|
||||||
|
|
||||||
|
return api.create_response(
|
||||||
|
request,
|
||||||
|
{
|
||||||
|
"succeeded": False,
|
||||||
|
"message": f'{err.__class__.__name__}: {err}',
|
||||||
|
"errors": [
|
||||||
|
''.join(format_exception(err)),
|
||||||
|
# or send simpler parent-only traceback:
|
||||||
|
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
status=status,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# import orjson
|
||||||
|
# from ninja.renderers import BaseRenderer
|
||||||
|
# class ORJSONRenderer(BaseRenderer):
|
||||||
|
# media_type = "application/json"
|
||||||
|
# def render(self, request, data, *, response_status):
|
||||||
|
# return {
|
||||||
|
# "success": True,
|
||||||
|
# "errors": [],
|
||||||
|
# "result": data,
|
||||||
|
# "stdout": ansi_to_html(stdout.getvalue().strip()),
|
||||||
|
# "stderr": ansi_to_html(stderr.getvalue().strip()),
|
||||||
|
# }
|
||||||
|
# return orjson.dumps(data)
|
52
archivebox/api/v1_auth.py
Normal file
52
archivebox/api/v1_auth.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from ninja import Router, Schema
|
||||||
|
|
||||||
|
from api.models import APIToken
|
||||||
|
from api.auth import auth_using_token, auth_using_password
|
||||||
|
|
||||||
|
|
||||||
|
router = Router(tags=['Authentication'])
|
||||||
|
|
||||||
|
|
||||||
|
class PasswordAuthSchema(Schema):
|
||||||
|
"""Schema for a /get_api_token request"""
|
||||||
|
username: Optional[str] = None
|
||||||
|
password: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
|
||||||
|
def get_api_token(request, auth_data: PasswordAuthSchema):
|
||||||
|
user = auth_using_password(
|
||||||
|
username=auth_data.username,
|
||||||
|
password=auth_data.password,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
|
||||||
|
if user:
|
||||||
|
# TODO: support multiple tokens in the future, for now we just have one per user
|
||||||
|
api_token, created = APIToken.objects.get_or_create(user=user)
|
||||||
|
|
||||||
|
return api_token.__json__()
|
||||||
|
|
||||||
|
return {"success": False, "errors": ["Invalid credentials"]}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TokenAuthSchema(Schema):
|
||||||
|
"""Schema for a /check_api_token request"""
|
||||||
|
token: str
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
|
||||||
|
def check_api_token(request, token_data: TokenAuthSchema):
|
||||||
|
user = auth_using_token(
|
||||||
|
token=token_data.token,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
if user:
|
||||||
|
return {"success": True, "user_id": str(user.id)}
|
||||||
|
|
||||||
|
return {"success": False, "user_id": None}
|
234
archivebox/api/v1_cli.py
Normal file
234
archivebox/api/v1_cli.py
Normal file
|
@ -0,0 +1,234 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from ninja import Router, Schema
|
||||||
|
|
||||||
|
from ..main import (
|
||||||
|
add,
|
||||||
|
remove,
|
||||||
|
update,
|
||||||
|
list_all,
|
||||||
|
schedule,
|
||||||
|
)
|
||||||
|
from ..util import ansi_to_html
|
||||||
|
from ..config import ONLY_NEW
|
||||||
|
|
||||||
|
|
||||||
|
# router for API that exposes archivebox cli subcommands as REST endpoints
|
||||||
|
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
|
||||||
|
|
||||||
|
|
||||||
|
# Schemas
|
||||||
|
|
||||||
|
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
|
||||||
|
|
||||||
|
class CLICommandResponseSchema(Schema):
|
||||||
|
success: bool
|
||||||
|
errors: List[str]
|
||||||
|
result: JSONType
|
||||||
|
stdout: str
|
||||||
|
stderr: str
|
||||||
|
|
||||||
|
class FilterTypeChoices(str, Enum):
|
||||||
|
exact = 'exact'
|
||||||
|
substring = 'substring'
|
||||||
|
regex = 'regex'
|
||||||
|
domain = 'domain'
|
||||||
|
tag = 'tag'
|
||||||
|
timestamp = 'timestamp'
|
||||||
|
|
||||||
|
class StatusChoices(str, Enum):
|
||||||
|
indexed = 'indexed'
|
||||||
|
archived = 'archived'
|
||||||
|
unarchived = 'unarchived'
|
||||||
|
present = 'present'
|
||||||
|
valid = 'valid'
|
||||||
|
invalid = 'invalid'
|
||||||
|
duplicate = 'duplicate'
|
||||||
|
orphaned = 'orphaned'
|
||||||
|
corrupted = 'corrupted'
|
||||||
|
unrecognized = 'unrecognized'
|
||||||
|
|
||||||
|
|
||||||
|
class AddCommandSchema(Schema):
|
||||||
|
urls: List[str]
|
||||||
|
tag: str = ""
|
||||||
|
depth: int = 0
|
||||||
|
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
|
||||||
|
update_all: bool = False
|
||||||
|
index_only: bool = False
|
||||||
|
overwrite: bool = False
|
||||||
|
init: bool = False
|
||||||
|
extractors: str = ""
|
||||||
|
parser: str = "auto"
|
||||||
|
|
||||||
|
class UpdateCommandSchema(Schema):
|
||||||
|
resume: Optional[float] = 0
|
||||||
|
only_new: bool = ONLY_NEW
|
||||||
|
index_only: bool = False
|
||||||
|
overwrite: bool = False
|
||||||
|
after: Optional[float] = 0
|
||||||
|
before: Optional[float] = 999999999999999
|
||||||
|
status: Optional[StatusChoices] = StatusChoices.unarchived
|
||||||
|
filter_type: Optional[str] = FilterTypeChoices.substring
|
||||||
|
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||||
|
extractors: Optional[str] = ""
|
||||||
|
|
||||||
|
class ScheduleCommandSchema(Schema):
|
||||||
|
import_path: Optional[str] = None
|
||||||
|
add: bool = False
|
||||||
|
every: Optional[str] = None
|
||||||
|
tag: str = ''
|
||||||
|
depth: int = 0
|
||||||
|
overwrite: bool = False
|
||||||
|
update: bool = not ONLY_NEW
|
||||||
|
clear: bool = False
|
||||||
|
|
||||||
|
class ListCommandSchema(Schema):
|
||||||
|
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||||
|
filter_type: str = FilterTypeChoices.substring
|
||||||
|
status: Optional[StatusChoices] = StatusChoices.indexed
|
||||||
|
after: Optional[float] = 0
|
||||||
|
before: Optional[float] = 999999999999999
|
||||||
|
sort: str = 'added'
|
||||||
|
as_json: bool = True
|
||||||
|
as_html: bool = False
|
||||||
|
as_csv: str | bool = 'timestamp,url'
|
||||||
|
with_headers: bool = False
|
||||||
|
|
||||||
|
class RemoveCommandSchema(Schema):
|
||||||
|
delete: bool = True
|
||||||
|
after: Optional[float] = 0
|
||||||
|
before: Optional[float] = 999999999999999
|
||||||
|
filter_type: str = FilterTypeChoices.exact
|
||||||
|
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||||
|
def cli_add(request, args: AddCommandSchema):
|
||||||
|
result = add(
|
||||||
|
urls=args.urls,
|
||||||
|
tag=args.tag,
|
||||||
|
depth=args.depth,
|
||||||
|
update=args.update,
|
||||||
|
update_all=args.update_all,
|
||||||
|
index_only=args.index_only,
|
||||||
|
overwrite=args.overwrite,
|
||||||
|
init=args.init,
|
||||||
|
extractors=args.extractors,
|
||||||
|
parser=args.parser,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
||||||
|
def cli_update(request, args: UpdateCommandSchema):
|
||||||
|
result = update(
|
||||||
|
resume=args.resume,
|
||||||
|
only_new=args.only_new,
|
||||||
|
index_only=args.index_only,
|
||||||
|
overwrite=args.overwrite,
|
||||||
|
before=args.before,
|
||||||
|
after=args.after,
|
||||||
|
status=args.status,
|
||||||
|
filter_type=args.filter_type,
|
||||||
|
filter_patterns=args.filter_patterns,
|
||||||
|
extractors=args.extractors,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
||||||
|
def cli_schedule(request, args: ScheduleCommandSchema):
|
||||||
|
result = schedule(
|
||||||
|
import_path=args.import_path,
|
||||||
|
add=args.add,
|
||||||
|
show=args.show,
|
||||||
|
clear=args.clear,
|
||||||
|
every=args.every,
|
||||||
|
tag=args.tag,
|
||||||
|
depth=args.depth,
|
||||||
|
overwrite=args.overwrite,
|
||||||
|
update=args.update,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns]')
|
||||||
|
def cli_list(request, args: ListCommandSchema):
|
||||||
|
result = list_all(
|
||||||
|
filter_patterns=args.filter_patterns,
|
||||||
|
filter_type=args.filter_type,
|
||||||
|
status=args.status,
|
||||||
|
after=args.after,
|
||||||
|
before=args.before,
|
||||||
|
sort=args.sort,
|
||||||
|
csv=args.as_csv,
|
||||||
|
json=args.as_json,
|
||||||
|
html=args.as_html,
|
||||||
|
with_headers=args.with_headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
result_format = 'txt'
|
||||||
|
if args.as_json:
|
||||||
|
result_format = "json"
|
||||||
|
elif args.as_html:
|
||||||
|
result_format = "html"
|
||||||
|
elif args.as_csv:
|
||||||
|
result_format = "csv"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"result_format": result_format,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
||||||
|
def cli_remove(request, args: RemoveCommandSchema):
|
||||||
|
result = remove(
|
||||||
|
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||||
|
delete=args.delete,
|
||||||
|
before=args.before,
|
||||||
|
after=args.after,
|
||||||
|
filter_type=args.filter_type,
|
||||||
|
filter_patterns=args.filter_patterns,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"errors": [],
|
||||||
|
"result": result,
|
||||||
|
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||||
|
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||||
|
}
|
||||||
|
|
210
archivebox/api/v1_core.py
Normal file
210
archivebox/api/v1_core.py
Normal file
|
@ -0,0 +1,210 @@
|
||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from uuid import UUID
|
||||||
|
from typing import List, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from django.shortcuts import get_object_or_404
|
||||||
|
|
||||||
|
from ninja import Router, Schema, FilterSchema, Field, Query
|
||||||
|
from ninja.pagination import paginate
|
||||||
|
|
||||||
|
from core.models import Snapshot, ArchiveResult, Tag
|
||||||
|
|
||||||
|
|
||||||
|
router = Router(tags=['Core Models'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### ArchiveResult #########################################################################
|
||||||
|
|
||||||
|
class ArchiveResultSchema(Schema):
|
||||||
|
id: UUID
|
||||||
|
|
||||||
|
snapshot_id: UUID
|
||||||
|
snapshot_url: str
|
||||||
|
snapshot_tags: str
|
||||||
|
|
||||||
|
extractor: str
|
||||||
|
cmd: List[str]
|
||||||
|
pwd: str
|
||||||
|
cmd_version: str
|
||||||
|
output: str
|
||||||
|
status: str
|
||||||
|
|
||||||
|
created: datetime
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_id(obj):
|
||||||
|
return obj.uuid
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_created(obj):
|
||||||
|
return obj.start_ts
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_snapshot_url(obj):
|
||||||
|
return obj.snapshot.url
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_snapshot_tags(obj):
|
||||||
|
return obj.snapshot.tags_str()
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiveResultFilterSchema(FilterSchema):
|
||||||
|
id: Optional[UUID] = Field(None, q='uuid')
|
||||||
|
|
||||||
|
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
|
||||||
|
snapshot_id: Optional[UUID] = Field(None, q='snapshot_id')
|
||||||
|
snapshot_url: Optional[str] = Field(None, q='snapshot__url')
|
||||||
|
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name')
|
||||||
|
|
||||||
|
status: Optional[str] = Field(None, q='status')
|
||||||
|
output: Optional[str] = Field(None, q='output__icontains')
|
||||||
|
extractor: Optional[str] = Field(None, q='extractor__icontains')
|
||||||
|
cmd: Optional[str] = Field(None, q='cmd__0__icontains')
|
||||||
|
pwd: Optional[str] = Field(None, q='pwd__icontains')
|
||||||
|
cmd_version: Optional[str] = Field(None, q='cmd_version')
|
||||||
|
|
||||||
|
created: Optional[datetime] = Field(None, q='updated')
|
||||||
|
created__gte: Optional[datetime] = Field(None, q='updated__gte')
|
||||||
|
created__lt: Optional[datetime] = Field(None, q='updated__lt')
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/archiveresults", response=List[ArchiveResultSchema])
|
||||||
|
@paginate
|
||||||
|
def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
|
||||||
|
qs = ArchiveResult.objects.all()
|
||||||
|
results = filters.filter(qs)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
|
||||||
|
def get_archiveresult(request, archiveresult_id: str):
|
||||||
|
archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||||
|
return archiveresult
|
||||||
|
|
||||||
|
|
||||||
|
# @router.post("/archiveresult", response=ArchiveResultSchema)
|
||||||
|
# def create_archiveresult(request, payload: ArchiveResultSchema):
|
||||||
|
# archiveresult = ArchiveResult.objects.create(**payload.dict())
|
||||||
|
# return archiveresult
|
||||||
|
#
|
||||||
|
# @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
|
||||||
|
# def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
|
||||||
|
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||||
|
#
|
||||||
|
# for attr, value in payload.dict().items():
|
||||||
|
# setattr(archiveresult, attr, value)
|
||||||
|
# archiveresult.save()
|
||||||
|
#
|
||||||
|
# return archiveresult
|
||||||
|
#
|
||||||
|
# @router.delete("/archiveresult/{archiveresult_id}")
|
||||||
|
# def delete_archiveresult(request, archiveresult_id: str):
|
||||||
|
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||||
|
# archiveresult.delete()
|
||||||
|
# return {"success": True}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Snapshot #########################################################################
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotSchema(Schema):
|
||||||
|
id: UUID
|
||||||
|
|
||||||
|
url: str
|
||||||
|
tags: str
|
||||||
|
title: Optional[str]
|
||||||
|
timestamp: str
|
||||||
|
bookmarked: datetime
|
||||||
|
added: datetime
|
||||||
|
updated: datetime
|
||||||
|
archive_path: str
|
||||||
|
|
||||||
|
archiveresults: List[ArchiveResultSchema]
|
||||||
|
|
||||||
|
# @staticmethod
|
||||||
|
# def resolve_id(obj):
|
||||||
|
# return str(obj.id)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_tags(obj):
|
||||||
|
return obj.tags_str()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_archiveresults(obj, context):
|
||||||
|
if context['request'].with_archiveresults:
|
||||||
|
return obj.archiveresult_set.all().distinct()
|
||||||
|
return ArchiveResult.objects.none()
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotFilterSchema(FilterSchema):
|
||||||
|
id: Optional[UUID] = Field(None, q='id')
|
||||||
|
|
||||||
|
search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains'])
|
||||||
|
url: Optional[str] = Field(None, q='url')
|
||||||
|
tag: Optional[str] = Field(None, q='tags__name')
|
||||||
|
title: Optional[str] = Field(None, q='title__icontains')
|
||||||
|
|
||||||
|
timestamp: Optional[str] = Field(None, q='timestamp__startswith')
|
||||||
|
|
||||||
|
added: Optional[datetime] = Field(None, q='added')
|
||||||
|
added__gte: Optional[datetime] = Field(None, q='added__gte')
|
||||||
|
added__lt: Optional[datetime] = Field(None, q='added__lt')
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/snapshots", response=List[SnapshotSchema])
|
||||||
|
@paginate
|
||||||
|
def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
|
||||||
|
request.with_archiveresults = with_archiveresults
|
||||||
|
|
||||||
|
qs = Snapshot.objects.all()
|
||||||
|
results = filters.filter(qs)
|
||||||
|
return results
|
||||||
|
|
||||||
|
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
|
||||||
|
def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
|
||||||
|
request.with_archiveresults = with_archiveresults
|
||||||
|
snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||||
|
return snapshot
|
||||||
|
|
||||||
|
|
||||||
|
# @router.post("/snapshot", response=SnapshotSchema)
|
||||||
|
# def create_snapshot(request, payload: SnapshotSchema):
|
||||||
|
# snapshot = Snapshot.objects.create(**payload.dict())
|
||||||
|
# return snapshot
|
||||||
|
#
|
||||||
|
# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
|
||||||
|
# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
|
||||||
|
# snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||||
|
#
|
||||||
|
# for attr, value in payload.dict().items():
|
||||||
|
# setattr(snapshot, attr, value)
|
||||||
|
# snapshot.save()
|
||||||
|
#
|
||||||
|
# return snapshot
|
||||||
|
#
|
||||||
|
# @router.delete("/snapshot/{snapshot_id}")
|
||||||
|
# def delete_snapshot(request, snapshot_id: str):
|
||||||
|
# snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||||
|
# snapshot.delete()
|
||||||
|
# return {"success": True}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Tag #########################################################################
|
||||||
|
|
||||||
|
|
||||||
|
class TagSchema(Schema):
|
||||||
|
name: str
|
||||||
|
slug: str
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/tags", response=List[TagSchema])
|
||||||
|
def list_tags(request):
|
||||||
|
return Tag.objects.all()
|
|
@ -112,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
||||||
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
||||||
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
||||||
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
|
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
|
||||||
},
|
},
|
||||||
|
|
||||||
'ARCHIVE_METHOD_TOGGLES': {
|
'ARCHIVE_METHOD_TOGGLES': {
|
||||||
|
@ -265,7 +265,7 @@ CONFIG_ALIASES = {
|
||||||
for key, default in section.items()
|
for key, default in section.items()
|
||||||
for alias in default.get('aliases', ())
|
for alias in default.get('aliases', ())
|
||||||
}
|
}
|
||||||
USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
||||||
|
|
||||||
def get_real_name(key: str) -> str:
|
def get_real_name(key: str) -> str:
|
||||||
"""get the current canonical name for a given deprecated config key"""
|
"""get the current canonical name for a given deprecated config key"""
|
||||||
|
@ -282,6 +282,7 @@ ARCHIVE_DIR_NAME = 'archive'
|
||||||
SOURCES_DIR_NAME = 'sources'
|
SOURCES_DIR_NAME = 'sources'
|
||||||
LOGS_DIR_NAME = 'logs'
|
LOGS_DIR_NAME = 'logs'
|
||||||
PERSONAS_DIR_NAME = 'personas'
|
PERSONAS_DIR_NAME = 'personas'
|
||||||
|
CRONTABS_DIR_NAME = 'crontabs'
|
||||||
SQL_INDEX_FILENAME = 'index.sqlite3'
|
SQL_INDEX_FILENAME = 'index.sqlite3'
|
||||||
JSON_INDEX_FILENAME = 'index.json'
|
JSON_INDEX_FILENAME = 'index.json'
|
||||||
HTML_INDEX_FILENAME = 'index.html'
|
HTML_INDEX_FILENAME = 'index.html'
|
||||||
|
@ -355,7 +356,7 @@ ALLOWED_IN_OUTPUT_DIR = {
|
||||||
'static',
|
'static',
|
||||||
'sonic',
|
'sonic',
|
||||||
'search.sqlite3',
|
'search.sqlite3',
|
||||||
'crontabs',
|
CRONTABS_DIR_NAME,
|
||||||
ARCHIVE_DIR_NAME,
|
ARCHIVE_DIR_NAME,
|
||||||
SOURCES_DIR_NAME,
|
SOURCES_DIR_NAME,
|
||||||
LOGS_DIR_NAME,
|
LOGS_DIR_NAME,
|
||||||
|
@ -598,7 +599,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
|
|
||||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||||
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
|
||||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||||
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
||||||
|
@ -985,11 +985,6 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||||
'enabled': True,
|
'enabled': True,
|
||||||
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
||||||
},
|
},
|
||||||
'CUSTOM_TEMPLATES_DIR': {
|
|
||||||
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
|
|
||||||
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
|
|
||||||
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
|
|
||||||
},
|
|
||||||
# 'NODE_MODULES_DIR': {
|
# 'NODE_MODULES_DIR': {
|
||||||
# 'path': ,
|
# 'path': ,
|
||||||
# 'enabled': ,
|
# 'enabled': ,
|
||||||
|
@ -997,50 +992,25 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||||
# },
|
# },
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_external_locations(config: ConfigDict) -> ConfigValue:
|
|
||||||
abspath = lambda path: None if path is None else Path(path).resolve()
|
|
||||||
return {
|
|
||||||
'CHROME_USER_DATA_DIR': {
|
|
||||||
'path': abspath(config['CHROME_USER_DATA_DIR']),
|
|
||||||
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
|
||||||
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
|
||||||
},
|
|
||||||
'COOKIES_FILE': {
|
|
||||||
'path': abspath(config['COOKIES_FILE']),
|
|
||||||
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
|
||||||
'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||||
return {
|
return {
|
||||||
|
# OLD: migrating to personas
|
||||||
|
# 'CHROME_USER_DATA_DIR': {
|
||||||
|
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
|
||||||
|
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||||
|
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
||||||
|
# },
|
||||||
|
# 'COOKIES_FILE': {
|
||||||
|
# 'path': os.path.abspath(config['COOKIES_FILE']),
|
||||||
|
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
||||||
|
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
||||||
|
# },
|
||||||
'OUTPUT_DIR': {
|
'OUTPUT_DIR': {
|
||||||
'path': config['OUTPUT_DIR'].resolve(),
|
'path': config['OUTPUT_DIR'].resolve(),
|
||||||
'enabled': True,
|
'enabled': True,
|
||||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||||
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
||||||
},
|
},
|
||||||
'SOURCES_DIR': {
|
|
||||||
'path': config['SOURCES_DIR'].resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['SOURCES_DIR'].exists(),
|
|
||||||
},
|
|
||||||
'LOGS_DIR': {
|
|
||||||
'path': config['LOGS_DIR'].resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['LOGS_DIR'].exists(),
|
|
||||||
},
|
|
||||||
'PERSONAS_DIR': {
|
|
||||||
'path': config['PERSONAS_DIR'].resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['PERSONAS_DIR'].exists(),
|
|
||||||
},
|
|
||||||
'ARCHIVE_DIR': {
|
|
||||||
'path': config['ARCHIVE_DIR'].resolve(),
|
|
||||||
'enabled': True,
|
|
||||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
|
||||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
|
||||||
},
|
|
||||||
'CONFIG_FILE': {
|
'CONFIG_FILE': {
|
||||||
'path': config['CONFIG_FILE'].resolve(),
|
'path': config['CONFIG_FILE'].resolve(),
|
||||||
'enabled': True,
|
'enabled': True,
|
||||||
|
@ -1052,6 +1022,38 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||||
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
||||||
},
|
},
|
||||||
|
'ARCHIVE_DIR': {
|
||||||
|
'path': config['ARCHIVE_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||||
|
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||||
|
},
|
||||||
|
'SOURCES_DIR': {
|
||||||
|
'path': config['SOURCES_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['SOURCES_DIR'].exists(),
|
||||||
|
},
|
||||||
|
'LOGS_DIR': {
|
||||||
|
'path': config['LOGS_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['LOGS_DIR'].exists(),
|
||||||
|
},
|
||||||
|
'CUSTOM_TEMPLATES_DIR': {
|
||||||
|
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
|
||||||
|
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
|
||||||
|
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
|
||||||
|
},
|
||||||
|
'PERSONAS_DIR': {
|
||||||
|
'path': config['PERSONAS_DIR'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['PERSONAS_DIR'].exists(),
|
||||||
|
},
|
||||||
|
# managed by bin/docker_entrypoint.sh and python-crontab:
|
||||||
|
# 'CRONTABS_DIR': {
|
||||||
|
# 'path': config['CRONTABS_DIR'].resolve(),
|
||||||
|
# 'enabled': True,
|
||||||
|
# 'is_valid': config['CRONTABS_DIR'].exists(),
|
||||||
|
# },
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||||
|
@ -1366,6 +1368,7 @@ def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=C
|
||||||
stderr(' archivebox init')
|
stderr(' archivebox init')
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
|
||||||
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
|
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
|
||||||
output_dir = out_dir or config['OUTPUT_DIR']
|
output_dir = out_dir or config['OUTPUT_DIR']
|
||||||
from .index.sql import list_migrations
|
from .index.sql import list_migrations
|
||||||
|
|
|
@ -14,12 +14,17 @@ from django.shortcuts import render, redirect
|
||||||
from django.contrib.auth import get_user_model
|
from django.contrib.auth import get_user_model
|
||||||
from django import forms
|
from django import forms
|
||||||
|
|
||||||
|
|
||||||
|
from signal_webhooks.apps import DjangoSignalWebhooksConfig
|
||||||
|
from signal_webhooks.admin import WebhookAdmin, WebhookModel
|
||||||
|
|
||||||
from ..util import htmldecode, urldecode, ansi_to_html
|
from ..util import htmldecode, urldecode, ansi_to_html
|
||||||
|
|
||||||
from core.models import Snapshot, ArchiveResult, Tag
|
from core.models import Snapshot, ArchiveResult, Tag
|
||||||
from core.forms import AddLinkForm
|
from core.forms import AddLinkForm
|
||||||
|
|
||||||
from core.mixins import SearchResultsAdminMixin
|
from core.mixins import SearchResultsAdminMixin
|
||||||
|
from api.models import APIToken
|
||||||
|
|
||||||
from index.html import snapshot_icons
|
from index.html import snapshot_icons
|
||||||
from logging_util import printable_filesize
|
from logging_util import printable_filesize
|
||||||
|
@ -98,10 +103,32 @@ class ArchiveBoxAdmin(admin.AdminSite):
|
||||||
|
|
||||||
return render(template_name='add.html', request=request, context=context)
|
return render(template_name='add.html', request=request, context=context)
|
||||||
|
|
||||||
|
|
||||||
|
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
|
||||||
|
DjangoSignalWebhooksConfig.verbose_name = 'API'
|
||||||
|
WebhookModel._meta.get_field('name').help_text = 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).'
|
||||||
|
WebhookModel._meta.get_field('signal').help_text = 'The type of event the webhook should fire for (e.g. Create, Update, Delete).'
|
||||||
|
WebhookModel._meta.get_field('ref').help_text = 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).'
|
||||||
|
WebhookModel._meta.get_field('endpoint').help_text = 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).'
|
||||||
|
WebhookModel._meta.app_label = 'api'
|
||||||
|
|
||||||
|
|
||||||
archivebox_admin = ArchiveBoxAdmin()
|
archivebox_admin = ArchiveBoxAdmin()
|
||||||
archivebox_admin.register(get_user_model())
|
archivebox_admin.register(get_user_model())
|
||||||
|
archivebox_admin.register(APIToken)
|
||||||
|
archivebox_admin.register(WebhookModel, WebhookAdmin)
|
||||||
archivebox_admin.disable_action('delete_selected')
|
archivebox_admin.disable_action('delete_selected')
|
||||||
|
|
||||||
|
|
||||||
|
# patch admin with methods to add data views
|
||||||
|
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
|
||||||
|
|
||||||
|
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResultInline(admin.TabularInline):
|
class ArchiveResultInline(admin.TabularInline):
|
||||||
model = ArchiveResult
|
model = ArchiveResult
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,6 +7,22 @@ class CoreConfig(AppConfig):
|
||||||
name = 'core'
|
name = 'core'
|
||||||
|
|
||||||
def ready(self):
|
def ready(self):
|
||||||
|
# register our custom admin as the primary django admin
|
||||||
|
from django.contrib import admin
|
||||||
|
from django.contrib.admin import sites
|
||||||
|
from core.admin import archivebox_admin
|
||||||
|
|
||||||
|
admin.site = archivebox_admin
|
||||||
|
sites.site = archivebox_admin
|
||||||
|
|
||||||
|
|
||||||
|
# register signal handlers
|
||||||
from .auth import register_signals
|
from .auth import register_signals
|
||||||
|
|
||||||
register_signals()
|
register_signals()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# from django.contrib.admin.apps import AdminConfig
|
||||||
|
# class CoreAdminConfig(AdminConfig):
|
||||||
|
# default_site = "core.admin.get_admin_site"
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import os
|
__package__ = 'archivebox.core'
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from ..config import (
|
from ..config import (
|
||||||
LDAP
|
LDAP
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
from django.conf import settings
|
|
||||||
from ..config import (
|
from ..config import (
|
||||||
LDAP_CREATE_SUPERUSER
|
LDAP_CREATE_SUPERUSER
|
||||||
)
|
)
|
||||||
|
|
||||||
def create_user(sender, user=None, ldap_user=None, **kwargs):
|
def create_user(sender, user=None, ldap_user=None, **kwargs):
|
||||||
|
|
||||||
if not user.id and LDAP_CREATE_SUPERUSER:
|
if not user.id and LDAP_CREATE_SUPERUSER:
|
||||||
user.is_superuser = True
|
user.is_superuser = True
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ from ..config import (
|
||||||
CUSTOM_TEMPLATES_DIR,
|
CUSTOM_TEMPLATES_DIR,
|
||||||
SQL_INDEX_FILENAME,
|
SQL_INDEX_FILENAME,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
|
ARCHIVE_DIR,
|
||||||
LOGS_DIR,
|
LOGS_DIR,
|
||||||
TIMEZONE,
|
TIMEZONE,
|
||||||
|
|
||||||
|
@ -63,6 +64,9 @@ INSTALLED_APPS = [
|
||||||
'core',
|
'core',
|
||||||
'api',
|
'api',
|
||||||
|
|
||||||
|
'admin_data_views',
|
||||||
|
|
||||||
|
'signal_webhooks',
|
||||||
'django_extensions',
|
'django_extensions',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -173,6 +177,17 @@ if DEBUG_TOOLBAR:
|
||||||
]
|
]
|
||||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||||
|
|
||||||
|
|
||||||
|
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
|
||||||
|
# Must delete archivebox/templates/admin to use because it relies on some things we override
|
||||||
|
# visit /__requests_tracker__/ to access
|
||||||
|
DEBUG_REQUESTS_TRACKER = False
|
||||||
|
if DEBUG_REQUESTS_TRACKER:
|
||||||
|
INSTALLED_APPS += ["requests_tracker"]
|
||||||
|
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
|
||||||
|
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
|
||||||
|
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
### Staticfile and Template Settings
|
### Staticfile and Template Settings
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -242,6 +257,29 @@ CACHES = {
|
||||||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||||
|
|
||||||
|
|
||||||
|
STORAGES = {
|
||||||
|
"default": {
|
||||||
|
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||||
|
},
|
||||||
|
"staticfiles": {
|
||||||
|
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
|
||||||
|
},
|
||||||
|
"archive": {
|
||||||
|
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||||
|
"OPTIONS": {
|
||||||
|
"base_url": "/archive/",
|
||||||
|
"location": ARCHIVE_DIR,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# "personas": {
|
||||||
|
# "BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||||
|
# "OPTIONS": {
|
||||||
|
# "base_url": "/personas/",
|
||||||
|
# "location": PERSONAS_DIR,
|
||||||
|
# },
|
||||||
|
# },
|
||||||
|
}
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
### Security Settings
|
### Security Settings
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -368,3 +406,32 @@ LOGGING = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Add default webhook configuration to the User model
|
||||||
|
SIGNAL_WEBHOOKS = {
|
||||||
|
"HOOKS": {
|
||||||
|
"django.contrib.auth.models.User": ...,
|
||||||
|
"core.models.Snapshot": ...,
|
||||||
|
"core.models.ArchiveResult": ...,
|
||||||
|
"core.models.Tag": ...,
|
||||||
|
"api.models.APIToken": ...,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ADMIN_DATA_VIEWS = {
|
||||||
|
"NAME": "configuration",
|
||||||
|
"URLS": [
|
||||||
|
{
|
||||||
|
"route": "live/",
|
||||||
|
"view": "core.views.live_config_list_view",
|
||||||
|
"name": "live",
|
||||||
|
"items": {
|
||||||
|
"route": "<str:key>/",
|
||||||
|
"view": "core.views.live_config_value_view",
|
||||||
|
"name": "live_config_value",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .admin import archivebox_admin
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
from django.urls import path, include
|
from django.urls import path, include
|
||||||
from django.views import static
|
from django.views import static
|
||||||
|
@ -6,14 +6,9 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.views.generic.base import RedirectView
|
from django.views.generic.base import RedirectView
|
||||||
|
|
||||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
from .admin import archivebox_admin
|
||||||
|
from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||||
|
|
||||||
from ninja import NinjaAPI
|
|
||||||
from api.auth import GlobalAuth
|
|
||||||
|
|
||||||
api = NinjaAPI(auth=GlobalAuth())
|
|
||||||
api.add_router("/auth/", "api.auth.router")
|
|
||||||
api.add_router("/archive/", "api.archive.router")
|
|
||||||
|
|
||||||
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||||
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||||
|
@ -43,10 +38,10 @@ urlpatterns = [
|
||||||
path('accounts/', include('django.contrib.auth.urls')),
|
path('accounts/', include('django.contrib.auth.urls')),
|
||||||
path('admin/', archivebox_admin.urls),
|
path('admin/', archivebox_admin.urls),
|
||||||
|
|
||||||
path("api/", api.urls),
|
path("api/", include('api.urls')),
|
||||||
|
|
||||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||||
path('error/', lambda _: 1/0),
|
path('error/', lambda *_: 1/0),
|
||||||
|
|
||||||
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
||||||
|
|
||||||
|
@ -57,10 +52,10 @@ urlpatterns = [
|
||||||
urlpatterns += staticfiles_urlpatterns()
|
urlpatterns += staticfiles_urlpatterns()
|
||||||
|
|
||||||
if settings.DEBUG_TOOLBAR:
|
if settings.DEBUG_TOOLBAR:
|
||||||
import debug_toolbar
|
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]
|
||||||
urlpatterns += [
|
|
||||||
path('__debug__/', include(debug_toolbar.urls)),
|
if settings.DEBUG_REQUESTS_TRACKER:
|
||||||
]
|
urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))]
|
||||||
|
|
||||||
|
|
||||||
# # Proposed FUTURE URLs spec
|
# # Proposed FUTURE URLs spec
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
__package__ = 'archivebox.core'
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from contextlib import redirect_stdout
|
from contextlib import redirect_stdout
|
||||||
|
|
||||||
from django.shortcuts import render, redirect
|
from django.shortcuts import render, redirect
|
||||||
from django.http import HttpResponse, Http404
|
from django.http import HttpRequest, HttpResponse, Http404
|
||||||
from django.utils.html import format_html, mark_safe
|
from django.utils.html import format_html, mark_safe
|
||||||
from django.views import View, static
|
from django.views import View, static
|
||||||
from django.views.generic.list import ListView
|
from django.views.generic.list import ListView
|
||||||
|
@ -14,6 +16,10 @@ from django.contrib.auth.mixins import UserPassesTestMixin
|
||||||
from django.views.decorators.csrf import csrf_exempt
|
from django.views.decorators.csrf import csrf_exempt
|
||||||
from django.utils.decorators import method_decorator
|
from django.utils.decorators import method_decorator
|
||||||
|
|
||||||
|
from admin_data_views.typing import TableContext, ItemContext
|
||||||
|
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||||
|
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from core.forms import AddLinkForm
|
from core.forms import AddLinkForm
|
||||||
|
|
||||||
|
@ -26,6 +32,10 @@ from ..config import (
|
||||||
COMMIT_HASH,
|
COMMIT_HASH,
|
||||||
FOOTER_INFO,
|
FOOTER_INFO,
|
||||||
SNAPSHOTS_PER_PAGE,
|
SNAPSHOTS_PER_PAGE,
|
||||||
|
CONFIG,
|
||||||
|
CONFIG_SCHEMA,
|
||||||
|
DYNAMIC_CONFIG_SCHEMA,
|
||||||
|
USER_CONFIG,
|
||||||
)
|
)
|
||||||
from ..main import add
|
from ..main import add
|
||||||
from ..util import base_url, ansi_to_html
|
from ..util import base_url, ansi_to_html
|
||||||
|
@ -124,9 +134,9 @@ class SnapshotView(View):
|
||||||
'<center><br/><br/><br/>'
|
'<center><br/><br/><br/>'
|
||||||
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
|
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
|
||||||
'{}'
|
'{}'
|
||||||
f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
|
f'</code></b> does not exist in the <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
|
||||||
'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
|
'It\'s possible that this resource type is not available for the Snapshot,<br/>or that the archiving process has not completed yet.<br/>'
|
||||||
f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
f'<pre><code># if interrupted, run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
||||||
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
||||||
'<i><b>Next steps:</i></b><br/>'
|
'<i><b>Next steps:</i></b><br/>'
|
||||||
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
||||||
|
@ -312,3 +322,124 @@ class HealthCheckView(View):
|
||||||
content_type='text/plain',
|
content_type='text/plain',
|
||||||
status=200
|
status=200
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def find_config_section(key: str) -> str:
|
||||||
|
matching_sections = [
|
||||||
|
name for name, opts in CONFIG_SCHEMA.items() if key in opts
|
||||||
|
]
|
||||||
|
section = matching_sections[0] if matching_sections else 'DYNAMIC'
|
||||||
|
return section
|
||||||
|
|
||||||
|
def find_config_default(key: str) -> str:
|
||||||
|
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
|
||||||
|
if isinstance(default_val, Callable):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
default_val = repr(default_val)
|
||||||
|
return default_val
|
||||||
|
|
||||||
|
def find_config_type(key: str) -> str:
|
||||||
|
if key in USER_CONFIG:
|
||||||
|
return USER_CONFIG[key]['type'].__name__
|
||||||
|
elif key in DYNAMIC_CONFIG_SCHEMA:
|
||||||
|
return type(CONFIG[key]).__name__
|
||||||
|
return 'str'
|
||||||
|
|
||||||
|
def key_is_safe(key: str) -> bool:
|
||||||
|
for term in ('key', 'password', 'secret', 'token'):
|
||||||
|
if term in key.lower():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
@render_with_table_view
|
||||||
|
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||||
|
|
||||||
|
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||||
|
|
||||||
|
rows = {
|
||||||
|
"Section": [],
|
||||||
|
"Key": [],
|
||||||
|
"Type": [],
|
||||||
|
"Value": [],
|
||||||
|
"Default": [],
|
||||||
|
# "Documentation": [],
|
||||||
|
"Aliases": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for section in CONFIG_SCHEMA.keys():
|
||||||
|
for key in CONFIG_SCHEMA[section].keys():
|
||||||
|
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
|
||||||
|
rows['Key'].append(ItemLink(key, key=key))
|
||||||
|
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||||
|
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||||
|
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
|
||||||
|
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||||
|
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
|
||||||
|
|
||||||
|
section = 'DYNAMIC'
|
||||||
|
for key in DYNAMIC_CONFIG_SCHEMA.keys():
|
||||||
|
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
|
||||||
|
rows['Key'].append(ItemLink(key, key=key))
|
||||||
|
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||||
|
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||||
|
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
|
||||||
|
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||||
|
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
|
||||||
|
|
||||||
|
return TableContext(
|
||||||
|
title="Computed Configuration Values",
|
||||||
|
table=rows,
|
||||||
|
)
|
||||||
|
|
||||||
|
@render_with_item_view
|
||||||
|
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||||
|
|
||||||
|
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||||
|
|
||||||
|
aliases = USER_CONFIG.get(key, {}).get("aliases", [])
|
||||||
|
|
||||||
|
return ItemContext(
|
||||||
|
slug=key,
|
||||||
|
title=key,
|
||||||
|
data=[
|
||||||
|
{
|
||||||
|
"name": mark_safe(f'data / ArchiveBox.conf [{find_config_section(key)}] <b><code style="color: lightgray">{key}</code></b>' if key in USER_CONFIG else f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(calculated at runtime)</small>'),
|
||||||
|
"description": None,
|
||||||
|
"fields": {
|
||||||
|
'Key': key,
|
||||||
|
'Type': find_config_type(key),
|
||||||
|
'Value': CONFIG[key] if key_is_safe(key) else '********',
|
||||||
|
},
|
||||||
|
"help_texts": {
|
||||||
|
'Key': mark_safe(f'''
|
||||||
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a>
|
||||||
|
<span style="display: {"inline" if aliases else "none"}">
|
||||||
|
Aliases: {", ".join(aliases)}
|
||||||
|
</span>
|
||||||
|
'''),
|
||||||
|
'Type': mark_safe(f'''
|
||||||
|
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
|
||||||
|
See full definition in <code>archivebox/config.py</code>...
|
||||||
|
</a>
|
||||||
|
'''),
|
||||||
|
'Value': mark_safe(f'''
|
||||||
|
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||||
|
Default: <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
|
||||||
|
<code>{find_config_default(key) or 'See 1here...'}</code>
|
||||||
|
</a>
|
||||||
|
<br/><br/>
|
||||||
|
<p style="display: {"block" if key in USER_CONFIG else "none"}">
|
||||||
|
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
||||||
|
<br/><br/>
|
||||||
|
<code>archivebox config --set {key}="{
|
||||||
|
val.strip("'")
|
||||||
|
if (val := find_config_default(key)) else
|
||||||
|
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||||
|
}"</code>
|
||||||
|
</p>
|
||||||
|
'''),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
|
@ -4,6 +4,7 @@ WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.
|
||||||
|
|
||||||
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
|
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
|
||||||
|
|
||||||
|
These are the old types we used to use before ArchiveBox v0.4 (before we switched to Django).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__package__ = 'archivebox.index'
|
__package__ = 'archivebox.index'
|
||||||
|
|
|
@ -494,12 +494,12 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
||||||
if delete:
|
if delete:
|
||||||
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
|
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
|
||||||
print(
|
print(
|
||||||
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' +
|
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
||||||
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' +
|
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
|
||||||
' (Pass --delete if you also want to permanently delete the data folders)'
|
' (Pass --delete if you also want to permanently delete the data folders)'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -104,7 +104,6 @@ from .config import (
|
||||||
COMMIT_HASH,
|
COMMIT_HASH,
|
||||||
BUILD_TIME,
|
BUILD_TIME,
|
||||||
CODE_LOCATIONS,
|
CODE_LOCATIONS,
|
||||||
EXTERNAL_LOCATIONS,
|
|
||||||
DATA_LOCATIONS,
|
DATA_LOCATIONS,
|
||||||
DEPENDENCIES,
|
DEPENDENCIES,
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
|
@ -231,7 +230,7 @@ def version(quiet: bool=False,
|
||||||
p = platform.uname()
|
p = platform.uname()
|
||||||
print(
|
print(
|
||||||
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
||||||
*((f'COMMIT_HASH={COMMIT_HASH[:7]}',) if COMMIT_HASH else ()),
|
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||||
f'BUILD_TIME={BUILD_TIME}',
|
f'BUILD_TIME={BUILD_TIME}',
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
|
@ -272,11 +271,6 @@ def version(quiet: bool=False,
|
||||||
for name, path in CODE_LOCATIONS.items():
|
for name, path in CODE_LOCATIONS.items():
|
||||||
print(printable_folder_status(name, path))
|
print(printable_folder_status(name, path))
|
||||||
|
|
||||||
print()
|
|
||||||
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
|
|
||||||
for name, path in EXTERNAL_LOCATIONS.items():
|
|
||||||
print(printable_folder_status(name, path))
|
|
||||||
|
|
||||||
print()
|
print()
|
||||||
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
||||||
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
||||||
|
@ -695,7 +689,7 @@ def add(urls: Union[str, List[str]],
|
||||||
if CAN_UPGRADE:
|
if CAN_UPGRADE:
|
||||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||||
|
|
||||||
return all_links
|
return new_links
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def remove(filter_str: Optional[str]=None,
|
def remove(filter_str: Optional[str]=None,
|
||||||
|
@ -1362,7 +1356,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
||||||
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||||
stderr()
|
stderr('')
|
||||||
|
|
||||||
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
||||||
# versions of ./manage.py commands whenever possible. When that's not possible
|
# versions of ./manage.py commands whenever possible. When that's not possible
|
||||||
# (e.g. makemigrations), you can comment out this check temporarily
|
# (e.g. makemigrations), you can comment out this check temporarily
|
||||||
|
|
||||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
|
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv):
|
||||||
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
||||||
print()
|
print()
|
||||||
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
||||||
|
|
|
@ -7,7 +7,6 @@ For examples of supported import formats see tests/.
|
||||||
|
|
||||||
__package__ = 'archivebox.parsers'
|
__package__ = 'archivebox.parsers'
|
||||||
|
|
||||||
import re
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
||||||
from typing import IO, Tuple, List, Optional
|
from typing import IO, Tuple, List, Optional
|
||||||
|
@ -28,7 +27,6 @@ from ..util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
download_url,
|
download_url,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
URL_REGEX,
|
|
||||||
)
|
)
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..logging_util import TimedProgress, log_source_saved
|
from ..logging_util import TimedProgress, log_source_saved
|
||||||
|
@ -202,54 +200,3 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
||||||
log_source_saved(source_file=source_path)
|
log_source_saved(source_file=source_path)
|
||||||
|
|
||||||
return source_path
|
return source_path
|
||||||
|
|
||||||
|
|
||||||
# Check that plain text regex URL parsing works as expected
|
|
||||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
|
||||||
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
|
|
||||||
# the consequences of bad URL parsing could be disastrous and lead to many
|
|
||||||
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
|
||||||
_test_url_strs = {
|
|
||||||
'example.com': 0,
|
|
||||||
'/example.com': 0,
|
|
||||||
'//example.com': 0,
|
|
||||||
':/example.com': 0,
|
|
||||||
'://example.com': 0,
|
|
||||||
'htt://example8.com': 0,
|
|
||||||
'/htt://example.com': 0,
|
|
||||||
'https://example': 1,
|
|
||||||
'https://localhost/2345': 1,
|
|
||||||
'https://localhost:1234/123': 1,
|
|
||||||
'://': 0,
|
|
||||||
'https://': 0,
|
|
||||||
'http://': 0,
|
|
||||||
'ftp://': 0,
|
|
||||||
'ftp://example.com': 0,
|
|
||||||
'https://example.com': 1,
|
|
||||||
'https://example.com/': 1,
|
|
||||||
'https://a.example.com': 1,
|
|
||||||
'https://a.example.com/': 1,
|
|
||||||
'https://a.example.com/what/is/happening.html': 1,
|
|
||||||
'https://a.example.com/what/ís/happening.html': 1,
|
|
||||||
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
|
||||||
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
|
||||||
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
|
||||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
|
||||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
|
||||||
'<test>http://example7.com</test>': 1,
|
|
||||||
'https://<test>': 0,
|
|
||||||
'https://[test]': 0,
|
|
||||||
'http://"test"': 0,
|
|
||||||
'http://\'test\'': 0,
|
|
||||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
|
||||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
|
||||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
|
||||||
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
|
||||||
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
|
||||||
'<or>http://examplehttp://15.badc</that>': 2,
|
|
||||||
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
|
||||||
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
|
||||||
}
|
|
||||||
for url_str, num_urls in _test_url_strs.items():
|
|
||||||
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
|
|
||||||
f'{url_str} does not contain {num_urls} urls')
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
URL_REGEX,
|
find_all_urls,
|
||||||
)
|
)
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
@ -40,10 +40,22 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
||||||
parser.feed(line)
|
parser.feed(line)
|
||||||
for url in parser.urls:
|
for url in parser.urls:
|
||||||
if root_url:
|
if root_url:
|
||||||
# resolve relative urls /home.html -> https://example.com/home.html
|
url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://'))
|
||||||
url = urljoin(root_url, url)
|
# url = https://abc.com => True
|
||||||
|
# url = /page.php?next=https://example.com => False
|
||||||
|
|
||||||
for archivable_url in re.findall(URL_REGEX, url):
|
if not url_is_absolute: # resolve it by joining it with root_url
|
||||||
|
relative_path = url
|
||||||
|
|
||||||
|
url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html
|
||||||
|
# => https://example.com/home.html
|
||||||
|
|
||||||
|
# special case to handle bug around // handling, crucial for urls that contain sub-urls
|
||||||
|
# e.g. https://web.archive.org/web/https://example.com
|
||||||
|
if did_urljoin_misbehave(root_url, relative_path, url):
|
||||||
|
url = fix_urljoin_bug(url)
|
||||||
|
|
||||||
|
for archivable_url in find_all_urls(url):
|
||||||
yield Link(
|
yield Link(
|
||||||
url=htmldecode(archivable_url),
|
url=htmldecode(archivable_url),
|
||||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||||
|
@ -56,3 +68,74 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
||||||
KEY = 'html'
|
KEY = 'html'
|
||||||
NAME = 'Generic HTML'
|
NAME = 'Generic HTML'
|
||||||
PARSER = parse_generic_html_export
|
PARSER = parse_generic_html_export
|
||||||
|
|
||||||
|
|
||||||
|
#### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 ####
|
||||||
|
|
||||||
|
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Handle urljoin edge case bug where multiple slashes get turned into a single slash:
|
||||||
|
- https://github.com/python/cpython/issues/96015
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/issues/1411
|
||||||
|
|
||||||
|
This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.:
|
||||||
|
https://web.archive.org/web/https://example.com/some/inner/url
|
||||||
|
|
||||||
|
But there are other valid URLs containing // that are not fixed by this workaround, e.g.:
|
||||||
|
https://example.com/drives/C//some/file
|
||||||
|
"""
|
||||||
|
|
||||||
|
# if relative path is actually an absolute url, cut off its own scheme so we check the path component only
|
||||||
|
relative_path = relative_path.lower()
|
||||||
|
if relative_path.startswith('http://') or relative_path.startswith('https://'):
|
||||||
|
relative_path = relative_path.split('://', 1)[-1]
|
||||||
|
|
||||||
|
# TODO: properly fix all double // getting stripped by urljoin, not just ://
|
||||||
|
original_path_had_suburl = '://' in relative_path
|
||||||
|
original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https://
|
||||||
|
final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https://
|
||||||
|
|
||||||
|
urljoin_broke_suburls = (
|
||||||
|
(original_root_had_suburl or original_path_had_suburl)
|
||||||
|
and not final_joined_has_suburl
|
||||||
|
)
|
||||||
|
return urljoin_broke_suburls
|
||||||
|
|
||||||
|
|
||||||
|
def fix_urljoin_bug(url: str, nesting_limit=5):
|
||||||
|
"""
|
||||||
|
recursively replace broken suburls .../http:/... with .../http://...
|
||||||
|
|
||||||
|
basically equivalent to this for 99.9% of cases:
|
||||||
|
url = url.replace('/http:/', '/http://')
|
||||||
|
url = url.replace('/https:/', '/https://')
|
||||||
|
except this handles:
|
||||||
|
other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example)
|
||||||
|
other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home)
|
||||||
|
fixing multiple suburls recursively
|
||||||
|
"""
|
||||||
|
input_url = url
|
||||||
|
for _ in range(nesting_limit):
|
||||||
|
url = re.sub(
|
||||||
|
r'(?P<root>.+?)' # https://web.archive.org/web
|
||||||
|
+ r'(?P<separator>[-=/_&+%$#@!*\(\\])' # /
|
||||||
|
+ r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/' # http:/
|
||||||
|
+ r'(?P<suburl>[^/\\]+)', # example.com
|
||||||
|
r"\1\2\3://\4",
|
||||||
|
input_url,
|
||||||
|
re.IGNORECASE | re.UNICODE,
|
||||||
|
)
|
||||||
|
if url == input_url:
|
||||||
|
break # nothing left to replace, all suburls are fixed
|
||||||
|
input_url = url
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
# sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs
|
||||||
|
assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True
|
||||||
|
assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True
|
||||||
|
assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls
|
||||||
|
assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html'
|
||||||
|
assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com'
|
||||||
|
|
||||||
|
|
|
@ -72,21 +72,13 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
|
|
||||||
json_file.seek(0)
|
json_file.seek(0)
|
||||||
|
|
||||||
try:
|
links = json.load(json_file)
|
||||||
links = json.load(json_file)
|
if type(links) != list:
|
||||||
if type(links) != list:
|
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
|
||||||
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
|
|
||||||
except json.decoder.JSONDecodeError:
|
|
||||||
# sometimes the first line is a comment or other junk, so try without
|
|
||||||
json_file.seek(0)
|
|
||||||
first_line = json_file.readline()
|
|
||||||
#print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
|
|
||||||
links = json.load(json_file)
|
|
||||||
# we may fail again, which means we really don't know what to do
|
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
if link:
|
if link:
|
||||||
yield jsonObjectToLink(link,json_file.name)
|
yield jsonObjectToLink(link, json_file.name)
|
||||||
|
|
||||||
KEY = 'json'
|
KEY = 'json'
|
||||||
NAME = 'Generic JSON'
|
NAME = 'Generic JSON'
|
||||||
|
|
|
@ -3,11 +3,9 @@ __package__ = 'archivebox.parsers'
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from typing import IO, Iterable
|
from typing import IO, Iterable
|
||||||
from datetime import datetime, timezone
|
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
htmldecode,
|
|
||||||
enforce_types,
|
enforce_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
__package__ = 'archivebox.parsers'
|
__package__ = 'archivebox.parsers'
|
||||||
__description__ = 'Plain Text'
|
__description__ = 'Plain Text'
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from typing import IO, Iterable
|
from typing import IO, Iterable
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,7 +9,7 @@ from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
URL_REGEX
|
find_all_urls,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,7 +37,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# otherwise look for anything that looks like a URL in the line
|
# otherwise look for anything that looks like a URL in the line
|
||||||
for url in re.findall(URL_REGEX, line):
|
for url in find_all_urls(line):
|
||||||
yield Link(
|
yield Link(
|
||||||
url=htmldecode(url),
|
url=htmldecode(url),
|
||||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||||
|
@ -48,17 +46,6 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
sources=[text_file.name],
|
sources=[text_file.name],
|
||||||
)
|
)
|
||||||
|
|
||||||
# look inside the URL for any sub-urls, e.g. for archive.org links
|
|
||||||
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
|
||||||
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
|
||||||
for sub_url in re.findall(URL_REGEX, line[1:]):
|
|
||||||
yield Link(
|
|
||||||
url=htmldecode(sub_url),
|
|
||||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
|
||||||
title=None,
|
|
||||||
tags=None,
|
|
||||||
sources=[text_file.name],
|
|
||||||
)
|
|
||||||
|
|
||||||
KEY = 'txt'
|
KEY = 'txt'
|
||||||
NAME = 'Generic TXT'
|
NAME = 'Generic TXT'
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
<a href="/admin/core/tag/">Tags</a> |
|
<a href="/admin/core/tag/">Tags</a> |
|
||||||
<a href="/admin/core/archiveresult/?o=-1">Log</a>
|
<a href="/admin/core/archiveresult/?o=-1">Log</a>
|
||||||
<a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
|
<a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
|
||||||
|
<a href="/api">API</a> |
|
||||||
<a href="{% url 'public-index' %}">Public</a> |
|
<a href="{% url 'public-index' %}">Public</a> |
|
||||||
<a href="/admin/">Admin</a>
|
<a href="/admin/">Admin</a>
|
||||||
|
|
||||||
|
@ -16,7 +17,7 @@
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
{% block userlinks %}
|
{% block userlinks %}
|
||||||
{% if user.has_usable_password %}
|
{% if user.has_usable_password %}
|
||||||
<a href="{% url 'admin:password_change' %}">Account</a> /
|
<a href="{% url 'admin:password_change' %}" title="Change your account password">Account</a> /
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<a href="{% url 'admin:logout' %}">{% trans 'Log out' %}</a>
|
<a href="{% url 'admin:logout' %}">{% trans 'Log out' %}</a>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
|
@ -62,12 +62,12 @@ COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m
|
||||||
|
|
||||||
# https://mathiasbynens.be/demo/url-regex
|
# https://mathiasbynens.be/demo/url-regex
|
||||||
URL_REGEX = re.compile(
|
URL_REGEX = re.compile(
|
||||||
r'(?=(' +
|
r'(?=('
|
||||||
r'http[s]?://' + # start matching from allowed schemes
|
r'http[s]?://' # start matching from allowed schemes
|
||||||
r'(?:[a-zA-Z]|[0-9]' + # followed by allowed alphanum characters
|
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||||
r'|[-_$@.&+!*\(\),]' + # or allowed symbols (keep hyphen first to match literal hyphen)
|
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||||
r'|[^\u0000-\u007F])+' + # or allowed unicode bytes
|
r'|[^\u0000-\u007F])+' # or allowed unicode bytes
|
||||||
r'[^\]\[<>"\'\s]+' + # stop parsing at these symbols
|
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
|
||||||
r'))',
|
r'))',
|
||||||
re.IGNORECASE | re.UNICODE,
|
re.IGNORECASE | re.UNICODE,
|
||||||
)
|
)
|
||||||
|
@ -90,6 +90,11 @@ def fix_url_from_markdown(url_str: str) -> str:
|
||||||
helpful to fix URLs parsed from markdown e.g.
|
helpful to fix URLs parsed from markdown e.g.
|
||||||
input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
|
input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
|
||||||
result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
|
result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
|
||||||
|
|
||||||
|
IMPORTANT ASSUMPTION: valid urls wont have unbalanced or incorrectly nested parentheses
|
||||||
|
e.g. this will fail the user actually wants to ingest a url like 'https://example.com/some_wei)(rd_url'
|
||||||
|
in that case it will return https://example.com/some_wei (truncated up to the first unbalanced paren)
|
||||||
|
This assumption is true 99.9999% of the time, and for the rare edge case the user can use url_list parser.
|
||||||
"""
|
"""
|
||||||
trimmed_url = url_str
|
trimmed_url = url_str
|
||||||
|
|
||||||
|
@ -353,7 +358,8 @@ def chrome_cleanup():
|
||||||
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
|
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
|
||||||
remove_file("/home/archivebox/.config/chromium/SingletonLock")
|
remove_file("/home/archivebox/.config/chromium/SingletonLock")
|
||||||
|
|
||||||
def ansi_to_html(text):
|
@enforce_types
|
||||||
|
def ansi_to_html(text: str) -> str:
|
||||||
"""
|
"""
|
||||||
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
|
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
|
||||||
"""
|
"""
|
||||||
|
@ -439,11 +445,14 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
||||||
|
|
||||||
|
|
||||||
### URL PARSING TESTS / ASSERTIONS
|
### URL PARSING TESTS / ASSERTIONS
|
||||||
# they run at runtime because I like having them inline in this file,
|
|
||||||
# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
|
|
||||||
# and these assertions are basically instant, so not a big performance cost to do it on startup
|
|
||||||
|
|
||||||
assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
|
# Check that plain text regex URL parsing works as expected
|
||||||
|
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||||
|
# misbehaving due to some OS-level or environment level quirks (e.g. regex engine / cpython / locale differences)
|
||||||
|
# the consequences of bad URL parsing could be disastrous and lead to many
|
||||||
|
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
||||||
|
|
||||||
|
assert fix_url_from_markdown('http://example.com/a(b)c).x(y)z') == 'http://example.com/a(b)c'
|
||||||
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
|
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
|
||||||
|
|
||||||
URL_REGEX_TESTS = [
|
URL_REGEX_TESTS = [
|
||||||
|
@ -482,3 +491,50 @@ URL_REGEX_TESTS = [
|
||||||
for urls_str, expected_url_matches in URL_REGEX_TESTS:
|
for urls_str, expected_url_matches in URL_REGEX_TESTS:
|
||||||
url_matches = list(find_all_urls(urls_str))
|
url_matches = list(find_all_urls(urls_str))
|
||||||
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
|
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
|
||||||
|
|
||||||
|
|
||||||
|
# More test cases
|
||||||
|
_test_url_strs = {
|
||||||
|
'example.com': 0,
|
||||||
|
'/example.com': 0,
|
||||||
|
'//example.com': 0,
|
||||||
|
':/example.com': 0,
|
||||||
|
'://example.com': 0,
|
||||||
|
'htt://example8.com': 0,
|
||||||
|
'/htt://example.com': 0,
|
||||||
|
'https://example': 1,
|
||||||
|
'https://localhost/2345': 1,
|
||||||
|
'https://localhost:1234/123': 1,
|
||||||
|
'://': 0,
|
||||||
|
'https://': 0,
|
||||||
|
'http://': 0,
|
||||||
|
'ftp://': 0,
|
||||||
|
'ftp://example.com': 0,
|
||||||
|
'https://example.com': 1,
|
||||||
|
'https://example.com/': 1,
|
||||||
|
'https://a.example.com': 1,
|
||||||
|
'https://a.example.com/': 1,
|
||||||
|
'https://a.example.com/what/is/happening.html': 1,
|
||||||
|
'https://a.example.com/what/ís/happening.html': 1,
|
||||||
|
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
||||||
|
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
||||||
|
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
||||||
|
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||||
|
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||||
|
'<test>http://example7.com</test>': 1,
|
||||||
|
'https://<test>': 0,
|
||||||
|
'https://[test]': 0,
|
||||||
|
'http://"test"': 0,
|
||||||
|
'http://\'test\'': 0,
|
||||||
|
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||||
|
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||||
|
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||||
|
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
||||||
|
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
||||||
|
'<or>http://examplehttp://15.badc</that>': 2,
|
||||||
|
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
||||||
|
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
||||||
|
}
|
||||||
|
for url_str, num_urls in _test_url_strs.items():
|
||||||
|
assert len(list(find_all_urls(url_str))) == num_urls, (
|
||||||
|
f'{url_str} does not contain {num_urls} urls')
|
||||||
|
|
|
@ -18,7 +18,7 @@ which docker > /dev/null || exit 1
|
||||||
which jq > /dev/null || exit 1
|
which jq > /dev/null || exit 1
|
||||||
# which pdm > /dev/null || exit 1
|
# which pdm > /dev/null || exit 1
|
||||||
|
|
||||||
SUPPORTED_PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7"
|
SUPPORTED_PLATFORMS="linux/amd64,linux/arm64"
|
||||||
|
|
||||||
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
|
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
|
||||||
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
|
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
|
||||||
|
@ -80,20 +80,20 @@ echo "[+] Building archivebox:$VERSION docker image..."
|
||||||
# docker build . --no-cache -t archivebox-dev \
|
# docker build . --no-cache -t archivebox-dev \
|
||||||
# replace --load with --push to deploy
|
# replace --load with --push to deploy
|
||||||
docker buildx build --platform "$SELECTED_PLATFORMS" --load . \
|
docker buildx build --platform "$SELECTED_PLATFORMS" --load . \
|
||||||
-t archivebox/archivebox \
|
# -t archivebox/archivebox \
|
||||||
-t archivebox/archivebox:$TAG_NAME \
|
-t archivebox/archivebox:$TAG_NAME \
|
||||||
-t archivebox/archivebox:$VERSION \
|
# -t archivebox/archivebox:$VERSION \
|
||||||
-t archivebox/archivebox:$SHORT_VERSION \
|
# -t archivebox/archivebox:$SHORT_VERSION \
|
||||||
-t archivebox/archivebox:$GIT_SHA \
|
-t archivebox/archivebox:$GIT_SHA \
|
||||||
-t archivebox/archivebox:latest \
|
# -t archivebox/archivebox:latest \
|
||||||
-t nikisweeting/archivebox \
|
# -t nikisweeting/archivebox \
|
||||||
-t nikisweeting/archivebox:$TAG_NAME \
|
-t nikisweeting/archivebox:$TAG_NAME \
|
||||||
-t nikisweeting/archivebox:$VERSION \
|
# -t nikisweeting/archivebox:$VERSION \
|
||||||
-t nikisweeting/archivebox:$SHORT_VERSION \
|
# -t nikisweeting/archivebox:$SHORT_VERSION \
|
||||||
-t nikisweeting/archivebox:$GIT_SHA \
|
-t nikisweeting/archivebox:$GIT_SHA \
|
||||||
-t nikisweeting/archivebox:latest \
|
# -t nikisweeting/archivebox:latest \
|
||||||
-t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \
|
-t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \
|
||||||
-t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
|
# -t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
|
||||||
-t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
|
# -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
|
||||||
-t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA \
|
-t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA \
|
||||||
-t ghcr.io/archivebox/archivebox/archivebox:latest
|
# -t ghcr.io/archivebox/archivebox/archivebox:latest
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
|
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
|
||||||
# set -o xtrace
|
# set -o xtrace
|
||||||
# set -o nounset
|
# set -o nounset
|
||||||
|
shopt -s nullglob
|
||||||
set -o errexit
|
set -o errexit
|
||||||
set -o errtrace
|
set -o errtrace
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
|
@ -15,7 +15,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
|
||||||
source "$DIR/.venv/bin/activate"
|
source "$DIR/.venv/bin/activate"
|
||||||
|
|
||||||
echo "[*] Running flake8..."
|
echo "[*] Running flake8..."
|
||||||
cd archivebox
|
cd "$DIR/archivebox"
|
||||||
flake8 . && echo "√ No errors found."
|
flake8 . && echo "√ No errors found."
|
||||||
|
|
||||||
echo
|
echo
|
||||||
|
|
|
@ -48,7 +48,7 @@ echo
|
||||||
|
|
||||||
echo "[+] Generating dev & prod requirements.txt & pdm.lock from pyproject.toml..."
|
echo "[+] Generating dev & prod requirements.txt & pdm.lock from pyproject.toml..."
|
||||||
pip install --upgrade pip setuptools
|
pip install --upgrade pip setuptools
|
||||||
pdm self update
|
pdm self update >/dev/null 2>&1 || true
|
||||||
pdm venv create 3.12
|
pdm venv create 3.12
|
||||||
echo
|
echo
|
||||||
echo "pyproject.toml: archivebox $(grep 'version = ' pyproject.toml | awk '{print $3}' | jq -r)"
|
echo "pyproject.toml: archivebox $(grep 'version = ' pyproject.toml | awk '{print $3}' | jq -r)"
|
||||||
|
@ -73,7 +73,7 @@ cp ./pdm.dev.lock ./pip_dist/
|
||||||
cp ./requirements-dev.txt ./pip_dist/
|
cp ./requirements-dev.txt ./pip_dist/
|
||||||
|
|
||||||
echo
|
echo
|
||||||
echo "[+]] Generating package-lock.json from package.json..."
|
echo "[+] Generating package-lock.json from package.json..."
|
||||||
npm install -g npm
|
npm install -g npm
|
||||||
echo
|
echo
|
||||||
echo "package.json: archivebox $(jq -r '.version' package.json)"
|
echo "package.json: archivebox $(jq -r '.version' package.json)"
|
||||||
|
|
|
@ -27,9 +27,9 @@ if (which docker-compose > /dev/null && docker pull archivebox/archivebox:latest
|
||||||
if [ -f "./index.sqlite3" ]; then
|
if [ -f "./index.sqlite3" ]; then
|
||||||
mv -i ~/archivebox/* ~/archivebox/data/
|
mv -i ~/archivebox/* ~/archivebox/data/
|
||||||
fi
|
fi
|
||||||
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/docker-compose.yml' > docker-compose.yml
|
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/docker-compose.yml' > docker-compose.yml
|
||||||
mkdir -p ./etc
|
mkdir -p ./etc
|
||||||
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > ./etc/sonic.cfg
|
curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg' > ./etc/sonic.cfg
|
||||||
docker compose run --rm archivebox init --setup
|
docker compose run --rm archivebox init --setup
|
||||||
echo
|
echo
|
||||||
echo "[+] Starting ArchiveBox server using: docker compose up -d..."
|
echo "[+] Starting ArchiveBox server using: docker compose up -d..."
|
||||||
|
|
|
@ -48,17 +48,17 @@ services:
|
||||||
# $ docker compose restart archivebox_scheduler
|
# $ docker compose restart archivebox_scheduler
|
||||||
|
|
||||||
archivebox_scheduler:
|
archivebox_scheduler:
|
||||||
image: archivebox/archivebox:latest
|
image: archivebox/archivebox:latest
|
||||||
command: schedule --foreground --update --every=day
|
command: schedule --foreground --update --every=day
|
||||||
environment:
|
environment:
|
||||||
- TIMEOUT=120 # use a higher timeout than the main container to give slow tasks more time when retrying
|
- TIMEOUT=120 # use a higher timeout than the main container to give slow tasks more time when retrying
|
||||||
# - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
|
# - PUID=502 # set to your host user's UID & GID if you encounter permissions issues
|
||||||
# - PGID=20
|
# - PGID=20
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/data
|
- ./data:/data
|
||||||
# cpus: 2 # uncomment / edit these values to limit scheduler container resource consumption
|
# cpus: 2 # uncomment / edit these values to limit scheduler container resource consumption
|
||||||
# mem_limit: 2048m
|
# mem_limit: 2048m
|
||||||
# restart: always
|
# restart: always
|
||||||
|
|
||||||
|
|
||||||
### This runs the optional Sonic full-text search backend (much faster than default rg backend).
|
### This runs the optional Sonic full-text search backend (much faster than default rg backend).
|
||||||
|
@ -72,7 +72,7 @@ services:
|
||||||
# not needed after first run / if you have already have ./etc/sonic.cfg present
|
# not needed after first run / if you have already have ./etc/sonic.cfg present
|
||||||
dockerfile_inline: |
|
dockerfile_inline: |
|
||||||
FROM quay.io/curl/curl:latest AS config_downloader
|
FROM quay.io/curl/curl:latest AS config_downloader
|
||||||
RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > /tmp/sonic.cfg
|
RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg' > /tmp/sonic.cfg
|
||||||
FROM valeriansaliou/sonic:latest
|
FROM valeriansaliou/sonic:latest
|
||||||
COPY --from=config_downloader /tmp/sonic.cfg /etc/sonic.cfg
|
COPY --from=config_downloader /tmp/sonic.cfg /etc/sonic.cfg
|
||||||
expose:
|
expose:
|
||||||
|
|
2
docs
2
docs
|
@ -1 +1 @@
|
||||||
Subproject commit a1b69c51ba9b249c0b2a6efd141dbb792fc36ad2
|
Subproject commit f23abba9773b67ad9f2fd04d6f2e8e056dfa6521
|
50
package-lock.json
generated
50
package-lock.json
generated
|
@ -25,9 +25,9 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@babel/runtime-corejs2": {
|
"node_modules/@babel/runtime-corejs2": {
|
||||||
"version": "7.24.4",
|
"version": "7.24.5",
|
||||||
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.24.4.tgz",
|
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.24.5.tgz",
|
||||||
"integrity": "sha512-ZCKqyUKt/Coimg+3Kafu43yNetgYnTXzNbEGAgxc81J5sI0qFNbQ613w7PNny+SmijAmGVroL0GDvx5rG/JI5Q==",
|
"integrity": "sha512-cC9jiO6s/IN+xwCHYy1AGrcFJ4bwgIwb8HX1KaoEpRsznLlO4x9eBP6AX7RIeMSWlQqEj2WHox637OS8cDq6Ew==",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"core-js": "^2.6.12",
|
"core-js": "^2.6.12",
|
||||||
"regenerator-runtime": "^0.14.0"
|
"regenerator-runtime": "^0.14.0"
|
||||||
|
@ -203,9 +203,9 @@
|
||||||
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
|
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
|
||||||
},
|
},
|
||||||
"node_modules/@types/node": {
|
"node_modules/@types/node": {
|
||||||
"version": "20.12.7",
|
"version": "20.12.8",
|
||||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz",
|
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.8.tgz",
|
||||||
"integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==",
|
"integrity": "sha512-NU0rJLJnshZWdE/097cdCBbyW1h4hEg0xpovcoAQYHl8dnEyp/NAOiE45pvc+Bd1Dt+2r94v2eGFpQJ4R7g+2w==",
|
||||||
"optional": true,
|
"optional": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"undici-types": "~5.26.4"
|
"undici-types": "~5.26.4"
|
||||||
|
@ -713,9 +713,9 @@
|
||||||
"integrity": "sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ=="
|
"integrity": "sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ=="
|
||||||
},
|
},
|
||||||
"node_modules/dompurify": {
|
"node_modules/dompurify": {
|
||||||
"version": "3.1.0",
|
"version": "3.1.2",
|
||||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.1.2.tgz",
|
||||||
"integrity": "sha512-yoU4rhgPKCo+p5UrWWWNKiIq+ToGqmVVhk0PmMYBK4kRsR3/qhemNFL8f6CFmBd4gMwm3F4T7HBoydP5uY07fA=="
|
"integrity": "sha512-hLGGBI1tw5N8qTELr3blKjAML/LY4ANxksbS612UiJyDfyf/2D092Pvm+S7pmeTGJRqvlJkFzBoHBQKgQlOQVg=="
|
||||||
},
|
},
|
||||||
"node_modules/domutils": {
|
"node_modules/domutils": {
|
||||||
"version": "1.5.1",
|
"version": "1.5.1",
|
||||||
|
@ -1655,6 +1655,26 @@
|
||||||
"node": ">=18"
|
"node": ">=18"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/puppeteer-core/node_modules/ws": {
|
||||||
|
"version": "8.16.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
|
||||||
|
"integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=10.0.0"
|
||||||
|
},
|
||||||
|
"peerDependencies": {
|
||||||
|
"bufferutil": "^4.0.1",
|
||||||
|
"utf-8-validate": ">=5.0.2"
|
||||||
|
},
|
||||||
|
"peerDependenciesMeta": {
|
||||||
|
"bufferutil": {
|
||||||
|
"optional": true
|
||||||
|
},
|
||||||
|
"utf-8-validate": {
|
||||||
|
"optional": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/qs": {
|
"node_modules/qs": {
|
||||||
"version": "6.5.3",
|
"version": "6.5.3",
|
||||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz",
|
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz",
|
||||||
|
@ -2071,9 +2091,9 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/tough-cookie": {
|
"node_modules/tough-cookie": {
|
||||||
"version": "4.1.3",
|
"version": "4.1.4",
|
||||||
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz",
|
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.4.tgz",
|
||||||
"integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==",
|
"integrity": "sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag==",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"psl": "^1.1.33",
|
"psl": "^1.1.33",
|
||||||
"punycode": "^2.1.1",
|
"punycode": "^2.1.1",
|
||||||
|
@ -2276,9 +2296,9 @@
|
||||||
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="
|
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="
|
||||||
},
|
},
|
||||||
"node_modules/ws": {
|
"node_modules/ws": {
|
||||||
"version": "8.16.0",
|
"version": "8.17.0",
|
||||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
|
"resolved": "https://registry.npmjs.org/ws/-/ws-8.17.0.tgz",
|
||||||
"integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
|
"integrity": "sha512-uJq6108EgZMAl20KagGkzCKfMEjxmKvZHG7Tlq0Z6nOky7YF7aq4mOx6xK8TJ/i1LeK4Qus7INktacctDgY8Ow==",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=10.0.0"
|
"node": ">=10.0.0"
|
||||||
},
|
},
|
||||||
|
|
|
@ -12,32 +12,31 @@ readme = "README.md"
|
||||||
# pdm install
|
# pdm install
|
||||||
# pdm update --unconstrained
|
# pdm update --unconstrained
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
# Last Bumped: 2024-04-25
|
||||||
# Base Framework and Language Dependencies
|
# Base Framework and Language Dependencies
|
||||||
"setuptools>=69.5.1",
|
"setuptools>=69.5.1",
|
||||||
"django>=4.2.0,<5.0",
|
"django>=5.0.4,<6.0",
|
||||||
"django-ninja>=1.1.0",
|
"django-ninja>=1.1.0",
|
||||||
"django-extensions>=3.2.3",
|
"django-extensions>=3.2.3",
|
||||||
"mypy-extensions>=1.0.0",
|
"mypy-extensions>=1.0.0",
|
||||||
|
|
||||||
# Python Helper Libraries
|
# Python Helper Libraries
|
||||||
"requests>=2.31.0",
|
"requests>=2.31.0",
|
||||||
"dateparser>=1.0.0",
|
"dateparser>=1.0.0",
|
||||||
"feedparser>=6.0.11",
|
"feedparser>=6.0.11",
|
||||||
"w3lib>=1.22.0",
|
"w3lib>=2.1.2",
|
||||||
|
|
||||||
# Feature-Specific Dependencies
|
# Feature-Specific Dependencies
|
||||||
"python-crontab>=2.5.1", # for: archivebox schedule
|
"python-crontab>=3.0.0", # for: archivebox schedule
|
||||||
"croniter>=0.3.34", # for: archivebox schedule
|
"croniter>=2.0.5", # for: archivebox schedule
|
||||||
"ipython>5.0.0", # for: archivebox shell
|
"ipython>=8.23.0", # for: archivebox shell
|
||||||
|
|
||||||
# Extractor Dependencies
|
# Extractor Dependencies
|
||||||
"yt-dlp>=2024.4.9", # for: media
|
"yt-dlp>=2024.4.9", # for: media
|
||||||
"playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
|
# "playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
|
||||||
|
|
||||||
# TODO: add more extractors
|
# TODO: add more extractors
|
||||||
# - gallery-dl
|
# - gallery-dl
|
||||||
# - scihubdl
|
# - scihubdl
|
||||||
# - See Github issues for more...
|
# - See Github issues for more...
|
||||||
|
"django-signal-webhooks>=0.3.0",
|
||||||
|
"django-admin-data-views>=0.3.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
homepage = "https://github.com/ArchiveBox/ArchiveBox"
|
homepage = "https://github.com/ArchiveBox/ArchiveBox"
|
||||||
|
@ -59,9 +58,6 @@ classifiers = [
|
||||||
"Natural Language :: English",
|
"Natural Language :: English",
|
||||||
"Operating System :: OS Independent",
|
"Operating System :: OS Independent",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3.7",
|
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.11",
|
||||||
"Programming Language :: Python :: 3.12",
|
"Programming Language :: Python :: 3.12",
|
||||||
|
@ -100,10 +96,10 @@ ldap = [
|
||||||
# pdm update --dev --unconstrained
|
# pdm update --dev --unconstrained
|
||||||
[tool.pdm.dev-dependencies]
|
[tool.pdm.dev-dependencies]
|
||||||
build = [
|
build = [
|
||||||
|
# "pdm", # usually installed by apt/brew, dont double-install with pip
|
||||||
"setuptools>=69.5.1",
|
"setuptools>=69.5.1",
|
||||||
"pip",
|
"pip",
|
||||||
"wheel",
|
"wheel",
|
||||||
"pdm",
|
|
||||||
"homebrew-pypi-poet>=0.10.0", # for: generating archivebox.rb brewfile list of python packages
|
"homebrew-pypi-poet>=0.10.0", # for: generating archivebox.rb brewfile list of python packages
|
||||||
]
|
]
|
||||||
docs = [
|
docs = [
|
||||||
|
@ -115,10 +111,11 @@ debug = [
|
||||||
"django-debug-toolbar",
|
"django-debug-toolbar",
|
||||||
"djdt_flamegraph",
|
"djdt_flamegraph",
|
||||||
"ipdb",
|
"ipdb",
|
||||||
|
"requests-tracker>=0.3.3",
|
||||||
]
|
]
|
||||||
test = [
|
test = [
|
||||||
"pdm[pytest]",
|
|
||||||
"pytest",
|
"pytest",
|
||||||
|
"bottle",
|
||||||
]
|
]
|
||||||
lint = [
|
lint = [
|
||||||
"flake8",
|
"flake8",
|
||||||
|
@ -126,6 +123,12 @@ lint = [
|
||||||
"django-stubs",
|
"django-stubs",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[tool.pdm.scripts]
|
||||||
|
lint = "./bin/lint.sh"
|
||||||
|
test = "./bin/test.sh"
|
||||||
|
# all = {composite = ["lint mypackage/", "test -v tests/"]}
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["pdm-backend"]
|
requires = ["pdm-backend"]
|
||||||
build-backend = "pdm.backend"
|
build-backend = "pdm.backend"
|
||||||
|
@ -134,11 +137,6 @@ build-backend = "pdm.backend"
|
||||||
archivebox = "archivebox.cli:main"
|
archivebox = "archivebox.cli:main"
|
||||||
|
|
||||||
|
|
||||||
[tool.pdm.scripts]
|
|
||||||
lint = "./bin/lint.sh"
|
|
||||||
test = "./bin/test.sh"
|
|
||||||
# all = {composite = ["lint mypackage/", "test -v tests/"]}
|
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
testpaths = [ "tests" ]
|
testpaths = [ "tests" ]
|
||||||
|
|
||||||
|
@ -154,6 +152,8 @@ explicit_package_bases = true
|
||||||
# exclude = "pdm/(pep582/|models/in_process/.+\\.py)"
|
# exclude = "pdm/(pep582/|models/in_process/.+\\.py)"
|
||||||
plugins = ["mypy_django_plugin.main"]
|
plugins = ["mypy_django_plugin.main"]
|
||||||
|
|
||||||
|
[tool.django-stubs]
|
||||||
|
django_settings_module = "core.settings"
|
||||||
|
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
|
|
|
@ -2,54 +2,59 @@
|
||||||
# Please do not edit it manually.
|
# Please do not edit it manually.
|
||||||
|
|
||||||
annotated-types==0.6.0
|
annotated-types==0.6.0
|
||||||
|
anyio==4.3.0
|
||||||
asgiref==3.8.1
|
asgiref==3.8.1
|
||||||
asttokens==2.4.1
|
asttokens==2.4.1
|
||||||
brotli==1.1.0; implementation_name == "cpython"
|
brotli==1.1.0; implementation_name == "cpython"
|
||||||
brotlicffi==1.1.0.0; implementation_name != "cpython"
|
brotlicffi==1.1.0.0; implementation_name != "cpython"
|
||||||
certifi==2024.2.2
|
certifi==2024.2.2
|
||||||
cffi==1.16.0; implementation_name != "cpython"
|
cffi==1.16.0; platform_python_implementation != "PyPy" or implementation_name != "cpython"
|
||||||
charset-normalizer==3.3.2
|
charset-normalizer==3.3.2
|
||||||
colorama==0.4.6; sys_platform == "win32"
|
colorama==0.4.6; sys_platform == "win32"
|
||||||
croniter==2.0.5
|
croniter==2.0.5
|
||||||
|
cryptography==42.0.7
|
||||||
dateparser==1.2.0
|
dateparser==1.2.0
|
||||||
decorator==5.1.1
|
decorator==5.1.1
|
||||||
django==4.2.11
|
django==5.0.4
|
||||||
django-auth-ldap==4.8.0
|
django-auth-ldap==4.8.0
|
||||||
django-extensions==3.2.3
|
django-extensions==3.2.3
|
||||||
django-ninja==1.1.0
|
django-ninja==1.1.0
|
||||||
|
django-settings-holder==0.1.2
|
||||||
|
django-signal-webhooks==0.3.0
|
||||||
exceptiongroup==1.2.1; python_version < "3.11"
|
exceptiongroup==1.2.1; python_version < "3.11"
|
||||||
executing==2.0.1
|
executing==2.0.1
|
||||||
feedparser==6.0.11
|
feedparser==6.0.11
|
||||||
greenlet==3.0.3; platform_machine != "armv7l"
|
h11==0.14.0
|
||||||
|
httpcore==1.0.5
|
||||||
|
httpx==0.27.0
|
||||||
idna==3.7
|
idna==3.7
|
||||||
ipython==8.23.0
|
ipython==8.24.0
|
||||||
jedi==0.19.1
|
jedi==0.19.1
|
||||||
matplotlib-inline==0.1.7
|
matplotlib-inline==0.1.7
|
||||||
mutagen==1.47.0
|
mutagen==1.47.0
|
||||||
mypy-extensions==1.0.0
|
mypy-extensions==1.0.0
|
||||||
parso==0.8.4
|
parso==0.8.4
|
||||||
pexpect==4.9.0; sys_platform != "win32" and sys_platform != "emscripten"
|
pexpect==4.9.0; sys_platform != "win32" and sys_platform != "emscripten"
|
||||||
playwright==1.43.0; platform_machine != "armv7l"
|
|
||||||
prompt-toolkit==3.0.43
|
prompt-toolkit==3.0.43
|
||||||
ptyprocess==0.7.0; sys_platform != "win32" and sys_platform != "emscripten"
|
ptyprocess==0.7.0; sys_platform != "win32" and sys_platform != "emscripten"
|
||||||
pure-eval==0.2.2
|
pure-eval==0.2.2
|
||||||
pyasn1==0.6.0
|
pyasn1==0.6.0
|
||||||
pyasn1-modules==0.4.0
|
pyasn1-modules==0.4.0
|
||||||
pycparser==2.22; implementation_name != "cpython"
|
pycparser==2.22; platform_python_implementation != "PyPy" or implementation_name != "cpython"
|
||||||
pycryptodomex==3.20.0
|
pycryptodomex==3.20.0
|
||||||
pydantic==2.7.1
|
pydantic==2.7.1
|
||||||
pydantic-core==2.18.2
|
pydantic-core==2.18.2
|
||||||
pyee==11.1.0; platform_machine != "armv7l"
|
pygments==2.18.0
|
||||||
pygments==2.17.2
|
|
||||||
python-crontab==3.0.0
|
python-crontab==3.0.0
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
python-ldap==3.4.4
|
python-ldap==3.4.4
|
||||||
pytz==2024.1
|
pytz==2024.1
|
||||||
regex==2024.4.16
|
regex==2024.4.28
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
setuptools==69.5.1
|
setuptools==69.5.1
|
||||||
sgmllib3k==1.0.0
|
sgmllib3k==1.0.0
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
|
sniffio==1.3.1
|
||||||
sonic-client==1.0.0
|
sonic-client==1.0.0
|
||||||
sqlparse==0.5.0
|
sqlparse==0.5.0
|
||||||
stack-data==0.6.3
|
stack-data==0.6.3
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue