mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-09 12:21:57 -04:00
bring image back down to 700mb
Some checks are pending
CodeQL / Analyze (python) (push) Waiting to run
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Deploy static content to Pages / deploy (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Build GitHub Pages website / build (push) Waiting to run
Build GitHub Pages website / deploy (push) Blocked by required conditions
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
Some checks are pending
CodeQL / Analyze (python) (push) Waiting to run
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Deploy static content to Pages / deploy (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Build GitHub Pages website / build (push) Waiting to run
Build GitHub Pages website / deploy (push) Blocked by required conditions
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
This commit is contained in:
parent
47a7cabc68
commit
54d4d7f640
3 changed files with 102 additions and 52 deletions
|
@ -37,6 +37,7 @@ docker/
|
|||
website/
|
||||
typings/
|
||||
|
||||
tmp/
|
||||
data/
|
||||
data*/
|
||||
output/
|
||||
|
|
106
Dockerfile
106
Dockerfile
|
@ -28,8 +28,7 @@
|
|||
|
||||
#########################################################################################
|
||||
|
||||
FROM python:3.11-slim-bookworm
|
||||
# FROM debian:bookworm-backports # Tried using faster bookworm-backports but wasn't worth it due to more frequent breakages: https://packages.debian.org/bookworm-backports/
|
||||
FROM ubuntu:24.04
|
||||
|
||||
LABEL name="archivebox" \
|
||||
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
|
||||
|
@ -55,7 +54,6 @@ ARG TARGETPLATFORM
|
|||
ARG TARGETOS
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
######### Environment Variables #################################
|
||||
|
||||
# Global built-time and runtime environment constants + default pkg manager config
|
||||
|
@ -71,7 +69,7 @@ ENV TZ=UTC \
|
|||
npm_config_loglevel=error
|
||||
|
||||
# Language Version config
|
||||
ENV PYTHON_VERSION=3.11 \
|
||||
ENV PYTHON_VERSION=3.12 \
|
||||
NODE_VERSION=22
|
||||
|
||||
# Non-root User config
|
||||
|
@ -96,9 +94,6 @@ SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "
|
|||
# Detect ArchiveBox version number by reading pyproject.toml (also serves to invalidate the entire build cache whenever pyproject.toml changes)
|
||||
WORKDIR "$CODE_DIR"
|
||||
|
||||
RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
|
||||
grep '^version = ' "/app/pyproject.toml" | awk -F'"' '{print $2}' > /VERSION.txt
|
||||
|
||||
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up back-to-back Docker builds)
|
||||
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
|
||||
&& echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
|
||||
|
@ -106,7 +101,7 @@ RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d
|
|||
&& rm -f /etc/apt/apt.conf.d/docker-clean
|
||||
|
||||
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
|
||||
RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \
|
||||
RUN (echo "[i] Docker build for ArchiveBox starting..." \
|
||||
&& echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
|
||||
&& echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \
|
||||
&& echo \
|
||||
|
@ -134,10 +129,9 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
|
|||
# Install system apt dependencies (adding backports to access more recent apt updates)
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] APT Installing base system dependencies for $TARGETPLATFORM..." \
|
||||
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
|
||||
&& mkdir -p /etc/apt/keyrings \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports \
|
||||
&& apt-get install -qq -y \
|
||||
# 1. packaging dependencies
|
||||
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
|
||||
# 2. docker and init system dependencies
|
||||
|
@ -147,6 +141,27 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
# nano iputils-ping dnsutils htop procps jq yq
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install apt binary dependencies for exractors
|
||||
# COPY --from=selenium/ffmpeg:latest /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
git ffmpeg ripgrep \
|
||||
# Packages we have also needed in the past:
|
||||
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
|
||||
# curl wget (already installed above)
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# Save version info
|
||||
&& ( \
|
||||
which curl && curl --version | head -n1 \
|
||||
&& which wget && wget --version 2>&1 | head -n1 \
|
||||
&& which git && git --version 2>&1 | head -n1 \
|
||||
&& which ffmpeg && (ffmpeg --version 2>&1 | head -n1) || true \
|
||||
&& which rg && rg --version 2>&1 | head -n1 \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt
|
||||
|
||||
# Install sonic search backend
|
||||
COPY --from=archivebox/sonic:1.4.9 /usr/local/bin/sonic /usr/local/bin/sonic
|
||||
COPY --chown=root:root --chmod=755 "etc/sonic.cfg" /etc/sonic.cfg
|
||||
|
@ -160,7 +175,7 @@ RUN (which sonic && sonic --version) | tee -a /VERSION.txt
|
|||
# --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||
# RUN echo "[+] APT Installing PYTHON $PYTHON_VERSION for $TARGETPLATFORM (skipped, provided by base image)..." \
|
||||
# && apt-get update -qq \
|
||||
# && apt-get install -qq -y -t bookworm-backports --no-upgrade \
|
||||
# && apt-get install -qq -y --no-upgrade \
|
||||
# python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip python${PYTHON_VERSION}-venv pipx \
|
||||
# && rm -rf /var/lib/apt/lists/* \
|
||||
# tell PDM to allow using global system python site packages
|
||||
|
@ -188,8 +203,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
|
||||
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
|
||||
&& apt-get install -y -t bookworm-backports --no-upgrade \
|
||||
&& apt-get install -qq -y --no-upgrade libatomic1 \
|
||||
&& apt-get install -y --no-upgrade \
|
||||
nodejs \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# Update NPM to latest version
|
||||
|
@ -205,25 +220,23 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
# Set up uv and main app /venv
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.5 /uv /uvx /bin/
|
||||
ENV UV_COMPILE_BYTECODE=1 \
|
||||
UV_PYTHON_PREFERENCE=only-system \
|
||||
UV_LINK_MODE=copy \
|
||||
UV_PROJECT_ENVIRONMENT=/venv \
|
||||
PATH="/venv/bin:$PATH"
|
||||
UV_PROJECT_ENVIRONMENT=/venv
|
||||
WORKDIR "$CODE_DIR"
|
||||
# COPY --chown=root:root --chmod=755 pyproject.toml "$CODE_DIR/"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] UV Creating /venv using python ${PYTHON_VERSION} for ${TARGETPLATFORM} (provided by base image)..." \
|
||||
&& uv venv \
|
||||
&& uv pip install setuptools pip \
|
||||
&& ln -s /venv "$CODE_DIR/.venv" \
|
||||
&& uv venv /venv
|
||||
ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH"
|
||||
RUN uv pip install setuptools pip \
|
||||
&& ( \
|
||||
which python3 && python3 --version | grep " $PYTHON_VERSION" \
|
||||
&& which pip && pip --version \
|
||||
which python3 && python3 --version \
|
||||
&& which uv && uv version \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt
|
||||
|
||||
|
||||
|
||||
######### ArchiveBox & Extractor Dependencies ##################################
|
||||
|
||||
# Install ArchiveBox C-compiled/apt-installed Python dependencies in app /venv (currently only used for python-ldap)
|
||||
|
@ -233,41 +246,24 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
|||
#--mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] APT Installing + Compiling python3-ldap for PIP archivebox[ldap] on ${TARGETPLATFORM}..." \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
build-essential gcc \
|
||||
libssl-dev libldap2-dev libsasl2-dev python3-ldap \
|
||||
python3-dev libssl-dev libldap2-dev libsasl2-dev python3-ldap \
|
||||
python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
|
||||
&& uv pip install \
|
||||
"python-ldap>=3.4.3" \
|
||||
&& apt-get purge -y \
|
||||
build-essential gcc \
|
||||
python3-dev build-essential gcc \
|
||||
&& apt-get autoremove -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# Install apt binary dependencies for exractors
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports \
|
||||
curl wget git ffmpeg ripgrep pipx \
|
||||
# Packages we have also needed in the past:
|
||||
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# Save version info
|
||||
&& ( \
|
||||
which curl && curl --version | head -n1 \
|
||||
&& which wget && wget --version 2>&1 | head -n1 \
|
||||
&& which git && git --version 2>&1 | head -n1 \
|
||||
&& which rg && rg --version 2>&1 | head -n1 \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt
|
||||
|
||||
# Install apt font & rendering dependencies for chromium browser
|
||||
# TODO: figure out how much of this overlaps with `playwright install-deps chromium`
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] APT Installing CHROMIUM dependencies, fonts, and display libraries for $TARGETPLATFORM..." \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y -t bookworm-backports \
|
||||
&& apt-get install -qq -y \
|
||||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||
at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
|
||||
libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
|
||||
|
@ -285,17 +281,20 @@ RUN --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=brows
|
|||
--mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] PIP Installing playwright into /venv and CHROMIUM binary into $PLAYWRIGHT_BROWSERS_PATH..." \
|
||||
&& uv pip install "playwright>=1.49.1" \
|
||||
&& uv run playwright install chromium --with-deps \
|
||||
&& uv run playwright install chromium --no-shell \
|
||||
# --with-deps \
|
||||
&& export CHROME_BINARY="$(uv run python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
|
||||
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
|
||||
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
|
||||
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/home/${ARCHIVEBOX_USER}/.config" \
|
||||
&& mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" \
|
||||
&& chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \
|
||||
# delete extra full copy of node that playwright installs (saves >100mb)
|
||||
&& rm -f /venv/lib/python$PYTHON_VERSION/site-packages/playwright/driver/node \
|
||||
# Save version info
|
||||
&& ( \
|
||||
uv pip show playwright \
|
||||
&& uv run playwright --version \
|
||||
# && uv run playwright --version \
|
||||
&& which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt
|
||||
|
@ -304,15 +303,16 @@ RUN --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=brows
|
|||
ENV PATH="/home/$ARCHIVEBOX_USER/.npm/bin:$PATH"
|
||||
USER $ARCHIVEBOX_USER
|
||||
WORKDIR "/home/$ARCHIVEBOX_USER/.npm"
|
||||
RUN --mount=type=cache,target=/home/$ARCHIVEBOX_USER/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \
|
||||
echo "[+] NPM Installing extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \
|
||||
RUN --mount=type=cache,target=/home/archivebox/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \
|
||||
echo "[+] NPM Installing node extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \
|
||||
&& npm config set prefix "/home/$ARCHIVEBOX_USER/.npm" \
|
||||
&& npm install --global --prefer-offline --no-fund --no-audit --cache "/home/$ARCHIVEBOX_USER/.npm_cache" \
|
||||
"@postlight/parser@^2.2.3" \
|
||||
"readability-extractor@github:ArchiveBox/readability-extractor" \
|
||||
"single-file-cli@^1.1.54" \
|
||||
"puppeteer@^23.5.0" \
|
||||
"@puppeteer/browsers@^2.4.0"
|
||||
"@puppeteer/browsers@^2.4.0" \
|
||||
&& rm -Rf "/home/$ARCHIVEBOX_USER/.cache/puppeteer"
|
||||
USER root
|
||||
WORKDIR "$CODE_DIR"
|
||||
RUN ( \
|
||||
|
@ -328,13 +328,14 @@ RUN ( \
|
|||
######### Build Dependencies ####################################
|
||||
|
||||
|
||||
|
||||
# Install ArchiveBox Python venv dependencies from uv.lock
|
||||
COPY --chown=root:root --chmod=755 "pyproject.toml" "uv.lock" "$CODE_DIR"/
|
||||
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
|
||||
RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
|
||||
--mount=type=bind,source=uv.lock,target=/app/uv.lock \
|
||||
--mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[+] PIP Installing ArchiveBox dependencies from pyproject.toml and uv.lock..." \
|
||||
&& uv sync \
|
||||
--frozen \
|
||||
--inexact \
|
||||
--all-extras \
|
||||
--no-install-project \
|
||||
--no-install-workspace
|
||||
|
@ -345,8 +346,9 @@ COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
|
|||
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
|
||||
echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \
|
||||
&& uv sync \
|
||||
--all-extras \
|
||||
--frozen \
|
||||
--inexact \
|
||||
--all-extras \
|
||||
&& ( \
|
||||
uv tree \
|
||||
&& which archivebox \
|
||||
|
|
47
bin/docker_layers.sh
Executable file
47
bin/docker_layers.sh
Executable file
|
@ -0,0 +1,47 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# This script takes a single Docker image tag (e.g. "ubuntu:latest") as input
|
||||
# and shows the contents of the filesystem for each layer in the image.
|
||||
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 <image_tag>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
IMAGE=$1
|
||||
# TMPDIR=$(mktemp -d)
|
||||
mkdir -p "$PWD/tmp"
|
||||
TMPDIR="$PWD/tmp"
|
||||
|
||||
# Save the Docker image to a tar archive
|
||||
echo "Saving Docker image '$IMAGE'..."
|
||||
if ! docker save "$IMAGE" | pv > "${TMPDIR}/image.tar"; then
|
||||
echo "Failed to save image '$IMAGE'. Make sure the image exists and Docker is running."
|
||||
rm -rf "${TMPDIR}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd "${TMPDIR}" || exit 1
|
||||
|
||||
# Extract the top-level metadata of the image tar
|
||||
echo "Extracting image metadata..."
|
||||
pwd
|
||||
tar -xzf image.tar
|
||||
chmod -R 777 .
|
||||
cd blobs/sha256 || exit 1
|
||||
|
||||
# Typically, the saved image will contain multiple directories each representing a layer.
|
||||
# Each layer directory should have a 'layer.tar' file that contains the filesystem for that layer.
|
||||
for LAYERFILE in ./*; do
|
||||
if [ -f "${LAYERFILE}" ]; then
|
||||
mv "${LAYERFILE}" "${LAYERFILE}.tar"
|
||||
tar -xzf "${LAYERFILE}.tar"
|
||||
rm "${LAYERFILE}.tar"
|
||||
echo "-----------------------------------------------------------------"
|
||||
echo "Contents of layer: ${LAYERFILE%/}"
|
||||
echo "-----------------------------------------------------------------"
|
||||
# List the files in the layer.tar without extracting
|
||||
tree -L 2
|
||||
echo
|
||||
fi
|
||||
done
|
Loading…
Add table
Add a link
Reference in a new issue