bring image back down to 700mb
Some checks are pending
CodeQL / Analyze (python) (push) Waiting to run
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Deploy static content to Pages / deploy (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Build GitHub Pages website / build (push) Waiting to run
Build GitHub Pages website / deploy (push) Blocked by required conditions
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run

This commit is contained in:
Nick Sweeting 2024-12-18 04:43:53 -08:00
parent 47a7cabc68
commit 54d4d7f640
No known key found for this signature in database
3 changed files with 102 additions and 52 deletions

View file

@ -37,6 +37,7 @@ docker/
website/
typings/
tmp/
data/
data*/
output/

View file

@ -28,8 +28,7 @@
#########################################################################################
FROM python:3.11-slim-bookworm
# FROM debian:bookworm-backports # Tried using faster bookworm-backports but wasn't worth it due to more frequent breakages: https://packages.debian.org/bookworm-backports/
FROM ubuntu:24.04
LABEL name="archivebox" \
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
@ -55,7 +54,6 @@ ARG TARGETPLATFORM
ARG TARGETOS
ARG TARGETARCH
ARG TARGETVARIANT
######### Environment Variables #################################
# Global built-time and runtime environment constants + default pkg manager config
@ -71,7 +69,7 @@ ENV TZ=UTC \
npm_config_loglevel=error
# Language Version config
ENV PYTHON_VERSION=3.11 \
ENV PYTHON_VERSION=3.12 \
NODE_VERSION=22
# Non-root User config
@ -96,9 +94,6 @@ SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "
# Detect ArchiveBox version number by reading pyproject.toml (also serves to invalidate the entire build cache whenever pyproject.toml changes)
WORKDIR "$CODE_DIR"
RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
grep '^version = ' "/app/pyproject.toml" | awk -F'"' '{print $2}' > /VERSION.txt
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up back-to-back Docker builds)
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
&& echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
@ -106,7 +101,7 @@ RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d
&& rm -f /etc/apt/apt.conf.d/docker-clean
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \
RUN (echo "[i] Docker build for ArchiveBox starting..." \
&& echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
&& echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \
&& echo \
@ -134,10 +129,9 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
# Install system apt dependencies (adding backports to access more recent apt updates)
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
echo "[+] APT Installing base system dependencies for $TARGETPLATFORM..." \
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
&& mkdir -p /etc/apt/keyrings \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports \
&& apt-get install -qq -y \
# 1. packaging dependencies
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
# 2. docker and init system dependencies
@ -147,6 +141,27 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
# nano iputils-ping dnsutils htop procps jq yq
&& rm -rf /var/lib/apt/lists/*
# Install apt binary dependencies for exractors
# COPY --from=selenium/ffmpeg:latest /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
git ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# curl wget (already installed above)
&& rm -rf /var/lib/apt/lists/* \
# Save version info
&& ( \
which curl && curl --version | head -n1 \
&& which wget && wget --version 2>&1 | head -n1 \
&& which git && git --version 2>&1 | head -n1 \
&& which ffmpeg && (ffmpeg --version 2>&1 | head -n1) || true \
&& which rg && rg --version 2>&1 | head -n1 \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
# Install sonic search backend
COPY --from=archivebox/sonic:1.4.9 /usr/local/bin/sonic /usr/local/bin/sonic
COPY --chown=root:root --chmod=755 "etc/sonic.cfg" /etc/sonic.cfg
@ -160,7 +175,7 @@ RUN (which sonic && sonic --version) | tee -a /VERSION.txt
# --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
# RUN echo "[+] APT Installing PYTHON $PYTHON_VERSION for $TARGETPLATFORM (skipped, provided by base image)..." \
# && apt-get update -qq \
# && apt-get install -qq -y -t bookworm-backports --no-upgrade \
# && apt-get install -qq -y --no-upgrade \
# python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip python${PYTHON_VERSION}-venv pipx \
# && rm -rf /var/lib/apt/lists/* \
# tell PDM to allow using global system python site packages
@ -188,8 +203,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
&& apt-get install -y -t bookworm-backports --no-upgrade \
&& apt-get install -qq -y --no-upgrade libatomic1 \
&& apt-get install -y --no-upgrade \
nodejs \
&& rm -rf /var/lib/apt/lists/* \
# Update NPM to latest version
@ -205,25 +220,23 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
# Set up uv and main app /venv
COPY --from=ghcr.io/astral-sh/uv:0.5 /uv /uvx /bin/
ENV UV_COMPILE_BYTECODE=1 \
UV_PYTHON_PREFERENCE=only-system \
UV_LINK_MODE=copy \
UV_PROJECT_ENVIRONMENT=/venv \
PATH="/venv/bin:$PATH"
UV_PROJECT_ENVIRONMENT=/venv
WORKDIR "$CODE_DIR"
# COPY --chown=root:root --chmod=755 pyproject.toml "$CODE_DIR/"
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
echo "[+] UV Creating /venv using python ${PYTHON_VERSION} for ${TARGETPLATFORM} (provided by base image)..." \
&& uv venv \
&& uv pip install setuptools pip \
&& ln -s /venv "$CODE_DIR/.venv" \
&& uv venv /venv
ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH"
RUN uv pip install setuptools pip \
&& ( \
which python3 && python3 --version | grep " $PYTHON_VERSION" \
&& which pip && pip --version \
which python3 && python3 --version \
&& which uv && uv version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
######### ArchiveBox & Extractor Dependencies ##################################
# Install ArchiveBox C-compiled/apt-installed Python dependencies in app /venv (currently only used for python-ldap)
@ -233,41 +246,24 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
#--mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
echo "[+] APT Installing + Compiling python3-ldap for PIP archivebox[ldap] on ${TARGETPLATFORM}..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
&& apt-get install -qq -y --no-install-recommends \
build-essential gcc \
libssl-dev libldap2-dev libsasl2-dev python3-ldap \
python3-dev libssl-dev libldap2-dev libsasl2-dev python3-ldap \
python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
&& uv pip install \
"python-ldap>=3.4.3" \
&& apt-get purge -y \
build-essential gcc \
python3-dev build-essential gcc \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
# Install apt binary dependencies for exractors
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports \
curl wget git ffmpeg ripgrep pipx \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
&& rm -rf /var/lib/apt/lists/* \
# Save version info
&& ( \
which curl && curl --version | head -n1 \
&& which wget && wget --version 2>&1 | head -n1 \
&& which git && git --version 2>&1 | head -n1 \
&& which rg && rg --version 2>&1 | head -n1 \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
# Install apt font & rendering dependencies for chromium browser
# TODO: figure out how much of this overlaps with `playwright install-deps chromium`
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
echo "[+] APT Installing CHROMIUM dependencies, fonts, and display libraries for $TARGETPLATFORM..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports \
&& apt-get install -qq -y \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
@ -285,17 +281,20 @@ RUN --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=brows
--mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
echo "[+] PIP Installing playwright into /venv and CHROMIUM binary into $PLAYWRIGHT_BROWSERS_PATH..." \
&& uv pip install "playwright>=1.49.1" \
&& uv run playwright install chromium --with-deps \
&& uv run playwright install chromium --no-shell \
# --with-deps \
&& export CHROME_BINARY="$(uv run python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/home/${ARCHIVEBOX_USER}/.config" \
&& mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" \
&& chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \
# delete extra full copy of node that playwright installs (saves >100mb)
&& rm -f /venv/lib/python$PYTHON_VERSION/site-packages/playwright/driver/node \
# Save version info
&& ( \
uv pip show playwright \
&& uv run playwright --version \
# && uv run playwright --version \
&& which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
@ -304,15 +303,16 @@ RUN --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=brows
ENV PATH="/home/$ARCHIVEBOX_USER/.npm/bin:$PATH"
USER $ARCHIVEBOX_USER
WORKDIR "/home/$ARCHIVEBOX_USER/.npm"
RUN --mount=type=cache,target=/home/$ARCHIVEBOX_USER/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \
echo "[+] NPM Installing extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \
RUN --mount=type=cache,target=/home/archivebox/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \
echo "[+] NPM Installing node extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \
&& npm config set prefix "/home/$ARCHIVEBOX_USER/.npm" \
&& npm install --global --prefer-offline --no-fund --no-audit --cache "/home/$ARCHIVEBOX_USER/.npm_cache" \
"@postlight/parser@^2.2.3" \
"readability-extractor@github:ArchiveBox/readability-extractor" \
"single-file-cli@^1.1.54" \
"puppeteer@^23.5.0" \
"@puppeteer/browsers@^2.4.0"
"@puppeteer/browsers@^2.4.0" \
&& rm -Rf "/home/$ARCHIVEBOX_USER/.cache/puppeteer"
USER root
WORKDIR "$CODE_DIR"
RUN ( \
@ -328,13 +328,14 @@ RUN ( \
######### Build Dependencies ####################################
# Install ArchiveBox Python venv dependencies from uv.lock
COPY --chown=root:root --chmod=755 "pyproject.toml" "uv.lock" "$CODE_DIR"/
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
--mount=type=bind,source=uv.lock,target=/app/uv.lock \
--mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
echo "[+] PIP Installing ArchiveBox dependencies from pyproject.toml and uv.lock..." \
&& uv sync \
--frozen \
--inexact \
--all-extras \
--no-install-project \
--no-install-workspace
@ -345,8 +346,9 @@ COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \
&& uv sync \
--all-extras \
--frozen \
--inexact \
--all-extras \
&& ( \
uv tree \
&& which archivebox \

47
bin/docker_layers.sh Executable file
View file

@ -0,0 +1,47 @@
#!/usr/bin/env bash
# This script takes a single Docker image tag (e.g. "ubuntu:latest") as input
# and shows the contents of the filesystem for each layer in the image.
if [ $# -ne 1 ]; then
echo "Usage: $0 <image_tag>"
exit 1
fi
IMAGE=$1
# TMPDIR=$(mktemp -d)
mkdir -p "$PWD/tmp"
TMPDIR="$PWD/tmp"
# Save the Docker image to a tar archive
echo "Saving Docker image '$IMAGE'..."
if ! docker save "$IMAGE" | pv > "${TMPDIR}/image.tar"; then
echo "Failed to save image '$IMAGE'. Make sure the image exists and Docker is running."
rm -rf "${TMPDIR}"
exit 1
fi
cd "${TMPDIR}" || exit 1
# Extract the top-level metadata of the image tar
echo "Extracting image metadata..."
pwd
tar -xzf image.tar
chmod -R 777 .
cd blobs/sha256 || exit 1
# Typically, the saved image will contain multiple directories each representing a layer.
# Each layer directory should have a 'layer.tar' file that contains the filesystem for that layer.
for LAYERFILE in ./*; do
if [ -f "${LAYERFILE}" ]; then
mv "${LAYERFILE}" "${LAYERFILE}.tar"
tar -xzf "${LAYERFILE}.tar"
rm "${LAYERFILE}.tar"
echo "-----------------------------------------------------------------"
echo "Contents of layer: ${LAYERFILE%/}"
echo "-----------------------------------------------------------------"
# List the files in the layer.tar without extracting
tree -L 2
echo
fi
done