From 54d4d7f640a8d5076690003f3d93bc7f32e45102 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 18 Dec 2024 04:43:53 -0800 Subject: [PATCH] bring image back down to 700mb --- .dockerignore | 1 + Dockerfile | 106 ++++++++++++++++++++++--------------------- bin/docker_layers.sh | 47 +++++++++++++++++++ 3 files changed, 102 insertions(+), 52 deletions(-) create mode 100755 bin/docker_layers.sh diff --git a/.dockerignore b/.dockerignore index 24cc1d5c..fac517b4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -37,6 +37,7 @@ docker/ website/ typings/ +tmp/ data/ data*/ output/ diff --git a/Dockerfile b/Dockerfile index 8120211e..3abdb6d7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,8 +28,7 @@ ######################################################################################### -FROM python:3.11-slim-bookworm -# FROM debian:bookworm-backports # Tried using faster bookworm-backports but wasn't worth it due to more frequent breakages: https://packages.debian.org/bookworm-backports/ +FROM ubuntu:24.04 LABEL name="archivebox" \ maintainer="Nick Sweeting " \ @@ -55,7 +54,6 @@ ARG TARGETPLATFORM ARG TARGETOS ARG TARGETARCH ARG TARGETVARIANT - ######### Environment Variables ################################# # Global built-time and runtime environment constants + default pkg manager config @@ -71,7 +69,7 @@ ENV TZ=UTC \ npm_config_loglevel=error # Language Version config -ENV PYTHON_VERSION=3.11 \ +ENV PYTHON_VERSION=3.12 \ NODE_VERSION=22 # Non-root User config @@ -96,9 +94,6 @@ SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", " # Detect ArchiveBox version number by reading pyproject.toml (also serves to invalidate the entire build cache whenever pyproject.toml changes) WORKDIR "$CODE_DIR" -RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \ - grep '^version = ' "/app/pyproject.toml" | awk -F'"' '{print $2}' > /VERSION.txt - # Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up back-to-back Docker builds) RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \ && echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \ @@ -106,7 +101,7 @@ RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d && rm -f /etc/apt/apt.conf.d/docker-clean # Print debug info about build and save it to disk, for human eyes only, not used by anything else -RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \ +RUN (echo "[i] Docker build for ArchiveBox starting..." \ && echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \ && echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \ && echo \ @@ -134,10 +129,9 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \ # Install system apt dependencies (adding backports to access more recent apt updates) RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ echo "[+] APT Installing base system dependencies for $TARGETPLATFORM..." \ - && echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \ && mkdir -p /etc/apt/keyrings \ && apt-get update -qq \ - && apt-get install -qq -y -t bookworm-backports \ + && apt-get install -qq -y \ # 1. packaging dependencies apt-transport-https ca-certificates apt-utils gnupg2 curl wget \ # 2. docker and init system dependencies @@ -147,6 +141,27 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T # nano iputils-ping dnsutils htop procps jq yq && rm -rf /var/lib/apt/lists/* +# Install apt binary dependencies for exractors +# COPY --from=selenium/ffmpeg:latest /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \ + && apt-get update -qq \ + && apt-get install -qq -y --no-install-recommends \ + git ffmpeg ripgrep \ + # Packages we have also needed in the past: + # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \ + # curl wget (already installed above) + && rm -rf /var/lib/apt/lists/* \ + # Save version info + && ( \ + which curl && curl --version | head -n1 \ + && which wget && wget --version 2>&1 | head -n1 \ + && which git && git --version 2>&1 | head -n1 \ + && which ffmpeg && (ffmpeg --version 2>&1 | head -n1) || true \ + && which rg && rg --version 2>&1 | head -n1 \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt + # Install sonic search backend COPY --from=archivebox/sonic:1.4.9 /usr/local/bin/sonic /usr/local/bin/sonic COPY --chown=root:root --chmod=755 "etc/sonic.cfg" /etc/sonic.cfg @@ -160,7 +175,7 @@ RUN (which sonic && sonic --version) | tee -a /VERSION.txt # --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \ # RUN echo "[+] APT Installing PYTHON $PYTHON_VERSION for $TARGETPLATFORM (skipped, provided by base image)..." \ # && apt-get update -qq \ - # && apt-get install -qq -y -t bookworm-backports --no-upgrade \ + # && apt-get install -qq -y --no-upgrade \ # python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip python${PYTHON_VERSION}-venv pipx \ # && rm -rf /var/lib/apt/lists/* \ # tell PDM to allow using global system python site packages @@ -188,8 +203,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \ && curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \ && apt-get update -qq \ - && apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \ - && apt-get install -y -t bookworm-backports --no-upgrade \ + && apt-get install -qq -y --no-upgrade libatomic1 \ + && apt-get install -y --no-upgrade \ nodejs \ && rm -rf /var/lib/apt/lists/* \ # Update NPM to latest version @@ -205,25 +220,23 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T # Set up uv and main app /venv COPY --from=ghcr.io/astral-sh/uv:0.5 /uv /uvx /bin/ ENV UV_COMPILE_BYTECODE=1 \ + UV_PYTHON_PREFERENCE=only-system \ UV_LINK_MODE=copy \ - UV_PROJECT_ENVIRONMENT=/venv \ - PATH="/venv/bin:$PATH" + UV_PROJECT_ENVIRONMENT=/venv WORKDIR "$CODE_DIR" # COPY --chown=root:root --chmod=755 pyproject.toml "$CODE_DIR/" RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ echo "[+] UV Creating /venv using python ${PYTHON_VERSION} for ${TARGETPLATFORM} (provided by base image)..." \ - && uv venv \ - && uv pip install setuptools pip \ - && ln -s /venv "$CODE_DIR/.venv" \ + && uv venv /venv +ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH" +RUN uv pip install setuptools pip \ && ( \ - which python3 && python3 --version | grep " $PYTHON_VERSION" \ - && which pip && pip --version \ + which python3 && python3 --version \ && which uv && uv version \ && echo -e '\n\n' \ ) | tee -a /VERSION.txt - ######### ArchiveBox & Extractor Dependencies ################################## # Install ArchiveBox C-compiled/apt-installed Python dependencies in app /venv (currently only used for python-ldap) @@ -233,41 +246,24 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T #--mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \ echo "[+] APT Installing + Compiling python3-ldap for PIP archivebox[ldap] on ${TARGETPLATFORM}..." \ && apt-get update -qq \ - && apt-get install -qq -y -t bookworm-backports --no-install-recommends \ + && apt-get install -qq -y --no-install-recommends \ build-essential gcc \ - libssl-dev libldap2-dev libsasl2-dev python3-ldap \ + python3-dev libssl-dev libldap2-dev libsasl2-dev python3-ldap \ python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \ && uv pip install \ "python-ldap>=3.4.3" \ && apt-get purge -y \ - build-essential gcc \ + python3-dev build-essential gcc \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* -# Install apt binary dependencies for exractors -RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ - echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \ - && apt-get update -qq \ - && apt-get install -qq -y -t bookworm-backports \ - curl wget git ffmpeg ripgrep pipx \ - # Packages we have also needed in the past: - # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \ - && rm -rf /var/lib/apt/lists/* \ - # Save version info - && ( \ - which curl && curl --version | head -n1 \ - && which wget && wget --version 2>&1 | head -n1 \ - && which git && git --version 2>&1 | head -n1 \ - && which rg && rg --version 2>&1 | head -n1 \ - && echo -e '\n\n' \ - ) | tee -a /VERSION.txt - # Install apt font & rendering dependencies for chromium browser +# TODO: figure out how much of this overlaps with `playwright install-deps chromium` RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ echo "[+] APT Installing CHROMIUM dependencies, fonts, and display libraries for $TARGETPLATFORM..." \ && apt-get update -qq \ - && apt-get install -qq -y -t bookworm-backports \ + && apt-get install -qq -y \ fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \ libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \ @@ -285,17 +281,20 @@ RUN --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=brows --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ echo "[+] PIP Installing playwright into /venv and CHROMIUM binary into $PLAYWRIGHT_BROWSERS_PATH..." \ && uv pip install "playwright>=1.49.1" \ - && uv run playwright install chromium --with-deps \ + && uv run playwright install chromium --no-shell \ + # --with-deps \ && export CHROME_BINARY="$(uv run python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \ && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \ && mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \ && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/home/${ARCHIVEBOX_USER}/.config" \ && mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" \ && chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \ + # delete extra full copy of node that playwright installs (saves >100mb) + && rm -f /venv/lib/python$PYTHON_VERSION/site-packages/playwright/driver/node \ # Save version info && ( \ uv pip show playwright \ - && uv run playwright --version \ + # && uv run playwright --version \ && which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \ && echo -e '\n\n' \ ) | tee -a /VERSION.txt @@ -304,15 +303,16 @@ RUN --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=brows ENV PATH="/home/$ARCHIVEBOX_USER/.npm/bin:$PATH" USER $ARCHIVEBOX_USER WORKDIR "/home/$ARCHIVEBOX_USER/.npm" -RUN --mount=type=cache,target=/home/$ARCHIVEBOX_USER/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \ - echo "[+] NPM Installing extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \ +RUN --mount=type=cache,target=/home/archivebox/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \ + echo "[+] NPM Installing node extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \ && npm config set prefix "/home/$ARCHIVEBOX_USER/.npm" \ && npm install --global --prefer-offline --no-fund --no-audit --cache "/home/$ARCHIVEBOX_USER/.npm_cache" \ "@postlight/parser@^2.2.3" \ "readability-extractor@github:ArchiveBox/readability-extractor" \ "single-file-cli@^1.1.54" \ "puppeteer@^23.5.0" \ - "@puppeteer/browsers@^2.4.0" + "@puppeteer/browsers@^2.4.0" \ + && rm -Rf "/home/$ARCHIVEBOX_USER/.cache/puppeteer" USER root WORKDIR "$CODE_DIR" RUN ( \ @@ -328,13 +328,14 @@ RUN ( \ ######### Build Dependencies #################################### - # Install ArchiveBox Python venv dependencies from uv.lock -COPY --chown=root:root --chmod=755 "pyproject.toml" "uv.lock" "$CODE_DIR"/ -RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ +RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \ + --mount=type=bind,source=uv.lock,target=/app/uv.lock \ + --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ echo "[+] PIP Installing ArchiveBox dependencies from pyproject.toml and uv.lock..." \ && uv sync \ --frozen \ + --inexact \ --all-extras \ --no-install-project \ --no-install-workspace @@ -345,8 +346,9 @@ COPY --chown=root:root --chmod=755 "." "$CODE_DIR/" RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \ && uv sync \ - --all-extras \ --frozen \ + --inexact \ + --all-extras \ && ( \ uv tree \ && which archivebox \ diff --git a/bin/docker_layers.sh b/bin/docker_layers.sh new file mode 100755 index 00000000..be849820 --- /dev/null +++ b/bin/docker_layers.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# This script takes a single Docker image tag (e.g. "ubuntu:latest") as input +# and shows the contents of the filesystem for each layer in the image. + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +IMAGE=$1 +# TMPDIR=$(mktemp -d) +mkdir -p "$PWD/tmp" +TMPDIR="$PWD/tmp" + +# Save the Docker image to a tar archive +echo "Saving Docker image '$IMAGE'..." +if ! docker save "$IMAGE" | pv > "${TMPDIR}/image.tar"; then + echo "Failed to save image '$IMAGE'. Make sure the image exists and Docker is running." + rm -rf "${TMPDIR}" + exit 1 +fi + +cd "${TMPDIR}" || exit 1 + +# Extract the top-level metadata of the image tar +echo "Extracting image metadata..." +pwd +tar -xzf image.tar +chmod -R 777 . +cd blobs/sha256 || exit 1 + +# Typically, the saved image will contain multiple directories each representing a layer. +# Each layer directory should have a 'layer.tar' file that contains the filesystem for that layer. +for LAYERFILE in ./*; do + if [ -f "${LAYERFILE}" ]; then + mv "${LAYERFILE}" "${LAYERFILE}.tar" + tar -xzf "${LAYERFILE}.tar" + rm "${LAYERFILE}.tar" + echo "-----------------------------------------------------------------" + echo "Contents of layer: ${LAYERFILE%/}" + echo "-----------------------------------------------------------------" + # List the files in the layer.tar without extracting + tree -L 2 + echo + fi +done