From 1fc5d7c5c8aa9075ee05d7f7a7e2c8dc1d23fcd0 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 18 Mar 2024 14:39:09 -0700 Subject: [PATCH 1/4] add USER_AGENT config option to set all USER_AGENTs at once --- archivebox/config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index abc83f79..74e7ee58 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -142,9 +142,10 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, - 'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, - 'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, - 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, + 'USER_AGENT': {'type': str, 'default': None}, + 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, + 'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, + 'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, 'COOKIES_FILE': {'type': str, 'default': None}, 'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, From c5bb99dce1cfc9f5f873f3d6a63bc1a92295690f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 18 Mar 2024 14:40:40 -0700 Subject: [PATCH 2/4] explicitly use Default profile inside user data dir --- archivebox/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 3647d538..3814c23f 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -299,10 +299,11 @@ def chrome_args(**options) -> List[str]: if options['CHROME_USER_DATA_DIR']: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - + cmd_args.append('--profile-directory=Default') return dedupe(cmd_args) + def chrome_cleanup(): """ Cleans up any state or runtime files that chrome leaves behind when killed by From c0b5dbcecb3bc5c9ea6690d79ac43e60335202b7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 18 Mar 2024 14:41:39 -0700 Subject: [PATCH 3/4] create new data/personas dir to hold cookies and chrome profiles --- archivebox/config.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/archivebox/config.py b/archivebox/config.py index 74e7ee58..8b2f3a7e 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -281,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates' ARCHIVE_DIR_NAME = 'archive' SOURCES_DIR_NAME = 'sources' LOGS_DIR_NAME = 'logs' +PERSONAS_DIR_NAME = 'personas' SQL_INDEX_FILENAME = 'index.sqlite3' JSON_INDEX_FILENAME = 'index.json' HTML_INDEX_FILENAME = 'index.html' @@ -357,6 +358,7 @@ ALLOWED_IN_OUTPUT_DIR = { ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, + PERSONAS_DIR_NAME, SQL_INDEX_FILENAME, f'{SQL_INDEX_FILENAME}-wal', f'{SQL_INDEX_FILENAME}-shm', @@ -507,6 +509,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME}, 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME}, + 'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME}, 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None}, @@ -1026,6 +1029,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'enabled': True, 'is_valid': config['LOGS_DIR'].exists(), }, + 'PERSONAS': { + 'path': config['PERSONAS'].resolve(), + 'enabled': True, + 'is_valid': config['PERSONAS'].exists(), + }, 'ARCHIVE_DIR': { 'path': config['ARCHIVE_DIR'].resolve(), 'enabled': True, @@ -1373,6 +1381,8 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True) (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True) + (Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True) + (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True) From 8007e97c3f93dc763c95737e1452af95ba73ff5c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 18 Mar 2024 14:41:57 -0700 Subject: [PATCH 4/4] point archivebox to novnc display container by default --- Dockerfile | 9 ++++++++- docker-compose.yml | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 541c338a..82647329 100644 --- a/Dockerfile +++ b/Dockerfile @@ -266,7 +266,14 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T # Setup ArchiveBox runtime config WORKDIR "$DATA_DIR" -ENV IN_DOCKER=True +ENV IN_DOCKER=True \ + DISPLAY=novnc:0.0 \ + CUSTOM_TEMPLATES_DIR=/data/templates \ + CHROME_USER_DATA_DIR=/data/personas/Default/chromium \ + GOOGLE_API_KEY=no \ + GOOGLE_DEFAULT_CLIENT_ID=no \ + GOOGLE_DEFAULT_CLIENT_SECRET=no \ + ALLOWED_HOSTS=* ## No need to set explicitly, these values will be autodetected by archivebox in docker: # CHROME_SANDBOX=False \ # WGET_BINARY="wget" \ diff --git a/docker-compose.yml b/docker-compose.yml index d8342216..ea3d3ab7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -124,6 +124,21 @@ services: # - ./data:/var/www + ### Example: Watch the ArchiveBox browser in realtime as it archives things, + # or remote control it to set up logins and credentials for sites you want to archive. + # https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile + + novnc: + image: theasp/novnc:latest + environment: + - DISPLAY_WIDTH=1920 + - DISPLAY_HEIGHT=1080 + - RUN_XTERM=no + ports: + # to view/control ArchiveBox's browser, visit: http://localhost:8080/vnc.html + - "8080:8080" + + ### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel # wireguard: