mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
Merge branch 'dev' into plugins-browsertrix
This commit is contained in:
commit
c22df0b63a
4 changed files with 38 additions and 5 deletions
|
@ -267,7 +267,13 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||||
# Setup ArchiveBox runtime config
|
# Setup ArchiveBox runtime config
|
||||||
WORKDIR "$DATA_DIR"
|
WORKDIR "$DATA_DIR"
|
||||||
ENV IN_DOCKER=True \
|
ENV IN_DOCKER=True \
|
||||||
CUSTOM_TEMPLATES_DIR=/data/templates
|
DISPLAY=novnc:0.0 \
|
||||||
|
CUSTOM_TEMPLATES_DIR=/data/templates \
|
||||||
|
CHROME_USER_DATA_DIR=/data/personas/Default/chromium \
|
||||||
|
GOOGLE_API_KEY=no \
|
||||||
|
GOOGLE_DEFAULT_CLIENT_ID=no \
|
||||||
|
GOOGLE_DEFAULT_CLIENT_SECRET=no \
|
||||||
|
ALLOWED_HOSTS=*
|
||||||
## No need to set explicitly, these values will be autodetected by archivebox in docker:
|
## No need to set explicitly, these values will be autodetected by archivebox in docker:
|
||||||
# CHROME_SANDBOX=False \
|
# CHROME_SANDBOX=False \
|
||||||
# WGET_BINARY="wget" \
|
# WGET_BINARY="wget" \
|
||||||
|
|
|
@ -142,9 +142,10 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||||
|
|
||||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
'USER_AGENT': {'type': str, 'default': None},
|
||||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||||
|
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||||
|
|
||||||
'COOKIES_FILE': {'type': str, 'default': None},
|
'COOKIES_FILE': {'type': str, 'default': None},
|
||||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||||
|
@ -280,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates'
|
||||||
ARCHIVE_DIR_NAME = 'archive'
|
ARCHIVE_DIR_NAME = 'archive'
|
||||||
SOURCES_DIR_NAME = 'sources'
|
SOURCES_DIR_NAME = 'sources'
|
||||||
LOGS_DIR_NAME = 'logs'
|
LOGS_DIR_NAME = 'logs'
|
||||||
|
PERSONAS_DIR_NAME = 'personas'
|
||||||
SQL_INDEX_FILENAME = 'index.sqlite3'
|
SQL_INDEX_FILENAME = 'index.sqlite3'
|
||||||
JSON_INDEX_FILENAME = 'index.json'
|
JSON_INDEX_FILENAME = 'index.json'
|
||||||
HTML_INDEX_FILENAME = 'index.html'
|
HTML_INDEX_FILENAME = 'index.html'
|
||||||
|
@ -356,6 +358,7 @@ ALLOWED_IN_OUTPUT_DIR = {
|
||||||
ARCHIVE_DIR_NAME,
|
ARCHIVE_DIR_NAME,
|
||||||
SOURCES_DIR_NAME,
|
SOURCES_DIR_NAME,
|
||||||
LOGS_DIR_NAME,
|
LOGS_DIR_NAME,
|
||||||
|
PERSONAS_DIR_NAME,
|
||||||
SQL_INDEX_FILENAME,
|
SQL_INDEX_FILENAME,
|
||||||
f'{SQL_INDEX_FILENAME}-wal',
|
f'{SQL_INDEX_FILENAME}-wal',
|
||||||
f'{SQL_INDEX_FILENAME}-shm',
|
f'{SQL_INDEX_FILENAME}-shm',
|
||||||
|
@ -506,6 +509,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
||||||
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
|
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
|
||||||
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
|
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
|
||||||
|
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
||||||
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
||||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
||||||
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
|
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
|
||||||
|
@ -1035,6 +1039,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||||
'enabled': True,
|
'enabled': True,
|
||||||
'is_valid': config['LOGS_DIR'].exists(),
|
'is_valid': config['LOGS_DIR'].exists(),
|
||||||
},
|
},
|
||||||
|
'PERSONAS': {
|
||||||
|
'path': config['PERSONAS'].resolve(),
|
||||||
|
'enabled': True,
|
||||||
|
'is_valid': config['PERSONAS'].exists(),
|
||||||
|
},
|
||||||
'ARCHIVE_DIR': {
|
'ARCHIVE_DIR': {
|
||||||
'path': config['ARCHIVE_DIR'].resolve(),
|
'path': config['ARCHIVE_DIR'].resolve(),
|
||||||
'enabled': True,
|
'enabled': True,
|
||||||
|
@ -1382,6 +1391,8 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
|
||||||
|
|
||||||
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
|
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
|
||||||
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
|
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
|
||||||
|
(Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
|
||||||
|
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -303,10 +303,11 @@ def chrome_args(**options) -> List[str]:
|
||||||
|
|
||||||
if options['CHROME_USER_DATA_DIR']:
|
if options['CHROME_USER_DATA_DIR']:
|
||||||
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
||||||
|
cmd_args.append('--profile-directory=Default')
|
||||||
|
|
||||||
return dedupe(cmd_args)
|
return dedupe(cmd_args)
|
||||||
|
|
||||||
|
|
||||||
def chrome_cleanup():
|
def chrome_cleanup():
|
||||||
"""
|
"""
|
||||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||||
|
|
|
@ -135,6 +135,21 @@ services:
|
||||||
# - ./data:/var/www
|
# - ./data:/var/www
|
||||||
|
|
||||||
|
|
||||||
|
### Example: Watch the ArchiveBox browser in realtime as it archives things,
|
||||||
|
# or remote control it to set up logins and credentials for sites you want to archive.
|
||||||
|
# https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
|
||||||
|
|
||||||
|
novnc:
|
||||||
|
image: theasp/novnc:latest
|
||||||
|
environment:
|
||||||
|
- DISPLAY_WIDTH=1920
|
||||||
|
- DISPLAY_HEIGHT=1080
|
||||||
|
- RUN_XTERM=no
|
||||||
|
ports:
|
||||||
|
# to view/control ArchiveBox's browser, visit: http://localhost:8080/vnc.html
|
||||||
|
- "8080:8080"
|
||||||
|
|
||||||
|
|
||||||
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
|
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
|
||||||
|
|
||||||
# wireguard:
|
# wireguard:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue