make chrome binary and configs directly runnable and make extractor use external bin

This commit is contained in:
Nick Sweeting 2024-12-06 02:04:05 -08:00
parent a572db307b
commit ac53fdf677
No known key found for this signature in database
7 changed files with 316 additions and 83 deletions

View file

@ -3,11 +3,12 @@ This file provides the Django ABIDField and ABIDModel base model to inherit from
""" """
from typing import Any, Dict, Union, List, Set, cast
import json
from uuid import uuid4 from uuid import uuid4
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Union, List, Set, cast
from charidfield import CharIDField # type: ignore[import-untyped] from charidfield import CharIDField # type: ignore[import-untyped]
from django.contrib import admin from django.contrib import admin
@ -27,6 +28,7 @@ from django_stubs_ext.db.models import TypedModelMeta
from archivebox.index.json import to_json from archivebox.index.json import to_json
from archivebox.misc.hashing import get_dir_info
from .abid import ( from .abid import (
ABID, ABID,
@ -590,18 +592,20 @@ class ModelWithOutputDir(ABIDModel):
"""Write the ./.index.merkle file to the output dir""" """Write the ./.index.merkle file to the output dir"""
# write self.generate_merkle_tree() to self.output_dir / '.index.merkle' # write self.generate_merkle_tree() to self.output_dir / '.index.merkle'
print(f'{type(self).__name__}[{self.ABID}].save_merkle_index()') print(f'{type(self).__name__}[{self.ABID}].save_merkle_index()')
dir_info = get_dir_info(self.OUTPUT_DIR, max_depth=6)
with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f:
json.dump(dir_info, f)
pass pass
def save_html_index(self, **kwargs) -> None: def save_html_index(self, **kwargs) -> None:
# write self.as_html() to self.output_dir / 'index.html' # write self.as_html() to self.output_dir / 'index.html'
print(f'{type(self).__name__}[{self.ABID}].save_html_index()') print(f'{type(self).__name__}[{self.ABID}].save_html_index()')
pass (self.OUTPUT_DIR / 'index.html').write_text(self.as_html())
def save_json_index(self, **kwargs) -> None: def save_json_index(self, **kwargs) -> None:
print(f'{type(self).__name__}[{self.ABID}].save_json_index()') print(f'{type(self).__name__}[{self.ABID}].save_json_index()')
# write self.as_json() to self.output_dir / 'index.json' # write self.as_json() to self.output_dir / 'index.json'
(self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json())) (self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json()))
pass
def save_symlinks_index(self) -> None: def save_symlinks_index(self) -> None:
print(f'{type(self).__name__}[{self.ABID}].save_symlinks_index()') print(f'{type(self).__name__}[{self.ABID}].save_symlinks_index()')
@ -610,26 +614,26 @@ class ModelWithOutputDir(ABIDModel):
# ln -s self.output_dir data/archive/1453452234234.21445 # ln -s self.output_dir data/archive/1453452234234.21445
pass pass
def as_json(self) -> dict: def as_json(self, *keys) -> dict:
"""Get the object's properties as a dict""" """Get the object's properties as a dict"""
# dump the object's properties to a json-ready dict
return { return {
'TYPE': self.TYPE, 'TYPE': self.TYPE,
'id': self.id, 'id': str(self.id),
'abid': str(self.ABID), 'abid': str(self.ABID),
'str': str(self), 'str': str(self),
'modified_at': self.modified_at,
'created_at': self.created_at,
'created_by_id': self.created_by_id, 'created_by_id': self.created_by_id,
'created_at': self.created_at,
'modified_at': self.modified_at,
'status': getattr(self, 'status', None), 'status': getattr(self, 'status', None),
'retry_at': getattr(self, 'retry_at', None), 'retry_at': getattr(self, 'retry_at', None),
'notes': getattr(self, 'notes', None), 'notes': getattr(self, 'notes', None),
**{key: getattr(self, key) for key in keys},
} }
def as_html(self) -> str: def as_html(self) -> str:
"""Get the object's properties as a html string""" """Get the object's properties as a html string"""
# render snapshot_detail.html template with self as context and return html string # render snapshot_detail.html template with self as context and return html string
return '' return str(self)
#################################################### ####################################################

View file

@ -104,6 +104,7 @@ SERVER_CONFIG = ServerConfig()
class ArchivingConfig(BaseConfigSet): class ArchivingConfig(BaseConfigSet):
ONLY_NEW: bool = Field(default=True) ONLY_NEW: bool = Field(default=True)
OVERWRITE: bool = Field(default=False)
TIMEOUT: int = Field(default=60) TIMEOUT: int = Field(default=60)
MEDIA_TIMEOUT: int = Field(default=3600) MEDIA_TIMEOUT: int = Field(default=3600)

View file

@ -16,7 +16,7 @@ from django.utils.text import slugify
from django.utils import timezone from django.utils import timezone
from django.core.cache import cache from django.core.cache import cache
from django.urls import reverse, reverse_lazy from django.urls import reverse, reverse_lazy
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, IntegerField
from django.contrib import admin from django.contrib import admin
from django.conf import settings from django.conf import settings
@ -25,7 +25,8 @@ import abx
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size from archivebox.misc.system import get_dir_size
from archivebox.misc.util import parse_date, base_url from archivebox.misc.util import parse_date, base_url, domain as url_domain
from archivebox.misc.hashing import get_dir_info
from archivebox.index.schema import Link from archivebox.index.schema import Link
from archivebox.index.html import snapshot_icons from archivebox.index.html import snapshot_icons
from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
@ -142,8 +143,20 @@ def validate_timestamp(value):
assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"' assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"'
class SnapshotManager(models.Manager): class SnapshotManager(models.Manager):
def filter(self, *args, **kwargs):
"""add support for .filter(domain='example.com') to Snapshot queryset"""
domain = kwargs.pop('domain', None)
qs = super().filter(*args, **kwargs)
if domain:
qs = qs.filter(url__icontains=f'://{domain}')
return qs
def get_queryset(self): def get_queryset(self):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct() return (
super().get_queryset()
.prefetch_related('tags', 'archiveresult_set')
# .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
)
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDModel): class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDModel):
abid_prefix = 'snp_' abid_prefix = 'snp_'
@ -257,6 +270,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDM
self.crawl.urls += f'\n{self.url}' self.crawl.urls += f'\n{self.url}'
self.crawl.save() self.crawl.save()
def output_dir_parent(self) -> str:
return 'archive'
def output_dir_name(self) -> str:
return str(self.timestamp)
def archive(self, overwrite=False, methods=None): def archive(self, overwrite=False, methods=None):
result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods) result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
return result return result
@ -339,6 +359,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDM
# TODO: remove this # TODO: remove this
return self.bookmarked return self.bookmarked
@cached_property
def domain(self) -> str:
return url_domain(self.url)
@cached_property @cached_property
def is_archived(self): def is_archived(self):
return self.as_link().is_archived return self.as_link().is_archived
@ -659,7 +683,8 @@ class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine,
notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have') notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have')
# the network interface that was used to download this result # the network interface that was used to download this result
# uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used') # machine = models.ForeignKey(Machine, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Machine Used')
# network = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
objects = ArchiveResultManager() objects = ArchiveResultManager()
@ -742,8 +767,7 @@ class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine,
return None return None
def legacy_output_path(self): def legacy_output_path(self):
link = self.snapshot.as_link() return self.canonical_outputs().get(f'{self.extractor}_path')
return link.canonical_outputs().get(f'{self.extractor}_path')
def output_exists(self) -> bool: def output_exists(self) -> bool:
output_path = Path(self.snapshot_dir) / self.extractor output_path = Path(self.snapshot_dir) / self.extractor
@ -761,6 +785,89 @@ class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine,
for key in args for key in args
} }
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""Predict the expected output paths that should be present after archiving"""
# You'll need to implement the actual logic based on your requirements
# TODO: banish this awful duplication from the codebase and import these
# from their respective extractor files
from abx_plugin_favicon.config import FAVICON_CONFIG
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
'google_favicon_path': FAVICON_CONFIG.FAVICON_PROVIDER.format(self.domain),
'wget_path': f'warc/{self.timestamp}',
'warc_path': 'warc/',
'singlefile_path': 'singlefile.html',
'readability_path': 'readability/content.html',
'mercury_path': 'mercury/content.html',
'htmltotext_path': 'htmltotext.txt',
'pdf_path': 'output.pdf',
'screenshot_path': 'screenshot.png',
'dom_path': 'output.html',
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
'git_path': 'git/',
'media_path': 'media/',
'headers_path': 'headers.json',
}
if self.is_static:
static_path = f'warc/{self.timestamp}'
canonical.update({
'title': self.basename,
'wget_path': static_path,
'pdf_path': static_path,
'screenshot_path': static_path,
'dom_path': static_path,
'singlefile_path': static_path,
'readability_path': static_path,
'mercury_path': static_path,
'htmltotext_path': static_path,
})
return canonical
@property
def output_dir_name(self) -> str:
return self.extractor
@property
def output_dir_parent(self) -> str:
return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
@cached_property
def output_files(self) -> dict[str, dict]:
dir_info = get_dir_info(self.OUTPUT_DIR, max_depth=6)
with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f:
json.dump(dir_info, f)
return dir_info
def announce_event(self, output_type: str, event: dict):
event = {
**event,
'type': output_type,
}
# if event references a file, make sure it exists on disk
if 'path' in event:
file_path = Path(self.OUTPUT_DIR) / event['path']
assert file_path.exists(), f'ArchiveResult[{self.ABID}].announce_event(): File does not exist: {file_path} ({event})'
with open(self.OUTPUT_DIR / '.events.jsonl', 'a') as f:
f.write(json.dumps(event, sort_keys=True, default=str) + '\n')
def events(self, filter_type: str | None=None) -> list[dict]:
events = []
try:
with open(self.OUTPUT_DIR / '.events.jsonl', 'r') as f:
for line in f:
event = json.loads(line)
if filter_type is None or event['type'] == filter_type:
events.append(event)
except FileNotFoundError:
pass
return events
def write_indexes(self): def write_indexes(self):
"""Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend""" """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
super().write_indexes() super().write_indexes()

View file

@ -2,12 +2,15 @@ import hashlib
import mimetypes import mimetypes
import os import os
import subprocess
from typing import ClassVar from typing import ClassVar
from datetime import timedelta from datetime import timedelta
from zipfile import Path from zipfile import Path
from django.utils import timezone from django.utils import timezone
from archivebox.misc.hashing import get_dir_info
from core.models import ArchiveResult from core.models import ArchiveResult
import abx import abx
@ -206,8 +209,6 @@ class Extractor:
def after_extract(self, error: Exception | None=None): def after_extract(self, error: Exception | None=None):
status, retry_at = self.determine_status() status, retry_at = self.determine_status()
self.archiveresult.outputs = []
self.archiveresult.error = f'{type(error).__name__}: {error}' if error else None self.archiveresult.error = f'{type(error).__name__}: {error}' if error else None
self.archiveresult.status = self.archiveresult.StatusChoices.FAILED if error else self.archiveresult.StatusChoices.SUCCEEDED self.archiveresult.status = self.archiveresult.StatusChoices.FAILED if error else self.archiveresult.StatusChoices.SUCCEEDED

View file

@ -1,67 +1,110 @@
# import mimetypes import mimetypes
# import uuid import uuid
from datetime import timedelta
from pathlib import Path
from django.db import models
from django.conf import settings
from django.utils import timezone
# from django.db import models from archivebox import DATA_DIR
# from django.conf import settings from archivebox.misc.hashing import get_dir_info, hash_file
# from django.utils import timezone from base_models.abid import DEFAULT_ABID_URI_SALT
from base_models.models import ABIDModel, ABIDField, get_or_create_system_user_pk
# from archivebox import DATA_DIR
# from archivebox.misc.hashing import get_dir_info, hash_file
# from base_models.abid import DEFAULT_ABID_URI_SALT
# from base_models.models import ABIDModel, ABIDField, get_or_create_system_user_pk
# class File(ABIDModel): class File(ABIDModel):
# abid_prefix = 'fil_' abid_prefix = 'fil_'
# abid_ts_src = 'self.created_at' abid_ts_src = 'self.created_at'
# abid_uri_src = 'self.path' abid_uri_src = 'self.path'
# abid_subtype_src = 'self.mime_type' abid_subtype_src = 'self.mime_type'
# abid_rand_src = 'self.id' abid_rand_src = 'self.id'
# abid_salt: str = DEFAULT_ABID_URI_SALT # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users) abid_salt: str = DEFAULT_ABID_URI_SALT # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users)
# abid_drift_allowed: bool = False abid_drift_allowed: bool = False
# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, null=False) id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, null=False)
# abid = ABIDField(prefix=abid_prefix) abid = ABIDField(prefix=abid_prefix)
# created_at = models.DateTimeField(default=timezone.now, null=False) created_at = models.DateTimeField(default=timezone.now, null=False)
# modified_at = models.DateTimeField(default=timezone.now, null=False) modified_at = models.DateTimeField(default=timezone.now, null=False)
# created_by = models.ForeignKey(settings.USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) created_by = models.ForeignKey(settings.USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
# path = models.FilePathField(path=str(DATA_DIR), recursive=True, allow_files=True, allow_folders=True, db_index=True, unique=True) class StatusChoices(models.TextChoices):
UNLOCKED = 'unlocked'
LOCKED = 'locked'
# basename = models.CharField(max_length=255, default=None, null=False) # e.g. 'index' status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.UNLOCKED, null=False)
# extension = models.CharField(max_length=63, default='', null=False) # e.g. 'html' retry_at = models.DateTimeField(default=None, null=True)
# mime_type = models.CharField(max_length=63, default=None, null=False, db_index=True) # e.g. 'inode/directory' or 'text/html' version = models.CharField(max_length=16, default='unknown', null=False)
# num_subpaths = models.IntegerField(default=None, null=False) # e.g. 3
# num_bytes = models.IntegerField(default=None, null=False) # e.g. 123456
# hash_sha256 = models.CharField(max_length=64, default=None, null=False, db_index=True) # e.g. '5994471abb01112afcc1815994471abb01112afcc1815994471abb01112afcc181' file = models.FileField(null=False)
# # hash_blake3 = models.CharField(max_length=64, default=None, null=False, db_index=True) # e.g. '5994471abb01112afcc1815994471abb01112afcc1815994471abb01112afcc181'
# DIR = 'inode/directory' basename = models.CharField(max_length=255, default=None, null=False) # e.g. 'index'
extension = models.CharField(max_length=63, default='', null=False) # e.g. 'html'
mime_type = models.CharField(max_length=63, default=None, null=False, db_index=True) # e.g. 'inode/directory' or 'text/html'
num_subpaths = models.IntegerField(default=None, null=False) # e.g. 3
num_bytes = models.IntegerField(default=None, null=False) # e.g. 123456
sha256 = models.CharField(max_length=64, default=None, null=False, db_index=True) # e.g. '5994471abb01112afcc1815994471abb01112afcc1815994471abb01112afcc181'
# blake3 = models.CharField(max_length=64, default=None, null=False, db_index=True) # e.g. '5994471abb01112afcc1815994471abb01112afcc1815994471abb01112afcc181'
DIR = 'inode/directory'
@classmethod
def release_expired_locks(cls):
cls.objects.filter(status='locked', retry_at__lt=timezone.now()).update(status='unlocked', retry_at=None)
@property
def parent(self) -> 'File':
return File.objects.get(path=str(self.PATH.parent)) or File(path=str(self.PATH.parent))
@property
def relpath(self) -> Path:
return Path(self.file.name)
@property
def abspath(self) -> Path:
return DATA_DIR / self.file.name
def save(self, *args, **kwargs):
assert self.abspath.exists()
if self.abspath.is_dir():
self.basename = self.relpath.name
self.extension = ''
self.mime_type = self.DIR
dir_info = get_dir_info(self.abspath)
self.num_subpaths = dir_info['.']['num_subpaths']
self.num_bytes = dir_info['.']['num_bytes']
self.hash_sha256 = dir_info['.']['hash_sha256']
# TODO: hash_blake3 = dir_info['.']['hash_blake3']
else:
self.basename = self.relpath.name
self.extension = self.relpath.suffix
self.mime_type = mimetypes.guess_type(self.abspath)[0]
self.num_bytes = self.abspath.stat().st_size
self.hash_sha256, self.hash_blake3 = hash_file(self.abspath)
super().save(*args, **kwargs)
# @property def acquire_lock(self, timeout_seconds: int = 60):
# def parent(self) -> 'File': self.status = 'locked'
# return File.objects.get(path=self.path.parent) or File(path=self.path.parent) self.retry_at = timezone.now() + timedelta(seconds=timeout_seconds)
self.save()
# def save(self, *args, **kwargs): def release_lock(self):
# assert self.path.exists() self.status = 'unlocked'
self.retry_at = None
self.save()
# if self.path.is_dir(): def move_to(self, new_path: Path):
# self.basename = self.path.name if str(new_path).startswith(str(DATA_DIR)):
# self.extension = '' new_relpath = new_path.relative_to(DATA_DIR)
# self.mime_type = self.DIR new_abspath = new_path
# dir_info = get_dir_info(self.path) else:
# self.num_subpaths = dir_info['.']['num_subpaths'] new_relpath = new_path
# self.num_bytes = dir_info['.']['num_bytes'] new_abspath = DATA_DIR / new_path
# self.hash_sha256 = dir_info['.']['hash_sha256']
# # TODO: hash_blake3 = dir_info['.']['hash_blake3']
# else:
# self.basename = self.path.name
# self.extension = self.path.suffix
# self.mime_type = mimetypes.guess_type(self.path)[0]
# self.num_bytes = self.path.stat().st_size
# self.hash_sha256, self.hash_blake3 = hash_file(self.path)
# super().save(*args, **kwargs)
new_abspath.parent.mkdir(parents=True, exist_ok=True)
self.abspath.rename(new_abspath)
self.file.name = new_relpath
self.save()

View file

@ -1,3 +1,6 @@
#!/usr/bin/env python3
__package__ = 'abx_plugin_chrome'
import os import os
import platform import platform
from pathlib import Path from pathlib import Path
@ -147,3 +150,7 @@ class ChromeBinary(Binary):
CHROME_BINARY = ChromeBinary() CHROME_BINARY = ChromeBinary()
if __name__ == '__main__':
binary = CHROME_BINARY.load()
print(binary.version, ' ', binary.abspath)

View file

@ -1,5 +1,8 @@
#!/usr/bin/env python3
import os import os
from pathlib import Path from pathlib import Path
import sys
from typing import List, Optional from typing import List, Optional
from pydantic import Field from pydantic import Field
@ -79,16 +82,67 @@ class ChromeConfig(BaseConfigSet):
# Chrome Binary # Chrome Binary
CHROME_BINARY: str = Field(default='chrome') CHROME_BINARY: str = Field(default='chrome')
CHROME_DEFAULT_ARGS: List[str] = Field(default=[ CHROME_DEFAULT_ARGS: List[str] = Field(default=[
"--disable-sync",
"--no-pings",
"--no-first-run", # dont show any first run ui / setup prompts "--no-first-run", # dont show any first run ui / setup prompts
'--virtual-time-budget=15000', # accellerate any animations on the page by 15s into the future "--no-default-browser-check",
'--disable-features=DarkMode', # disable dark mode for archiving "--disable-default-apps",
"--run-all-compositor-stages-before-draw", # dont draw partially rendered content, wait until everything is ready "--ash-no-nudges",
"--hide-scrollbars", # hide scrollbars to prevent layout shift / scrollbar visible in screenshots "--disable-infobars",
"--autoplay-policy=no-user-gesture-required", # allow media autoplay without user gesture (e.g. on mobile) "--disable-blink-features=AutomationControlled",
"--use-fake-ui-for-media-stream", # provide fake camera if site tries to request camera access "--js-flags=--random-seed=1157259159",
"--deterministic-mode",
"--deterministic-fetch",
"--start-maximized",
"--test-type=gpu",
"--disable-search-engine-choice-screen",
"--disable-session-crashed-bubble",
"--hide-crash-restore-bubble",
"--suppress-message-center-popups",
"--disable-client-side-phishing-detection",
"--disable-domain-reliability",
"--disable-component-update",
"--disable-datasaver-prompt",
"--disable-hang-monitor",
"--disable-session-crashed-bubble",
"--disable-speech-synthesis-api",
"--disable-speech-api",
"--disable-print-preview",
"--safebrowsing-disable-auto-update",
"--deny-permission-prompts",
"--disable-external-intent-requests",
"--disable-notifications",
"--disable-desktop-notifications",
"--noerrdialogs",
"--disable-popup-blocking",
"--disable-prompt-on-repost",
"--silent-debugger-extension-api",
"--block-new-web-contents",
"--metrics-recording-only",
"--disable-breakpad",
"--run-all-compositor-stages-before-draw",
"--use-fake-device-for-media-stream", # provide fake camera if site tries to request camera access "--use-fake-device-for-media-stream", # provide fake camera if site tries to request camera access
"--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'", # ignore chrome updates "--simulate-outdated-no-au=Tue, 31 Dec 2099 23:59:59 GMT", # ignore chrome updates
"--force-gpu-mem-available-mb=4096", # allows for longer full page screenshots https://github.com/puppeteer/puppeteer/issues/5530 "--force-gpu-mem-available-mb=4096", # allows for longer full page screenshots https://github.com/puppeteer/puppeteer/issues/5530
"--password-store=basic",
"--use-mock-keychain",
"--disable-cookie-encryption",
"--allow-legacy-extension-manifests",
"--disable-gesture-requirement-for-media-playback",
"--font-render-hinting=none",
"--force-color-profile=srgb",
"--disable-partial-raster",
"--disable-skia-runtime-opts",
"--disable-2d-canvas-clip-aa",
"--disable-lazy-loading",
"--disable-renderer-backgrounding",
"--disable-background-networking",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-ipc-flooding-protection",
"--disable-extensions-http-throttling",
"--disable-field-trial-config",
"--disable-back-forward-cache",
]) ])
CHROME_EXTRA_ARGS: List[str] = Field(default=[]) CHROME_EXTRA_ARGS: List[str] = Field(default=[])
@ -99,6 +153,7 @@ class ChromeConfig(BaseConfigSet):
CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION) CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
# Cookies & Auth # Cookies & Auth
CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CHROME_USER_DATA_DIR: Path | None = Field(default=CONSTANTS.PERSONAS_DIR / 'Default' / 'chrome_profile') CHROME_USER_DATA_DIR: Path | None = Field(default=CONSTANTS.PERSONAS_DIR / 'Default' / 'chrome_profile')
@ -109,6 +164,8 @@ class ChromeConfig(BaseConfigSet):
SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM') SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF') SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
OVERWRITE: bool = Field(default=lambda: ARCHIVING_CONFIG.OVERWRITE)
def validate(self): def validate(self):
from archivebox.config.paths import create_and_chown_dir from archivebox.config.paths import create_and_chown_dir
@ -147,7 +204,11 @@ class ChromeConfig(BaseConfigSet):
self.update_in_place(CHROME_USER_DATA_DIR=None) self.update_in_place(CHROME_USER_DATA_DIR=None)
@property
def CHROME_ARGS(self) -> str:
# import shlex
# return '\n'.join(shlex.quote(arg) for arg in self.chrome_args())
return '\n'.join(self.chrome_args())
def chrome_args(self, **options) -> List[str]: def chrome_args(self, **options) -> List[str]:
"""helper to build up a chrome shell command with arguments""" """helper to build up a chrome shell command with arguments"""
@ -157,8 +218,8 @@ class ChromeConfig(BaseConfigSet):
cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS] cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS]
if options.CHROME_HEADLESS: # if options.CHROME_HEADLESS:
cmd_args += ["--headless=new"] # expects chrome version >= 111 # cmd_args += ["--headless"] # expects chrome version >= 111
if not options.CHROME_SANDBOX: if not options.CHROME_SANDBOX:
# assume this means we are running inside a docker container # assume this means we are running inside a docker container
@ -205,3 +266,12 @@ class ChromeConfig(BaseConfigSet):
CHROME_CONFIG = ChromeConfig() CHROME_CONFIG = ChromeConfig()
if __name__ == '__main__':
if len(sys.argv) > 1:
result = getattr(CHROME_CONFIG, sys.argv[1], '')
if callable(result):
result = result()
print(result)
else:
print(CHROME_CONFIG.model_dump_json(indent=4))