diff --git a/.gitmodules b/.gitmodules
index 196c9a92..7b72ad6c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,3 +26,6 @@
[submodule "archivebox/vendor/python-atomicwrites"]
path = archivebox/vendor/python-atomicwrites
url = https://github.com/untitaker/python-atomicwrites
+[submodule "archivebox/vendor/pydantic-pkgr"]
+ path = archivebox/vendor/pydantic-pkgr
+ url = https://github.com/ArchiveBox/pydantic-pkgr
diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py
index e5502cea..4e25ac0a 100644
--- a/archivebox/abid_utils/models.py
+++ b/archivebox/abid_utils/models.py
@@ -61,6 +61,11 @@ def get_or_create_system_user_pk(username='system'):
return user.pk
+class AutoDateTimeField(models.DateTimeField):
+ def pre_save(self, model_instance, add):
+ return timezone.now()
+
+
class ABIDModel(models.Model):
"""
Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
@@ -76,13 +81,16 @@ class ABIDModel(models.Model):
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
- created = models.DateTimeField(auto_now_add=True)
+ created = AutoDateTimeField(default=timezone.now, db_index=True)
modified = models.DateTimeField(auto_now=True)
class Meta(TypedModelMeta):
abstract = True
def save(self, *args: Any, **kwargs: Any) -> None:
+ if self._state.adding or not self.created:
+ self.created = timezone.now()
+
# when first creating a row, self.ABID is the source of truth
# overwrite default prefilled self.id & self.abid with generated self.ABID value
if self._state.adding or not self.id:
@@ -93,6 +101,7 @@ class ABIDModel(models.Model):
super().save(*args, **kwargs)
assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}'
assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}'
+ assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})'
@property
def abid_values(self) -> Dict[str, Any]:
@@ -186,6 +195,14 @@ class ABIDModel(models.Model):
Get a uuid.UUID (v4) representation of the object's ABID.
"""
return self.ABID.uuid
+
+ @property
+ def uuid(self) -> str:
+ """
+ Get a str uuid.UUID (v4) representation of the object's ABID.
+ """
+ assert str(self.id) == str(self.ABID.uuid)
+ return str(self.id)
@property
def TypeID(self) -> TypeID:
diff --git a/archivebox/builtin_plugins/__init__.py b/archivebox/builtin_plugins/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/builtin_plugins/base/__init__.py b/archivebox/builtin_plugins/base/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/builtin_plugins/base/admin.py b/archivebox/builtin_plugins/base/admin.py
new file mode 100644
index 00000000..8c38f3f3
--- /dev/null
+++ b/archivebox/builtin_plugins/base/admin.py
@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.
diff --git a/archivebox/builtin_plugins/base/apps.py b/archivebox/builtin_plugins/base/apps.py
new file mode 100644
index 00000000..291bbe50
--- /dev/null
+++ b/archivebox/builtin_plugins/base/apps.py
@@ -0,0 +1,83 @@
+import sys
+import inspect
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+import django
+from django.apps import AppConfig
+from django.core.checks import Tags, Warning, register
+from django.db.backends.sqlite3.base import Database as sqlite3
+
+from pydantic import (
+ Field,
+ SerializeAsAny,
+)
+
+from pydantic_pkgr import SemVer, BinProvider, BinProviderName, ProviderLookupDict, BinName, Binary, EnvProvider, NpmProvider
+
+from plugantic.extractors import Extractor, ExtractorName
+from plugantic.plugins import Plugin
+from plugantic.configs import ConfigSet, ConfigSectionName
+from plugantic.replayers import Replayer
+
+
+class PythonBinary(Binary):
+ name: BinName = 'python'
+
+ providers_supported: List[BinProvider] = [EnvProvider()]
+ provider_overrides: Dict[str, Any] = {
+ 'env': {
+ 'subdeps': \
+ lambda: 'python3 python3-minimal python3-pip python3-virtualenv',
+ 'abspath': \
+ lambda: sys.executable,
+ 'version': \
+ lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
+ },
+ }
+
+class SqliteBinary(Binary):
+ name: BinName = 'sqlite'
+ providers_supported: List[BinProvider] = [EnvProvider()]
+ provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
+ 'env': {
+ 'abspath': \
+ lambda: Path(inspect.getfile(sqlite3)),
+ 'version': \
+ lambda: SemVer(sqlite3.version),
+ },
+ }
+
+
+class DjangoBinary(Binary):
+ name: BinName = 'django'
+
+ providers_supported: List[BinProvider] = [EnvProvider()]
+ provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
+ 'env': {
+ 'abspath': \
+ lambda: inspect.getfile(django),
+ 'version': \
+ lambda: django.VERSION[:3],
+ },
+ }
+
+
+class BasicReplayer(Replayer):
+ name: str = 'basic'
+
+
+class BasePlugin(Plugin):
+ name: str = 'base'
+ configs: List[SerializeAsAny[ConfigSet]] = []
+ binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
+ extractors: List[SerializeAsAny[Extractor]] = []
+ replayers: List[SerializeAsAny[Replayer]] = [BasicReplayer()]
+
+
+PLUGINS = [BasePlugin()]
+
+
+class BaseConfig(AppConfig):
+ default_auto_field = 'django.db.models.BigAutoField'
+ name = 'builtin_plugins.base'
diff --git a/archivebox/builtin_plugins/base/migrations/__init__.py b/archivebox/builtin_plugins/base/migrations/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/builtin_plugins/base/models.py b/archivebox/builtin_plugins/base/models.py
new file mode 100644
index 00000000..71a83623
--- /dev/null
+++ b/archivebox/builtin_plugins/base/models.py
@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.
diff --git a/archivebox/builtin_plugins/base/tests.py b/archivebox/builtin_plugins/base/tests.py
new file mode 100644
index 00000000..7ce503c2
--- /dev/null
+++ b/archivebox/builtin_plugins/base/tests.py
@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.
diff --git a/archivebox/builtin_plugins/base/views.py b/archivebox/builtin_plugins/base/views.py
new file mode 100644
index 00000000..91ea44a2
--- /dev/null
+++ b/archivebox/builtin_plugins/base/views.py
@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.
diff --git a/archivebox/builtin_plugins/singlefile/__init__.py b/archivebox/builtin_plugins/singlefile/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/builtin_plugins/singlefile/apps.py b/archivebox/builtin_plugins/singlefile/apps.py
new file mode 100644
index 00000000..1d40e8a7
--- /dev/null
+++ b/archivebox/builtin_plugins/singlefile/apps.py
@@ -0,0 +1,113 @@
+from typing import List, Optional, Dict
+from pathlib import Path
+
+from django.apps import AppConfig
+from django.core.checks import Tags, Warning, register
+
+from pydantic import (
+ Field,
+ SerializeAsAny,
+)
+
+from pydantic_pkgr import BinProvider, BinName, Binary, EnvProvider, NpmProvider
+from pydantic_pkgr.binprovider import bin_abspath
+from pydantic_pkgr.binary import BinProviderName, ProviderLookupDict
+
+from plugantic.extractors import Extractor, ExtractorName
+from plugantic.plugins import Plugin
+from plugantic.configs import ConfigSet, ConfigSectionName
+
+from pkg.settings import env
+
+
+###################### Config ##########################
+
+class SinglefileToggleConfig(ConfigSet):
+ section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
+
+ SAVE_SINGLEFILE: bool = True
+
+
+class SinglefileDependencyConfig(ConfigSet):
+ section: ConfigSectionName = 'DEPENDENCY_CONFIG'
+
+ SINGLEFILE_BINARY: str = Field(default='wget')
+ SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
+ SINGLEFILE_EXTRA_ARGS: List[str] = []
+ SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
+
+class SinglefileOptionsConfig(ConfigSet):
+ section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
+
+ # loaded from shared config
+ SINGLEFILE_USER_AGENT: str = Field(default='', alias='USER_AGENT')
+ SINGLEFILE_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
+ SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
+ SINGLEFILE_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
+ SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
+
+
+
+DEFAULT_CONFIG = {
+ 'CHECK_SSL_VALIDITY': False,
+ 'SAVE_SINGLEFILE': True,
+ 'TIMEOUT': 120,
+}
+
+PLUGIN_CONFIG = [
+ SinglefileToggleConfig(**DEFAULT_CONFIG),
+ SinglefileDependencyConfig(**DEFAULT_CONFIG),
+ SinglefileOptionsConfig(**DEFAULT_CONFIG),
+]
+
+###################### Binaries ############################
+
+min_version: str = "1.1.54"
+max_version: str = "2.0.0"
+
+class SinglefileBinary(Binary):
+ name: BinName = 'single-file'
+ providers_supported: List[BinProvider] = [NpmProvider()]
+
+
+ provider_overrides: Dict[BinProviderName, ProviderLookupDict] ={
+ 'env': {
+ 'abspath': lambda: bin_abspath('single-file-node.js', PATH=env.PATH) or bin_abspath('single-file', PATH=env.PATH),
+ },
+ 'npm': {
+ # 'abspath': lambda: bin_abspath('single-file', PATH=NpmProvider().PATH) or bin_abspath('single-file', PATH=env.PATH),
+ 'subdeps': lambda: f'single-file-cli@>={min_version} <{max_version}',
+ },
+ }
+
+
+###################### Extractors ##########################
+
+class SinglefileExtractor(Extractor):
+ name: ExtractorName = 'singlefile'
+ binary: Binary = SinglefileBinary()
+
+ def get_output_path(self, snapshot) -> Path:
+ return Path(snapshot.link_dir) / 'singlefile.html'
+
+
+###################### Plugins #############################
+
+
+class SinglefilePlugin(Plugin):
+ name: str = 'singlefile'
+ configs: List[SerializeAsAny[ConfigSet]] = [*PLUGIN_CONFIG]
+ binaries: List[SerializeAsAny[Binary]] = [SinglefileBinary()]
+ extractors: List[SerializeAsAny[Extractor]] = [SinglefileExtractor()]
+
+PLUGINS = [SinglefilePlugin()]
+
+###################### Django Apps #########################
+
+class SinglefileConfig(AppConfig):
+ name = 'builtin_plugins.singlefile'
+ verbose_name = 'SingleFile'
+
+ def ready(self):
+ pass
+ # print('Loaded singlefile plugin')
diff --git a/archivebox/builtin_plugins/singlefile/config.yaml b/archivebox/builtin_plugins/singlefile/config.yaml
new file mode 100644
index 00000000..b4d80f06
--- /dev/null
+++ b/archivebox/builtin_plugins/singlefile/config.yaml
@@ -0,0 +1,66 @@
+name: singlefile
+plugin_version: '0.0.1'
+plugin_spec: '0.0.1'
+
+binaries:
+ singlefile:
+ providers:
+ - env
+ - npm
+
+commands:
+ - singlefile.exec
+ - singlefile.extract
+ - singlefile.should_extract
+ - singlefile.get_output_path
+
+extractors:
+ singlefile:
+ binary: singlefile
+ test: singlefile.should_extract
+ extract: singlefile.extract
+ output_files:
+ - singlefile.html
+
+configs:
+ ARCHIVE_METHOD_TOGGLES:
+ SAVE_SINGLEFILE:
+ type: bool
+ default: true
+
+ DEPENDENCY_CONFIG:
+ SINGLEFILE_BINARY:
+ type: str
+ default: wget
+ SINGLEFILE_ARGS:
+ type: Optional[List[str]]
+ default: null
+ SINGLEFILE_EXTRA_ARGS:
+ type: List[str]
+ default: []
+ SINGLEFILE_DEFAULT_ARGS:
+ type: List[str]
+ default:
+ - "--timeout={TIMEOUT-10}"
+
+ ARCHIVE_METHOD_OPTIONS:
+ SINGLEFILE_USER_AGENT:
+ type: str
+ default: ""
+ alias: USER_AGENT
+ SINGLEFILE_TIMEOUT:
+ type: int
+ default: 60
+ alias: TIMEOUT
+ SINGLEFILE_CHECK_SSL_VALIDITY:
+ type: bool
+ default: true
+ alias: CHECK_SSL_VALIDITY
+ SINGLEFILE_RESTRICT_FILE_NAMES:
+ type: str
+ default: windows
+ alias: RESTRICT_FILE_NAMES
+ SINGLEFILE_COOKIES_FILE:
+ type: Optional[Path]
+ default: null
+ alias: COOKIES_FILE
diff --git a/archivebox/builtin_plugins/singlefile/tests.py b/archivebox/builtin_plugins/singlefile/tests.py
new file mode 100644
index 00000000..7ce503c2
--- /dev/null
+++ b/archivebox/builtin_plugins/singlefile/tests.py
@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.
diff --git a/archivebox/config.py b/archivebox/config.py
index de086304..8d4a0695 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -31,8 +31,6 @@ import getpass
import platform
import shutil
import requests
-import django
-from sqlite3 import dbapi2 as sqlite3
from hashlib import md5
from pathlib import Path
@@ -43,6 +41,11 @@ from configparser import ConfigParser
from collections import defaultdict
import importlib.metadata
+from pydantic_pkgr import SemVer
+
+import django
+from django.db.backends.sqlite3.base import Database as sqlite3
+
from .config_stubs import (
AttrDict,
SimpleConfigValueDict,
@@ -52,6 +55,11 @@ from .config_stubs import (
ConfigDefaultDict,
)
+# load fallback libraries from vendor dir
+from .vendor import load_vendored_libs
+load_vendored_libs()
+
+
############################### Config Schema ##################################
@@ -89,13 +97,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SECRET_KEY': {'type': str, 'default': None},
'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
'ALLOWED_HOSTS': {'type': str, 'default': '*'}, # e.g. archivebox.example.com,archivebox2.example.com
- 'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': ''}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
+ 'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c['BIND_ADDR'])}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
'DEBUG': {'type': bool, 'default': False},
'PUBLIC_INDEX': {'type': bool, 'default': True},
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
- 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
+ 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 100},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'},
@@ -565,7 +573,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
- 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
+ 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
@@ -902,16 +910,9 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None) -> Optional[str]
version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
# take first 3 columns of first line of version info
- version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE)
- try:
- version_nums = version_ptn.findall(version_str.split('\n')[0])[0]
- if version_nums:
- return version_nums
- else:
- raise IndexError
- except IndexError:
- # take first 3 columns of first line of version info
- return ' '.join(version_str.split('\n')[0].strip().split()[:3])
+ semver = SemVer.parse(version_str)
+ if semver:
+ return str(semver)
except OSError:
pass
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
@@ -1524,5 +1525,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
+
+ # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
+ if settings.DEBUG_LOGFIRE:
+ from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
+ SQLite3Instrumentor().instrument()
+
+ import logfire
+
+ logfire.configure()
+ logfire.instrument_django(is_sql_commentor_enabled=True)
+ logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
+
except KeyboardInterrupt:
raise SystemExit(2)
diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index b87f6874..29463623 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -10,12 +10,15 @@ from datetime import datetime, timezone
from typing import Dict, Any
from django.contrib import admin
-from django.db.models import Count, Q
-from django.urls import path, reverse
+from django.db.models import Count, Q, Prefetch
+from django.urls import path, reverse, resolve
+from django.utils import timezone
+from django.utils.functional import cached_property
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model
+from django.core.paginator import Paginator
from django.core.exceptions import ValidationError
from django.conf import settings
from django import forms
@@ -126,22 +129,99 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
+class AccelleratedPaginator(Paginator):
+ """
+ Accellerated Pagniator ignores DISTINCT when counting total number of rows.
+ Speeds up SELECT Count(*) on Admin views by >20x.
+ https://hakibenita.com/optimizing-the-django-admin-paginator
+ """
+
+ @cached_property
+ def count(self):
+ if self.object_list._has_filters(): # type: ignore
+ # fallback to normal count method on filtered queryset
+ return super().count
+ else:
+ # otherwise count total rows in a separate fast query
+ return self.object_list.model.objects.count()
+
+ # Alternative approach for PostgreSQL: fallback count takes > 200ms
+ # from django.db import connection, transaction, OperationalError
+ # with transaction.atomic(), connection.cursor() as cursor:
+ # cursor.execute('SET LOCAL statement_timeout TO 200;')
+ # try:
+ # return super().count
+ # except OperationalError:
+ # return 9999999999999
+
+
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
model = ArchiveResult
+ parent_model = Snapshot
# fk_name = 'snapshot'
- extra = 1
- readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version')
- fields = ('id', *readonly_fields, 'status', 'output')
+ extra = 0
+ sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
+ readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version')
+ fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
+ # exclude = ('id',)
+ ordering = ('end_ts',)
show_change_link = True
# # classes = ['collapse']
# # list_display_links = ['abid']
+ def get_parent_object_from_request(self, request):
+ resolved = resolve(request.path_info)
+ return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
+
+ @admin.display(
+ description='Completed',
+ ordering='end_ts',
+ )
+ def completed(self, obj):
+ return format_html('
{}
', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
+
def result_id(self, obj):
- return format_html('[{}]
', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
+ return format_html('[{}]
', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
def command(self, obj):
return format_html('{}
', " ".join(obj.cmd or []))
+
+ def version(self, obj):
+ return format_html('{}
', obj.cmd_version or '-')
+
+ def get_formset(self, request, obj=None, **kwargs):
+ formset = super().get_formset(request, obj, **kwargs)
+ snapshot = self.get_parent_object_from_request(request)
+
+ # import ipdb; ipdb.set_trace()
+ formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
+
+ # default values for new entries
+ formset.form.base_fields['status'].initial = 'succeeded'
+ formset.form.base_fields['start_ts'].initial = timezone.now()
+ formset.form.base_fields['end_ts'].initial = timezone.now()
+ formset.form.base_fields['cmd_version'].initial = '-'
+ formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
+ formset.form.base_fields['created_by'].initial = request.user
+ formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
+ formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
+
+ if obj is not None:
+ # hidden values for existing entries and new entries
+ formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
+ formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
+ formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
+ formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
+ formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
+ formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
+ return formset
+
+ def get_readonly_fields(self, request, obj=None):
+ if obj is not None:
+ return self.readonly_fields
+ else:
+ return []
class TagInline(admin.TabularInline):
@@ -222,25 +302,22 @@ def get_abid_info(self, obj):
@admin.register(Snapshot, site=archivebox_admin)
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
- class Meta:
- model = Snapshot
-
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
- # list_editable = ('title',)
sort_fields = ('title_str', 'url_str', 'added', 'files')
- readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
+ readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
- list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags')
+ list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'created_by', 'title', *readonly_fields)
ordering = ['-added']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
- autocomplete_fields = ['tags']
inlines = [TagInline, ArchiveResultInline]
- list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
+ list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm
+ paginator = AccelleratedPaginator
save_on_top = True
+ show_full_result_count = False
def changelist_view(self, request, extra_context=None):
extra_context = extra_context or {}
@@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
]
return custom_urls + urls
- def get_queryset(self, request):
- self.request = request
- return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult'))
+ # def get_queryset(self, request):
+ # # tags_qs = SnapshotTag.objects.all().select_related('tag')
+ # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
+
+ # self.request = request
+ # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
def tag_list(self, obj):
- return ', '.join(obj.tags.values_list('name', flat=True))
+ return ', '.join(tag.name for tag in obj.tags.all())
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj):
@@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
ordering='title',
)
def title_str(self, obj):
- canon = obj.as_link().canonical_outputs()
tags = ''.join(
- format_html('{} ', tag.id, tag)
+ format_html('{} ', tag.pk, tag.name)
for tag in obj.tags.all()
- if str(tag).strip()
+ if str(tag.name).strip()
)
return format_html(
''
- '
'
+ '
'
''
''
'{}'
'',
obj.archive_path,
- obj.archive_path, canon['favicon_path'],
+ obj.archive_path,
obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
@@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
@admin.display(
description='Files Saved',
- ordering='archiveresult_count',
+ # ordering='archiveresult_count',
)
def files(self, obj):
return snapshot_icons(obj)
@admin.display(
- ordering='archiveresult_count'
+ # ordering='archiveresult_count'
)
def size(self, obj):
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
@@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin):
actions = ['delete_selected']
ordering = ['-created']
+ paginator = AccelleratedPaginator
+
def API(self, obj):
try:
return get_abid_info(self, obj)
@@ -574,6 +655,8 @@ class ArchiveResultAdmin(ABIDModelAdmin):
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
+
+ paginator = AccelleratedPaginator
@admin.display(
description='Snapshot Info'
diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py
index 193c0d05..3a64eb45 100644
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -4,7 +4,7 @@ from django import forms
from ..util import URL_REGEX
from ..parsers import PARSERS
-from ..vendor.taggit_utils import edit_string_for_tags, parse_tags
+from taggit.utils import edit_string_for_tags, parse_tags
PARSER_CHOICES = [
(parser_key, parser[0])
diff --git a/archivebox/core/migrations/0027_update_snapshot_ids.py b/archivebox/core/migrations/0027_update_snapshot_ids.py
index ad197c04..6b8dcf4a 100644
--- a/archivebox/core/migrations/0027_update_snapshot_ids.py
+++ b/archivebox/core/migrations/0027_update_snapshot_ids.py
@@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
num_total = Snapshot.objects.all().count()
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
- for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
+ for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
assert snapshot.abid
snapshot.abid_prefix = 'snp_'
snapshot.abid_ts_src = 'self.added'
@@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor):
ArchiveResult = apps.get_model("core", "ArchiveResult")
num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
- for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
+ for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
assert result.abid
result.abid_prefix = 'res_'
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
diff --git a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
index 121a2154..dd6da1f5 100644
--- a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
+++ b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
@@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor):
ArchiveResult = apps.get_model("core", "ArchiveResult")
num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
- for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
+ for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
assert result.abid
result.uuid = ABID.parse(result.abid).uuid
result.save(update_fields=["uuid"])
diff --git a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
index ddb7afbb..9866f69c 100644
--- a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
+++ b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
@@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
SnapshotTag = apps.get_model("core", "SnapshotTag")
num_total = SnapshotTag.objects.all().count()
print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
- for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()):
+ for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
assert snapshottag.snapshot_old_id
snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
snapshottag.snapshot_id = snapshot.id
diff --git a/archivebox/core/migrations/0059_tag_id.py b/archivebox/core/migrations/0059_tag_id.py
index f09e9ffb..a81e022f 100644
--- a/archivebox/core/migrations/0059_tag_id.py
+++ b/archivebox/core/migrations/0059_tag_id.py
@@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor):
Tag = apps.get_model("core", "Tag")
num_total = Tag.objects.all().count()
print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
- for idx, tag in enumerate(Tag.objects.all().iterator()):
+ for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
if not tag.slug:
tag.slug = tag.name.lower().replace(' ', '_')
if not tag.name:
diff --git a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
index 6c574669..bb067acf 100644
--- a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
+++ b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
@@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
SnapshotTag = apps.get_model("core", "SnapshotTag")
num_total = SnapshotTag.objects.all().count()
print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
- for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()):
+ for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
assert snapshottag.old_tag_id
tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
snapshottag.tag_id = tag.id
diff --git a/archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py b/archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py
new file mode 100644
index 00000000..fcc9b0aa
--- /dev/null
+++ b/archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py
@@ -0,0 +1,35 @@
+# Generated by Django 5.1 on 2024-08-28 09:40
+
+import abid_utils.models
+import django.utils.timezone
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0068_alter_archiveresult_options'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='created',
+ field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='added',
+ field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='created',
+ field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ migrations.AlterField(
+ model_name='tag',
+ name='created',
+ field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index c9266bd9..a362bdae 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -12,6 +12,7 @@ from uuid import uuid4
from pathlib import Path
from django.db import models
+from django.utils import timezone
from django.utils.functional import cached_property
from django.utils.text import slugify
from django.core.cache import cache
@@ -19,7 +20,7 @@ from django.urls import reverse, reverse_lazy
from django.db.models import Case, When, Value, IntegerField
from django.conf import settings
-from abid_utils.models import ABIDModel, ABIDField
+from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
from ..system import get_dir_size
from ..util import parse_date, base_url
@@ -50,7 +51,7 @@ class Tag(ABIDModel):
Based on django-taggit model + ABID base.
"""
abid_prefix = 'tag_'
- abid_ts_src = 'self.created' # TODO: add created/modified time
+ abid_ts_src = 'self.created'
abid_uri_src = 'self.slug'
abid_subtype_src = '"03"'
abid_rand_src = 'self.old_id'
@@ -60,7 +61,6 @@ class Tag(ABIDModel):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
abid = ABIDField(prefix=abid_prefix)
-
name = models.CharField(unique=True, blank=False, max_length=100)
slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
# slug is autoset on save from name, never set it manually
@@ -125,6 +125,12 @@ class SnapshotTag(models.Model):
db_table = 'core_snapshot_tags'
unique_together = [('snapshot', 'tag')]
+
+class SnapshotManager(models.Manager):
+ def get_queryset(self):
+ return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
+
+
class Snapshot(ABIDModel):
abid_prefix = 'snp_'
abid_ts_src = 'self.added'
@@ -143,16 +149,15 @@ class Snapshot(ABIDModel):
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
- added = models.DateTimeField(auto_now_add=True, db_index=True)
+ added = AutoDateTimeField(default=timezone.now, db_index=True)
updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
archiveresult_set: models.Manager['ArchiveResult']
- @property
- def uuid(self):
- return self.id
+ objects = SnapshotManager()
+
def __repr__(self) -> str:
title = (self.title_stripped or '-')[:64]
@@ -162,13 +167,6 @@ class Snapshot(ABIDModel):
title = (self.title_stripped or '-')[:64]
return f'[{self.timestamp}] {self.url[:64]} ({title})'
- def save(self, *args, **kwargs):
- super().save(*args, **kwargs)
- try:
- assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
- except AssertionError as e:
- print(e)
-
@classmethod
def from_json(cls, info: dict):
info = {k: v for k, v in info.items() if k in cls.keys}
@@ -177,8 +175,7 @@ class Snapshot(ABIDModel):
def as_json(self, *args) -> dict:
args = args or self.keys
return {
- key: getattr(self, key)
- if key != 'tags' else self.tags_str()
+ key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
for key in args
}
@@ -190,8 +187,14 @@ class Snapshot(ABIDModel):
return load_link_details(self.as_link())
def tags_str(self, nocache=True) -> str | None:
+ calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
- calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
+
+ if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
+ # tags are pre-fetched already, use them directly (best because db is always freshest)
+ tags_str = calc_tags_str()
+ return tags_str
+
if nocache:
tags_str = calc_tags_str()
cache.set(cache_key, tags_str)
@@ -234,7 +237,10 @@ class Snapshot(ABIDModel):
@cached_property
def num_outputs(self) -> int:
- return self.archiveresult_set.filter(status='succeeded').count()
+ # DONT DO THIS: it will trigger a separate query for every snapshot
+ # return self.archiveresult_set.filter(status='succeeded').count()
+ # this is better:
+ return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
@cached_property
def base_url(self):
@@ -262,10 +268,21 @@ class Snapshot(ABIDModel):
@cached_property
def thumbnail_url(self) -> Optional[str]:
- result = self.archiveresult_set.filter(
- extractor='screenshot',
- status='succeeded'
- ).only('output').last()
+ if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
+ result = (sorted(
+ (
+ result
+ for result in self.archiveresult_set.all()
+ if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
+ ),
+ key=lambda result: result.created,
+ ) or [None])[-1]
+ else:
+ result = self.archiveresult_set.filter(
+ extractor='screenshot',
+ status='succeeded'
+ ).only('output').last()
+
if result:
return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
return None
@@ -292,6 +309,21 @@ class Snapshot(ABIDModel):
if self.title:
return self.title # whoopdedoo that was easy
+ # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
+ if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
+ try:
+ return (sorted(
+ (
+ result.output.strip()
+ for result in self.archiveresult_set.all()
+ if result.extractor == 'title' and result.status =='succeeded' and result.output
+ ),
+ key=lambda title: len(title),
+ ) or [None])[-1]
+ except IndexError:
+ pass
+
+
try:
# take longest successful title from ArchiveResult db history
return sorted(
@@ -355,12 +387,23 @@ class Snapshot(ABIDModel):
class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True):
+ """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
+
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
- qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
+ qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
if sorted:
- precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
- qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
+ precedence = [
+ When(extractor=method, then=Value(precedence))
+ for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
+ ]
+ qs = qs.annotate(
+ indexing_precedence=Case(
+ *precedence,
+ default=Value(1000),
+ output_field=IntegerField()
+ )
+ ).order_by('indexing_precedence')
return qs
class ArchiveResult(ABIDModel):
@@ -418,17 +461,6 @@ class ArchiveResult(ABIDModel):
def __str__(self):
return self.extractor
- def save(self, *args, **kwargs):
- super().save(*args, **kwargs)
- try:
- assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
- except AssertionError as e:
- print(e)
-
- @property
- def uuid(self):
- return self.id
-
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.link_dir)
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 1321bd52..707e17a1 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -4,7 +4,9 @@ import os
import sys
import re
import logging
+import inspect
import tempfile
+from typing import Any, Dict
from pathlib import Path
from django.utils.crypto import get_random_string
@@ -33,22 +35,20 @@ APPEND_SLASH = True
DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
-# add plugins folders to system path, and load plugins in installed_apps
-BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'plugins'
-USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'plugins'
-sys.path.insert(0, str(BUILTIN_PLUGINS_DIR))
-sys.path.insert(0, str(USER_PLUGINS_DIR))
+BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'builtin_plugins'
+USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'user_plugins'
-def find_plugins(plugins_dir):
- return {
- # plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA
- plugin_entrypoint.parent.name: plugin_entrypoint.parent
+def find_plugins(plugins_dir, prefix: str) -> Dict[str, Any]:
+ plugins = {
+ f'{prefix}.{plugin_entrypoint.parent.name}': plugin_entrypoint.parent
for plugin_entrypoint in plugins_dir.glob('*/apps.py')
}
+ # print(f'Found {prefix} plugins:\n', '\n '.join(plugins.keys()))
+ return plugins
INSTALLED_PLUGINS = {
- **find_plugins(BUILTIN_PLUGINS_DIR),
- **find_plugins(USER_PLUGINS_DIR),
+ **find_plugins(BUILTIN_PLUGINS_DIR, prefix='builtin_plugins'),
+ **find_plugins(USER_PLUGINS_DIR, prefix='user_plugins'),
}
@@ -66,11 +66,11 @@ INSTALLED_APPS = [
'plugantic',
'core',
'api',
+ 'pkg',
*INSTALLED_PLUGINS.keys(),
'admin_data_views',
-
'django_extensions',
]
@@ -144,64 +144,6 @@ if CONFIG.LDAP:
# sys.exit(1)
-################################################################################
-### Debug Settings
-################################################################################
-
-# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
-DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
-if DEBUG_TOOLBAR:
- try:
- import debug_toolbar # noqa
- DEBUG_TOOLBAR = True
- except ImportError:
- DEBUG_TOOLBAR = False
-
-if DEBUG_TOOLBAR:
- INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
- INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
- DEBUG_TOOLBAR_CONFIG = {
- "SHOW_TOOLBAR_CALLBACK": lambda request: True,
- "RENDER_PANELS": True,
- }
- DEBUG_TOOLBAR_PANELS = [
- 'debug_toolbar.panels.history.HistoryPanel',
- 'debug_toolbar.panels.versions.VersionsPanel',
- 'debug_toolbar.panels.timer.TimerPanel',
- 'debug_toolbar.panels.settings.SettingsPanel',
- 'debug_toolbar.panels.headers.HeadersPanel',
- 'debug_toolbar.panels.request.RequestPanel',
- 'debug_toolbar.panels.sql.SQLPanel',
- 'debug_toolbar.panels.staticfiles.StaticFilesPanel',
- # 'debug_toolbar.panels.templates.TemplatesPanel',
- 'debug_toolbar.panels.cache.CachePanel',
- 'debug_toolbar.panels.signals.SignalsPanel',
- 'debug_toolbar.panels.logging.LoggingPanel',
- 'debug_toolbar.panels.redirects.RedirectsPanel',
- 'debug_toolbar.panels.profiling.ProfilingPanel',
- 'djdt_flamegraph.FlamegraphPanel',
- ]
- MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
-
-if DEBUG:
- from django_autotyping.typing import AutotypingSettingsDict
-
- INSTALLED_APPS += ['django_autotyping']
- AUTOTYPING: AutotypingSettingsDict = {
- "STUBS_GENERATION": {
- "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
- }
- }
-
-# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
-# Must delete archivebox/templates/admin to use because it relies on some things we override
-# visit /__requests_tracker__/ to access
-DEBUG_REQUESTS_TRACKER = False
-if DEBUG_REQUESTS_TRACKER:
- INSTALLED_APPS += ["requests_tracker"]
- MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
- INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
-
################################################################################
### Staticfile and Template Settings
@@ -317,13 +259,15 @@ STORAGES = {
SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
-CSRF_TRUSTED_ORIGINS = CONFIG.CSRF_TRUSTED_ORIGINS.split(',')
+CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
-if CONFIG.ALLOWED_HOSTS != '*' and (not CSRF_TRUSTED_ORIGINS):
- for hostname in ALLOWED_HOSTS:
- CSRF_TRUSTED_ORIGINS.append(f'https://{hostname}')
+for hostname in ALLOWED_HOSTS:
+ https_endpoint = f'https://{hostname}'
+ if hostname != '*' and https_endpoint not in CSRF_TRUSTED_ORIGINS:
+ print(f'[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS')
+ CSRF_TRUSTED_ORIGINS.append(https_endpoint)
SECURE_BROWSER_XSS_FILTER = True
SECURE_CONTENT_TYPE_NOSNIFF = True
@@ -345,6 +289,8 @@ AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
]
+DATA_UPLOAD_MAX_NUMBER_FIELDS = None
+
################################################################################
### Shell Settings
################################################################################
@@ -385,6 +331,10 @@ IGNORABLE_404_URLS = [
re.compile(r'robots\.txt$'),
re.compile(r'.*\.(css|js)\.map$'),
]
+IGNORABLE_200_URLS = [
+ re.compile(r'^"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M),
+ re.compile(r'^"GET /admin/jsi18n/ HTTP/.*" (200|30.) .+', re.I | re.M),
+]
class NoisyRequestsFilter(logging.Filter):
def filter(self, record) -> bool:
@@ -396,19 +346,26 @@ class NoisyRequestsFilter(logging.Filter):
if ignorable_log_pattern.match(logline):
return False
- # ignore staticfile requests that 200 or 30*
- ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
- if ignoreable_200_log_pattern.match(logline):
- return False
+ ignorable_log_pattern = re.compile(f'^Not Found: /.*/?{ignorable_url_pattern.pattern}', re.I | re.M)
+ if ignorable_log_pattern.match(logline):
+ return False
+ # ignore staticfile requests that 200 or 30*
+ for ignorable_url_pattern in IGNORABLE_200_URLS:
+ if ignorable_log_pattern.match(logline):
+ return False
+
return True
+
+ERROR_LOG = tempfile.NamedTemporaryFile().name
+
if CONFIG.LOGS_DIR.exists():
ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
else:
# historically too many edge cases here around creating log dir w/ correct permissions early on
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
- ERROR_LOG = tempfile.NamedTemporaryFile().name
+ print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
LOGGING = {
'version': 1,
@@ -445,6 +402,10 @@ LOGGING = {
}
+################################################################################
+### REST API Outbound Webhooks settings
+################################################################################
+
# Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
SIGNAL_WEBHOOKS = {
@@ -458,7 +419,9 @@ SIGNAL_WEBHOOKS = {
},
}
-DATA_UPLOAD_MAX_NUMBER_FIELDS = None
+################################################################################
+### Admin Data View Settings
+################################################################################
ADMIN_DATA_VIEWS = {
"NAME": "Environment",
@@ -495,3 +458,86 @@ ADMIN_DATA_VIEWS = {
},
],
}
+
+
+################################################################################
+### Debug Settings
+################################################################################
+
+# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
+DEBUG_TOOLBAR = False
+DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
+if DEBUG_TOOLBAR:
+ try:
+ import debug_toolbar # noqa
+ DEBUG_TOOLBAR = True
+ except ImportError:
+ DEBUG_TOOLBAR = False
+
+if DEBUG_TOOLBAR:
+ INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
+ INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
+ DEBUG_TOOLBAR_CONFIG = {
+ "SHOW_TOOLBAR_CALLBACK": lambda request: True,
+ "RENDER_PANELS": True,
+ }
+ DEBUG_TOOLBAR_PANELS = [
+ 'debug_toolbar.panels.history.HistoryPanel',
+ 'debug_toolbar.panels.versions.VersionsPanel',
+ 'debug_toolbar.panels.timer.TimerPanel',
+ 'debug_toolbar.panels.settings.SettingsPanel',
+ 'debug_toolbar.panels.headers.HeadersPanel',
+ 'debug_toolbar.panels.request.RequestPanel',
+ 'debug_toolbar.panels.sql.SQLPanel',
+ 'debug_toolbar.panels.staticfiles.StaticFilesPanel',
+ # 'debug_toolbar.panels.templates.TemplatesPanel',
+ 'debug_toolbar.panels.cache.CachePanel',
+ 'debug_toolbar.panels.signals.SignalsPanel',
+ 'debug_toolbar.panels.logging.LoggingPanel',
+ 'debug_toolbar.panels.redirects.RedirectsPanel',
+ 'debug_toolbar.panels.profiling.ProfilingPanel',
+ 'djdt_flamegraph.FlamegraphPanel',
+ ]
+ MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
+
+if DEBUG:
+ from django_autotyping.typing import AutotypingSettingsDict
+
+ INSTALLED_APPS += ['django_autotyping']
+ AUTOTYPING: AutotypingSettingsDict = {
+ "STUBS_GENERATION": {
+ "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
+ }
+ }
+
+# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
+# Must delete archivebox/templates/admin to use because it relies on some things we override
+# visit /__requests_tracker__/ to access
+DEBUG_REQUESTS_TRACKER = True
+DEBUG_REQUESTS_TRACKER = DEBUG_REQUESTS_TRACKER and DEBUG
+if DEBUG_REQUESTS_TRACKER:
+ import requests_tracker
+
+ INSTALLED_APPS += ["requests_tracker"]
+ MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
+ INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
+
+ TEMPLATE_DIRS.insert(0, str(Path(inspect.getfile(requests_tracker)).parent / "templates"))
+
+ REQUESTS_TRACKER_CONFIG = {
+ "TRACK_SQL": True,
+ "ENABLE_STACKTRACES": False,
+ "IGNORE_PATHS_PATTERNS": (
+ r".*/favicon\.ico",
+ r".*\.png",
+ r"/admin/jsi18n/",
+ ),
+ "IGNORE_SQL_PATTERNS": (
+ r"^SELECT .* FROM django_migrations WHERE app = 'requests_tracker'",
+ r"^SELECT .* FROM django_migrations WHERE app = 'auth'",
+ ),
+ }
+
+# https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.)
+DEBUG_LOGFIRE = False
+DEBUG_LOGFIRE = DEBUG_LOGFIRE and (Path(CONFIG.OUTPUT_DIR) / '.logfire').is_dir()
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 036ff73c..22d6a405 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
if type(all_links) is QuerySet:
num_links: int = all_links.count()
get_link = lambda x: x.as_link_with_details()
- all_links = all_links.iterator()
+ all_links = all_links.iterator(chunk_size=500)
else:
num_links: int = len(all_links)
get_link = lambda x: x
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index cd72be4e..c97b2f28 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
@enforce_types
-def wget_output_path(link: Link) -> Optional[str]:
+def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
@@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]:
# https://example.com/abc/test/?v=zzVa_tX1OiI
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
+ cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path'
+
+ if not nocache:
+ from django.core.cache import cache
+ cached_result = cache.get(cache_key)
+ if cached_result:
+ return cached_result
+
+
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
@@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]:
output_path = None
if output_path:
+ if not nocache:
+ cache.set(cache_key, output_path)
return output_path
# fallback to just the domain dir
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 1bc5a104..1edd3caf 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
- links = (snapshot.as_link() for snapshot in snapshots.iterator())
+ links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return {
link.link_dir: link
for link in links
@@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
- links = (snapshot.as_link() for snapshot in snapshots.iterator())
+ links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return {
link.link_dir: link
for link in filter(is_archived, links)
@@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
- links = (snapshot.as_link() for snapshot in snapshots.iterator())
+ links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return {
link.link_dir: link
for link in filter(is_unarchived, links)
@@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs with a valid index matched to the main index and archived content"""
- links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+ links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
return {
link.link_dir: link
for link in filter(is_valid, links)
@@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
)
- for path in chain(snapshots.iterator(), data_folders):
+ for path in chain(snapshots.iterator(chunk_size=500), data_folders):
link = None
if type(path) is not str:
path = path.as_link().link_dir
@@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain a valid index and aren't listed in the main index"""
corrupted = {}
- for snapshot in snapshots.iterator():
+ for snapshot in snapshots.iterator(chunk_size=500):
link = snapshot.as_link()
if is_corrupt(link):
corrupted[link.link_dir] = link
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index 339f9429..2e5d18bc 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str:
from core.models import ArchiveResult
# start = datetime.now(timezone.utc)
- archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+ if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache:
+ archive_results = [
+ result
+ for result in snapshot.archiveresult_set.all()
+ if result.status == "succeeded" and result.output
+ ]
+ else:
+ archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+
link = snapshot.as_link()
path = link.archive_path
canon = link.canonical_outputs()
diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py
index 10c1525d..97058590 100644
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -37,9 +37,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
@enforce_types
def write_link_to_sql_index(link: Link, created_by_id: int | None=None):
from core.models import Snapshot, ArchiveResult
+ from abid_utils.models import get_or_create_system_user_pk
+
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
- info['created_by_id'] = created_by_id
+ info['created_by_id'] = created_by_id or get_or_create_system_user_pk()
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
diff --git a/archivebox/main.py b/archivebox/main.py
index b36fb3dd..5ab175bb 100755
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -960,7 +960,8 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
run_subcommand('init', stdin=None, pwd=out_dir)
setup_django(out_dir=out_dir, check_db=True)
- from core.models import User
+ from django.contrib.auth import get_user_model
+ User = get_user_model()
if not User.objects.filter(is_superuser=True).exists():
stderr('\n[+] Creating new admin user for the Web UI...', color='green')
@@ -979,16 +980,16 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
'--upgrade',
'--no-cache-dir',
'--no-warn-script-location',
- 'youtube_dl',
+ 'yt-dlp',
], capture_output=False, cwd=out_dir)
pkg_path = run_shell([
PYTHON_BINARY, '-m', 'pip',
'show',
- 'youtube_dl',
+ 'yt-dlp',
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
- NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'youtube_dl' / '__main__.py'
+ NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
- assert NEW_YOUTUBEDL_BINARY.exists(), f'youtube_dl must exist inside {pkg_path}'
+ assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir)
except BaseException as e: # lgtm [py/catch-base-exception]
stderr(f'[X] Failed to install python packages: {e}', color='red')
diff --git a/archivebox/package-lock.json b/archivebox/package-lock.json
index 0645c468..abcb8192 100644
--- a/archivebox/package-lock.json
+++ b/archivebox/package-lock.json
@@ -11,7 +11,7 @@
"dependencies": {
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
- "single-file-cli": "^1.1.54"
+ "single-file-cli": "^2.0.58"
}
},
"node_modules/@asamuzakjp/dom-selector": {
@@ -236,9 +236,9 @@
"license": "MIT"
},
"node_modules/@types/node": {
- "version": "22.5.0",
- "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz",
- "integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==",
+ "version": "22.5.1",
+ "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
+ "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
"license": "MIT",
"optional": true,
"dependencies": {
@@ -353,9 +353,9 @@
}
},
"node_modules/aws4": {
- "version": "1.13.1",
- "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz",
- "integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==",
+ "version": "1.13.2",
+ "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
+ "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
"license": "MIT"
},
"node_modules/b4a": {
@@ -2376,9 +2376,9 @@
}
},
"node_modules/tslib": {
- "version": "2.6.3",
- "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
- "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==",
+ "version": "2.7.0",
+ "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
+ "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
"license": "0BSD"
},
"node_modules/turndown": {
diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py
index eec4d73b..3415f35e 100644
--- a/archivebox/parsers/pocket_api.py
+++ b/archivebox/parsers/pocket_api.py
@@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
from configparser import ConfigParser
from pathlib import Path
-from ..vendor.pocket import Pocket
+from pocket import Pocket
from ..index.schema import Link
from ..util import enforce_types
diff --git a/archivebox/pkg/__init__.py b/archivebox/pkg/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/pkg/admin.py b/archivebox/pkg/admin.py
new file mode 100644
index 00000000..8c38f3f3
--- /dev/null
+++ b/archivebox/pkg/admin.py
@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.
diff --git a/archivebox/pkg/apps.py b/archivebox/pkg/apps.py
new file mode 100644
index 00000000..fa8a6913
--- /dev/null
+++ b/archivebox/pkg/apps.py
@@ -0,0 +1,14 @@
+__package__ = 'archivebox.pkg'
+
+from django.apps import AppConfig
+
+
+class PkgsConfig(AppConfig):
+ default_auto_field = 'django.db.models.BigAutoField'
+ name = 'pkg'
+
+ def ready(self):
+ from .settings import LOADED_DEPENDENCIES
+
+ # print(LOADED_DEPENDENCIES)
+
\ No newline at end of file
diff --git a/archivebox/pkg/management/__init__.py b/archivebox/pkg/management/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/pkg/management/commands/__init__.py b/archivebox/pkg/management/commands/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/pkg/management/commands/pkg.py b/archivebox/pkg/management/commands/pkg.py
new file mode 100644
index 00000000..7cbf795a
--- /dev/null
+++ b/archivebox/pkg/management/commands/pkg.py
@@ -0,0 +1,75 @@
+__package__ = 'archivebox.pkg.management.commands'
+
+from django.core.management.base import BaseCommand
+from django.conf import settings
+
+from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
+from pydantic_pkgr.binprovider import bin_abspath
+
+from ....config import NODE_BIN_PATH, bin_path
+
+from plugantic.plugins import LOADED_PLUGINS
+
+from pkg.settings import env
+
+
+class Command(BaseCommand):
+ def handle(self, *args, method, **options):
+ method(*args, **options)
+
+ def add_arguments(self, parser):
+ subparsers = parser.add_subparsers(title="sub-commands", required=True)
+
+ list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
+ list_parser.set_defaults(method=self.list)
+
+ install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
+ install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
+ install_parser.add_argument("package_names", nargs="+", type=str)
+ install_parser.set_defaults(method=self.install)
+
+ def list(self, *args, **options):
+ self.stdout.write('################# PLUGINS ####################')
+ for plugin in LOADED_PLUGINS:
+ self.stdout.write(f'{plugin.name}:')
+ for binary in plugin.binaries:
+ try:
+ binary = binary.install()
+ except Exception as e:
+ # import ipdb; ipdb.set_trace()
+ raise
+ self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}')
+
+ self.stdout.write('\n################# LEGACY ####################')
+ for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
+ bin_name = settings.CONFIG[bin_key]
+
+ self.stdout.write(f'{bin_key}: {bin_name}')
+
+ # binary = Binary(name=package_name, providers=[env])
+ # print(binary)
+
+ # try:
+ # loaded_bin = binary.load()
+ # self.stdout.write(
+ # self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+ # )
+ # except Exception as e:
+ # self.stderr.write(
+ # self.style.ERROR(f"Error loading {package_name}: {e}")
+ # )
+
+ def install(self, *args, bright, **options):
+ for package_name in options["package_names"]:
+ binary = Binary(name=package_name, providers=[env])
+ print(binary)
+
+ try:
+ loaded_bin = binary.load()
+ self.stdout.write(
+ self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+ )
+ except Exception as e:
+ self.stderr.write(
+ self.style.ERROR(f"Error loading {package_name}: {e}")
+ )
diff --git a/archivebox/pkg/migrations/__init__.py b/archivebox/pkg/migrations/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/pkg/models.py b/archivebox/pkg/models.py
new file mode 100644
index 00000000..71a83623
--- /dev/null
+++ b/archivebox/pkg/models.py
@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.
diff --git a/archivebox/pkg/settings.py b/archivebox/pkg/settings.py
new file mode 100644
index 00000000..7f13d125
--- /dev/null
+++ b/archivebox/pkg/settings.py
@@ -0,0 +1,86 @@
+__package__ = 'archivebox.pkg'
+
+import os
+import sys
+import shutil
+import inspect
+from pathlib import Path
+
+import django
+from django.conf import settings
+from django.db.backends.sqlite3.base import Database as sqlite3
+
+from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
+from pydantic_pkgr.binprovider import bin_abspath
+
+from ..config import NODE_BIN_PATH, bin_path
+
+env = EnvProvider(PATH=NODE_BIN_PATH + ':' + os.environ.get('PATH', '/bin'))
+
+
+LOADED_DEPENDENCIES = {}
+
+for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
+ # 'PYTHON_BINARY': {
+ # 'path': bin_path(config['PYTHON_BINARY']),
+ # 'version': config['PYTHON_VERSION'],
+ # 'hash': bin_hash(config['PYTHON_BINARY']),
+ # 'enabled': True,
+ # 'is_valid': bool(config['PYTHON_VERSION']),
+ # },
+
+
+ bin_name = settings.CONFIG[bin_key]
+
+ if bin_name.endswith('django/__init__.py'):
+ binary_spec = Binary(name='django', providers=[env], provider_overrides={
+ 'env': {
+ 'abspath': lambda: Path(inspect.getfile(django)),
+ 'version': lambda: SemVer('{}.{}.{} {} ({})'.format(*django.VERSION)),
+ }
+ })
+ elif bin_name.endswith('sqlite3/dbapi2.py'):
+ binary_spec = Binary(name='sqlite3', providers=[env], provider_overrides={
+ 'env': {
+ 'abspath': lambda: Path(inspect.getfile(sqlite3)),
+ 'version': lambda: SemVer(sqlite3.version),
+ }
+ })
+ elif bin_name.endswith('archivebox'):
+ binary_spec = Binary(name='archivebox', providers=[env], provider_overrides={
+ 'env': {
+ 'abspath': lambda: shutil.which(str(Path('archivebox').expanduser())),
+ 'version': lambda: settings.CONFIG.VERSION,
+ }
+ })
+ elif bin_name.endswith('postlight/parser/cli.js'):
+ binary_spec = Binary(name='postlight-parser', providers=[env], provider_overrides={
+ 'env': {
+ 'abspath': lambda: bin_path('postlight-parser'),
+ 'version': lambda: SemVer('1.0.0'),
+ }
+ })
+ else:
+ binary_spec = Binary(name=bin_name, providers=[env])
+
+ try:
+ binary = binary_spec.load()
+ except Exception as e:
+ # print(f"- ❌ Binary {bin_name} failed to load with error: {e}")
+ continue
+
+ assert isinstance(binary.loaded_version, SemVer)
+
+ try:
+ assert str(binary.loaded_version) == dependency['version'], f"Expected {bin_name} version {dependency['version']}, got {binary.loaded_version}"
+ assert str(binary.loaded_respath) == str(bin_abspath(dependency['path']).resolve()), f"Expected {bin_name} abspath {bin_abspath(dependency['path']).resolve()}, got {binary.loaded_respath}"
+ assert binary.is_valid == dependency['is_valid'], f"Expected {bin_name} is_valid={dependency['is_valid']}, got {binary.is_valid}"
+ except Exception as e:
+ pass
+ # print(f"WARNING: Error loading {bin_name}: {e}")
+ # import ipdb; ipdb.set_trace()
+
+ # print(f"- ✅ Binary {bin_name} loaded successfully")
+ LOADED_DEPENDENCIES[bin_key] = binary
+
+
diff --git a/archivebox/pkg/tests.py b/archivebox/pkg/tests.py
new file mode 100644
index 00000000..7ce503c2
--- /dev/null
+++ b/archivebox/pkg/tests.py
@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.
diff --git a/archivebox/pkg/views.py b/archivebox/pkg/views.py
new file mode 100644
index 00000000..91ea44a2
--- /dev/null
+++ b/archivebox/pkg/views.py
@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.
diff --git a/archivebox/plugantic/__init__.py b/archivebox/plugantic/__init__.py
index 244d084f..c8f37e05 100644
--- a/archivebox/plugantic/__init__.py
+++ b/archivebox/plugantic/__init__.py
@@ -1,6 +1,5 @@
__package__ = 'archivebox.plugantic'
-from .binproviders import BinProvider
from .binaries import Binary
from .extractors import Extractor
from .replayers import Replayer
diff --git a/archivebox/plugantic/apps.py b/archivebox/plugantic/apps.py
index c0f1ce71..57d57cd8 100644
--- a/archivebox/plugantic/apps.py
+++ b/archivebox/plugantic/apps.py
@@ -1,6 +1,17 @@
+import importlib
from django.apps import AppConfig
class PluganticConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'plugantic'
+
+ def ready(self) -> None:
+ from django.conf import settings
+ from .plugins import PLUGINS
+
+ for plugin_name in settings.INSTALLED_PLUGINS.keys():
+ lib = importlib.import_module(f'{plugin_name}.apps')
+ if hasattr(lib, 'PLUGINS'):
+ for plugin_instance in lib.PLUGINS:
+ PLUGINS.append(plugin_instance)
diff --git a/archivebox/plugantic/binaries.py b/archivebox/plugantic/binaries.py
index 4788c361..76bd63ac 100644
--- a/archivebox/plugantic/binaries.py
+++ b/archivebox/plugantic/binaries.py
@@ -10,285 +10,17 @@ from typing import Any, Optional, Dict, List
from typing_extensions import Self
from subprocess import run, PIPE
+from pydantic_pkgr import Binary, SemVer, BinName, BinProvider, EnvProvider, AptProvider, BrewProvider, PipProvider, BinProviderName, ProviderLookupDict
-from pydantic_core import ValidationError
-
-from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer
-
-from .binproviders import (
- SemVer,
- BinName,
- BinProviderName,
- HostBinPath,
- BinProvider,
- EnvProvider,
- AptProvider,
- BrewProvider,
- PipProvider,
- ProviderLookupDict,
- bin_name,
- bin_abspath,
- path_is_script,
- path_is_executable,
-)
-
-
-class Binary(BaseModel):
- name: BinName
- description: str = Field(default='')
-
- providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers')
- provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides')
-
- loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider')
- loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath')
- loaded_version: Optional[SemVer] = Field(default=None, alias='version')
-
- # bin_filename: see below
- # is_executable: see below
- # is_script
- # is_valid: see below
-
-
- @model_validator(mode='after')
- def validate(self):
- self.loaded_abspath = bin_abspath(self.name) or self.name
- self.description = self.description or self.name
-
- assert self.providers_supported, f'No providers were given for package {self.name}'
-
- # pull in any overrides from the binproviders
- for provider in self.providers_supported:
- overrides_by_provider = provider.get_providers_for_bin(self.name)
- if overrides_by_provider:
- self.provider_overrides[provider.name] = {
- **overrides_by_provider,
- **self.provider_overrides.get(provider.name, {}),
- }
- return self
-
- @field_validator('loaded_abspath', mode='before')
- def parse_abspath(cls, value: Any):
- return bin_abspath(value)
-
- @field_validator('loaded_version', mode='before')
- def parse_version(cls, value: Any):
- return value and SemVer(value)
-
- @field_serializer('provider_overrides', when_used='json')
- def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]:
- return {
- provider_name: {
- key: str(val)
- for key, val in overrides.items()
- }
- for provider_name, overrides in provider_overrides.items()
- }
-
- @computed_field # type: ignore[misc] # see mypy issue #1362
- @property
- def bin_filename(self) -> BinName:
- if self.is_script:
- # e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite
- name = self.name
- elif self.loaded_abspath:
- # e.g. '/opt/homebrew/bin/wget' -> wget
- name = bin_name(self.loaded_abspath)
- else:
- # e.g. 'ytdlp' -> 'yt-dlp'
- name = bin_name(self.name)
- return name
-
- @computed_field # type: ignore[misc] # see mypy issue #1362
- @property
- def is_executable(self) -> bool:
- try:
- assert self.loaded_abspath and path_is_executable(self.loaded_abspath)
- return True
- except (ValidationError, AssertionError):
- return False
-
- @computed_field # type: ignore[misc] # see mypy issue #1362
- @property
- def is_script(self) -> bool:
- try:
- assert self.loaded_abspath and path_is_script(self.loaded_abspath)
- return True
- except (ValidationError, AssertionError):
- return False
-
- @computed_field # type: ignore[misc] # see mypy issue #1362
- @property
- def is_valid(self) -> bool:
- return bool(
- self.name
- and self.loaded_abspath
- and self.loaded_version
- and (self.is_executable or self.is_script)
- )
-
- @validate_call
- def install(self) -> Self:
- if not self.providers_supported:
- return self
-
- exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
- for provider in self.providers_supported:
- try:
- installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name))
- if installed_bin:
- # print('INSTALLED', self.name, installed_bin)
- return self.model_copy(update={
- 'loaded_provider': provider.name,
- 'loaded_abspath': installed_bin.abspath,
- 'loaded_version': installed_bin.version,
- })
- except Exception as err:
- print(err)
- exc = err
- raise exc
-
- @validate_call
- def load(self, cache=True) -> Self:
- if self.is_valid:
- return self
-
- if not self.providers_supported:
- return self
-
- exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
- for provider in self.providers_supported:
- try:
- installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name))
- if installed_bin:
- # print('LOADED', provider, self.name, installed_bin)
- return self.model_copy(update={
- 'loaded_provider': provider.name,
- 'loaded_abspath': installed_bin.abspath,
- 'loaded_version': installed_bin.version,
- })
- except Exception as err:
- print(err)
- exc = err
- raise exc
-
- @validate_call
- def load_or_install(self, cache=True) -> Self:
- if self.is_valid:
- return self
-
- if not self.providers_supported:
- return self
-
- exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
- for provider in self.providers_supported:
- try:
- installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache)
- if installed_bin:
- # print('LOADED_OR_INSTALLED', self.name, installed_bin)
- return self.model_copy(update={
- 'loaded_provider': provider.name,
- 'loaded_abspath': installed_bin.abspath,
- 'loaded_version': installed_bin.version,
- })
- except Exception as err:
- print(err)
- exc = err
- raise exc
-
- @validate_call
- def exec(self, args=(), pwd='.'):
- assert self.loaded_abspath
- assert self.loaded_version
- return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd)
+import django
+from django.db.backends.sqlite3.base import Database as sqlite3
-class SystemPythonHelpers:
- @staticmethod
- def get_subdeps() -> str:
- return 'python3 python3-minimal python3-pip python3-virtualenv'
-
- @staticmethod
- def get_abspath() -> str:
- return sys.executable
-
- @staticmethod
- def get_version() -> str:
- return '{}.{}.{}'.format(*sys.version_info[:3])
-
-
-class SqliteHelpers:
- @staticmethod
- def get_abspath() -> Path:
- import sqlite3
- importlib.reload(sqlite3)
- return Path(inspect.getfile(sqlite3))
-
- @staticmethod
- def get_version() -> SemVer:
- import sqlite3
- importlib.reload(sqlite3)
- version = sqlite3.version
- assert version
- return SemVer(version)
-
-class DjangoHelpers:
- @staticmethod
- def get_django_abspath() -> str:
- import django
- return inspect.getfile(django)
-
-
- @staticmethod
- def get_django_version() -> str:
- import django
- return '{}.{}.{} {} ({})'.format(*django.VERSION)
-
-class YtdlpHelpers:
- @staticmethod
- def get_ytdlp_subdeps() -> str:
- return 'yt-dlp ffmpeg'
-
- @staticmethod
- def get_ytdlp_version() -> str:
- import yt_dlp
- importlib.reload(yt_dlp)
-
- version = yt_dlp.version.__version__
- assert version
- return version
-
-class PythonBinary(Binary):
- name: BinName = 'python'
-
- providers_supported: List[BinProvider] = [
- EnvProvider(
- subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'},
- abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'},
- version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'},
- ),
- ]
-
-class SqliteBinary(Binary):
- name: BinName = 'sqlite'
- providers_supported: List[BinProvider] = [
- EnvProvider(
- version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'},
- abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'},
- ),
- ]
-
-class DjangoBinary(Binary):
- name: BinName = 'django'
- providers_supported: List[BinProvider] = [
- EnvProvider(
- abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'},
- version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'},
- ),
- ]
-
+def get_ytdlp_version() -> str:
+ import yt_dlp
+ return yt_dlp.version.__version__
@@ -296,16 +28,26 @@ class DjangoBinary(Binary):
class YtdlpBinary(Binary):
name: BinName = 'yt-dlp'
providers_supported: List[BinProvider] = [
- # EnvProvider(),
- PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}),
- BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}),
- # AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}),
+ EnvProvider(),
+ PipProvider(),
+ BrewProvider(),
+ AptProvider(),
]
-
+ provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
+ 'pip': {
+ 'version': get_ytdlp_version,
+ },
+ 'brew': {
+ 'subdeps': lambda: 'yt-dlp ffmpeg',
+ },
+ 'apt': {
+ 'subdeps': lambda: 'yt-dlp ffmpeg',
+ }
+ }
class WgetBinary(Binary):
name: BinName = 'wget'
- providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()]
+ providers_supported: List[BinProvider] = [EnvProvider(), AptProvider(), BrewProvider()]
# if __name__ == '__main__':
diff --git a/archivebox/plugantic/binproviders.py b/archivebox/plugantic/binproviders.py
deleted file mode 100644
index 1c9933ea..00000000
--- a/archivebox/plugantic/binproviders.py
+++ /dev/null
@@ -1,561 +0,0 @@
-__package__ = 'archivebox.plugantic'
-
-import os
-import shutil
-import operator
-
-from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING
-from typing_extensions import Self
-from abc import ABC, abstractmethod
-from collections import namedtuple
-from pathlib import Path
-from subprocess import run, PIPE
-
-from pydantic_core import core_schema, ValidationError
-from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler
-
-
-
-def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool:
- """returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless"""
- code = lambda_func.__code__
- has_args = code.co_argcount > 0
- has_varargs = code.co_flags & 0x04 != 0
- has_varkw = code.co_flags & 0x08 != 0
- return has_args or has_varargs or has_varkw
-
-
-def is_semver_str(semver: Any) -> bool:
- if isinstance(semver, str):
- return (semver.count('.') == 2 and semver.replace('.', '').isdigit())
- return False
-
-def semver_to_str(semver: tuple[int, int, int] | str) -> str:
- if isinstance(semver, (list, tuple)):
- return '.'.join(str(chunk) for chunk in semver)
- if is_semver_str(semver):
- return semver
- raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver))
-
-
-SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0))
-SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int]
-
-class SemVer(SemVerTuple):
- major: int
- minor: int = 0
- patch: int = 0
-
- if TYPE_CHECKING:
- full_text: str | None = ''
-
- def __new__(cls, *args, full_text=None, **kwargs):
- # '1.1.1'
- if len(args) == 1 and is_semver_str(args[0]):
- result = SemVer.parse(args[0])
-
- # ('1', '2', '3')
- elif len(args) == 1 and isinstance(args[0], (tuple, list)):
- result = SemVer.parse(args[0])
-
- # (1, '2', None)
- elif not all(isinstance(arg, (int, type(None))) for arg in args):
- result = SemVer.parse(args)
-
- # (None)
- elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())):
- result = None
-
- # 1, 2, 3
- else:
- result = SemVerTuple.__new__(cls, *args, **kwargs)
-
- if result is not None:
- # add first line as extra hidden metadata so it can be logged without having to re-run version cmd
- result.full_text = full_text or str(result)
- return result
-
- @classmethod
- def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None:
- """
- parses a version tag string formatted like into (major, minor, patch) ints
- 'Google Chrome 124.0.6367.208' -> (124, 0, 6367)
- 'GNU Wget 1.24.5 built on darwin23.2.0.' -> (1, 24, 5)
- 'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0)
- '2024.04.09' -> (2024, 4, 9)
-
- """
- # print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout)
-
- if isinstance(version_stdout, (tuple, list)):
- version_stdout = '.'.join(str(chunk) for chunk in version_stdout)
- elif isinstance(version_stdout, bytes):
- version_stdout = version_stdout.decode()
- elif not isinstance(version_stdout, str):
- version_stdout = str(version_stdout)
-
- # no text to work with, return None immediately
- if not version_stdout.strip():
- # raise Exception('Tried to parse semver from empty version output (is binary installed and available?)')
- return None
-
- just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0]
- contains_semver = lambda col: (
- col.count('.') in (1, 2, 3)
- and all(chunk.isdigit() for chunk in col.split('.')[:3]) # first 3 chunks can only be nums
- )
-
- full_text = version_stdout.split('\n')[0].strip()
- first_line_columns = full_text.split()[:4]
- version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns)))
-
- # could not find any column of first line that looks like a version number, despite there being some text
- if not version_columns:
- # raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns)))
- return None
-
- # take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09)
- first_version_tuple = version_columns[0].split('.', 3)[:3]
-
- # print('FINAL_VALUE', first_version_tuple)
-
- return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text)
-
- def __str__(self):
- return '.'.join(str(chunk) for chunk in self)
-
- # @classmethod
- # def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema:
- # default_schema = handler(source)
- # return core_schema.no_info_after_validator_function(
- # cls.parse,
- # default_schema,
- # serialization=core_schema.plain_serializer_function_ser_schema(
- # lambda semver: str(semver),
- # info_arg=False,
- # return_schema=core_schema.str_schema(),
- # ),
- # )
-
-assert SemVer(None) == None
-assert SemVer('') == None
-assert SemVer.parse('') == None
-assert SemVer(1) == (1, 0, 0)
-assert SemVer(1, 2) == (1, 2, 0)
-assert SemVer('1.2+234234') == (1, 2, 0)
-assert SemVer((1, 2, 3)) == (1, 2, 3)
-assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3'
-assert SemVer(('1', '2', '3')) == (1, 2, 3)
-assert SemVer.parse('5.6.7') == (5, 6, 7)
-assert SemVer.parse('124.0.6367.208') == (124, 0, 6367)
-assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0)
-assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367)
-assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367)
-assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123'
-assert SemVer.parse('Google Chrome') == None
-
-@validate_call
-def bin_name(bin_path_or_name: str | Path) -> str:
- name = Path(bin_path_or_name).name
- assert len(name) > 1
- assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), (
- f'Binary name can only contain a-Z0-9-_.: {name}')
- return name
-
-BinName = Annotated[str, AfterValidator(bin_name)]
-
-@validate_call
-def path_is_file(path: Path | str) -> Path:
- path = Path(path) if isinstance(path, str) else path
- assert path.is_file(), f'Path is not a file: {path}'
- return path
-
-HostExistsPath = Annotated[Path, AfterValidator(path_is_file)]
-
-@validate_call
-def path_is_executable(path: HostExistsPath) -> HostExistsPath:
- assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})'
- return path
-
-@validate_call
-def path_is_script(path: HostExistsPath) -> HostExistsPath:
- SCRIPT_EXTENSIONS = ('.py', '.js', '.sh')
- assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS))
- return path
-
-HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)]
-
-@validate_call
-def path_is_abspath(path: Path) -> Path:
- return path.resolve()
-
-HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)]
-HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)]
-
-
-@validate_call
-def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None:
- assert bin_path_or_name
-
- if str(bin_path_or_name).startswith('/'):
- # already a path, get its absolute form
- abspath = Path(bin_path_or_name).resolve()
- else:
- # not a path yet, get path using os.which
- binpath = shutil.which(bin_path_or_name)
- if not binpath:
- return None
- abspath = Path(binpath).resolve()
-
- try:
- return TypeAdapter(HostBinPath).validate_python(abspath)
- except ValidationError:
- return None
-
-
-@validate_call
-def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None:
- return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode())
-
-
-class InstalledBin(BaseModel):
- abspath: HostBinPath
- version: SemVer
-
-
-def is_valid_install_string(pkgs_str: str) -> str:
- """Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'"""
- assert pkgs_str
- assert all(len(pkg) > 1 for pkg in pkgs_str.split(' '))
- return pkgs_str
-
-def is_valid_python_dotted_import(import_str: str) -> str:
- assert import_str and import_str.replace('.', '').replace('_', '').isalnum()
- return import_str
-
-InstallStr = Annotated[str, AfterValidator(is_valid_install_string)]
-
-LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)]
-
-ProviderHandler = Callable[..., Any] | Callable[[], Any] # must take no args [], or [bin_name: str, **kwargs]
-#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
-ProviderHandlerRef = LazyImportStr | ProviderHandler
-ProviderLookupDict = Dict[str, LazyImportStr]
-ProviderType = Literal['abspath', 'version', 'subdeps', 'install']
-
-
-# class Host(BaseModel):
-# machine: str
-# system: str
-# platform: str
-# in_docker: bool
-# in_qemu: bool
-# python: str
-
-BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor']
-
-
-class BinProvider(ABC, BaseModel):
- name: BinProviderName
-
- abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True)
- version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True)
- subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True)
- install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True)
-
- _abspath_cache: ClassVar = {}
- _version_cache: ClassVar = {}
- _install_cache: ClassVar = {}
-
- # def provider_version(self) -> SemVer | None:
- # """Version of the actual underlying package manager (e.g. pip v20.4.1)"""
- # if self.name in ('env', 'vendor'):
- # return SemVer('0.0.0')
- # installer_binpath = Path(shutil.which(self.name)).resolve()
- # return bin_version(installer_binpath)
-
- # def provider_host(self) -> Host:
- # """Information about the host env, archictecture, and OS needed to select & build packages"""
- # p = platform.uname()
- # return Host(
- # machine=p.machine,
- # system=p.system,
- # platform=platform.platform(),
- # python=sys.implementation.name,
- # in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true',
- # in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true',
- # )
-
- def get_default_providers(self):
- return self.get_providers_for_bin('*')
-
- def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None:
- if provider_func is None:
- return None
-
- # if provider_func is a dotted path to a function on self, swap it for the actual function
- if isinstance(provider_func, str) and provider_func.startswith('self.'):
- provider_func = getattr(self, provider_func.split('self.', 1)[-1])
-
- # if provider_func is a dot-formatted import string, import the function
- if isinstance(provider_func, str):
- from django.utils.module_loading import import_string
-
- package_name, module_name, classname, path = provider_func.split('.', 3) # -> abc, def, ghi.jkl
-
- # get .ghi.jkl nested attr present on module abc.def
- imported_module = import_string(f'{package_name}.{module_name}.{classname}')
- provider_func = operator.attrgetter(path)(imported_module)
-
- # # abc.def.ghi.jkl -> 1, 2, 3
- # for idx in range(1, len(path)):
- # parent_path = '.'.join(path[:-idx]) # abc.def.ghi
- # try:
- # parent_module = import_string(parent_path)
- # provider_func = getattr(parent_module, path[-idx])
- # except AttributeError, ImportError:
- # continue
-
- assert TypeAdapter(ProviderHandler).validate_python(provider_func), (
- f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}')
-
- return provider_func
-
- @validate_call
- def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict:
- providers_for_bin = {
- 'abspath': self.abspath_provider.get(bin_name),
- 'version': self.version_provider.get(bin_name),
- 'subdeps': self.subdeps_provider.get(bin_name),
- 'install': self.install_provider.get(bin_name),
- }
- only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None}
-
- return only_set_providers_for_bin
-
- @validate_call
- def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler:
- """
- Get the provider func for a given key + Dict of provider callbacks + fallback default provider.
- e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable
- """
-
- provider_func_ref = (
- (overrides or {}).get(provider_type)
- or self.get_providers_for_bin(bin_name).get(provider_type)
- or self.get_default_providers().get(provider_type)
- or default_provider
- )
- # print('getting provider for action', bin_name, provider_type, provider_func)
-
- provider_func = self.resolve_provider_func(provider_func_ref)
-
- assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.'
-
- return provider_func
-
- @validate_call
- def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any:
- provider_func: ProviderHandler = self.get_provider_for_action(
- bin_name=bin_name,
- provider_type=provider_type,
- default_provider=default_provider,
- overrides=overrides,
- )
- if not func_takes_args_or_kwargs(provider_func):
- # if it's a pure argless lambdas, dont pass bin_path and other **kwargs
- provider_func_without_args = cast(Callable[[], Any], provider_func)
- return provider_func_without_args()
-
- provider_func = cast(Callable[..., Any], provider_func)
- return provider_func(bin_name, **kwargs)
-
-
-
- def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None:
- print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...')
- try:
- return bin_abspath(bin_name)
- except ValidationError:
- return None
-
- def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None:
- abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name)
- if not abspath: return None
-
- print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...')
- try:
- return bin_version(abspath)
- except ValidationError:
- return None
-
- def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr:
- print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}')
- # ... subdependency calculation logic here
- return TypeAdapter(InstallStr).validate_python(bin_name)
-
- @abstractmethod
- def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
- subdeps = subdeps or self.get_subdeps(bin_name)
- print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
- # ... install logic here
- assert True
-
-
- @validate_call
- def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None:
- abspath = self.call_provider_for_action(
- bin_name=bin_name,
- provider_type='abspath',
- default_provider=self.on_get_abspath,
- overrides=overrides,
- )
- if not abspath:
- return None
- result = TypeAdapter(HostBinPath).validate_python(abspath)
- self._abspath_cache[bin_name] = result
- return result
-
- @validate_call
- def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None:
- version = self.call_provider_for_action(
- bin_name=bin_name,
- provider_type='version',
- default_provider=self.on_get_version,
- overrides=overrides,
- abspath=abspath,
- )
- if not version:
- return None
- result = SemVer(version)
- self._version_cache[bin_name] = result
- return result
-
- @validate_call
- def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr:
- subdeps = self.call_provider_for_action(
- bin_name=bin_name,
- provider_type='subdeps',
- default_provider=self.on_get_subdeps,
- overrides=overrides,
- )
- if not subdeps:
- subdeps = bin_name
- result = TypeAdapter(InstallStr).validate_python(subdeps)
- return result
-
- @validate_call
- def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None:
- subdeps = self.get_subdeps(bin_name, overrides=overrides)
-
- self.call_provider_for_action(
- bin_name=bin_name,
- provider_type='install',
- default_provider=self.on_install,
- overrides=overrides,
- subdeps=subdeps,
- )
-
- installed_abspath = self.get_abspath(bin_name)
- assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}'
-
- installed_version = self.get_version(bin_name, abspath=installed_abspath)
- assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}'
-
- result = InstalledBin(abspath=installed_abspath, version=installed_version)
- self._install_cache[bin_name] = result
- return result
-
- @validate_call
- def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None:
- installed_abspath = None
- installed_version = None
-
- if cache:
- installed_bin = self._install_cache.get(bin_name)
- if installed_bin:
- return installed_bin
- installed_abspath = self._abspath_cache.get(bin_name)
- installed_version = self._version_cache.get(bin_name)
-
-
- installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides)
- if not installed_abspath:
- return None
-
- installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides)
- if not installed_version:
- return None
-
- return InstalledBin(abspath=installed_abspath, version=installed_version)
-
- @validate_call
- def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None:
- installed = self.load(bin_name, overrides=overrides, cache=cache)
- if not installed:
- installed = self.install(bin_name, overrides=overrides)
- return installed
-
-
-class PipProvider(BinProvider):
- name: BinProviderName = 'pip'
-
- def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
- subdeps = subdeps or self.on_get_subdeps(bin_name)
- print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-
- proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-
- if proc.returncode != 0:
- print(proc.stdout.strip().decode())
- print(proc.stderr.strip().decode())
- raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-
-class AptProvider(BinProvider):
- name: BinProviderName = 'apt'
-
- subdeps_provider: ProviderLookupDict = {
- 'yt-dlp': lambda: 'yt-dlp ffmpeg',
- }
-
- def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
- subdeps = subdeps or self.on_get_subdeps(bin_name)
- print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-
- run(['apt-get', 'update', '-qq'])
- proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-
- if proc.returncode != 0:
- print(proc.stdout.strip().decode())
- print(proc.stderr.strip().decode())
- raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-class BrewProvider(BinProvider):
- name: BinProviderName = 'brew'
-
- def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
- subdeps = subdeps or self.on_get_subdeps(bin_name)
- print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-
- proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-
- if proc.returncode != 0:
- print(proc.stdout.strip().decode())
- print(proc.stderr.strip().decode())
- raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-
-class EnvProvider(BinProvider):
- name: BinProviderName = 'env'
-
- abspath_provider: ProviderLookupDict = {
- # 'python': lambda: Path('/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'),
- }
- version_provider: ProviderLookupDict = {
- # 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
- }
-
- def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
- """The env provider is ready-only and does not install any packages, so this is a no-op"""
- pass
diff --git a/archivebox/plugantic/extractors.py b/archivebox/plugantic/extractors.py
index 3befa5b5..56d594f3 100644
--- a/archivebox/plugantic/extractors.py
+++ b/archivebox/plugantic/extractors.py
@@ -31,7 +31,7 @@ def no_empty_args(args: List[str]) -> List[str]:
assert all(len(arg) for arg in args)
return args
-ExtractorName = Literal['wget', 'warc', 'media']
+ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]
diff --git a/archivebox/plugantic/plugins.py b/archivebox/plugantic/plugins.py
index c34c4703..d213fced 100644
--- a/archivebox/plugantic/plugins.py
+++ b/archivebox/plugantic/plugins.py
@@ -14,9 +14,6 @@ from pydantic import (
from .binaries import (
Binary,
- PythonBinary,
- SqliteBinary,
- DjangoBinary,
WgetBinary,
YtdlpBinary,
)
@@ -28,7 +25,6 @@ from .extractors import (
)
from .replayers import (
Replayer,
- GENERIC_REPLAYER,
MEDIA_REPLAYER,
)
from .configs import (
@@ -80,12 +76,6 @@ class Plugin(BaseModel):
})
-class CorePlugin(Plugin):
- name: str = 'core'
- configs: List[SerializeAsAny[ConfigSet]] = []
- binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
- extractors: List[SerializeAsAny[Extractor]] = []
- replayers: List[SerializeAsAny[Replayer]] = [GENERIC_REPLAYER]
class YtdlpPlugin(Plugin):
name: str = 'ytdlp'
@@ -101,11 +91,9 @@ class WgetPlugin(Plugin):
extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()]
-CORE_PLUGIN = CorePlugin()
YTDLP_PLUGIN = YtdlpPlugin()
WGET_PLUGIN = WgetPlugin()
PLUGINS = [
- CORE_PLUGIN,
YTDLP_PLUGIN,
WGET_PLUGIN,
]
diff --git a/archivebox/plugantic/replayers.py b/archivebox/plugantic/replayers.py
index 12ade623..08f1cd88 100644
--- a/archivebox/plugantic/replayers.py
+++ b/archivebox/plugantic/replayers.py
@@ -22,5 +22,4 @@ class Replayer(BaseModel):
# thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
-GENERIC_REPLAYER = Replayer(name='generic')
MEDIA_REPLAYER = Replayer(name='media')
diff --git a/archivebox/plugantic/views.py b/archivebox/plugantic/views.py
index b29a8cf5..24f256de 100644
--- a/archivebox/plugantic/views.py
+++ b/archivebox/plugantic/views.py
@@ -1,5 +1,8 @@
__package__ = 'archivebox.plugantic'
+import inspect
+from typing import Any
+
from django.http import HttpRequest
from django.utils.html import format_html, mark_safe
@@ -10,6 +13,44 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
from plugantic.plugins import LOADED_PLUGINS
from django.conf import settings
+def obj_to_yaml(obj: Any, indent: int=0) -> str:
+ indent_str = " " * indent
+
+ if isinstance(obj, dict):
+ if not obj:
+ return "{}"
+ result = "\n"
+ for key, value in obj.items():
+ result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n"
+ return result
+
+ elif isinstance(obj, list):
+ if not obj:
+ return "[]"
+ result = "\n"
+ for item in obj:
+ result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n"
+ return result.rstrip()
+
+ elif isinstance(obj, str):
+ if "\n" in obj:
+ return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ")
+ else:
+ return f" {obj}"
+
+ elif isinstance(obj, (int, float, bool)):
+ return f" {str(obj)}"
+
+ elif callable(obj):
+ source = '\n'.join(
+ '' if 'def ' in line else line
+ for line in inspect.getsource(obj).split('\n')
+ if line.strip()
+ ).split('lambda: ')[-1].rstrip(',')
+ return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ")
+
+ else:
+ return f" {str(obj)}"
@render_with_table_view
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@@ -18,13 +59,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows = {
"Binary": [],
- "From Plugin": [],
"Found Version": [],
+ "From Plugin": [],
"Provided By": [],
"Found Abspath": [],
"Related Configuration": [],
"Overrides": [],
- "Description": [],
+ # "Description": [],
}
relevant_configs = {
@@ -38,8 +79,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
binary = binary.load_or_install()
rows['Binary'].append(ItemLink(binary.name, key=binary.name))
- rows['From Plugin'].append(plugin.name)
rows['Found Version'].append(binary.loaded_version)
+ rows['From Plugin'].append(plugin.name)
rows['Provided By'].append(binary.loaded_provider)
rows['Found Abspath'].append(binary.loaded_abspath)
rows['Related Configuration'].append(mark_safe(', '.join(
@@ -48,8 +89,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
# or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
)))
- rows['Overrides'].append(str(binary.provider_overrides))
- rows['Description'].append(binary.description)
+ rows['Overrides'].append(obj_to_yaml(binary.provider_overrides))
+ # rows['Description'].append(binary.description)
return TableContext(
title="Binaries",
@@ -85,8 +126,8 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
'binprovider': binary.loaded_provider,
'abspath': binary.loaded_abspath,
'version': binary.loaded_version,
- 'overrides': str(binary.provider_overrides),
- 'providers': str(binary.providers_supported),
+ 'overrides': obj_to_yaml(binary.provider_overrides),
+ 'providers': obj_to_yaml(binary.providers_supported),
},
"help_texts": {
# TODO
diff --git a/archivebox/system.py b/archivebox/system.py
index bced0bac..58571000 100644
--- a/archivebox/system.py
+++ b/archivebox/system.py
@@ -11,13 +11,12 @@ from typing import Optional, Union, Set, Tuple
from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
from crontab import CronTab
-from .vendor.atomicwrites import atomic_write as lib_atomic_write
+from atomicwrites import atomic_write as lib_atomic_write
from .util import enforce_types, ExtendedEncoder
from .config import PYTHON_BINARY, OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
-
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
"""Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
diff --git a/archivebox/util.py b/archivebox/util.py
index d9dd4dbf..c96c1d1a 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -16,7 +16,7 @@ from datetime import datetime, timezone
from dateparser import parse as dateparser
from requests.exceptions import RequestException, ReadTimeout
-from .vendor.base32_crockford import encode as base32_encode # type: ignore
+from base32_crockford import encode as base32_encode # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
from os.path import lexists
from os import remove as remove_file
@@ -273,8 +273,8 @@ def get_headers(url: str, timeout: int=None) -> str:
{
'URL': url,
'Status-Code': response.status_code,
- 'Elapsed': response.elapsed,
- 'Encoding': response.encoding,
+ 'Elapsed': response.elapsed.total_seconds()*1000,
+ 'Encoding': str(response.encoding),
'Apparent-Encoding': response.apparent_encoding,
**dict(response.headers),
},
@@ -304,11 +304,7 @@ def chrome_args(**options) -> List[str]:
cmd_args += CHROME_EXTRA_ARGS
if options['CHROME_HEADLESS']:
- chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
- if chrome_major_version >= 111:
- cmd_args += ("--headless=new",)
- else:
- cmd_args += ('--headless',)
+ cmd_args += ("--headless=new",) # expects chrome version >= 111
if not options['CHROME_SANDBOX']:
# assume this means we are running inside a docker container
diff --git a/archivebox/vendor/__init__.py b/archivebox/vendor/__init__.py
index e69de29b..e19c45af 100644
--- a/archivebox/vendor/__init__.py
+++ b/archivebox/vendor/__init__.py
@@ -0,0 +1,34 @@
+import sys
+import inspect
+import importlib
+from pathlib import Path
+
+VENDOR_DIR = Path(__file__).parent
+
+VENDORED_LIBS = {
+ # sys.path dir: library name
+ 'python-atomicwrites': 'atomicwrites',
+ 'django-taggit': 'taggit',
+ 'pydantic-pkgr': 'pydantic_pkgr',
+ 'pocket': 'pocket',
+ 'base32-crockford': 'base32_crockford',
+}
+
+def load_vendored_libs():
+ for lib_subdir, lib_name in VENDORED_LIBS.items():
+ lib_dir = VENDOR_DIR / lib_subdir
+ assert lib_dir.is_dir(), 'Expected vendor libary {lib_name} could not be found in {lib_dir}'
+
+ try:
+ lib = importlib.import_module(lib_name)
+ # print(f"Successfully imported lib from environment {lib_name}: {inspect.getfile(lib)}")
+ except ImportError:
+ sys.path.append(str(lib_dir))
+ try:
+ lib = importlib.import_module(lib_name)
+ # print(f"Successfully imported lib from vendored fallback {lib_name}: {inspect.getfile(lib)}")
+ except ImportError as e:
+ print(f"Failed to import lib from environment or vendored fallback {lib_name}: {e}", file=sys.stderr)
+ sys.exit(1)
+
+
diff --git a/archivebox/vendor/atomicwrites.py b/archivebox/vendor/atomicwrites.py
deleted file mode 120000
index 73abfe4c..00000000
--- a/archivebox/vendor/atomicwrites.py
+++ /dev/null
@@ -1 +0,0 @@
-python-atomicwrites/atomicwrites/__init__.py
\ No newline at end of file
diff --git a/archivebox/vendor/base32_crockford.py b/archivebox/vendor/base32_crockford.py
deleted file mode 120000
index a5d9c64f..00000000
--- a/archivebox/vendor/base32_crockford.py
+++ /dev/null
@@ -1 +0,0 @@
-base32-crockford/base32_crockford.py
\ No newline at end of file
diff --git a/archivebox/vendor/package-lock.json b/archivebox/vendor/package-lock.json
deleted file mode 120000
index 322001ae..00000000
--- a/archivebox/vendor/package-lock.json
+++ /dev/null
@@ -1 +0,0 @@
-../../package-lock.json
\ No newline at end of file
diff --git a/archivebox/vendor/package.json b/archivebox/vendor/package.json
deleted file mode 120000
index 138a42cd..00000000
--- a/archivebox/vendor/package.json
+++ /dev/null
@@ -1 +0,0 @@
-../../package.json
\ No newline at end of file
diff --git a/archivebox/vendor/pocket.py b/archivebox/vendor/pocket.py
deleted file mode 120000
index 37352d27..00000000
--- a/archivebox/vendor/pocket.py
+++ /dev/null
@@ -1 +0,0 @@
-pocket/pocket.py
\ No newline at end of file
diff --git a/archivebox/vendor/pydantic-pkgr b/archivebox/vendor/pydantic-pkgr
new file mode 160000
index 00000000..2cd84453
--- /dev/null
+++ b/archivebox/vendor/pydantic-pkgr
@@ -0,0 +1 @@
+Subproject commit 2cd844533d888ce29b9bf32b8363510dd0d76166
diff --git a/archivebox/vendor/taggit_utils.py b/archivebox/vendor/taggit_utils.py
deleted file mode 120000
index f36776db..00000000
--- a/archivebox/vendor/taggit_utils.py
+++ /dev/null
@@ -1 +0,0 @@
-django-taggit/taggit/utils.py
\ No newline at end of file
diff --git a/package-lock.json b/package-lock.json
index 0645c468..7f5a2969 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -236,9 +236,9 @@
"license": "MIT"
},
"node_modules/@types/node": {
- "version": "22.5.0",
- "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz",
- "integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==",
+ "version": "22.5.1",
+ "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
+ "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
"license": "MIT",
"optional": true,
"dependencies": {
@@ -353,9 +353,9 @@
}
},
"node_modules/aws4": {
- "version": "1.13.1",
- "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz",
- "integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==",
+ "version": "1.13.2",
+ "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
+ "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
"license": "MIT"
},
"node_modules/b4a": {
@@ -2376,9 +2376,9 @@
}
},
"node_modules/tslib": {
- "version": "2.6.3",
- "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
- "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==",
+ "version": "2.7.0",
+ "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
+ "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
"license": "0BSD"
},
"node_modules/turndown": {
diff --git a/pdm.lock b/pdm.lock
index d31f5ca2..4e719c8e 100644
--- a/pdm.lock
+++ b/pdm.lock
@@ -5,7 +5,7 @@
groups = ["default", "ldap", "sonic"]
strategy = ["inherit_metadata"]
lock_version = "4.5.0"
-content_hash = "sha256:f2f7ca01f2e18a1ef07d59b7a8985d89785a4b8a2a4e66452f1f9e8e8ad529ad"
+content_hash = "sha256:c6aa1f436032d18d079a4c2e9d9b95a5110579eb96a449751bfaf4d472eba401"
[[metadata.targets]]
requires_python = "==3.10.*"
@@ -78,6 +78,29 @@ files = [
{file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"},
]
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+summary = "Atomic file writes."
+groups = ["default"]
+marker = "python_version == \"3.10\""
+files = [
+ {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+ {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+
+[[package]]
+name = "base32-crockford"
+version = "0.3.0"
+summary = "A Python implementation of Douglas Crockford's base32 encoding scheme"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+files = [
+ {file = "base32-crockford-0.3.0.tar.gz", hash = "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969"},
+ {file = "base32_crockford-0.3.0-py2.py3-none-any.whl", hash = "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"},
+]
+
[[package]]
name = "brotli"
version = "1.1.0"
@@ -407,6 +430,21 @@ files = [
{file = "django_stubs_ext-5.0.4.tar.gz", hash = "sha256:85da065224204774208be29c7d02b4482d5a69218a728465c2fbe41725fdc819"},
]
+[[package]]
+name = "django-taggit"
+version = "1.3.0"
+requires_python = ">=3.5"
+summary = "django-taggit is a reusable Django application for simple tagging."
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+ "Django>=1.11",
+]
+files = [
+ {file = "django-taggit-1.3.0.tar.gz", hash = "sha256:4a833bf71f4c2deddd9745924eee53be1c075d7f0020a06f12e29fa3d752732d"},
+ {file = "django_taggit-1.3.0-py3-none-any.whl", hash = "sha256:609b0223d8a652f3fae088b7fd29f294fdadaca2d7931d45c27d6c59b02fdf31"},
+]
+
[[package]]
name = "exceptiongroup"
version = "1.2.2"
@@ -479,7 +517,7 @@ files = [
[[package]]
name = "httpx"
-version = "0.27.0"
+version = "0.27.2"
requires_python = ">=3.8"
summary = "The next generation HTTP client."
groups = ["default"]
@@ -492,20 +530,20 @@ dependencies = [
"sniffio",
]
files = [
- {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"},
- {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"},
+ {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
+ {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
]
[[package]]
name = "idna"
-version = "3.7"
-requires_python = ">=3.5"
+version = "3.8"
+requires_python = ">=3.6"
summary = "Internationalized Domain Names in Applications (IDNA)"
groups = ["default"]
marker = "python_version == \"3.10\""
files = [
- {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
- {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
+ {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"},
+ {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"},
]
[[package]]
@@ -613,6 +651,32 @@ files = [
{file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
]
+[[package]]
+name = "pocket"
+version = "0.3.7"
+git = "https://github.com/tapanpandita/pocket.git"
+ref = "v0.3.7"
+revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
+summary = "api wrapper for getpocket.com"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+ "requests",
+]
+
+[[package]]
+name = "pocket"
+version = "0.3.7"
+git = "https://github.com/tapanpandita/pocket.git"
+ref = "v0.3.7"
+revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
+summary = "api wrapper for getpocket.com"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+ "requests",
+]
+
[[package]]
name = "prompt-toolkit"
version = "3.0.47"
@@ -739,6 +803,23 @@ files = [
{file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
]
+[[package]]
+name = "pydantic-pkgr"
+version = "0.1.4"
+requires_python = ">=3.10"
+summary = "System package manager APIs in strongly typed Python"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+ "pydantic-core>=2.18.2",
+ "pydantic>=2.7.1",
+ "typing-extensions>=4.11.0",
+]
+files = [
+ {file = "pydantic_pkgr-0.1.4-py3-none-any.whl", hash = "sha256:bd9ddfa8eeb4d361257c4d3d8d36ba44a72515b497ee52cf0763240c66006417"},
+ {file = "pydantic_pkgr-0.1.4.tar.gz", hash = "sha256:e0422022dd83341f1e869a54da9aca903a6407a983ece0735f69493841b0fbb8"},
+]
+
[[package]]
name = "pygments"
version = "2.18.0"
@@ -841,14 +922,14 @@ files = [
[[package]]
name = "setuptools"
-version = "73.0.1"
+version = "74.0.0"
requires_python = ">=3.8"
summary = "Easily download, build, install, upgrade, and uninstall Python packages"
groups = ["default"]
marker = "python_version == \"3.10\""
files = [
- {file = "setuptools-73.0.1-py3-none-any.whl", hash = "sha256:b208925fcb9f7af924ed2dc04708ea89791e24bde0d3020b27df0e116088b34e"},
- {file = "setuptools-73.0.1.tar.gz", hash = "sha256:d59a3e788ab7e012ab2c4baed1b376da6366883ee20d7a5fc426816e3d7b1193"},
+ {file = "setuptools-74.0.0-py3-none-any.whl", hash = "sha256:0274581a0037b638b9fc1c6883cc71c0210865aaa76073f7882376b641b84e8f"},
+ {file = "setuptools-74.0.0.tar.gz", hash = "sha256:a85e96b8be2b906f3e3e789adec6a9323abf79758ecfa3065bd740d81158b11e"},
]
[[package]]
diff --git a/pyproject.toml b/pyproject.toml
index ebeccd59..6d3f8521 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,12 +29,9 @@ dependencies = [
"croniter>=2.0.5", # for: archivebox schedule
"ipython>=8.23.0", # for: archivebox shell
# Extractor Dependencies
- "yt-dlp>=2024.4.9", # for: media
+ "yt-dlp>=2024.8.6", # for: media
# "playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
- # TODO: add more extractors
- # - gallery-dl
- # - scihubdl
- # - See Github issues for more...
+
"django-signal-webhooks>=0.3.0",
"django-admin-data-views>=0.3.1",
"ulid-py>=1.1.0",
@@ -43,6 +40,14 @@ dependencies = [
"django-pydantic-field>=0.3.9",
"django-jsonform>=2.22.0",
"django-stubs>=5.0.2",
+
+ # these can be safely omitted when installation subsystem does not provide these as packages (e.g. apt/debian)
+ # archivebox will automatically load fallback vendored copies bundled via archivebox/vendor/__init__.py
+ "pydantic-pkgr>=0.1.4",
+ "atomicwrites==1.4.0",
+ "pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7",
+ "django-taggit==1.3.0",
+ "base32-crockford==0.3.0",
]
homepage = "https://github.com/ArchiveBox/ArchiveBox"
@@ -139,7 +144,7 @@ exclude = [
"**/migrations",
"archivebox/vendor",
]
-stubPath = "./typings"
+stubPath = "./archivebox/typings"
venvPath = "."
venv = ".venv"
# ignore = ["src/oldstuff"]
@@ -169,6 +174,9 @@ debug = [
"djdt_flamegraph",
"ipdb",
"requests-tracker>=0.3.3",
+ "logfire[django]>=0.51.0",
+ "opentelemetry-instrumentation-django>=0.47b0",
+ "opentelemetry-instrumentation-sqlite3>=0.47b0",
]
test = [
"pytest",
@@ -177,8 +185,6 @@ test = [
lint = [
"flake8",
"mypy",
-]
-dev = [
"django-autotyping>=0.5.1",
]
diff --git a/requirements.txt b/requirements.txt
index c464bf68..0ee4c4f8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,8 @@ annotated-types==0.7.0; python_version == "3.10"
anyio==4.4.0; python_version == "3.10"
asgiref==3.8.1; python_version == "3.10"
asttokens==2.4.1; python_version == "3.10"
+atomicwrites==1.4.0; python_version == "3.10"
+base32-crockford==0.3.0; python_version == "3.10"
brotli==1.1.0; implementation_name == "cpython" and python_version == "3.10"
brotlicffi==1.1.0.0; implementation_name != "cpython" and python_version == "3.10"
certifi==2024.7.4; python_version == "3.10"
@@ -26,13 +28,14 @@ django-settings-holder==0.1.2; python_version == "3.10"
django-signal-webhooks==0.3.0; python_version == "3.10"
django-stubs==5.0.4; python_version == "3.10"
django-stubs-ext==5.0.4; python_version == "3.10"
+django-taggit==1.3.0; python_version == "3.10"
exceptiongroup==1.2.2; python_version == "3.10"
executing==2.0.1; python_version == "3.10"
feedparser==6.0.11; python_version == "3.10"
h11==0.14.0; python_version == "3.10"
httpcore==1.0.5; python_version == "3.10"
-httpx==0.27.0; python_version == "3.10"
-idna==3.7; python_version == "3.10"
+httpx==0.27.2; python_version == "3.10"
+idna==3.8; python_version == "3.10"
ipython==8.26.0; python_version == "3.10"
jedi==0.19.1; python_version == "3.10"
matplotlib-inline==0.1.7; python_version == "3.10"
@@ -40,6 +43,7 @@ mutagen==1.47.0; python_version == "3.10"
mypy-extensions==1.0.0; python_version == "3.10"
parso==0.8.4; python_version == "3.10"
pexpect==4.9.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
+pocket @ git+https://github.com/tapanpandita/pocket.git@5a144438cc89bfc0ec94db960718ccf1f76468c1 ; python_version == "3.10"
prompt-toolkit==3.0.47; python_version == "3.10"
ptyprocess==0.7.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
pure-eval==0.2.3; python_version == "3.10"
@@ -49,6 +53,7 @@ pycparser==2.22; platform_python_implementation != "PyPy" and python_version ==
pycryptodomex==3.20.0; python_version == "3.10"
pydantic==2.8.2; python_version == "3.10"
pydantic-core==2.20.1; python_version == "3.10"
+pydantic-pkgr==0.1.4; python_version == "3.10"
pygments==2.18.0; python_version == "3.10"
python-crontab==3.2.0; python_version == "3.10"
python-dateutil==2.9.0.post0; python_version == "3.10"
@@ -56,7 +61,7 @@ python-ldap==3.4.4; python_version == "3.10"
pytz==2024.1; python_version == "3.10"
regex==2024.7.24; python_version == "3.10"
requests==2.32.3; python_version == "3.10"
-setuptools==73.0.1; python_version == "3.10"
+setuptools==74.0.0; python_version == "3.10"
sgmllib3k==1.0.0; python_version == "3.10"
six==1.16.0; python_version == "3.10"
sniffio==1.3.1; python_version == "3.10"