diff --git a/.gitmodules b/.gitmodules index 196c9a92..7b72ad6c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -26,3 +26,6 @@ [submodule "archivebox/vendor/python-atomicwrites"] path = archivebox/vendor/python-atomicwrites url = https://github.com/untitaker/python-atomicwrites +[submodule "archivebox/vendor/pydantic-pkgr"] + path = archivebox/vendor/pydantic-pkgr + url = https://github.com/ArchiveBox/pydantic-pkgr diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py index e5502cea..4e25ac0a 100644 --- a/archivebox/abid_utils/models.py +++ b/archivebox/abid_utils/models.py @@ -61,6 +61,11 @@ def get_or_create_system_user_pk(username='system'): return user.pk +class AutoDateTimeField(models.DateTimeField): + def pre_save(self, model_instance, add): + return timezone.now() + + class ABIDModel(models.Model): """ Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface. @@ -76,13 +81,16 @@ class ABIDModel(models.Model): abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) - created = models.DateTimeField(auto_now_add=True) + created = AutoDateTimeField(default=timezone.now, db_index=True) modified = models.DateTimeField(auto_now=True) class Meta(TypedModelMeta): abstract = True def save(self, *args: Any, **kwargs: Any) -> None: + if self._state.adding or not self.created: + self.created = timezone.now() + # when first creating a row, self.ABID is the source of truth # overwrite default prefilled self.id & self.abid with generated self.ABID value if self._state.adding or not self.id: @@ -93,6 +101,7 @@ class ABIDModel(models.Model): super().save(*args, **kwargs) assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}' assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}' + assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})' @property def abid_values(self) -> Dict[str, Any]: @@ -186,6 +195,14 @@ class ABIDModel(models.Model): Get a uuid.UUID (v4) representation of the object's ABID. """ return self.ABID.uuid + + @property + def uuid(self) -> str: + """ + Get a str uuid.UUID (v4) representation of the object's ABID. + """ + assert str(self.id) == str(self.ABID.uuid) + return str(self.id) @property def TypeID(self) -> TypeID: diff --git a/archivebox/builtin_plugins/__init__.py b/archivebox/builtin_plugins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/builtin_plugins/base/__init__.py b/archivebox/builtin_plugins/base/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/builtin_plugins/base/admin.py b/archivebox/builtin_plugins/base/admin.py new file mode 100644 index 00000000..8c38f3f3 --- /dev/null +++ b/archivebox/builtin_plugins/base/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/archivebox/builtin_plugins/base/apps.py b/archivebox/builtin_plugins/base/apps.py new file mode 100644 index 00000000..291bbe50 --- /dev/null +++ b/archivebox/builtin_plugins/base/apps.py @@ -0,0 +1,83 @@ +import sys +import inspect +from typing import List, Dict, Any, Optional +from pathlib import Path + +import django +from django.apps import AppConfig +from django.core.checks import Tags, Warning, register +from django.db.backends.sqlite3.base import Database as sqlite3 + +from pydantic import ( + Field, + SerializeAsAny, +) + +from pydantic_pkgr import SemVer, BinProvider, BinProviderName, ProviderLookupDict, BinName, Binary, EnvProvider, NpmProvider + +from plugantic.extractors import Extractor, ExtractorName +from plugantic.plugins import Plugin +from plugantic.configs import ConfigSet, ConfigSectionName +from plugantic.replayers import Replayer + + +class PythonBinary(Binary): + name: BinName = 'python' + + providers_supported: List[BinProvider] = [EnvProvider()] + provider_overrides: Dict[str, Any] = { + 'env': { + 'subdeps': \ + lambda: 'python3 python3-minimal python3-pip python3-virtualenv', + 'abspath': \ + lambda: sys.executable, + 'version': \ + lambda: '{}.{}.{}'.format(*sys.version_info[:3]), + }, + } + +class SqliteBinary(Binary): + name: BinName = 'sqlite' + providers_supported: List[BinProvider] = [EnvProvider()] + provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { + 'env': { + 'abspath': \ + lambda: Path(inspect.getfile(sqlite3)), + 'version': \ + lambda: SemVer(sqlite3.version), + }, + } + + +class DjangoBinary(Binary): + name: BinName = 'django' + + providers_supported: List[BinProvider] = [EnvProvider()] + provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { + 'env': { + 'abspath': \ + lambda: inspect.getfile(django), + 'version': \ + lambda: django.VERSION[:3], + }, + } + + +class BasicReplayer(Replayer): + name: str = 'basic' + + +class BasePlugin(Plugin): + name: str = 'base' + configs: List[SerializeAsAny[ConfigSet]] = [] + binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()] + extractors: List[SerializeAsAny[Extractor]] = [] + replayers: List[SerializeAsAny[Replayer]] = [BasicReplayer()] + + +PLUGINS = [BasePlugin()] + + +class BaseConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'builtin_plugins.base' diff --git a/archivebox/builtin_plugins/base/migrations/__init__.py b/archivebox/builtin_plugins/base/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/builtin_plugins/base/models.py b/archivebox/builtin_plugins/base/models.py new file mode 100644 index 00000000..71a83623 --- /dev/null +++ b/archivebox/builtin_plugins/base/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/archivebox/builtin_plugins/base/tests.py b/archivebox/builtin_plugins/base/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/builtin_plugins/base/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/builtin_plugins/base/views.py b/archivebox/builtin_plugins/base/views.py new file mode 100644 index 00000000..91ea44a2 --- /dev/null +++ b/archivebox/builtin_plugins/base/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/archivebox/builtin_plugins/singlefile/__init__.py b/archivebox/builtin_plugins/singlefile/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/builtin_plugins/singlefile/apps.py b/archivebox/builtin_plugins/singlefile/apps.py new file mode 100644 index 00000000..1d40e8a7 --- /dev/null +++ b/archivebox/builtin_plugins/singlefile/apps.py @@ -0,0 +1,113 @@ +from typing import List, Optional, Dict +from pathlib import Path + +from django.apps import AppConfig +from django.core.checks import Tags, Warning, register + +from pydantic import ( + Field, + SerializeAsAny, +) + +from pydantic_pkgr import BinProvider, BinName, Binary, EnvProvider, NpmProvider +from pydantic_pkgr.binprovider import bin_abspath +from pydantic_pkgr.binary import BinProviderName, ProviderLookupDict + +from plugantic.extractors import Extractor, ExtractorName +from plugantic.plugins import Plugin +from plugantic.configs import ConfigSet, ConfigSectionName + +from pkg.settings import env + + +###################### Config ########################## + +class SinglefileToggleConfig(ConfigSet): + section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES' + + SAVE_SINGLEFILE: bool = True + + +class SinglefileDependencyConfig(ConfigSet): + section: ConfigSectionName = 'DEPENDENCY_CONFIG' + + SINGLEFILE_BINARY: str = Field(default='wget') + SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None) + SINGLEFILE_EXTRA_ARGS: List[str] = [] + SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] + +class SinglefileOptionsConfig(ConfigSet): + section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS' + + # loaded from shared config + SINGLEFILE_USER_AGENT: str = Field(default='', alias='USER_AGENT') + SINGLEFILE_TIMEOUT: int = Field(default=60, alias='TIMEOUT') + SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY') + SINGLEFILE_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES') + SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE') + + + +DEFAULT_CONFIG = { + 'CHECK_SSL_VALIDITY': False, + 'SAVE_SINGLEFILE': True, + 'TIMEOUT': 120, +} + +PLUGIN_CONFIG = [ + SinglefileToggleConfig(**DEFAULT_CONFIG), + SinglefileDependencyConfig(**DEFAULT_CONFIG), + SinglefileOptionsConfig(**DEFAULT_CONFIG), +] + +###################### Binaries ############################ + +min_version: str = "1.1.54" +max_version: str = "2.0.0" + +class SinglefileBinary(Binary): + name: BinName = 'single-file' + providers_supported: List[BinProvider] = [NpmProvider()] + + + provider_overrides: Dict[BinProviderName, ProviderLookupDict] ={ + 'env': { + 'abspath': lambda: bin_abspath('single-file-node.js', PATH=env.PATH) or bin_abspath('single-file', PATH=env.PATH), + }, + 'npm': { + # 'abspath': lambda: bin_abspath('single-file', PATH=NpmProvider().PATH) or bin_abspath('single-file', PATH=env.PATH), + 'subdeps': lambda: f'single-file-cli@>={min_version} <{max_version}', + }, + } + + +###################### Extractors ########################## + +class SinglefileExtractor(Extractor): + name: ExtractorName = 'singlefile' + binary: Binary = SinglefileBinary() + + def get_output_path(self, snapshot) -> Path: + return Path(snapshot.link_dir) / 'singlefile.html' + + +###################### Plugins ############################# + + +class SinglefilePlugin(Plugin): + name: str = 'singlefile' + configs: List[SerializeAsAny[ConfigSet]] = [*PLUGIN_CONFIG] + binaries: List[SerializeAsAny[Binary]] = [SinglefileBinary()] + extractors: List[SerializeAsAny[Extractor]] = [SinglefileExtractor()] + +PLUGINS = [SinglefilePlugin()] + +###################### Django Apps ######################### + +class SinglefileConfig(AppConfig): + name = 'builtin_plugins.singlefile' + verbose_name = 'SingleFile' + + def ready(self): + pass + # print('Loaded singlefile plugin') diff --git a/archivebox/builtin_plugins/singlefile/config.yaml b/archivebox/builtin_plugins/singlefile/config.yaml new file mode 100644 index 00000000..b4d80f06 --- /dev/null +++ b/archivebox/builtin_plugins/singlefile/config.yaml @@ -0,0 +1,66 @@ +name: singlefile +plugin_version: '0.0.1' +plugin_spec: '0.0.1' + +binaries: + singlefile: + providers: + - env + - npm + +commands: + - singlefile.exec + - singlefile.extract + - singlefile.should_extract + - singlefile.get_output_path + +extractors: + singlefile: + binary: singlefile + test: singlefile.should_extract + extract: singlefile.extract + output_files: + - singlefile.html + +configs: + ARCHIVE_METHOD_TOGGLES: + SAVE_SINGLEFILE: + type: bool + default: true + + DEPENDENCY_CONFIG: + SINGLEFILE_BINARY: + type: str + default: wget + SINGLEFILE_ARGS: + type: Optional[List[str]] + default: null + SINGLEFILE_EXTRA_ARGS: + type: List[str] + default: [] + SINGLEFILE_DEFAULT_ARGS: + type: List[str] + default: + - "--timeout={TIMEOUT-10}" + + ARCHIVE_METHOD_OPTIONS: + SINGLEFILE_USER_AGENT: + type: str + default: "" + alias: USER_AGENT + SINGLEFILE_TIMEOUT: + type: int + default: 60 + alias: TIMEOUT + SINGLEFILE_CHECK_SSL_VALIDITY: + type: bool + default: true + alias: CHECK_SSL_VALIDITY + SINGLEFILE_RESTRICT_FILE_NAMES: + type: str + default: windows + alias: RESTRICT_FILE_NAMES + SINGLEFILE_COOKIES_FILE: + type: Optional[Path] + default: null + alias: COOKIES_FILE diff --git a/archivebox/builtin_plugins/singlefile/tests.py b/archivebox/builtin_plugins/singlefile/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/builtin_plugins/singlefile/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/config.py b/archivebox/config.py index de086304..8d4a0695 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -31,8 +31,6 @@ import getpass import platform import shutil import requests -import django -from sqlite3 import dbapi2 as sqlite3 from hashlib import md5 from pathlib import Path @@ -43,6 +41,11 @@ from configparser import ConfigParser from collections import defaultdict import importlib.metadata +from pydantic_pkgr import SemVer + +import django +from django.db.backends.sqlite3.base import Database as sqlite3 + from .config_stubs import ( AttrDict, SimpleConfigValueDict, @@ -52,6 +55,11 @@ from .config_stubs import ( ConfigDefaultDict, ) +# load fallback libraries from vendor dir +from .vendor import load_vendored_libs +load_vendored_libs() + + ############################### Config Schema ################################## @@ -89,13 +97,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SECRET_KEY': {'type': str, 'default': None}, 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]}, 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, # e.g. archivebox.example.com,archivebox2.example.com - 'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': ''}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080 + 'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c['BIND_ADDR'])}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080 'DEBUG': {'type': bool, 'default': False}, 'PUBLIC_INDEX': {'type': bool, 'default': True}, 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, - 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, + 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 100}, 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, 'TIME_ZONE': {'type': str, 'default': 'UTC'}, 'TIMEZONE': {'type': str, 'default': 'UTC'}, @@ -565,7 +573,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])}, 'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)}, - 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)}, + 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])}, 'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)}, 'SQLITE_VERSION': {'default': lambda c: sqlite3.version}, @@ -902,16 +910,9 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None) -> Optional[str] version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode() # take first 3 columns of first line of version info - version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE) - try: - version_nums = version_ptn.findall(version_str.split('\n')[0])[0] - if version_nums: - return version_nums - else: - raise IndexError - except IndexError: - # take first 3 columns of first line of version info - return ' '.join(version_str.split('\n')[0].strip().split()[:3]) + semver = SemVer.parse(version_str) + if semver: + return str(semver) except OSError: pass # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red') @@ -1524,5 +1525,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, assert sql_index_path.exists(), ( f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') + + # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging + if settings.DEBUG_LOGFIRE: + from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor + SQLite3Instrumentor().instrument() + + import logfire + + logfire.configure() + logfire.instrument_django(is_sql_commentor_enabled=True) + logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv) + except KeyboardInterrupt: raise SystemExit(2) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index b87f6874..29463623 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -10,12 +10,15 @@ from datetime import datetime, timezone from typing import Dict, Any from django.contrib import admin -from django.db.models import Count, Q -from django.urls import path, reverse +from django.db.models import Count, Q, Prefetch +from django.urls import path, reverse, resolve +from django.utils import timezone +from django.utils.functional import cached_property from django.utils.html import format_html from django.utils.safestring import mark_safe from django.shortcuts import render, redirect from django.contrib.auth import get_user_model +from django.core.paginator import Paginator from django.core.exceptions import ValidationError from django.conf import settings from django import forms @@ -126,22 +129,99 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin) +class AccelleratedPaginator(Paginator): + """ + Accellerated Pagniator ignores DISTINCT when counting total number of rows. + Speeds up SELECT Count(*) on Admin views by >20x. + https://hakibenita.com/optimizing-the-django-admin-paginator + """ + + @cached_property + def count(self): + if self.object_list._has_filters(): # type: ignore + # fallback to normal count method on filtered queryset + return super().count + else: + # otherwise count total rows in a separate fast query + return self.object_list.model.objects.count() + + # Alternative approach for PostgreSQL: fallback count takes > 200ms + # from django.db import connection, transaction, OperationalError + # with transaction.atomic(), connection.cursor() as cursor: + # cursor.execute('SET LOCAL statement_timeout TO 200;') + # try: + # return super().count + # except OperationalError: + # return 9999999999999 + + class ArchiveResultInline(admin.TabularInline): name = 'Archive Results Log' model = ArchiveResult + parent_model = Snapshot # fk_name = 'snapshot' - extra = 1 - readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version') - fields = ('id', *readonly_fields, 'status', 'output') + extra = 0 + sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version') + readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version') + fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output') + # exclude = ('id',) + ordering = ('end_ts',) show_change_link = True # # classes = ['collapse'] # # list_display_links = ['abid'] + def get_parent_object_from_request(self, request): + resolved = resolve(request.path_info) + return self.parent_model.objects.get(pk=resolved.kwargs['object_id']) + + @admin.display( + description='Completed', + ordering='end_ts', + ) + def completed(self, obj): + return format_html('

{}

', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S')) + def result_id(self, obj): - return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) + return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) def command(self, obj): return format_html('{}', " ".join(obj.cmd or [])) + + def version(self, obj): + return format_html('{}', obj.cmd_version or '-') + + def get_formset(self, request, obj=None, **kwargs): + formset = super().get_formset(request, obj, **kwargs) + snapshot = self.get_parent_object_from_request(request) + + # import ipdb; ipdb.set_trace() + formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget() + + # default values for new entries + formset.form.base_fields['status'].initial = 'succeeded' + formset.form.base_fields['start_ts'].initial = timezone.now() + formset.form.base_fields['end_ts'].initial = timezone.now() + formset.form.base_fields['cmd_version'].initial = '-' + formset.form.base_fields['pwd'].initial = str(snapshot.link_dir) + formset.form.base_fields['created_by'].initial = request.user + formset.form.base_fields['cmd'] = forms.JSONField(initial=['-']) + formset.form.base_fields['output'].initial = 'Manually recorded cmd output...' + + if obj is not None: + # hidden values for existing entries and new entries + formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget() + formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget() + formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget() + formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget() + formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget() + formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget() + return formset + + def get_readonly_fields(self, request, obj=None): + if obj is not None: + return self.readonly_fields + else: + return [] class TagInline(admin.TabularInline): @@ -222,25 +302,22 @@ def get_abid_info(self, obj): @admin.register(Snapshot, site=archivebox_admin) class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): - class Meta: - model = Snapshot - list_display = ('added', 'title_str', 'files', 'size', 'url_str') - # list_editable = ('title',) sort_fields = ('title_str', 'url_str', 'added', 'files') - readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir') + readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir') search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name') - list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags') + list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name') fields = ('url', 'created_by', 'title', *readonly_fields) ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] - autocomplete_fields = ['tags'] inlines = [TagInline, ArchiveResultInline] - list_per_page = CONFIG.SNAPSHOTS_PER_PAGE + list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000) action_form = SnapshotActionForm + paginator = AccelleratedPaginator save_on_top = True + show_full_result_count = False def changelist_view(self, request, extra_context=None): extra_context = extra_context or {} @@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): ] return custom_urls + urls - def get_queryset(self, request): - self.request = request - return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult')) + # def get_queryset(self, request): + # # tags_qs = SnapshotTag.objects.all().select_related('tag') + # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs) + + # self.request = request + # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) def tag_list(self, obj): - return ', '.join(obj.tags.values_list('name', flat=True)) + return ', '.join(tag.name for tag in obj.tags.all()) # TODO: figure out a different way to do this, you cant nest forms so this doenst work # def action(self, obj): @@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): ordering='title', ) def title_str(self, obj): - canon = obj.as_link().canonical_outputs() tags = ''.join( - format_html('{} ', tag.id, tag) + format_html('{} ', tag.pk, tag.name) for tag in obj.tags.all() - if str(tag).strip() + if str(tag.name).strip() ) return format_html( '' - '' + '' '' '' '{}' '', obj.archive_path, - obj.archive_path, canon['favicon_path'], + obj.archive_path, obj.archive_path, 'fetched' if obj.latest_title or obj.title else 'pending', urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' @@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): @admin.display( description='Files Saved', - ordering='archiveresult_count', + # ordering='archiveresult_count', ) def files(self, obj): return snapshot_icons(obj) @admin.display( - ordering='archiveresult_count' + # ordering='archiveresult_count' ) def size(self, obj): archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size @@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin): actions = ['delete_selected'] ordering = ['-created'] + paginator = AccelleratedPaginator + def API(self, obj): try: return get_abid_info(self, obj) @@ -574,6 +655,8 @@ class ArchiveResultAdmin(ABIDModelAdmin): list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') ordering = ['-start_ts'] list_per_page = CONFIG.SNAPSHOTS_PER_PAGE + + paginator = AccelleratedPaginator @admin.display( description='Snapshot Info' diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 193c0d05..3a64eb45 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -4,7 +4,7 @@ from django import forms from ..util import URL_REGEX from ..parsers import PARSERS -from ..vendor.taggit_utils import edit_string_for_tags, parse_tags +from taggit.utils import edit_string_for_tags, parse_tags PARSER_CHOICES = [ (parser_key, parser[0]) diff --git a/archivebox/core/migrations/0027_update_snapshot_ids.py b/archivebox/core/migrations/0027_update_snapshot_ids.py index ad197c04..6b8dcf4a 100644 --- a/archivebox/core/migrations/0027_update_snapshot_ids.py +++ b/archivebox/core/migrations/0027_update_snapshot_ids.py @@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor): Snapshot = apps.get_model("core", "Snapshot") num_total = Snapshot.objects.all().count() print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...') - for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()): + for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)): assert snapshot.abid snapshot.abid_prefix = 'snp_' snapshot.abid_ts_src = 'self.added' @@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor): ArchiveResult = apps.get_model("core", "ArchiveResult") num_total = ArchiveResult.objects.all().count() print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') - for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()): + for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)): assert result.abid result.abid_prefix = 'res_' result.snapshot = Snapshot.objects.get(pk=result.snapshot_id) diff --git a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py index 121a2154..dd6da1f5 100644 --- a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py +++ b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py @@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor): ArchiveResult = apps.get_model("core", "ArchiveResult") num_total = ArchiveResult.objects.all().count() print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') - for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()): + for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)): assert result.abid result.uuid = ABID.parse(result.abid).uuid result.save(update_fields=["uuid"]) diff --git a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py index ddb7afbb..9866f69c 100644 --- a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py +++ b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py @@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor): SnapshotTag = apps.get_model("core", "SnapshotTag") num_total = SnapshotTag.objects.all().count() print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)') - for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()): + for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)): assert snapshottag.snapshot_old_id snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id) snapshottag.snapshot_id = snapshot.id diff --git a/archivebox/core/migrations/0059_tag_id.py b/archivebox/core/migrations/0059_tag_id.py index f09e9ffb..a81e022f 100644 --- a/archivebox/core/migrations/0059_tag_id.py +++ b/archivebox/core/migrations/0059_tag_id.py @@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor): Tag = apps.get_model("core", "Tag") num_total = Tag.objects.all().count() print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...') - for idx, tag in enumerate(Tag.objects.all().iterator()): + for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)): if not tag.slug: tag.slug = tag.name.lower().replace(' ', '_') if not tag.name: diff --git a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py index 6c574669..bb067acf 100644 --- a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py +++ b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py @@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor): SnapshotTag = apps.get_model("core", "SnapshotTag") num_total = SnapshotTag.objects.all().count() print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)') - for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()): + for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)): assert snapshottag.old_tag_id tag = Tag.objects.get(old_id=snapshottag.old_tag_id) snapshottag.tag_id = tag.id diff --git a/archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py b/archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py new file mode 100644 index 00000000..fcc9b0aa --- /dev/null +++ b/archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py @@ -0,0 +1,35 @@ +# Generated by Django 5.1 on 2024-08-28 09:40 + +import abid_utils.models +import django.utils.timezone +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0068_alter_archiveresult_options'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='created', + field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='snapshot', + name='added', + field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='snapshot', + name='created', + field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='tag', + name='created', + field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index c9266bd9..a362bdae 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -12,6 +12,7 @@ from uuid import uuid4 from pathlib import Path from django.db import models +from django.utils import timezone from django.utils.functional import cached_property from django.utils.text import slugify from django.core.cache import cache @@ -19,7 +20,7 @@ from django.urls import reverse, reverse_lazy from django.db.models import Case, When, Value, IntegerField from django.conf import settings -from abid_utils.models import ABIDModel, ABIDField +from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField from ..system import get_dir_size from ..util import parse_date, base_url @@ -50,7 +51,7 @@ class Tag(ABIDModel): Based on django-taggit model + ABID base. """ abid_prefix = 'tag_' - abid_ts_src = 'self.created' # TODO: add created/modified time + abid_ts_src = 'self.created' abid_uri_src = 'self.slug' abid_subtype_src = '"03"' abid_rand_src = 'self.old_id' @@ -60,7 +61,6 @@ class Tag(ABIDModel): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True) abid = ABIDField(prefix=abid_prefix) - name = models.CharField(unique=True, blank=False, max_length=100) slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) # slug is autoset on save from name, never set it manually @@ -125,6 +125,12 @@ class SnapshotTag(models.Model): db_table = 'core_snapshot_tags' unique_together = [('snapshot', 'tag')] + +class SnapshotManager(models.Manager): + def get_queryset(self): + return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct() + + class Snapshot(ABIDModel): abid_prefix = 'snp_' abid_ts_src = 'self.added' @@ -143,16 +149,15 @@ class Snapshot(ABIDModel): tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) - added = models.DateTimeField(auto_now_add=True, db_index=True) + added = AutoDateTimeField(default=timezone.now, db_index=True) updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True) keys = ('url', 'timestamp', 'title', 'tags', 'updated') archiveresult_set: models.Manager['ArchiveResult'] - @property - def uuid(self): - return self.id + objects = SnapshotManager() + def __repr__(self) -> str: title = (self.title_stripped or '-')[:64] @@ -162,13 +167,6 @@ class Snapshot(ABIDModel): title = (self.title_stripped or '-')[:64] return f'[{self.timestamp}] {self.url[:64]} ({title})' - def save(self, *args, **kwargs): - super().save(*args, **kwargs) - try: - assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})' - except AssertionError as e: - print(e) - @classmethod def from_json(cls, info: dict): info = {k: v for k, v in info.items() if k in cls.keys} @@ -177,8 +175,7 @@ class Snapshot(ABIDModel): def as_json(self, *args) -> dict: args = args or self.keys return { - key: getattr(self, key) - if key != 'tags' else self.tags_str() + key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False) for key in args } @@ -190,8 +187,14 @@ class Snapshot(ABIDModel): return load_link_details(self.as_link()) def tags_str(self, nocache=True) -> str | None: + calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all())) cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags' - calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) + + if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache: + # tags are pre-fetched already, use them directly (best because db is always freshest) + tags_str = calc_tags_str() + return tags_str + if nocache: tags_str = calc_tags_str() cache.set(cache_key, tags_str) @@ -234,7 +237,10 @@ class Snapshot(ABIDModel): @cached_property def num_outputs(self) -> int: - return self.archiveresult_set.filter(status='succeeded').count() + # DONT DO THIS: it will trigger a separate query for every snapshot + # return self.archiveresult_set.filter(status='succeeded').count() + # this is better: + return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded')) @cached_property def base_url(self): @@ -262,10 +268,21 @@ class Snapshot(ABIDModel): @cached_property def thumbnail_url(self) -> Optional[str]: - result = self.archiveresult_set.filter( - extractor='screenshot', - status='succeeded' - ).only('output').last() + if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache: + result = (sorted( + ( + result + for result in self.archiveresult_set.all() + if result.extractor == 'screenshot' and result.status =='succeeded' and result.output + ), + key=lambda result: result.created, + ) or [None])[-1] + else: + result = self.archiveresult_set.filter( + extractor='screenshot', + status='succeeded' + ).only('output').last() + if result: return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}']) return None @@ -292,6 +309,21 @@ class Snapshot(ABIDModel): if self.title: return self.title # whoopdedoo that was easy + # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again + if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache: + try: + return (sorted( + ( + result.output.strip() + for result in self.archiveresult_set.all() + if result.extractor == 'title' and result.status =='succeeded' and result.output + ), + key=lambda title: len(title), + ) or [None])[-1] + except IndexError: + pass + + try: # take longest successful title from ArchiveResult db history return sorted( @@ -355,12 +387,23 @@ class Snapshot(ABIDModel): class ArchiveResultManager(models.Manager): def indexable(self, sorted: bool = True): + """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)""" + INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] - qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') + qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded') if sorted: - precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] - qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') + precedence = [ + When(extractor=method, then=Value(precedence)) + for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE + ] + qs = qs.annotate( + indexing_precedence=Case( + *precedence, + default=Value(1000), + output_field=IntegerField() + ) + ).order_by('indexing_precedence') return qs class ArchiveResult(ABIDModel): @@ -418,17 +461,6 @@ class ArchiveResult(ABIDModel): def __str__(self): return self.extractor - def save(self, *args, **kwargs): - super().save(*args, **kwargs) - try: - assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})' - except AssertionError as e: - print(e) - - @property - def uuid(self): - return self.id - @cached_property def snapshot_dir(self): return Path(self.snapshot.link_dir) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 1321bd52..707e17a1 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -4,7 +4,9 @@ import os import sys import re import logging +import inspect import tempfile +from typing import Any, Dict from pathlib import Path from django.utils.crypto import get_random_string @@ -33,22 +35,20 @@ APPEND_SLASH = True DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv) -# add plugins folders to system path, and load plugins in installed_apps -BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'plugins' -USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'plugins' -sys.path.insert(0, str(BUILTIN_PLUGINS_DIR)) -sys.path.insert(0, str(USER_PLUGINS_DIR)) +BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'builtin_plugins' +USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'user_plugins' -def find_plugins(plugins_dir): - return { - # plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA - plugin_entrypoint.parent.name: plugin_entrypoint.parent +def find_plugins(plugins_dir, prefix: str) -> Dict[str, Any]: + plugins = { + f'{prefix}.{plugin_entrypoint.parent.name}': plugin_entrypoint.parent for plugin_entrypoint in plugins_dir.glob('*/apps.py') } + # print(f'Found {prefix} plugins:\n', '\n '.join(plugins.keys())) + return plugins INSTALLED_PLUGINS = { - **find_plugins(BUILTIN_PLUGINS_DIR), - **find_plugins(USER_PLUGINS_DIR), + **find_plugins(BUILTIN_PLUGINS_DIR, prefix='builtin_plugins'), + **find_plugins(USER_PLUGINS_DIR, prefix='user_plugins'), } @@ -66,11 +66,11 @@ INSTALLED_APPS = [ 'plugantic', 'core', 'api', + 'pkg', *INSTALLED_PLUGINS.keys(), 'admin_data_views', - 'django_extensions', ] @@ -144,64 +144,6 @@ if CONFIG.LDAP: # sys.exit(1) -################################################################################ -### Debug Settings -################################################################################ - -# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode) -DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv) -if DEBUG_TOOLBAR: - try: - import debug_toolbar # noqa - DEBUG_TOOLBAR = True - except ImportError: - DEBUG_TOOLBAR = False - -if DEBUG_TOOLBAR: - INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar'] - INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*'] - DEBUG_TOOLBAR_CONFIG = { - "SHOW_TOOLBAR_CALLBACK": lambda request: True, - "RENDER_PANELS": True, - } - DEBUG_TOOLBAR_PANELS = [ - 'debug_toolbar.panels.history.HistoryPanel', - 'debug_toolbar.panels.versions.VersionsPanel', - 'debug_toolbar.panels.timer.TimerPanel', - 'debug_toolbar.panels.settings.SettingsPanel', - 'debug_toolbar.panels.headers.HeadersPanel', - 'debug_toolbar.panels.request.RequestPanel', - 'debug_toolbar.panels.sql.SQLPanel', - 'debug_toolbar.panels.staticfiles.StaticFilesPanel', - # 'debug_toolbar.panels.templates.TemplatesPanel', - 'debug_toolbar.panels.cache.CachePanel', - 'debug_toolbar.panels.signals.SignalsPanel', - 'debug_toolbar.panels.logging.LoggingPanel', - 'debug_toolbar.panels.redirects.RedirectsPanel', - 'debug_toolbar.panels.profiling.ProfilingPanel', - 'djdt_flamegraph.FlamegraphPanel', - ] - MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware'] - -if DEBUG: - from django_autotyping.typing import AutotypingSettingsDict - - INSTALLED_APPS += ['django_autotyping'] - AUTOTYPING: AutotypingSettingsDict = { - "STUBS_GENERATION": { - "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings", - } - } - -# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar) -# Must delete archivebox/templates/admin to use because it relies on some things we override -# visit /__requests_tracker__/ to access -DEBUG_REQUESTS_TRACKER = False -if DEBUG_REQUESTS_TRACKER: - INSTALLED_APPS += ["requests_tracker"] - MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"] - INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"] - ################################################################################ ### Staticfile and Template Settings @@ -317,13 +259,15 @@ STORAGES = { SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',') -CSRF_TRUSTED_ORIGINS = CONFIG.CSRF_TRUSTED_ORIGINS.split(',') +CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(','))) # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com) # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS -if CONFIG.ALLOWED_HOSTS != '*' and (not CSRF_TRUSTED_ORIGINS): - for hostname in ALLOWED_HOSTS: - CSRF_TRUSTED_ORIGINS.append(f'https://{hostname}') +for hostname in ALLOWED_HOSTS: + https_endpoint = f'https://{hostname}' + if hostname != '*' and https_endpoint not in CSRF_TRUSTED_ORIGINS: + print(f'[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS') + CSRF_TRUSTED_ORIGINS.append(https_endpoint) SECURE_BROWSER_XSS_FILTER = True SECURE_CONTENT_TYPE_NOSNIFF = True @@ -345,6 +289,8 @@ AUTH_PASSWORD_VALIDATORS = [ {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'}, ] +DATA_UPLOAD_MAX_NUMBER_FIELDS = None + ################################################################################ ### Shell Settings ################################################################################ @@ -385,6 +331,10 @@ IGNORABLE_404_URLS = [ re.compile(r'robots\.txt$'), re.compile(r'.*\.(css|js)\.map$'), ] +IGNORABLE_200_URLS = [ + re.compile(r'^"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M), + re.compile(r'^"GET /admin/jsi18n/ HTTP/.*" (200|30.) .+', re.I | re.M), +] class NoisyRequestsFilter(logging.Filter): def filter(self, record) -> bool: @@ -396,19 +346,26 @@ class NoisyRequestsFilter(logging.Filter): if ignorable_log_pattern.match(logline): return False - # ignore staticfile requests that 200 or 30* - ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M) - if ignoreable_200_log_pattern.match(logline): - return False + ignorable_log_pattern = re.compile(f'^Not Found: /.*/?{ignorable_url_pattern.pattern}', re.I | re.M) + if ignorable_log_pattern.match(logline): + return False + # ignore staticfile requests that 200 or 30* + for ignorable_url_pattern in IGNORABLE_200_URLS: + if ignorable_log_pattern.match(logline): + return False + return True + +ERROR_LOG = tempfile.NamedTemporaryFile().name + if CONFIG.LOGS_DIR.exists(): ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log') else: # historically too many edge cases here around creating log dir w/ correct permissions early on # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr - ERROR_LOG = tempfile.NamedTemporaryFile().name + print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}') LOGGING = { 'version': 1, @@ -445,6 +402,10 @@ LOGGING = { } +################################################################################ +### REST API Outbound Webhooks settings +################################################################################ + # Add default webhook configuration to the User model SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook' SIGNAL_WEBHOOKS = { @@ -458,7 +419,9 @@ SIGNAL_WEBHOOKS = { }, } -DATA_UPLOAD_MAX_NUMBER_FIELDS = None +################################################################################ +### Admin Data View Settings +################################################################################ ADMIN_DATA_VIEWS = { "NAME": "Environment", @@ -495,3 +458,86 @@ ADMIN_DATA_VIEWS = { }, ], } + + +################################################################################ +### Debug Settings +################################################################################ + +# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode) +DEBUG_TOOLBAR = False +DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv) +if DEBUG_TOOLBAR: + try: + import debug_toolbar # noqa + DEBUG_TOOLBAR = True + except ImportError: + DEBUG_TOOLBAR = False + +if DEBUG_TOOLBAR: + INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar'] + INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*'] + DEBUG_TOOLBAR_CONFIG = { + "SHOW_TOOLBAR_CALLBACK": lambda request: True, + "RENDER_PANELS": True, + } + DEBUG_TOOLBAR_PANELS = [ + 'debug_toolbar.panels.history.HistoryPanel', + 'debug_toolbar.panels.versions.VersionsPanel', + 'debug_toolbar.panels.timer.TimerPanel', + 'debug_toolbar.panels.settings.SettingsPanel', + 'debug_toolbar.panels.headers.HeadersPanel', + 'debug_toolbar.panels.request.RequestPanel', + 'debug_toolbar.panels.sql.SQLPanel', + 'debug_toolbar.panels.staticfiles.StaticFilesPanel', + # 'debug_toolbar.panels.templates.TemplatesPanel', + 'debug_toolbar.panels.cache.CachePanel', + 'debug_toolbar.panels.signals.SignalsPanel', + 'debug_toolbar.panels.logging.LoggingPanel', + 'debug_toolbar.panels.redirects.RedirectsPanel', + 'debug_toolbar.panels.profiling.ProfilingPanel', + 'djdt_flamegraph.FlamegraphPanel', + ] + MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware'] + +if DEBUG: + from django_autotyping.typing import AutotypingSettingsDict + + INSTALLED_APPS += ['django_autotyping'] + AUTOTYPING: AutotypingSettingsDict = { + "STUBS_GENERATION": { + "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings", + } + } + +# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar) +# Must delete archivebox/templates/admin to use because it relies on some things we override +# visit /__requests_tracker__/ to access +DEBUG_REQUESTS_TRACKER = True +DEBUG_REQUESTS_TRACKER = DEBUG_REQUESTS_TRACKER and DEBUG +if DEBUG_REQUESTS_TRACKER: + import requests_tracker + + INSTALLED_APPS += ["requests_tracker"] + MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"] + INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"] + + TEMPLATE_DIRS.insert(0, str(Path(inspect.getfile(requests_tracker)).parent / "templates")) + + REQUESTS_TRACKER_CONFIG = { + "TRACK_SQL": True, + "ENABLE_STACKTRACES": False, + "IGNORE_PATHS_PATTERNS": ( + r".*/favicon\.ico", + r".*\.png", + r"/admin/jsi18n/", + ), + "IGNORE_SQL_PATTERNS": ( + r"^SELECT .* FROM django_migrations WHERE app = 'requests_tracker'", + r"^SELECT .* FROM django_migrations WHERE app = 'auth'", + ), + } + +# https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.) +DEBUG_LOGFIRE = False +DEBUG_LOGFIRE = DEBUG_LOGFIRE and (Path(CONFIG.OUTPUT_DIR) / '.logfire').is_dir() diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 036ff73c..22d6a405 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa if type(all_links) is QuerySet: num_links: int = all_links.count() get_link = lambda x: x.as_link_with_details() - all_links = all_links.iterator() + all_links = all_links.iterator(chunk_size=500) else: num_links: int = len(all_links) get_link = lambda x: x diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index cd72be4e..c97b2f28 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]: @enforce_types -def wget_output_path(link: Link) -> Optional[str]: +def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]: """calculate the path to the wgetted .html file, since wget may adjust some paths to be different than the base_url path. @@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]: # https://example.com/abc/test/?v=zzVa_tX1OiI # > example.com/abc/test/index.html@v=zzVa_tX1OiI.html + cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path' + + if not nocache: + from django.core.cache import cache + cached_result = cache.get(cache_key) + if cached_result: + return cached_result + + # There's also lots of complexity around how the urlencoding and renaming # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc, # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than @@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]: output_path = None if output_path: + if not nocache: + cache.set(cache_key, output_path) return output_path # fallback to just the domain dir diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 1bc5a104..1edd3caf 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" - links = (snapshot.as_link() for snapshot in snapshots.iterator()) + links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { link.link_dir: link for link in links @@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" - links = (snapshot.as_link() for snapshot in snapshots.iterator()) + links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { link.link_dir: link for link in filter(is_archived, links) @@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" - links = (snapshot.as_link() for snapshot in snapshots.iterator()) + links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { link.link_dir: link for link in filter(is_unarchived, links) @@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs with a valid index matched to the main index and archived content""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)] return { link.link_dir: link for link in filter(is_valid, links) @@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists() ) - for path in chain(snapshots.iterator(), data_folders): + for path in chain(snapshots.iterator(chunk_size=500), data_folders): link = None if type(path) is not str: path = path.as_link().link_dir @@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that don't contain a valid index and aren't listed in the main index""" corrupted = {} - for snapshot in snapshots.iterator(): + for snapshot in snapshots.iterator(chunk_size=500): link = snapshot.as_link() if is_corrupt(link): corrupted[link.link_dir] = link diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 339f9429..2e5d18bc 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str: from core.models import ArchiveResult # start = datetime.now(timezone.utc) - archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) + if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache: + archive_results = [ + result + for result in snapshot.archiveresult_set.all() + if result.status == "succeeded" and result.output + ] + else: + archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) + link = snapshot.as_link() path = link.archive_path canon = link.canonical_outputs() diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 10c1525d..97058590 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -37,9 +37,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: @enforce_types def write_link_to_sql_index(link: Link, created_by_id: int | None=None): from core.models import Snapshot, ArchiveResult + from abid_utils.models import get_or_create_system_user_pk + info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} - info['created_by_id'] = created_by_id + info['created_by_id'] = created_by_id or get_or_create_system_user_pk() tag_list = list(dict.fromkeys( tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '') diff --git a/archivebox/main.py b/archivebox/main.py index b36fb3dd..5ab175bb 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -960,7 +960,8 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None: run_subcommand('init', stdin=None, pwd=out_dir) setup_django(out_dir=out_dir, check_db=True) - from core.models import User + from django.contrib.auth import get_user_model + User = get_user_model() if not User.objects.filter(is_superuser=True).exists(): stderr('\n[+] Creating new admin user for the Web UI...', color='green') @@ -979,16 +980,16 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None: '--upgrade', '--no-cache-dir', '--no-warn-script-location', - 'youtube_dl', + 'yt-dlp', ], capture_output=False, cwd=out_dir) pkg_path = run_shell([ PYTHON_BINARY, '-m', 'pip', 'show', - 'youtube_dl', + 'yt-dlp', ], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0] - NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'youtube_dl' / '__main__.py' + NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py' os.chmod(NEW_YOUTUBEDL_BINARY, 0o777) - assert NEW_YOUTUBEDL_BINARY.exists(), f'youtube_dl must exist inside {pkg_path}' + assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}' config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir) except BaseException as e: # lgtm [py/catch-base-exception] stderr(f'[X] Failed to install python packages: {e}', color='red') diff --git a/archivebox/package-lock.json b/archivebox/package-lock.json index 0645c468..abcb8192 100644 --- a/archivebox/package-lock.json +++ b/archivebox/package-lock.json @@ -11,7 +11,7 @@ "dependencies": { "@postlight/parser": "^2.2.3", "readability-extractor": "github:ArchiveBox/readability-extractor", - "single-file-cli": "^1.1.54" + "single-file-cli": "^2.0.58" } }, "node_modules/@asamuzakjp/dom-selector": { @@ -236,9 +236,9 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "22.5.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz", - "integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==", + "version": "22.5.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz", + "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==", "license": "MIT", "optional": true, "dependencies": { @@ -353,9 +353,9 @@ } }, "node_modules/aws4": { - "version": "1.13.1", - "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz", - "integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==", + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz", + "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==", "license": "MIT" }, "node_modules/b4a": { @@ -2376,9 +2376,9 @@ } }, "node_modules/tslib": { - "version": "2.6.3", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz", - "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==", + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz", + "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==", "license": "0BSD" }, "node_modules/turndown": { diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py index eec4d73b..3415f35e 100644 --- a/archivebox/parsers/pocket_api.py +++ b/archivebox/parsers/pocket_api.py @@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional from configparser import ConfigParser from pathlib import Path -from ..vendor.pocket import Pocket +from pocket import Pocket from ..index.schema import Link from ..util import enforce_types diff --git a/archivebox/pkg/__init__.py b/archivebox/pkg/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/pkg/admin.py b/archivebox/pkg/admin.py new file mode 100644 index 00000000..8c38f3f3 --- /dev/null +++ b/archivebox/pkg/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/archivebox/pkg/apps.py b/archivebox/pkg/apps.py new file mode 100644 index 00000000..fa8a6913 --- /dev/null +++ b/archivebox/pkg/apps.py @@ -0,0 +1,14 @@ +__package__ = 'archivebox.pkg' + +from django.apps import AppConfig + + +class PkgsConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'pkg' + + def ready(self): + from .settings import LOADED_DEPENDENCIES + + # print(LOADED_DEPENDENCIES) + \ No newline at end of file diff --git a/archivebox/pkg/management/__init__.py b/archivebox/pkg/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/pkg/management/commands/__init__.py b/archivebox/pkg/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/pkg/management/commands/pkg.py b/archivebox/pkg/management/commands/pkg.py new file mode 100644 index 00000000..7cbf795a --- /dev/null +++ b/archivebox/pkg/management/commands/pkg.py @@ -0,0 +1,75 @@ +__package__ = 'archivebox.pkg.management.commands' + +from django.core.management.base import BaseCommand +from django.conf import settings + +from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer +from pydantic_pkgr.binprovider import bin_abspath + +from ....config import NODE_BIN_PATH, bin_path + +from plugantic.plugins import LOADED_PLUGINS + +from pkg.settings import env + + +class Command(BaseCommand): + def handle(self, *args, method, **options): + method(*args, **options) + + def add_arguments(self, parser): + subparsers = parser.add_subparsers(title="sub-commands", required=True) + + list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.") + list_parser.set_defaults(method=self.list) + + install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.") + install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.") + install_parser.add_argument("package_names", nargs="+", type=str) + install_parser.set_defaults(method=self.install) + + def list(self, *args, **options): + self.stdout.write('################# PLUGINS ####################') + for plugin in LOADED_PLUGINS: + self.stdout.write(f'{plugin.name}:') + for binary in plugin.binaries: + try: + binary = binary.install() + except Exception as e: + # import ipdb; ipdb.set_trace() + raise + self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}') + + self.stdout.write('\n################# LEGACY ####################') + for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items(): + bin_name = settings.CONFIG[bin_key] + + self.stdout.write(f'{bin_key}: {bin_name}') + + # binary = Binary(name=package_name, providers=[env]) + # print(binary) + + # try: + # loaded_bin = binary.load() + # self.stdout.write( + # self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin) + # ) + # except Exception as e: + # self.stderr.write( + # self.style.ERROR(f"Error loading {package_name}: {e}") + # ) + + def install(self, *args, bright, **options): + for package_name in options["package_names"]: + binary = Binary(name=package_name, providers=[env]) + print(binary) + + try: + loaded_bin = binary.load() + self.stdout.write( + self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin) + ) + except Exception as e: + self.stderr.write( + self.style.ERROR(f"Error loading {package_name}: {e}") + ) diff --git a/archivebox/pkg/migrations/__init__.py b/archivebox/pkg/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/pkg/models.py b/archivebox/pkg/models.py new file mode 100644 index 00000000..71a83623 --- /dev/null +++ b/archivebox/pkg/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/archivebox/pkg/settings.py b/archivebox/pkg/settings.py new file mode 100644 index 00000000..7f13d125 --- /dev/null +++ b/archivebox/pkg/settings.py @@ -0,0 +1,86 @@ +__package__ = 'archivebox.pkg' + +import os +import sys +import shutil +import inspect +from pathlib import Path + +import django +from django.conf import settings +from django.db.backends.sqlite3.base import Database as sqlite3 + +from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer +from pydantic_pkgr.binprovider import bin_abspath + +from ..config import NODE_BIN_PATH, bin_path + +env = EnvProvider(PATH=NODE_BIN_PATH + ':' + os.environ.get('PATH', '/bin')) + + +LOADED_DEPENDENCIES = {} + +for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items(): + # 'PYTHON_BINARY': { + # 'path': bin_path(config['PYTHON_BINARY']), + # 'version': config['PYTHON_VERSION'], + # 'hash': bin_hash(config['PYTHON_BINARY']), + # 'enabled': True, + # 'is_valid': bool(config['PYTHON_VERSION']), + # }, + + + bin_name = settings.CONFIG[bin_key] + + if bin_name.endswith('django/__init__.py'): + binary_spec = Binary(name='django', providers=[env], provider_overrides={ + 'env': { + 'abspath': lambda: Path(inspect.getfile(django)), + 'version': lambda: SemVer('{}.{}.{} {} ({})'.format(*django.VERSION)), + } + }) + elif bin_name.endswith('sqlite3/dbapi2.py'): + binary_spec = Binary(name='sqlite3', providers=[env], provider_overrides={ + 'env': { + 'abspath': lambda: Path(inspect.getfile(sqlite3)), + 'version': lambda: SemVer(sqlite3.version), + } + }) + elif bin_name.endswith('archivebox'): + binary_spec = Binary(name='archivebox', providers=[env], provider_overrides={ + 'env': { + 'abspath': lambda: shutil.which(str(Path('archivebox').expanduser())), + 'version': lambda: settings.CONFIG.VERSION, + } + }) + elif bin_name.endswith('postlight/parser/cli.js'): + binary_spec = Binary(name='postlight-parser', providers=[env], provider_overrides={ + 'env': { + 'abspath': lambda: bin_path('postlight-parser'), + 'version': lambda: SemVer('1.0.0'), + } + }) + else: + binary_spec = Binary(name=bin_name, providers=[env]) + + try: + binary = binary_spec.load() + except Exception as e: + # print(f"- ❌ Binary {bin_name} failed to load with error: {e}") + continue + + assert isinstance(binary.loaded_version, SemVer) + + try: + assert str(binary.loaded_version) == dependency['version'], f"Expected {bin_name} version {dependency['version']}, got {binary.loaded_version}" + assert str(binary.loaded_respath) == str(bin_abspath(dependency['path']).resolve()), f"Expected {bin_name} abspath {bin_abspath(dependency['path']).resolve()}, got {binary.loaded_respath}" + assert binary.is_valid == dependency['is_valid'], f"Expected {bin_name} is_valid={dependency['is_valid']}, got {binary.is_valid}" + except Exception as e: + pass + # print(f"WARNING: Error loading {bin_name}: {e}") + # import ipdb; ipdb.set_trace() + + # print(f"- ✅ Binary {bin_name} loaded successfully") + LOADED_DEPENDENCIES[bin_key] = binary + + diff --git a/archivebox/pkg/tests.py b/archivebox/pkg/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/pkg/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/pkg/views.py b/archivebox/pkg/views.py new file mode 100644 index 00000000..91ea44a2 --- /dev/null +++ b/archivebox/pkg/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/archivebox/plugantic/__init__.py b/archivebox/plugantic/__init__.py index 244d084f..c8f37e05 100644 --- a/archivebox/plugantic/__init__.py +++ b/archivebox/plugantic/__init__.py @@ -1,6 +1,5 @@ __package__ = 'archivebox.plugantic' -from .binproviders import BinProvider from .binaries import Binary from .extractors import Extractor from .replayers import Replayer diff --git a/archivebox/plugantic/apps.py b/archivebox/plugantic/apps.py index c0f1ce71..57d57cd8 100644 --- a/archivebox/plugantic/apps.py +++ b/archivebox/plugantic/apps.py @@ -1,6 +1,17 @@ +import importlib from django.apps import AppConfig class PluganticConfig(AppConfig): default_auto_field = 'django.db.models.BigAutoField' name = 'plugantic' + + def ready(self) -> None: + from django.conf import settings + from .plugins import PLUGINS + + for plugin_name in settings.INSTALLED_PLUGINS.keys(): + lib = importlib.import_module(f'{plugin_name}.apps') + if hasattr(lib, 'PLUGINS'): + for plugin_instance in lib.PLUGINS: + PLUGINS.append(plugin_instance) diff --git a/archivebox/plugantic/binaries.py b/archivebox/plugantic/binaries.py index 4788c361..76bd63ac 100644 --- a/archivebox/plugantic/binaries.py +++ b/archivebox/plugantic/binaries.py @@ -10,285 +10,17 @@ from typing import Any, Optional, Dict, List from typing_extensions import Self from subprocess import run, PIPE +from pydantic_pkgr import Binary, SemVer, BinName, BinProvider, EnvProvider, AptProvider, BrewProvider, PipProvider, BinProviderName, ProviderLookupDict -from pydantic_core import ValidationError - -from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer - -from .binproviders import ( - SemVer, - BinName, - BinProviderName, - HostBinPath, - BinProvider, - EnvProvider, - AptProvider, - BrewProvider, - PipProvider, - ProviderLookupDict, - bin_name, - bin_abspath, - path_is_script, - path_is_executable, -) - - -class Binary(BaseModel): - name: BinName - description: str = Field(default='') - - providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers') - provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides') - - loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider') - loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath') - loaded_version: Optional[SemVer] = Field(default=None, alias='version') - - # bin_filename: see below - # is_executable: see below - # is_script - # is_valid: see below - - - @model_validator(mode='after') - def validate(self): - self.loaded_abspath = bin_abspath(self.name) or self.name - self.description = self.description or self.name - - assert self.providers_supported, f'No providers were given for package {self.name}' - - # pull in any overrides from the binproviders - for provider in self.providers_supported: - overrides_by_provider = provider.get_providers_for_bin(self.name) - if overrides_by_provider: - self.provider_overrides[provider.name] = { - **overrides_by_provider, - **self.provider_overrides.get(provider.name, {}), - } - return self - - @field_validator('loaded_abspath', mode='before') - def parse_abspath(cls, value: Any): - return bin_abspath(value) - - @field_validator('loaded_version', mode='before') - def parse_version(cls, value: Any): - return value and SemVer(value) - - @field_serializer('provider_overrides', when_used='json') - def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]: - return { - provider_name: { - key: str(val) - for key, val in overrides.items() - } - for provider_name, overrides in provider_overrides.items() - } - - @computed_field # type: ignore[misc] # see mypy issue #1362 - @property - def bin_filename(self) -> BinName: - if self.is_script: - # e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite - name = self.name - elif self.loaded_abspath: - # e.g. '/opt/homebrew/bin/wget' -> wget - name = bin_name(self.loaded_abspath) - else: - # e.g. 'ytdlp' -> 'yt-dlp' - name = bin_name(self.name) - return name - - @computed_field # type: ignore[misc] # see mypy issue #1362 - @property - def is_executable(self) -> bool: - try: - assert self.loaded_abspath and path_is_executable(self.loaded_abspath) - return True - except (ValidationError, AssertionError): - return False - - @computed_field # type: ignore[misc] # see mypy issue #1362 - @property - def is_script(self) -> bool: - try: - assert self.loaded_abspath and path_is_script(self.loaded_abspath) - return True - except (ValidationError, AssertionError): - return False - - @computed_field # type: ignore[misc] # see mypy issue #1362 - @property - def is_valid(self) -> bool: - return bool( - self.name - and self.loaded_abspath - and self.loaded_version - and (self.is_executable or self.is_script) - ) - - @validate_call - def install(self) -> Self: - if not self.providers_supported: - return self - - exc = Exception('No providers were able to install binary', self.name, self.providers_supported) - for provider in self.providers_supported: - try: - installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name)) - if installed_bin: - # print('INSTALLED', self.name, installed_bin) - return self.model_copy(update={ - 'loaded_provider': provider.name, - 'loaded_abspath': installed_bin.abspath, - 'loaded_version': installed_bin.version, - }) - except Exception as err: - print(err) - exc = err - raise exc - - @validate_call - def load(self, cache=True) -> Self: - if self.is_valid: - return self - - if not self.providers_supported: - return self - - exc = Exception('No providers were able to install binary', self.name, self.providers_supported) - for provider in self.providers_supported: - try: - installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name)) - if installed_bin: - # print('LOADED', provider, self.name, installed_bin) - return self.model_copy(update={ - 'loaded_provider': provider.name, - 'loaded_abspath': installed_bin.abspath, - 'loaded_version': installed_bin.version, - }) - except Exception as err: - print(err) - exc = err - raise exc - - @validate_call - def load_or_install(self, cache=True) -> Self: - if self.is_valid: - return self - - if not self.providers_supported: - return self - - exc = Exception('No providers were able to install binary', self.name, self.providers_supported) - for provider in self.providers_supported: - try: - installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache) - if installed_bin: - # print('LOADED_OR_INSTALLED', self.name, installed_bin) - return self.model_copy(update={ - 'loaded_provider': provider.name, - 'loaded_abspath': installed_bin.abspath, - 'loaded_version': installed_bin.version, - }) - except Exception as err: - print(err) - exc = err - raise exc - - @validate_call - def exec(self, args=(), pwd='.'): - assert self.loaded_abspath - assert self.loaded_version - return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd) +import django +from django.db.backends.sqlite3.base import Database as sqlite3 -class SystemPythonHelpers: - @staticmethod - def get_subdeps() -> str: - return 'python3 python3-minimal python3-pip python3-virtualenv' - - @staticmethod - def get_abspath() -> str: - return sys.executable - - @staticmethod - def get_version() -> str: - return '{}.{}.{}'.format(*sys.version_info[:3]) - - -class SqliteHelpers: - @staticmethod - def get_abspath() -> Path: - import sqlite3 - importlib.reload(sqlite3) - return Path(inspect.getfile(sqlite3)) - - @staticmethod - def get_version() -> SemVer: - import sqlite3 - importlib.reload(sqlite3) - version = sqlite3.version - assert version - return SemVer(version) - -class DjangoHelpers: - @staticmethod - def get_django_abspath() -> str: - import django - return inspect.getfile(django) - - - @staticmethod - def get_django_version() -> str: - import django - return '{}.{}.{} {} ({})'.format(*django.VERSION) - -class YtdlpHelpers: - @staticmethod - def get_ytdlp_subdeps() -> str: - return 'yt-dlp ffmpeg' - - @staticmethod - def get_ytdlp_version() -> str: - import yt_dlp - importlib.reload(yt_dlp) - - version = yt_dlp.version.__version__ - assert version - return version - -class PythonBinary(Binary): - name: BinName = 'python' - - providers_supported: List[BinProvider] = [ - EnvProvider( - subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'}, - abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'}, - version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'}, - ), - ] - -class SqliteBinary(Binary): - name: BinName = 'sqlite' - providers_supported: List[BinProvider] = [ - EnvProvider( - version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'}, - abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'}, - ), - ] - -class DjangoBinary(Binary): - name: BinName = 'django' - providers_supported: List[BinProvider] = [ - EnvProvider( - abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'}, - version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'}, - ), - ] - +def get_ytdlp_version() -> str: + import yt_dlp + return yt_dlp.version.__version__ @@ -296,16 +28,26 @@ class DjangoBinary(Binary): class YtdlpBinary(Binary): name: BinName = 'yt-dlp' providers_supported: List[BinProvider] = [ - # EnvProvider(), - PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}), - BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}), - # AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}), + EnvProvider(), + PipProvider(), + BrewProvider(), + AptProvider(), ] - + provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { + 'pip': { + 'version': get_ytdlp_version, + }, + 'brew': { + 'subdeps': lambda: 'yt-dlp ffmpeg', + }, + 'apt': { + 'subdeps': lambda: 'yt-dlp ffmpeg', + } + } class WgetBinary(Binary): name: BinName = 'wget' - providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()] + providers_supported: List[BinProvider] = [EnvProvider(), AptProvider(), BrewProvider()] # if __name__ == '__main__': diff --git a/archivebox/plugantic/binproviders.py b/archivebox/plugantic/binproviders.py deleted file mode 100644 index 1c9933ea..00000000 --- a/archivebox/plugantic/binproviders.py +++ /dev/null @@ -1,561 +0,0 @@ -__package__ = 'archivebox.plugantic' - -import os -import shutil -import operator - -from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING -from typing_extensions import Self -from abc import ABC, abstractmethod -from collections import namedtuple -from pathlib import Path -from subprocess import run, PIPE - -from pydantic_core import core_schema, ValidationError -from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler - - - -def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool: - """returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless""" - code = lambda_func.__code__ - has_args = code.co_argcount > 0 - has_varargs = code.co_flags & 0x04 != 0 - has_varkw = code.co_flags & 0x08 != 0 - return has_args or has_varargs or has_varkw - - -def is_semver_str(semver: Any) -> bool: - if isinstance(semver, str): - return (semver.count('.') == 2 and semver.replace('.', '').isdigit()) - return False - -def semver_to_str(semver: tuple[int, int, int] | str) -> str: - if isinstance(semver, (list, tuple)): - return '.'.join(str(chunk) for chunk in semver) - if is_semver_str(semver): - return semver - raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver)) - - -SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0)) -SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int] - -class SemVer(SemVerTuple): - major: int - minor: int = 0 - patch: int = 0 - - if TYPE_CHECKING: - full_text: str | None = '' - - def __new__(cls, *args, full_text=None, **kwargs): - # '1.1.1' - if len(args) == 1 and is_semver_str(args[0]): - result = SemVer.parse(args[0]) - - # ('1', '2', '3') - elif len(args) == 1 and isinstance(args[0], (tuple, list)): - result = SemVer.parse(args[0]) - - # (1, '2', None) - elif not all(isinstance(arg, (int, type(None))) for arg in args): - result = SemVer.parse(args) - - # (None) - elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())): - result = None - - # 1, 2, 3 - else: - result = SemVerTuple.__new__(cls, *args, **kwargs) - - if result is not None: - # add first line as extra hidden metadata so it can be logged without having to re-run version cmd - result.full_text = full_text or str(result) - return result - - @classmethod - def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None: - """ - parses a version tag string formatted like into (major, minor, patch) ints - 'Google Chrome 124.0.6367.208' -> (124, 0, 6367) - 'GNU Wget 1.24.5 built on darwin23.2.0.' -> (1, 24, 5) - 'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0) - '2024.04.09' -> (2024, 4, 9) - - """ - # print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout) - - if isinstance(version_stdout, (tuple, list)): - version_stdout = '.'.join(str(chunk) for chunk in version_stdout) - elif isinstance(version_stdout, bytes): - version_stdout = version_stdout.decode() - elif not isinstance(version_stdout, str): - version_stdout = str(version_stdout) - - # no text to work with, return None immediately - if not version_stdout.strip(): - # raise Exception('Tried to parse semver from empty version output (is binary installed and available?)') - return None - - just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0] - contains_semver = lambda col: ( - col.count('.') in (1, 2, 3) - and all(chunk.isdigit() for chunk in col.split('.')[:3]) # first 3 chunks can only be nums - ) - - full_text = version_stdout.split('\n')[0].strip() - first_line_columns = full_text.split()[:4] - version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns))) - - # could not find any column of first line that looks like a version number, despite there being some text - if not version_columns: - # raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns))) - return None - - # take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09) - first_version_tuple = version_columns[0].split('.', 3)[:3] - - # print('FINAL_VALUE', first_version_tuple) - - return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text) - - def __str__(self): - return '.'.join(str(chunk) for chunk in self) - - # @classmethod - # def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema: - # default_schema = handler(source) - # return core_schema.no_info_after_validator_function( - # cls.parse, - # default_schema, - # serialization=core_schema.plain_serializer_function_ser_schema( - # lambda semver: str(semver), - # info_arg=False, - # return_schema=core_schema.str_schema(), - # ), - # ) - -assert SemVer(None) == None -assert SemVer('') == None -assert SemVer.parse('') == None -assert SemVer(1) == (1, 0, 0) -assert SemVer(1, 2) == (1, 2, 0) -assert SemVer('1.2+234234') == (1, 2, 0) -assert SemVer((1, 2, 3)) == (1, 2, 3) -assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3' -assert SemVer(('1', '2', '3')) == (1, 2, 3) -assert SemVer.parse('5.6.7') == (5, 6, 7) -assert SemVer.parse('124.0.6367.208') == (124, 0, 6367) -assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0) -assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367) -assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367) -assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123' -assert SemVer.parse('Google Chrome') == None - -@validate_call -def bin_name(bin_path_or_name: str | Path) -> str: - name = Path(bin_path_or_name).name - assert len(name) > 1 - assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), ( - f'Binary name can only contain a-Z0-9-_.: {name}') - return name - -BinName = Annotated[str, AfterValidator(bin_name)] - -@validate_call -def path_is_file(path: Path | str) -> Path: - path = Path(path) if isinstance(path, str) else path - assert path.is_file(), f'Path is not a file: {path}' - return path - -HostExistsPath = Annotated[Path, AfterValidator(path_is_file)] - -@validate_call -def path_is_executable(path: HostExistsPath) -> HostExistsPath: - assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})' - return path - -@validate_call -def path_is_script(path: HostExistsPath) -> HostExistsPath: - SCRIPT_EXTENSIONS = ('.py', '.js', '.sh') - assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS)) - return path - -HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)] - -@validate_call -def path_is_abspath(path: Path) -> Path: - return path.resolve() - -HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)] -HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)] - - -@validate_call -def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None: - assert bin_path_or_name - - if str(bin_path_or_name).startswith('/'): - # already a path, get its absolute form - abspath = Path(bin_path_or_name).resolve() - else: - # not a path yet, get path using os.which - binpath = shutil.which(bin_path_or_name) - if not binpath: - return None - abspath = Path(binpath).resolve() - - try: - return TypeAdapter(HostBinPath).validate_python(abspath) - except ValidationError: - return None - - -@validate_call -def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None: - return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode()) - - -class InstalledBin(BaseModel): - abspath: HostBinPath - version: SemVer - - -def is_valid_install_string(pkgs_str: str) -> str: - """Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'""" - assert pkgs_str - assert all(len(pkg) > 1 for pkg in pkgs_str.split(' ')) - return pkgs_str - -def is_valid_python_dotted_import(import_str: str) -> str: - assert import_str and import_str.replace('.', '').replace('_', '').isalnum() - return import_str - -InstallStr = Annotated[str, AfterValidator(is_valid_install_string)] - -LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)] - -ProviderHandler = Callable[..., Any] | Callable[[], Any] # must take no args [], or [bin_name: str, **kwargs] -#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] -ProviderHandlerRef = LazyImportStr | ProviderHandler -ProviderLookupDict = Dict[str, LazyImportStr] -ProviderType = Literal['abspath', 'version', 'subdeps', 'install'] - - -# class Host(BaseModel): -# machine: str -# system: str -# platform: str -# in_docker: bool -# in_qemu: bool -# python: str - -BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor'] - - -class BinProvider(ABC, BaseModel): - name: BinProviderName - - abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True) - version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True) - subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True) - install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True) - - _abspath_cache: ClassVar = {} - _version_cache: ClassVar = {} - _install_cache: ClassVar = {} - - # def provider_version(self) -> SemVer | None: - # """Version of the actual underlying package manager (e.g. pip v20.4.1)""" - # if self.name in ('env', 'vendor'): - # return SemVer('0.0.0') - # installer_binpath = Path(shutil.which(self.name)).resolve() - # return bin_version(installer_binpath) - - # def provider_host(self) -> Host: - # """Information about the host env, archictecture, and OS needed to select & build packages""" - # p = platform.uname() - # return Host( - # machine=p.machine, - # system=p.system, - # platform=platform.platform(), - # python=sys.implementation.name, - # in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true', - # in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true', - # ) - - def get_default_providers(self): - return self.get_providers_for_bin('*') - - def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None: - if provider_func is None: - return None - - # if provider_func is a dotted path to a function on self, swap it for the actual function - if isinstance(provider_func, str) and provider_func.startswith('self.'): - provider_func = getattr(self, provider_func.split('self.', 1)[-1]) - - # if provider_func is a dot-formatted import string, import the function - if isinstance(provider_func, str): - from django.utils.module_loading import import_string - - package_name, module_name, classname, path = provider_func.split('.', 3) # -> abc, def, ghi.jkl - - # get .ghi.jkl nested attr present on module abc.def - imported_module = import_string(f'{package_name}.{module_name}.{classname}') - provider_func = operator.attrgetter(path)(imported_module) - - # # abc.def.ghi.jkl -> 1, 2, 3 - # for idx in range(1, len(path)): - # parent_path = '.'.join(path[:-idx]) # abc.def.ghi - # try: - # parent_module = import_string(parent_path) - # provider_func = getattr(parent_module, path[-idx]) - # except AttributeError, ImportError: - # continue - - assert TypeAdapter(ProviderHandler).validate_python(provider_func), ( - f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}') - - return provider_func - - @validate_call - def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict: - providers_for_bin = { - 'abspath': self.abspath_provider.get(bin_name), - 'version': self.version_provider.get(bin_name), - 'subdeps': self.subdeps_provider.get(bin_name), - 'install': self.install_provider.get(bin_name), - } - only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None} - - return only_set_providers_for_bin - - @validate_call - def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler: - """ - Get the provider func for a given key + Dict of provider callbacks + fallback default provider. - e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable - """ - - provider_func_ref = ( - (overrides or {}).get(provider_type) - or self.get_providers_for_bin(bin_name).get(provider_type) - or self.get_default_providers().get(provider_type) - or default_provider - ) - # print('getting provider for action', bin_name, provider_type, provider_func) - - provider_func = self.resolve_provider_func(provider_func_ref) - - assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.' - - return provider_func - - @validate_call - def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any: - provider_func: ProviderHandler = self.get_provider_for_action( - bin_name=bin_name, - provider_type=provider_type, - default_provider=default_provider, - overrides=overrides, - ) - if not func_takes_args_or_kwargs(provider_func): - # if it's a pure argless lambdas, dont pass bin_path and other **kwargs - provider_func_without_args = cast(Callable[[], Any], provider_func) - return provider_func_without_args() - - provider_func = cast(Callable[..., Any], provider_func) - return provider_func(bin_name, **kwargs) - - - - def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None: - print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...') - try: - return bin_abspath(bin_name) - except ValidationError: - return None - - def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None: - abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name) - if not abspath: return None - - print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...') - try: - return bin_version(abspath) - except ValidationError: - return None - - def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr: - print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}') - # ... subdependency calculation logic here - return TypeAdapter(InstallStr).validate_python(bin_name) - - @abstractmethod - def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_): - subdeps = subdeps or self.get_subdeps(bin_name) - print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})') - # ... install logic here - assert True - - - @validate_call - def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None: - abspath = self.call_provider_for_action( - bin_name=bin_name, - provider_type='abspath', - default_provider=self.on_get_abspath, - overrides=overrides, - ) - if not abspath: - return None - result = TypeAdapter(HostBinPath).validate_python(abspath) - self._abspath_cache[bin_name] = result - return result - - @validate_call - def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None: - version = self.call_provider_for_action( - bin_name=bin_name, - provider_type='version', - default_provider=self.on_get_version, - overrides=overrides, - abspath=abspath, - ) - if not version: - return None - result = SemVer(version) - self._version_cache[bin_name] = result - return result - - @validate_call - def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr: - subdeps = self.call_provider_for_action( - bin_name=bin_name, - provider_type='subdeps', - default_provider=self.on_get_subdeps, - overrides=overrides, - ) - if not subdeps: - subdeps = bin_name - result = TypeAdapter(InstallStr).validate_python(subdeps) - return result - - @validate_call - def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None: - subdeps = self.get_subdeps(bin_name, overrides=overrides) - - self.call_provider_for_action( - bin_name=bin_name, - provider_type='install', - default_provider=self.on_install, - overrides=overrides, - subdeps=subdeps, - ) - - installed_abspath = self.get_abspath(bin_name) - assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}' - - installed_version = self.get_version(bin_name, abspath=installed_abspath) - assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}' - - result = InstalledBin(abspath=installed_abspath, version=installed_version) - self._install_cache[bin_name] = result - return result - - @validate_call - def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None: - installed_abspath = None - installed_version = None - - if cache: - installed_bin = self._install_cache.get(bin_name) - if installed_bin: - return installed_bin - installed_abspath = self._abspath_cache.get(bin_name) - installed_version = self._version_cache.get(bin_name) - - - installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides) - if not installed_abspath: - return None - - installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides) - if not installed_version: - return None - - return InstalledBin(abspath=installed_abspath, version=installed_version) - - @validate_call - def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None: - installed = self.load(bin_name, overrides=overrides, cache=cache) - if not installed: - installed = self.install(bin_name, overrides=overrides) - return installed - - -class PipProvider(BinProvider): - name: BinProviderName = 'pip' - - def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_): - subdeps = subdeps or self.on_get_subdeps(bin_name) - print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})') - - proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE) - - if proc.returncode != 0: - print(proc.stdout.strip().decode()) - print(proc.stderr.strip().decode()) - raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}') - - -class AptProvider(BinProvider): - name: BinProviderName = 'apt' - - subdeps_provider: ProviderLookupDict = { - 'yt-dlp': lambda: 'yt-dlp ffmpeg', - } - - def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_): - subdeps = subdeps or self.on_get_subdeps(bin_name) - print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})') - - run(['apt-get', 'update', '-qq']) - proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE) - - if proc.returncode != 0: - print(proc.stdout.strip().decode()) - print(proc.stderr.strip().decode()) - raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}') - -class BrewProvider(BinProvider): - name: BinProviderName = 'brew' - - def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_): - subdeps = subdeps or self.on_get_subdeps(bin_name) - print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})') - - proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE) - - if proc.returncode != 0: - print(proc.stdout.strip().decode()) - print(proc.stderr.strip().decode()) - raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}') - - -class EnvProvider(BinProvider): - name: BinProviderName = 'env' - - abspath_provider: ProviderLookupDict = { - # 'python': lambda: Path('/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'), - } - version_provider: ProviderLookupDict = { - # 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]), - } - - def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_): - """The env provider is ready-only and does not install any packages, so this is a no-op""" - pass diff --git a/archivebox/plugantic/extractors.py b/archivebox/plugantic/extractors.py index 3befa5b5..56d594f3 100644 --- a/archivebox/plugantic/extractors.py +++ b/archivebox/plugantic/extractors.py @@ -31,7 +31,7 @@ def no_empty_args(args: List[str]) -> List[str]: assert all(len(arg) for arg in args) return args -ExtractorName = Literal['wget', 'warc', 'media'] +ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)] diff --git a/archivebox/plugantic/plugins.py b/archivebox/plugantic/plugins.py index c34c4703..d213fced 100644 --- a/archivebox/plugantic/plugins.py +++ b/archivebox/plugantic/plugins.py @@ -14,9 +14,6 @@ from pydantic import ( from .binaries import ( Binary, - PythonBinary, - SqliteBinary, - DjangoBinary, WgetBinary, YtdlpBinary, ) @@ -28,7 +25,6 @@ from .extractors import ( ) from .replayers import ( Replayer, - GENERIC_REPLAYER, MEDIA_REPLAYER, ) from .configs import ( @@ -80,12 +76,6 @@ class Plugin(BaseModel): }) -class CorePlugin(Plugin): - name: str = 'core' - configs: List[SerializeAsAny[ConfigSet]] = [] - binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()] - extractors: List[SerializeAsAny[Extractor]] = [] - replayers: List[SerializeAsAny[Replayer]] = [GENERIC_REPLAYER] class YtdlpPlugin(Plugin): name: str = 'ytdlp' @@ -101,11 +91,9 @@ class WgetPlugin(Plugin): extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()] -CORE_PLUGIN = CorePlugin() YTDLP_PLUGIN = YtdlpPlugin() WGET_PLUGIN = WgetPlugin() PLUGINS = [ - CORE_PLUGIN, YTDLP_PLUGIN, WGET_PLUGIN, ] diff --git a/archivebox/plugantic/replayers.py b/archivebox/plugantic/replayers.py index 12ade623..08f1cd88 100644 --- a/archivebox/plugantic/replayers.py +++ b/archivebox/plugantic/replayers.py @@ -22,5 +22,4 @@ class Replayer(BaseModel): # thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon' -GENERIC_REPLAYER = Replayer(name='generic') MEDIA_REPLAYER = Replayer(name='media') diff --git a/archivebox/plugantic/views.py b/archivebox/plugantic/views.py index b29a8cf5..24f256de 100644 --- a/archivebox/plugantic/views.py +++ b/archivebox/plugantic/views.py @@ -1,5 +1,8 @@ __package__ = 'archivebox.plugantic' +import inspect +from typing import Any + from django.http import HttpRequest from django.utils.html import format_html, mark_safe @@ -10,6 +13,44 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view from plugantic.plugins import LOADED_PLUGINS from django.conf import settings +def obj_to_yaml(obj: Any, indent: int=0) -> str: + indent_str = " " * indent + + if isinstance(obj, dict): + if not obj: + return "{}" + result = "\n" + for key, value in obj.items(): + result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n" + return result + + elif isinstance(obj, list): + if not obj: + return "[]" + result = "\n" + for item in obj: + result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n" + return result.rstrip() + + elif isinstance(obj, str): + if "\n" in obj: + return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ") + else: + return f" {obj}" + + elif isinstance(obj, (int, float, bool)): + return f" {str(obj)}" + + elif callable(obj): + source = '\n'.join( + '' if 'def ' in line else line + for line in inspect.getsource(obj).split('\n') + if line.strip() + ).split('lambda: ')[-1].rstrip(',') + return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ") + + else: + return f" {str(obj)}" @render_with_table_view def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: @@ -18,13 +59,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: rows = { "Binary": [], - "From Plugin": [], "Found Version": [], + "From Plugin": [], "Provided By": [], "Found Abspath": [], "Related Configuration": [], "Overrides": [], - "Description": [], + # "Description": [], } relevant_configs = { @@ -38,8 +79,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: binary = binary.load_or_install() rows['Binary'].append(ItemLink(binary.name, key=binary.name)) - rows['From Plugin'].append(plugin.name) rows['Found Version'].append(binary.loaded_version) + rows['From Plugin'].append(plugin.name) rows['Provided By'].append(binary.loaded_provider) rows['Found Abspath'].append(binary.loaded_abspath) rows['Related Configuration'].append(mark_safe(', '.join( @@ -48,8 +89,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower() # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower() ))) - rows['Overrides'].append(str(binary.provider_overrides)) - rows['Description'].append(binary.description) + rows['Overrides'].append(obj_to_yaml(binary.provider_overrides)) + # rows['Description'].append(binary.description) return TableContext( title="Binaries", @@ -85,8 +126,8 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: 'binprovider': binary.loaded_provider, 'abspath': binary.loaded_abspath, 'version': binary.loaded_version, - 'overrides': str(binary.provider_overrides), - 'providers': str(binary.providers_supported), + 'overrides': obj_to_yaml(binary.provider_overrides), + 'providers': obj_to_yaml(binary.providers_supported), }, "help_texts": { # TODO diff --git a/archivebox/system.py b/archivebox/system.py index bced0bac..58571000 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -11,13 +11,12 @@ from typing import Optional, Union, Set, Tuple from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired from crontab import CronTab -from .vendor.atomicwrites import atomic_write as lib_atomic_write +from atomicwrites import atomic_write as lib_atomic_write from .util import enforce_types, ExtendedEncoder from .config import PYTHON_BINARY, OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES - def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs): """Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py diff --git a/archivebox/util.py b/archivebox/util.py index d9dd4dbf..c96c1d1a 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -16,7 +16,7 @@ from datetime import datetime, timezone from dateparser import parse as dateparser from requests.exceptions import RequestException, ReadTimeout -from .vendor.base32_crockford import encode as base32_encode # type: ignore +from base32_crockford import encode as base32_encode # type: ignore from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding from os.path import lexists from os import remove as remove_file @@ -273,8 +273,8 @@ def get_headers(url: str, timeout: int=None) -> str: { 'URL': url, 'Status-Code': response.status_code, - 'Elapsed': response.elapsed, - 'Encoding': response.encoding, + 'Elapsed': response.elapsed.total_seconds()*1000, + 'Encoding': str(response.encoding), 'Apparent-Encoding': response.apparent_encoding, **dict(response.headers), }, @@ -304,11 +304,7 @@ def chrome_args(**options) -> List[str]: cmd_args += CHROME_EXTRA_ARGS if options['CHROME_HEADLESS']: - chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1]) - if chrome_major_version >= 111: - cmd_args += ("--headless=new",) - else: - cmd_args += ('--headless',) + cmd_args += ("--headless=new",) # expects chrome version >= 111 if not options['CHROME_SANDBOX']: # assume this means we are running inside a docker container diff --git a/archivebox/vendor/__init__.py b/archivebox/vendor/__init__.py index e69de29b..e19c45af 100644 --- a/archivebox/vendor/__init__.py +++ b/archivebox/vendor/__init__.py @@ -0,0 +1,34 @@ +import sys +import inspect +import importlib +from pathlib import Path + +VENDOR_DIR = Path(__file__).parent + +VENDORED_LIBS = { + # sys.path dir: library name + 'python-atomicwrites': 'atomicwrites', + 'django-taggit': 'taggit', + 'pydantic-pkgr': 'pydantic_pkgr', + 'pocket': 'pocket', + 'base32-crockford': 'base32_crockford', +} + +def load_vendored_libs(): + for lib_subdir, lib_name in VENDORED_LIBS.items(): + lib_dir = VENDOR_DIR / lib_subdir + assert lib_dir.is_dir(), 'Expected vendor libary {lib_name} could not be found in {lib_dir}' + + try: + lib = importlib.import_module(lib_name) + # print(f"Successfully imported lib from environment {lib_name}: {inspect.getfile(lib)}") + except ImportError: + sys.path.append(str(lib_dir)) + try: + lib = importlib.import_module(lib_name) + # print(f"Successfully imported lib from vendored fallback {lib_name}: {inspect.getfile(lib)}") + except ImportError as e: + print(f"Failed to import lib from environment or vendored fallback {lib_name}: {e}", file=sys.stderr) + sys.exit(1) + + diff --git a/archivebox/vendor/atomicwrites.py b/archivebox/vendor/atomicwrites.py deleted file mode 120000 index 73abfe4c..00000000 --- a/archivebox/vendor/atomicwrites.py +++ /dev/null @@ -1 +0,0 @@ -python-atomicwrites/atomicwrites/__init__.py \ No newline at end of file diff --git a/archivebox/vendor/base32_crockford.py b/archivebox/vendor/base32_crockford.py deleted file mode 120000 index a5d9c64f..00000000 --- a/archivebox/vendor/base32_crockford.py +++ /dev/null @@ -1 +0,0 @@ -base32-crockford/base32_crockford.py \ No newline at end of file diff --git a/archivebox/vendor/package-lock.json b/archivebox/vendor/package-lock.json deleted file mode 120000 index 322001ae..00000000 --- a/archivebox/vendor/package-lock.json +++ /dev/null @@ -1 +0,0 @@ -../../package-lock.json \ No newline at end of file diff --git a/archivebox/vendor/package.json b/archivebox/vendor/package.json deleted file mode 120000 index 138a42cd..00000000 --- a/archivebox/vendor/package.json +++ /dev/null @@ -1 +0,0 @@ -../../package.json \ No newline at end of file diff --git a/archivebox/vendor/pocket.py b/archivebox/vendor/pocket.py deleted file mode 120000 index 37352d27..00000000 --- a/archivebox/vendor/pocket.py +++ /dev/null @@ -1 +0,0 @@ -pocket/pocket.py \ No newline at end of file diff --git a/archivebox/vendor/pydantic-pkgr b/archivebox/vendor/pydantic-pkgr new file mode 160000 index 00000000..2cd84453 --- /dev/null +++ b/archivebox/vendor/pydantic-pkgr @@ -0,0 +1 @@ +Subproject commit 2cd844533d888ce29b9bf32b8363510dd0d76166 diff --git a/archivebox/vendor/taggit_utils.py b/archivebox/vendor/taggit_utils.py deleted file mode 120000 index f36776db..00000000 --- a/archivebox/vendor/taggit_utils.py +++ /dev/null @@ -1 +0,0 @@ -django-taggit/taggit/utils.py \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 0645c468..7f5a2969 100644 --- a/package-lock.json +++ b/package-lock.json @@ -236,9 +236,9 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "22.5.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz", - "integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==", + "version": "22.5.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz", + "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==", "license": "MIT", "optional": true, "dependencies": { @@ -353,9 +353,9 @@ } }, "node_modules/aws4": { - "version": "1.13.1", - "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz", - "integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==", + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz", + "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==", "license": "MIT" }, "node_modules/b4a": { @@ -2376,9 +2376,9 @@ } }, "node_modules/tslib": { - "version": "2.6.3", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz", - "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==", + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz", + "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==", "license": "0BSD" }, "node_modules/turndown": { diff --git a/pdm.lock b/pdm.lock index d31f5ca2..4e719c8e 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "ldap", "sonic"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:f2f7ca01f2e18a1ef07d59b7a8985d89785a4b8a2a4e66452f1f9e8e8ad529ad" +content_hash = "sha256:c6aa1f436032d18d079a4c2e9d9b95a5110579eb96a449751bfaf4d472eba401" [[metadata.targets]] requires_python = "==3.10.*" @@ -78,6 +78,29 @@ files = [ {file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"}, ] +[[package]] +name = "atomicwrites" +version = "1.4.0" +requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +summary = "Atomic file writes." +groups = ["default"] +marker = "python_version == \"3.10\"" +files = [ + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, +] + +[[package]] +name = "base32-crockford" +version = "0.3.0" +summary = "A Python implementation of Douglas Crockford's base32 encoding scheme" +groups = ["default"] +marker = "python_version == \"3.10\"" +files = [ + {file = "base32-crockford-0.3.0.tar.gz", hash = "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969"}, + {file = "base32_crockford-0.3.0-py2.py3-none-any.whl", hash = "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"}, +] + [[package]] name = "brotli" version = "1.1.0" @@ -407,6 +430,21 @@ files = [ {file = "django_stubs_ext-5.0.4.tar.gz", hash = "sha256:85da065224204774208be29c7d02b4482d5a69218a728465c2fbe41725fdc819"}, ] +[[package]] +name = "django-taggit" +version = "1.3.0" +requires_python = ">=3.5" +summary = "django-taggit is a reusable Django application for simple tagging." +groups = ["default"] +marker = "python_version == \"3.10\"" +dependencies = [ + "Django>=1.11", +] +files = [ + {file = "django-taggit-1.3.0.tar.gz", hash = "sha256:4a833bf71f4c2deddd9745924eee53be1c075d7f0020a06f12e29fa3d752732d"}, + {file = "django_taggit-1.3.0-py3-none-any.whl", hash = "sha256:609b0223d8a652f3fae088b7fd29f294fdadaca2d7931d45c27d6c59b02fdf31"}, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -479,7 +517,7 @@ files = [ [[package]] name = "httpx" -version = "0.27.0" +version = "0.27.2" requires_python = ">=3.8" summary = "The next generation HTTP client." groups = ["default"] @@ -492,20 +530,20 @@ dependencies = [ "sniffio", ] files = [ - {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"}, - {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"}, + {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, + {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, ] [[package]] name = "idna" -version = "3.7" -requires_python = ">=3.5" +version = "3.8" +requires_python = ">=3.6" summary = "Internationalized Domain Names in Applications (IDNA)" groups = ["default"] marker = "python_version == \"3.10\"" files = [ - {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, - {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, + {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"}, + {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"}, ] [[package]] @@ -613,6 +651,32 @@ files = [ {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, ] +[[package]] +name = "pocket" +version = "0.3.7" +git = "https://github.com/tapanpandita/pocket.git" +ref = "v0.3.7" +revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1" +summary = "api wrapper for getpocket.com" +groups = ["default"] +marker = "python_version == \"3.10\"" +dependencies = [ + "requests", +] + +[[package]] +name = "pocket" +version = "0.3.7" +git = "https://github.com/tapanpandita/pocket.git" +ref = "v0.3.7" +revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1" +summary = "api wrapper for getpocket.com" +groups = ["default"] +marker = "python_version == \"3.10\"" +dependencies = [ + "requests", +] + [[package]] name = "prompt-toolkit" version = "3.0.47" @@ -739,6 +803,23 @@ files = [ {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"}, ] +[[package]] +name = "pydantic-pkgr" +version = "0.1.4" +requires_python = ">=3.10" +summary = "System package manager APIs in strongly typed Python" +groups = ["default"] +marker = "python_version == \"3.10\"" +dependencies = [ + "pydantic-core>=2.18.2", + "pydantic>=2.7.1", + "typing-extensions>=4.11.0", +] +files = [ + {file = "pydantic_pkgr-0.1.4-py3-none-any.whl", hash = "sha256:bd9ddfa8eeb4d361257c4d3d8d36ba44a72515b497ee52cf0763240c66006417"}, + {file = "pydantic_pkgr-0.1.4.tar.gz", hash = "sha256:e0422022dd83341f1e869a54da9aca903a6407a983ece0735f69493841b0fbb8"}, +] + [[package]] name = "pygments" version = "2.18.0" @@ -841,14 +922,14 @@ files = [ [[package]] name = "setuptools" -version = "73.0.1" +version = "74.0.0" requires_python = ">=3.8" summary = "Easily download, build, install, upgrade, and uninstall Python packages" groups = ["default"] marker = "python_version == \"3.10\"" files = [ - {file = "setuptools-73.0.1-py3-none-any.whl", hash = "sha256:b208925fcb9f7af924ed2dc04708ea89791e24bde0d3020b27df0e116088b34e"}, - {file = "setuptools-73.0.1.tar.gz", hash = "sha256:d59a3e788ab7e012ab2c4baed1b376da6366883ee20d7a5fc426816e3d7b1193"}, + {file = "setuptools-74.0.0-py3-none-any.whl", hash = "sha256:0274581a0037b638b9fc1c6883cc71c0210865aaa76073f7882376b641b84e8f"}, + {file = "setuptools-74.0.0.tar.gz", hash = "sha256:a85e96b8be2b906f3e3e789adec6a9323abf79758ecfa3065bd740d81158b11e"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index ebeccd59..6d3f8521 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,12 +29,9 @@ dependencies = [ "croniter>=2.0.5", # for: archivebox schedule "ipython>=8.23.0", # for: archivebox shell # Extractor Dependencies - "yt-dlp>=2024.4.9", # for: media + "yt-dlp>=2024.8.6", # for: media # "playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages - # TODO: add more extractors - # - gallery-dl - # - scihubdl - # - See Github issues for more... + "django-signal-webhooks>=0.3.0", "django-admin-data-views>=0.3.1", "ulid-py>=1.1.0", @@ -43,6 +40,14 @@ dependencies = [ "django-pydantic-field>=0.3.9", "django-jsonform>=2.22.0", "django-stubs>=5.0.2", + + # these can be safely omitted when installation subsystem does not provide these as packages (e.g. apt/debian) + # archivebox will automatically load fallback vendored copies bundled via archivebox/vendor/__init__.py + "pydantic-pkgr>=0.1.4", + "atomicwrites==1.4.0", + "pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7", + "django-taggit==1.3.0", + "base32-crockford==0.3.0", ] homepage = "https://github.com/ArchiveBox/ArchiveBox" @@ -139,7 +144,7 @@ exclude = [ "**/migrations", "archivebox/vendor", ] -stubPath = "./typings" +stubPath = "./archivebox/typings" venvPath = "." venv = ".venv" # ignore = ["src/oldstuff"] @@ -169,6 +174,9 @@ debug = [ "djdt_flamegraph", "ipdb", "requests-tracker>=0.3.3", + "logfire[django]>=0.51.0", + "opentelemetry-instrumentation-django>=0.47b0", + "opentelemetry-instrumentation-sqlite3>=0.47b0", ] test = [ "pytest", @@ -177,8 +185,6 @@ test = [ lint = [ "flake8", "mypy", -] -dev = [ "django-autotyping>=0.5.1", ] diff --git a/requirements.txt b/requirements.txt index c464bf68..0ee4c4f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,8 @@ annotated-types==0.7.0; python_version == "3.10" anyio==4.4.0; python_version == "3.10" asgiref==3.8.1; python_version == "3.10" asttokens==2.4.1; python_version == "3.10" +atomicwrites==1.4.0; python_version == "3.10" +base32-crockford==0.3.0; python_version == "3.10" brotli==1.1.0; implementation_name == "cpython" and python_version == "3.10" brotlicffi==1.1.0.0; implementation_name != "cpython" and python_version == "3.10" certifi==2024.7.4; python_version == "3.10" @@ -26,13 +28,14 @@ django-settings-holder==0.1.2; python_version == "3.10" django-signal-webhooks==0.3.0; python_version == "3.10" django-stubs==5.0.4; python_version == "3.10" django-stubs-ext==5.0.4; python_version == "3.10" +django-taggit==1.3.0; python_version == "3.10" exceptiongroup==1.2.2; python_version == "3.10" executing==2.0.1; python_version == "3.10" feedparser==6.0.11; python_version == "3.10" h11==0.14.0; python_version == "3.10" httpcore==1.0.5; python_version == "3.10" -httpx==0.27.0; python_version == "3.10" -idna==3.7; python_version == "3.10" +httpx==0.27.2; python_version == "3.10" +idna==3.8; python_version == "3.10" ipython==8.26.0; python_version == "3.10" jedi==0.19.1; python_version == "3.10" matplotlib-inline==0.1.7; python_version == "3.10" @@ -40,6 +43,7 @@ mutagen==1.47.0; python_version == "3.10" mypy-extensions==1.0.0; python_version == "3.10" parso==0.8.4; python_version == "3.10" pexpect==4.9.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10" +pocket @ git+https://github.com/tapanpandita/pocket.git@5a144438cc89bfc0ec94db960718ccf1f76468c1 ; python_version == "3.10" prompt-toolkit==3.0.47; python_version == "3.10" ptyprocess==0.7.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10" pure-eval==0.2.3; python_version == "3.10" @@ -49,6 +53,7 @@ pycparser==2.22; platform_python_implementation != "PyPy" and python_version == pycryptodomex==3.20.0; python_version == "3.10" pydantic==2.8.2; python_version == "3.10" pydantic-core==2.20.1; python_version == "3.10" +pydantic-pkgr==0.1.4; python_version == "3.10" pygments==2.18.0; python_version == "3.10" python-crontab==3.2.0; python_version == "3.10" python-dateutil==2.9.0.post0; python_version == "3.10" @@ -56,7 +61,7 @@ python-ldap==3.4.4; python_version == "3.10" pytz==2024.1; python_version == "3.10" regex==2024.7.24; python_version == "3.10" requests==2.32.3; python_version == "3.10" -setuptools==73.0.1; python_version == "3.10" +setuptools==74.0.0; python_version == "3.10" sgmllib3k==1.0.0; python_version == "3.10" six==1.16.0; python_version == "3.10" sniffio==1.3.1; python_version == "3.10"