v0.8.+: Massive speed improvements for Admin UI & common queries, v3 plugins progress, and bugfixes (#1498)
Some checks failed
Build Debian package / build (push) Has been cancelled
Build Docker image / buildx (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Run linters / lint (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
Build Pip package / build (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled

This commit is contained in:
Nick Sweeting 2024-08-30 23:44:19 -07:00 committed by GitHub
commit 43e87ef437
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
66 changed files with 1150 additions and 1100 deletions

3
.gitmodules vendored
View file

@ -26,3 +26,6 @@
[submodule "archivebox/vendor/python-atomicwrites"] [submodule "archivebox/vendor/python-atomicwrites"]
path = archivebox/vendor/python-atomicwrites path = archivebox/vendor/python-atomicwrites
url = https://github.com/untitaker/python-atomicwrites url = https://github.com/untitaker/python-atomicwrites
[submodule "archivebox/vendor/pydantic-pkgr"]
path = archivebox/vendor/pydantic-pkgr
url = https://github.com/ArchiveBox/pydantic-pkgr

View file

@ -61,6 +61,11 @@ def get_or_create_system_user_pk(username='system'):
return user.pk return user.pk
class AutoDateTimeField(models.DateTimeField):
def pre_save(self, model_instance, add):
return timezone.now()
class ABIDModel(models.Model): class ABIDModel(models.Model):
""" """
Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface. Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
@ -76,13 +81,16 @@ class ABIDModel(models.Model):
abid = ABIDField(prefix=abid_prefix) abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
created = models.DateTimeField(auto_now_add=True) created = AutoDateTimeField(default=timezone.now, db_index=True)
modified = models.DateTimeField(auto_now=True) modified = models.DateTimeField(auto_now=True)
class Meta(TypedModelMeta): class Meta(TypedModelMeta):
abstract = True abstract = True
def save(self, *args: Any, **kwargs: Any) -> None: def save(self, *args: Any, **kwargs: Any) -> None:
if self._state.adding or not self.created:
self.created = timezone.now()
# when first creating a row, self.ABID is the source of truth # when first creating a row, self.ABID is the source of truth
# overwrite default prefilled self.id & self.abid with generated self.ABID value # overwrite default prefilled self.id & self.abid with generated self.ABID value
if self._state.adding or not self.id: if self._state.adding or not self.id:
@ -93,6 +101,7 @@ class ABIDModel(models.Model):
super().save(*args, **kwargs) super().save(*args, **kwargs)
assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}' assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}'
assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}' assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}'
assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})'
@property @property
def abid_values(self) -> Dict[str, Any]: def abid_values(self) -> Dict[str, Any]:
@ -186,6 +195,14 @@ class ABIDModel(models.Model):
Get a uuid.UUID (v4) representation of the object's ABID. Get a uuid.UUID (v4) representation of the object's ABID.
""" """
return self.ABID.uuid return self.ABID.uuid
@property
def uuid(self) -> str:
"""
Get a str uuid.UUID (v4) representation of the object's ABID.
"""
assert str(self.id) == str(self.ABID.uuid)
return str(self.id)
@property @property
def TypeID(self) -> TypeID: def TypeID(self) -> TypeID:

View file

View file

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

View file

@ -0,0 +1,83 @@
import sys
import inspect
from typing import List, Dict, Any, Optional
from pathlib import Path
import django
from django.apps import AppConfig
from django.core.checks import Tags, Warning, register
from django.db.backends.sqlite3.base import Database as sqlite3
from pydantic import (
Field,
SerializeAsAny,
)
from pydantic_pkgr import SemVer, BinProvider, BinProviderName, ProviderLookupDict, BinName, Binary, EnvProvider, NpmProvider
from plugantic.extractors import Extractor, ExtractorName
from plugantic.plugins import Plugin
from plugantic.configs import ConfigSet, ConfigSectionName
from plugantic.replayers import Replayer
class PythonBinary(Binary):
name: BinName = 'python'
providers_supported: List[BinProvider] = [EnvProvider()]
provider_overrides: Dict[str, Any] = {
'env': {
'subdeps': \
lambda: 'python3 python3-minimal python3-pip python3-virtualenv',
'abspath': \
lambda: sys.executable,
'version': \
lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
},
}
class SqliteBinary(Binary):
name: BinName = 'sqlite'
providers_supported: List[BinProvider] = [EnvProvider()]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
'env': {
'abspath': \
lambda: Path(inspect.getfile(sqlite3)),
'version': \
lambda: SemVer(sqlite3.version),
},
}
class DjangoBinary(Binary):
name: BinName = 'django'
providers_supported: List[BinProvider] = [EnvProvider()]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
'env': {
'abspath': \
lambda: inspect.getfile(django),
'version': \
lambda: django.VERSION[:3],
},
}
class BasicReplayer(Replayer):
name: str = 'basic'
class BasePlugin(Plugin):
name: str = 'base'
configs: List[SerializeAsAny[ConfigSet]] = []
binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
extractors: List[SerializeAsAny[Extractor]] = []
replayers: List[SerializeAsAny[Replayer]] = [BasicReplayer()]
PLUGINS = [BasePlugin()]
class BaseConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'builtin_plugins.base'

View file

@ -0,0 +1,3 @@
from django.db import models
# Create your models here.

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View file

@ -0,0 +1,113 @@
from typing import List, Optional, Dict
from pathlib import Path
from django.apps import AppConfig
from django.core.checks import Tags, Warning, register
from pydantic import (
Field,
SerializeAsAny,
)
from pydantic_pkgr import BinProvider, BinName, Binary, EnvProvider, NpmProvider
from pydantic_pkgr.binprovider import bin_abspath
from pydantic_pkgr.binary import BinProviderName, ProviderLookupDict
from plugantic.extractors import Extractor, ExtractorName
from plugantic.plugins import Plugin
from plugantic.configs import ConfigSet, ConfigSectionName
from pkg.settings import env
###################### Config ##########################
class SinglefileToggleConfig(ConfigSet):
section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
SAVE_SINGLEFILE: bool = True
class SinglefileDependencyConfig(ConfigSet):
section: ConfigSectionName = 'DEPENDENCY_CONFIG'
SINGLEFILE_BINARY: str = Field(default='wget')
SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
class SinglefileOptionsConfig(ConfigSet):
section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
# loaded from shared config
SINGLEFILE_USER_AGENT: str = Field(default='', alias='USER_AGENT')
SINGLEFILE_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
SINGLEFILE_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
DEFAULT_CONFIG = {
'CHECK_SSL_VALIDITY': False,
'SAVE_SINGLEFILE': True,
'TIMEOUT': 120,
}
PLUGIN_CONFIG = [
SinglefileToggleConfig(**DEFAULT_CONFIG),
SinglefileDependencyConfig(**DEFAULT_CONFIG),
SinglefileOptionsConfig(**DEFAULT_CONFIG),
]
###################### Binaries ############################
min_version: str = "1.1.54"
max_version: str = "2.0.0"
class SinglefileBinary(Binary):
name: BinName = 'single-file'
providers_supported: List[BinProvider] = [NpmProvider()]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] ={
'env': {
'abspath': lambda: bin_abspath('single-file-node.js', PATH=env.PATH) or bin_abspath('single-file', PATH=env.PATH),
},
'npm': {
# 'abspath': lambda: bin_abspath('single-file', PATH=NpmProvider().PATH) or bin_abspath('single-file', PATH=env.PATH),
'subdeps': lambda: f'single-file-cli@>={min_version} <{max_version}',
},
}
###################### Extractors ##########################
class SinglefileExtractor(Extractor):
name: ExtractorName = 'singlefile'
binary: Binary = SinglefileBinary()
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'singlefile.html'
###################### Plugins #############################
class SinglefilePlugin(Plugin):
name: str = 'singlefile'
configs: List[SerializeAsAny[ConfigSet]] = [*PLUGIN_CONFIG]
binaries: List[SerializeAsAny[Binary]] = [SinglefileBinary()]
extractors: List[SerializeAsAny[Extractor]] = [SinglefileExtractor()]
PLUGINS = [SinglefilePlugin()]
###################### Django Apps #########################
class SinglefileConfig(AppConfig):
name = 'builtin_plugins.singlefile'
verbose_name = 'SingleFile'
def ready(self):
pass
# print('Loaded singlefile plugin')

View file

@ -0,0 +1,66 @@
name: singlefile
plugin_version: '0.0.1'
plugin_spec: '0.0.1'
binaries:
singlefile:
providers:
- env
- npm
commands:
- singlefile.exec
- singlefile.extract
- singlefile.should_extract
- singlefile.get_output_path
extractors:
singlefile:
binary: singlefile
test: singlefile.should_extract
extract: singlefile.extract
output_files:
- singlefile.html
configs:
ARCHIVE_METHOD_TOGGLES:
SAVE_SINGLEFILE:
type: bool
default: true
DEPENDENCY_CONFIG:
SINGLEFILE_BINARY:
type: str
default: wget
SINGLEFILE_ARGS:
type: Optional[List[str]]
default: null
SINGLEFILE_EXTRA_ARGS:
type: List[str]
default: []
SINGLEFILE_DEFAULT_ARGS:
type: List[str]
default:
- "--timeout={TIMEOUT-10}"
ARCHIVE_METHOD_OPTIONS:
SINGLEFILE_USER_AGENT:
type: str
default: ""
alias: USER_AGENT
SINGLEFILE_TIMEOUT:
type: int
default: 60
alias: TIMEOUT
SINGLEFILE_CHECK_SSL_VALIDITY:
type: bool
default: true
alias: CHECK_SSL_VALIDITY
SINGLEFILE_RESTRICT_FILE_NAMES:
type: str
default: windows
alias: RESTRICT_FILE_NAMES
SINGLEFILE_COOKIES_FILE:
type: Optional[Path]
default: null
alias: COOKIES_FILE

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -31,8 +31,6 @@ import getpass
import platform import platform
import shutil import shutil
import requests import requests
import django
from sqlite3 import dbapi2 as sqlite3
from hashlib import md5 from hashlib import md5
from pathlib import Path from pathlib import Path
@ -43,6 +41,11 @@ from configparser import ConfigParser
from collections import defaultdict from collections import defaultdict
import importlib.metadata import importlib.metadata
from pydantic_pkgr import SemVer
import django
from django.db.backends.sqlite3.base import Database as sqlite3
from .config_stubs import ( from .config_stubs import (
AttrDict, AttrDict,
SimpleConfigValueDict, SimpleConfigValueDict,
@ -52,6 +55,11 @@ from .config_stubs import (
ConfigDefaultDict, ConfigDefaultDict,
) )
# load fallback libraries from vendor dir
from .vendor import load_vendored_libs
load_vendored_libs()
############################### Config Schema ################################## ############################### Config Schema ##################################
@ -89,13 +97,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SECRET_KEY': {'type': str, 'default': None}, 'SECRET_KEY': {'type': str, 'default': None},
'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]}, 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
'ALLOWED_HOSTS': {'type': str, 'default': '*'}, # e.g. archivebox.example.com,archivebox2.example.com 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, # e.g. archivebox.example.com,archivebox2.example.com
'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': ''}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080 'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c['BIND_ADDR'])}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
'DEBUG': {'type': bool, 'default': False}, 'DEBUG': {'type': bool, 'default': False},
'PUBLIC_INDEX': {'type': bool, 'default': True}, 'PUBLIC_INDEX': {'type': bool, 'default': True},
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 100},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'}, 'TIME_ZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'}, 'TIMEZONE': {'type': str, 'default': 'UTC'},
@ -565,7 +573,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])}, 'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)}, 'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)}, 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)}, 'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
'SQLITE_VERSION': {'default': lambda c: sqlite3.version}, 'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
@ -902,16 +910,9 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None) -> Optional[str]
version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode() version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
# take first 3 columns of first line of version info # take first 3 columns of first line of version info
version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE) semver = SemVer.parse(version_str)
try: if semver:
version_nums = version_ptn.findall(version_str.split('\n')[0])[0] return str(semver)
if version_nums:
return version_nums
else:
raise IndexError
except IndexError:
# take first 3 columns of first line of version info
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
except OSError: except OSError:
pass pass
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red') # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
@ -1524,5 +1525,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
assert sql_index_path.exists(), ( assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
if settings.DEBUG_LOGFIRE:
from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
SQLite3Instrumentor().instrument()
import logfire
logfire.configure()
logfire.instrument_django(is_sql_commentor_enabled=True)
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
except KeyboardInterrupt: except KeyboardInterrupt:
raise SystemExit(2) raise SystemExit(2)

View file

@ -10,12 +10,15 @@ from datetime import datetime, timezone
from typing import Dict, Any from typing import Dict, Any
from django.contrib import admin from django.contrib import admin
from django.db.models import Count, Q from django.db.models import Count, Q, Prefetch
from django.urls import path, reverse from django.urls import path, reverse, resolve
from django.utils import timezone
from django.utils.functional import cached_property
from django.utils.html import format_html from django.utils.html import format_html
from django.utils.safestring import mark_safe from django.utils.safestring import mark_safe
from django.shortcuts import render, redirect from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
from django.core.paginator import Paginator
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.conf import settings from django.conf import settings
from django import forms from django import forms
@ -126,22 +129,99 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin) archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
class AccelleratedPaginator(Paginator):
"""
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
Speeds up SELECT Count(*) on Admin views by >20x.
https://hakibenita.com/optimizing-the-django-admin-paginator
"""
@cached_property
def count(self):
if self.object_list._has_filters(): # type: ignore
# fallback to normal count method on filtered queryset
return super().count
else:
# otherwise count total rows in a separate fast query
return self.object_list.model.objects.count()
# Alternative approach for PostgreSQL: fallback count takes > 200ms
# from django.db import connection, transaction, OperationalError
# with transaction.atomic(), connection.cursor() as cursor:
# cursor.execute('SET LOCAL statement_timeout TO 200;')
# try:
# return super().count
# except OperationalError:
# return 9999999999999
class ArchiveResultInline(admin.TabularInline): class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log' name = 'Archive Results Log'
model = ArchiveResult model = ArchiveResult
parent_model = Snapshot
# fk_name = 'snapshot' # fk_name = 'snapshot'
extra = 1 extra = 0
readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version') sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
fields = ('id', *readonly_fields, 'status', 'output') readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version')
fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True show_change_link = True
# # classes = ['collapse'] # # classes = ['collapse']
# # list_display_links = ['abid'] # # list_display_links = ['abid']
def get_parent_object_from_request(self, request):
resolved = resolve(request.path_info)
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
@admin.display(
description='Completed',
ordering='end_ts',
)
def completed(self, obj):
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
def result_id(self, obj): def result_id(self, obj):
return format_html('<a href="{}"><small><code>[{}]</code></small></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
def command(self, obj): def command(self, obj):
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or [])) return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
def version(self, obj):
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
def get_formset(self, request, obj=None, **kwargs):
formset = super().get_formset(request, obj, **kwargs)
snapshot = self.get_parent_object_from_request(request)
# import ipdb; ipdb.set_trace()
formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
# default values for new entries
formset.form.base_fields['status'].initial = 'succeeded'
formset.form.base_fields['start_ts'].initial = timezone.now()
formset.form.base_fields['end_ts'].initial = timezone.now()
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
if obj is not None:
# hidden values for existing entries and new entries
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
return formset
def get_readonly_fields(self, request, obj=None):
if obj is not None:
return self.readonly_fields
else:
return []
class TagInline(admin.TabularInline): class TagInline(admin.TabularInline):
@ -222,25 +302,22 @@ def get_abid_info(self, obj):
@admin.register(Snapshot, site=archivebox_admin) @admin.register(Snapshot, site=archivebox_admin)
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
class Meta:
model = Snapshot
list_display = ('added', 'title_str', 'files', 'size', 'url_str') list_display = ('added', 'title_str', 'files', 'size', 'url_str')
# list_editable = ('title',)
sort_fields = ('title_str', 'url_str', 'added', 'files') sort_fields = ('title_str', 'url_str', 'added', 'files')
readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir') readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name') search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags') list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'created_by', 'title', *readonly_fields) fields = ('url', 'created_by', 'title', *readonly_fields)
ordering = ['-added'] ordering = ['-added']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
autocomplete_fields = ['tags']
inlines = [TagInline, ArchiveResultInline] inlines = [TagInline, ArchiveResultInline]
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm action_form = SnapshotActionForm
paginator = AccelleratedPaginator
save_on_top = True save_on_top = True
show_full_result_count = False
def changelist_view(self, request, extra_context=None): def changelist_view(self, request, extra_context=None):
extra_context = extra_context or {} extra_context = extra_context or {}
@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
] ]
return custom_urls + urls return custom_urls + urls
def get_queryset(self, request): # def get_queryset(self, request):
self.request = request # # tags_qs = SnapshotTag.objects.all().select_related('tag')
return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult')) # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
def tag_list(self, obj): def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True)) return ', '.join(tag.name for tag in obj.tags.all())
# TODO: figure out a different way to do this, you cant nest forms so this doenst work # TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj): # def action(self, obj):
@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
ordering='title', ordering='title',
) )
def title_str(self, obj): def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join( tags = ''.join(
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag) format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
for tag in obj.tags.all() for tag in obj.tags.all()
if str(tag).strip() if str(tag.name).strip()
) )
return format_html( return format_html(
'<a href="/{}">' '<a href="/{}">'
'<img src="/{}/{}" class="favicon" onerror="this.remove()">' '<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
'</a>' '</a>'
'<a href="/{}/index.html">' '<a href="/{}/index.html">'
'<b class="status-{}">{}</b>' '<b class="status-{}">{}</b>'
'</a>', '</a>',
obj.archive_path, obj.archive_path,
obj.archive_path, canon['favicon_path'], obj.archive_path,
obj.archive_path, obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending', 'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
@admin.display( @admin.display(
description='Files Saved', description='Files Saved',
ordering='archiveresult_count', # ordering='archiveresult_count',
) )
def files(self, obj): def files(self, obj):
return snapshot_icons(obj) return snapshot_icons(obj)
@admin.display( @admin.display(
ordering='archiveresult_count' # ordering='archiveresult_count'
) )
def size(self, obj): def size(self, obj):
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin):
actions = ['delete_selected'] actions = ['delete_selected']
ordering = ['-created'] ordering = ['-created']
paginator = AccelleratedPaginator
def API(self, obj): def API(self, obj):
try: try:
return get_abid_info(self, obj) return get_abid_info(self, obj)
@ -574,6 +655,8 @@ class ArchiveResultAdmin(ABIDModelAdmin):
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts'] ordering = ['-start_ts']
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
paginator = AccelleratedPaginator
@admin.display( @admin.display(
description='Snapshot Info' description='Snapshot Info'

View file

@ -4,7 +4,7 @@ from django import forms
from ..util import URL_REGEX from ..util import URL_REGEX
from ..parsers import PARSERS from ..parsers import PARSERS
from ..vendor.taggit_utils import edit_string_for_tags, parse_tags from taggit.utils import edit_string_for_tags, parse_tags
PARSER_CHOICES = [ PARSER_CHOICES = [
(parser_key, parser[0]) (parser_key, parser[0])

View file

@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot") Snapshot = apps.get_model("core", "Snapshot")
num_total = Snapshot.objects.all().count() num_total = Snapshot.objects.all().count()
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...') print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()): for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
assert snapshot.abid assert snapshot.abid
snapshot.abid_prefix = 'snp_' snapshot.abid_prefix = 'snp_'
snapshot.abid_ts_src = 'self.added' snapshot.abid_ts_src = 'self.added'
@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor):
ArchiveResult = apps.get_model("core", "ArchiveResult") ArchiveResult = apps.get_model("core", "ArchiveResult")
num_total = ArchiveResult.objects.all().count() num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()): for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
assert result.abid assert result.abid
result.abid_prefix = 'res_' result.abid_prefix = 'res_'
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id) result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)

View file

@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor):
ArchiveResult = apps.get_model("core", "ArchiveResult") ArchiveResult = apps.get_model("core", "ArchiveResult")
num_total = ArchiveResult.objects.all().count() num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()): for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
assert result.abid assert result.abid
result.uuid = ABID.parse(result.abid).uuid result.uuid = ABID.parse(result.abid).uuid
result.save(update_fields=["uuid"]) result.save(update_fields=["uuid"])

View file

@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
SnapshotTag = apps.get_model("core", "SnapshotTag") SnapshotTag = apps.get_model("core", "SnapshotTag")
num_total = SnapshotTag.objects.all().count() num_total = SnapshotTag.objects.all().count()
print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)') print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()): for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
assert snapshottag.snapshot_old_id assert snapshottag.snapshot_old_id
snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id) snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
snapshottag.snapshot_id = snapshot.id snapshottag.snapshot_id = snapshot.id

View file

@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor):
Tag = apps.get_model("core", "Tag") Tag = apps.get_model("core", "Tag")
num_total = Tag.objects.all().count() num_total = Tag.objects.all().count()
print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...') print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
for idx, tag in enumerate(Tag.objects.all().iterator()): for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
if not tag.slug: if not tag.slug:
tag.slug = tag.name.lower().replace(' ', '_') tag.slug = tag.name.lower().replace(' ', '_')
if not tag.name: if not tag.name:

View file

@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
SnapshotTag = apps.get_model("core", "SnapshotTag") SnapshotTag = apps.get_model("core", "SnapshotTag")
num_total = SnapshotTag.objects.all().count() num_total = SnapshotTag.objects.all().count()
print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)') print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()): for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
assert snapshottag.old_tag_id assert snapshottag.old_tag_id
tag = Tag.objects.get(old_id=snapshottag.old_tag_id) tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
snapshottag.tag_id = tag.id snapshottag.tag_id = tag.id

View file

@ -0,0 +1,35 @@
# Generated by Django 5.1 on 2024-08-28 09:40
import abid_utils.models
import django.utils.timezone
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0068_alter_archiveresult_options'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='created',
field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='added',
field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created',
field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='tag',
name='created',
field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
),
]

View file

@ -12,6 +12,7 @@ from uuid import uuid4
from pathlib import Path from pathlib import Path
from django.db import models from django.db import models
from django.utils import timezone
from django.utils.functional import cached_property from django.utils.functional import cached_property
from django.utils.text import slugify from django.utils.text import slugify
from django.core.cache import cache from django.core.cache import cache
@ -19,7 +20,7 @@ from django.urls import reverse, reverse_lazy
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, Value, IntegerField
from django.conf import settings from django.conf import settings
from abid_utils.models import ABIDModel, ABIDField from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
from ..system import get_dir_size from ..system import get_dir_size
from ..util import parse_date, base_url from ..util import parse_date, base_url
@ -50,7 +51,7 @@ class Tag(ABIDModel):
Based on django-taggit model + ABID base. Based on django-taggit model + ABID base.
""" """
abid_prefix = 'tag_' abid_prefix = 'tag_'
abid_ts_src = 'self.created' # TODO: add created/modified time abid_ts_src = 'self.created'
abid_uri_src = 'self.slug' abid_uri_src = 'self.slug'
abid_subtype_src = '"03"' abid_subtype_src = '"03"'
abid_rand_src = 'self.old_id' abid_rand_src = 'self.old_id'
@ -60,7 +61,6 @@ class Tag(ABIDModel):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True) id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
abid = ABIDField(prefix=abid_prefix) abid = ABIDField(prefix=abid_prefix)
name = models.CharField(unique=True, blank=False, max_length=100) name = models.CharField(unique=True, blank=False, max_length=100)
slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
# slug is autoset on save from name, never set it manually # slug is autoset on save from name, never set it manually
@ -125,6 +125,12 @@ class SnapshotTag(models.Model):
db_table = 'core_snapshot_tags' db_table = 'core_snapshot_tags'
unique_together = [('snapshot', 'tag')] unique_together = [('snapshot', 'tag')]
class SnapshotManager(models.Manager):
def get_queryset(self):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
class Snapshot(ABIDModel): class Snapshot(ABIDModel):
abid_prefix = 'snp_' abid_prefix = 'snp_'
abid_ts_src = 'self.added' abid_ts_src = 'self.added'
@ -143,16 +149,15 @@ class Snapshot(ABIDModel):
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
added = models.DateTimeField(auto_now_add=True, db_index=True) added = AutoDateTimeField(default=timezone.now, db_index=True)
updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True) updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
keys = ('url', 'timestamp', 'title', 'tags', 'updated') keys = ('url', 'timestamp', 'title', 'tags', 'updated')
archiveresult_set: models.Manager['ArchiveResult'] archiveresult_set: models.Manager['ArchiveResult']
@property objects = SnapshotManager()
def uuid(self):
return self.id
def __repr__(self) -> str: def __repr__(self) -> str:
title = (self.title_stripped or '-')[:64] title = (self.title_stripped or '-')[:64]
@ -162,13 +167,6 @@ class Snapshot(ABIDModel):
title = (self.title_stripped or '-')[:64] title = (self.title_stripped or '-')[:64]
return f'[{self.timestamp}] {self.url[:64]} ({title})' return f'[{self.timestamp}] {self.url[:64]} ({title})'
def save(self, *args, **kwargs):
super().save(*args, **kwargs)
try:
assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
except AssertionError as e:
print(e)
@classmethod @classmethod
def from_json(cls, info: dict): def from_json(cls, info: dict):
info = {k: v for k, v in info.items() if k in cls.keys} info = {k: v for k, v in info.items() if k in cls.keys}
@ -177,8 +175,7 @@ class Snapshot(ABIDModel):
def as_json(self, *args) -> dict: def as_json(self, *args) -> dict:
args = args or self.keys args = args or self.keys
return { return {
key: getattr(self, key) key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
if key != 'tags' else self.tags_str()
for key in args for key in args
} }
@ -190,8 +187,14 @@ class Snapshot(ABIDModel):
return load_link_details(self.as_link()) return load_link_details(self.as_link())
def tags_str(self, nocache=True) -> str | None: def tags_str(self, nocache=True) -> str | None:
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags' cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
# tags are pre-fetched already, use them directly (best because db is always freshest)
tags_str = calc_tags_str()
return tags_str
if nocache: if nocache:
tags_str = calc_tags_str() tags_str = calc_tags_str()
cache.set(cache_key, tags_str) cache.set(cache_key, tags_str)
@ -234,7 +237,10 @@ class Snapshot(ABIDModel):
@cached_property @cached_property
def num_outputs(self) -> int: def num_outputs(self) -> int:
return self.archiveresult_set.filter(status='succeeded').count() # DONT DO THIS: it will trigger a separate query for every snapshot
# return self.archiveresult_set.filter(status='succeeded').count()
# this is better:
return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
@cached_property @cached_property
def base_url(self): def base_url(self):
@ -262,10 +268,21 @@ class Snapshot(ABIDModel):
@cached_property @cached_property
def thumbnail_url(self) -> Optional[str]: def thumbnail_url(self) -> Optional[str]:
result = self.archiveresult_set.filter( if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
extractor='screenshot', result = (sorted(
status='succeeded' (
).only('output').last() result
for result in self.archiveresult_set.all()
if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
),
key=lambda result: result.created,
) or [None])[-1]
else:
result = self.archiveresult_set.filter(
extractor='screenshot',
status='succeeded'
).only('output').last()
if result: if result:
return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}']) return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
return None return None
@ -292,6 +309,21 @@ class Snapshot(ABIDModel):
if self.title: if self.title:
return self.title # whoopdedoo that was easy return self.title # whoopdedoo that was easy
# check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
try:
return (sorted(
(
result.output.strip()
for result in self.archiveresult_set.all()
if result.extractor == 'title' and result.status =='succeeded' and result.output
),
key=lambda title: len(title),
) or [None])[-1]
except IndexError:
pass
try: try:
# take longest successful title from ArchiveResult db history # take longest successful title from ArchiveResult db history
return sorted( return sorted(
@ -355,12 +387,23 @@ class Snapshot(ABIDModel):
class ArchiveResultManager(models.Manager): class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True): def indexable(self, sorted: bool = True):
"""Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
if sorted: if sorted:
precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] precedence = [
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') When(extractor=method, then=Value(precedence))
for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
]
qs = qs.annotate(
indexing_precedence=Case(
*precedence,
default=Value(1000),
output_field=IntegerField()
)
).order_by('indexing_precedence')
return qs return qs
class ArchiveResult(ABIDModel): class ArchiveResult(ABIDModel):
@ -418,17 +461,6 @@ class ArchiveResult(ABIDModel):
def __str__(self): def __str__(self):
return self.extractor return self.extractor
def save(self, *args, **kwargs):
super().save(*args, **kwargs)
try:
assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
except AssertionError as e:
print(e)
@property
def uuid(self):
return self.id
@cached_property @cached_property
def snapshot_dir(self): def snapshot_dir(self):
return Path(self.snapshot.link_dir) return Path(self.snapshot.link_dir)

View file

@ -4,7 +4,9 @@ import os
import sys import sys
import re import re
import logging import logging
import inspect
import tempfile import tempfile
from typing import Any, Dict
from pathlib import Path from pathlib import Path
from django.utils.crypto import get_random_string from django.utils.crypto import get_random_string
@ -33,22 +35,20 @@ APPEND_SLASH = True
DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv) DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
# add plugins folders to system path, and load plugins in installed_apps BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'builtin_plugins'
BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'plugins' USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'user_plugins'
USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'plugins'
sys.path.insert(0, str(BUILTIN_PLUGINS_DIR))
sys.path.insert(0, str(USER_PLUGINS_DIR))
def find_plugins(plugins_dir): def find_plugins(plugins_dir, prefix: str) -> Dict[str, Any]:
return { plugins = {
# plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA f'{prefix}.{plugin_entrypoint.parent.name}': plugin_entrypoint.parent
plugin_entrypoint.parent.name: plugin_entrypoint.parent
for plugin_entrypoint in plugins_dir.glob('*/apps.py') for plugin_entrypoint in plugins_dir.glob('*/apps.py')
} }
# print(f'Found {prefix} plugins:\n', '\n '.join(plugins.keys()))
return plugins
INSTALLED_PLUGINS = { INSTALLED_PLUGINS = {
**find_plugins(BUILTIN_PLUGINS_DIR), **find_plugins(BUILTIN_PLUGINS_DIR, prefix='builtin_plugins'),
**find_plugins(USER_PLUGINS_DIR), **find_plugins(USER_PLUGINS_DIR, prefix='user_plugins'),
} }
@ -66,11 +66,11 @@ INSTALLED_APPS = [
'plugantic', 'plugantic',
'core', 'core',
'api', 'api',
'pkg',
*INSTALLED_PLUGINS.keys(), *INSTALLED_PLUGINS.keys(),
'admin_data_views', 'admin_data_views',
'django_extensions', 'django_extensions',
] ]
@ -144,64 +144,6 @@ if CONFIG.LDAP:
# sys.exit(1) # sys.exit(1)
################################################################################
### Debug Settings
################################################################################
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
if DEBUG_TOOLBAR:
try:
import debug_toolbar # noqa
DEBUG_TOOLBAR = True
except ImportError:
DEBUG_TOOLBAR = False
if DEBUG_TOOLBAR:
INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
DEBUG_TOOLBAR_CONFIG = {
"SHOW_TOOLBAR_CALLBACK": lambda request: True,
"RENDER_PANELS": True,
}
DEBUG_TOOLBAR_PANELS = [
'debug_toolbar.panels.history.HistoryPanel',
'debug_toolbar.panels.versions.VersionsPanel',
'debug_toolbar.panels.timer.TimerPanel',
'debug_toolbar.panels.settings.SettingsPanel',
'debug_toolbar.panels.headers.HeadersPanel',
'debug_toolbar.panels.request.RequestPanel',
'debug_toolbar.panels.sql.SQLPanel',
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
# 'debug_toolbar.panels.templates.TemplatesPanel',
'debug_toolbar.panels.cache.CachePanel',
'debug_toolbar.panels.signals.SignalsPanel',
'debug_toolbar.panels.logging.LoggingPanel',
'debug_toolbar.panels.redirects.RedirectsPanel',
'debug_toolbar.panels.profiling.ProfilingPanel',
'djdt_flamegraph.FlamegraphPanel',
]
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
if DEBUG:
from django_autotyping.typing import AutotypingSettingsDict
INSTALLED_APPS += ['django_autotyping']
AUTOTYPING: AutotypingSettingsDict = {
"STUBS_GENERATION": {
"LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
}
}
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
# Must delete archivebox/templates/admin to use because it relies on some things we override
# visit /__requests_tracker__/ to access
DEBUG_REQUESTS_TRACKER = False
if DEBUG_REQUESTS_TRACKER:
INSTALLED_APPS += ["requests_tracker"]
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
################################################################################ ################################################################################
### Staticfile and Template Settings ### Staticfile and Template Settings
@ -317,13 +259,15 @@ STORAGES = {
SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',') ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
CSRF_TRUSTED_ORIGINS = CONFIG.CSRF_TRUSTED_ORIGINS.split(',') CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com) # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
if CONFIG.ALLOWED_HOSTS != '*' and (not CSRF_TRUSTED_ORIGINS): for hostname in ALLOWED_HOSTS:
for hostname in ALLOWED_HOSTS: https_endpoint = f'https://{hostname}'
CSRF_TRUSTED_ORIGINS.append(f'https://{hostname}') if hostname != '*' and https_endpoint not in CSRF_TRUSTED_ORIGINS:
print(f'[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS')
CSRF_TRUSTED_ORIGINS.append(https_endpoint)
SECURE_BROWSER_XSS_FILTER = True SECURE_BROWSER_XSS_FILTER = True
SECURE_CONTENT_TYPE_NOSNIFF = True SECURE_CONTENT_TYPE_NOSNIFF = True
@ -345,6 +289,8 @@ AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'}, {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
] ]
DATA_UPLOAD_MAX_NUMBER_FIELDS = None
################################################################################ ################################################################################
### Shell Settings ### Shell Settings
################################################################################ ################################################################################
@ -385,6 +331,10 @@ IGNORABLE_404_URLS = [
re.compile(r'robots\.txt$'), re.compile(r'robots\.txt$'),
re.compile(r'.*\.(css|js)\.map$'), re.compile(r'.*\.(css|js)\.map$'),
] ]
IGNORABLE_200_URLS = [
re.compile(r'^"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M),
re.compile(r'^"GET /admin/jsi18n/ HTTP/.*" (200|30.) .+', re.I | re.M),
]
class NoisyRequestsFilter(logging.Filter): class NoisyRequestsFilter(logging.Filter):
def filter(self, record) -> bool: def filter(self, record) -> bool:
@ -396,19 +346,26 @@ class NoisyRequestsFilter(logging.Filter):
if ignorable_log_pattern.match(logline): if ignorable_log_pattern.match(logline):
return False return False
# ignore staticfile requests that 200 or 30* ignorable_log_pattern = re.compile(f'^Not Found: /.*/?{ignorable_url_pattern.pattern}', re.I | re.M)
ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M) if ignorable_log_pattern.match(logline):
if ignoreable_200_log_pattern.match(logline): return False
return False
# ignore staticfile requests that 200 or 30*
for ignorable_url_pattern in IGNORABLE_200_URLS:
if ignorable_log_pattern.match(logline):
return False
return True return True
ERROR_LOG = tempfile.NamedTemporaryFile().name
if CONFIG.LOGS_DIR.exists(): if CONFIG.LOGS_DIR.exists():
ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log') ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
else: else:
# historically too many edge cases here around creating log dir w/ correct permissions early on # historically too many edge cases here around creating log dir w/ correct permissions early on
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
ERROR_LOG = tempfile.NamedTemporaryFile().name print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
LOGGING = { LOGGING = {
'version': 1, 'version': 1,
@ -445,6 +402,10 @@ LOGGING = {
} }
################################################################################
### REST API Outbound Webhooks settings
################################################################################
# Add default webhook configuration to the User model # Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook' SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
SIGNAL_WEBHOOKS = { SIGNAL_WEBHOOKS = {
@ -458,7 +419,9 @@ SIGNAL_WEBHOOKS = {
}, },
} }
DATA_UPLOAD_MAX_NUMBER_FIELDS = None ################################################################################
### Admin Data View Settings
################################################################################
ADMIN_DATA_VIEWS = { ADMIN_DATA_VIEWS = {
"NAME": "Environment", "NAME": "Environment",
@ -495,3 +458,86 @@ ADMIN_DATA_VIEWS = {
}, },
], ],
} }
################################################################################
### Debug Settings
################################################################################
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
DEBUG_TOOLBAR = False
DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
if DEBUG_TOOLBAR:
try:
import debug_toolbar # noqa
DEBUG_TOOLBAR = True
except ImportError:
DEBUG_TOOLBAR = False
if DEBUG_TOOLBAR:
INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
DEBUG_TOOLBAR_CONFIG = {
"SHOW_TOOLBAR_CALLBACK": lambda request: True,
"RENDER_PANELS": True,
}
DEBUG_TOOLBAR_PANELS = [
'debug_toolbar.panels.history.HistoryPanel',
'debug_toolbar.panels.versions.VersionsPanel',
'debug_toolbar.panels.timer.TimerPanel',
'debug_toolbar.panels.settings.SettingsPanel',
'debug_toolbar.panels.headers.HeadersPanel',
'debug_toolbar.panels.request.RequestPanel',
'debug_toolbar.panels.sql.SQLPanel',
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
# 'debug_toolbar.panels.templates.TemplatesPanel',
'debug_toolbar.panels.cache.CachePanel',
'debug_toolbar.panels.signals.SignalsPanel',
'debug_toolbar.panels.logging.LoggingPanel',
'debug_toolbar.panels.redirects.RedirectsPanel',
'debug_toolbar.panels.profiling.ProfilingPanel',
'djdt_flamegraph.FlamegraphPanel',
]
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
if DEBUG:
from django_autotyping.typing import AutotypingSettingsDict
INSTALLED_APPS += ['django_autotyping']
AUTOTYPING: AutotypingSettingsDict = {
"STUBS_GENERATION": {
"LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
}
}
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
# Must delete archivebox/templates/admin to use because it relies on some things we override
# visit /__requests_tracker__/ to access
DEBUG_REQUESTS_TRACKER = True
DEBUG_REQUESTS_TRACKER = DEBUG_REQUESTS_TRACKER and DEBUG
if DEBUG_REQUESTS_TRACKER:
import requests_tracker
INSTALLED_APPS += ["requests_tracker"]
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
TEMPLATE_DIRS.insert(0, str(Path(inspect.getfile(requests_tracker)).parent / "templates"))
REQUESTS_TRACKER_CONFIG = {
"TRACK_SQL": True,
"ENABLE_STACKTRACES": False,
"IGNORE_PATHS_PATTERNS": (
r".*/favicon\.ico",
r".*\.png",
r"/admin/jsi18n/",
),
"IGNORE_SQL_PATTERNS": (
r"^SELECT .* FROM django_migrations WHERE app = 'requests_tracker'",
r"^SELECT .* FROM django_migrations WHERE app = 'auth'",
),
}
# https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.)
DEBUG_LOGFIRE = False
DEBUG_LOGFIRE = DEBUG_LOGFIRE and (Path(CONFIG.OUTPUT_DIR) / '.logfire').is_dir()

View file

@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
if type(all_links) is QuerySet: if type(all_links) is QuerySet:
num_links: int = all_links.count() num_links: int = all_links.count()
get_link = lambda x: x.as_link_with_details() get_link = lambda x: x.as_link_with_details()
all_links = all_links.iterator() all_links = all_links.iterator(chunk_size=500)
else: else:
num_links: int = len(all_links) num_links: int = len(all_links)
get_link = lambda x: x get_link = lambda x: x

View file

@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
@enforce_types @enforce_types
def wget_output_path(link: Link) -> Optional[str]: def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may """calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path. adjust some paths to be different than the base_url path.
@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]:
# https://example.com/abc/test/?v=zzVa_tX1OiI # https://example.com/abc/test/?v=zzVa_tX1OiI
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html # > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path'
if not nocache:
from django.core.cache import cache
cached_result = cache.get(cache_key)
if cached_result:
return cached_result
# There's also lots of complexity around how the urlencoding and renaming # There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc, # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]:
output_path = None output_path = None
if output_path: if output_path:
if not nocache:
cache.set(cache_key, output_path)
return output_path return output_path
# fallback to just the domain dir # fallback to just the domain dir

View file

@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity""" """indexed links without checking archive status or data directory validity"""
links = (snapshot.as_link() for snapshot in snapshots.iterator()) links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return { return {
link.link_dir: link link.link_dir: link
for link in links for link in links
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory""" """indexed links that are archived with a valid data directory"""
links = (snapshot.as_link() for snapshot in snapshots.iterator()) links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return { return {
link.link_dir: link link.link_dir: link
for link in filter(is_archived, links) for link in filter(is_archived, links)
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory""" """indexed links that are unarchived with no data directory or an empty data directory"""
links = (snapshot.as_link() for snapshot in snapshots.iterator()) links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return { return {
link.link_dir: link link.link_dir: link
for link in filter(is_unarchived, links) for link in filter(is_unarchived, links)
@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs with a valid index matched to the main index and archived content""" """dirs with a valid index matched to the main index and archived content"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
return { return {
link.link_dir: link link.link_dir: link
for link in filter(is_valid, links) for link in filter(is_valid, links)
@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists() if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
) )
for path in chain(snapshots.iterator(), data_folders): for path in chain(snapshots.iterator(chunk_size=500), data_folders):
link = None link = None
if type(path) is not str: if type(path) is not str:
path = path.as_link().link_dir path = path.as_link().link_dir
@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain a valid index and aren't listed in the main index""" """dirs that don't contain a valid index and aren't listed in the main index"""
corrupted = {} corrupted = {}
for snapshot in snapshots.iterator(): for snapshot in snapshots.iterator(chunk_size=500):
link = snapshot.as_link() link = snapshot.as_link()
if is_corrupt(link): if is_corrupt(link):
corrupted[link.link_dir] = link corrupted[link.link_dir] = link

View file

@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str:
from core.models import ArchiveResult from core.models import ArchiveResult
# start = datetime.now(timezone.utc) # start = datetime.now(timezone.utc)
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache:
archive_results = [
result
for result in snapshot.archiveresult_set.all()
if result.status == "succeeded" and result.output
]
else:
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
link = snapshot.as_link() link = snapshot.as_link()
path = link.archive_path path = link.archive_path
canon = link.canonical_outputs() canon = link.canonical_outputs()

View file

@ -37,9 +37,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
@enforce_types @enforce_types
def write_link_to_sql_index(link: Link, created_by_id: int | None=None): def write_link_to_sql_index(link: Link, created_by_id: int | None=None):
from core.models import Snapshot, ArchiveResult from core.models import Snapshot, ArchiveResult
from abid_utils.models import get_or_create_system_user_pk
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
info['created_by_id'] = created_by_id info['created_by_id'] = created_by_id or get_or_create_system_user_pk()
tag_list = list(dict.fromkeys( tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '') tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')

View file

@ -960,7 +960,8 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
run_subcommand('init', stdin=None, pwd=out_dir) run_subcommand('init', stdin=None, pwd=out_dir)
setup_django(out_dir=out_dir, check_db=True) setup_django(out_dir=out_dir, check_db=True)
from core.models import User from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exists(): if not User.objects.filter(is_superuser=True).exists():
stderr('\n[+] Creating new admin user for the Web UI...', color='green') stderr('\n[+] Creating new admin user for the Web UI...', color='green')
@ -979,16 +980,16 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
'--upgrade', '--upgrade',
'--no-cache-dir', '--no-cache-dir',
'--no-warn-script-location', '--no-warn-script-location',
'youtube_dl', 'yt-dlp',
], capture_output=False, cwd=out_dir) ], capture_output=False, cwd=out_dir)
pkg_path = run_shell([ pkg_path = run_shell([
PYTHON_BINARY, '-m', 'pip', PYTHON_BINARY, '-m', 'pip',
'show', 'show',
'youtube_dl', 'yt-dlp',
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0] ], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'youtube_dl' / '__main__.py' NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777) os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
assert NEW_YOUTUBEDL_BINARY.exists(), f'youtube_dl must exist inside {pkg_path}' assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir) config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir)
except BaseException as e: # lgtm [py/catch-base-exception] except BaseException as e: # lgtm [py/catch-base-exception]
stderr(f'[X] Failed to install python packages: {e}', color='red') stderr(f'[X] Failed to install python packages: {e}', color='red')

View file

@ -11,7 +11,7 @@
"dependencies": { "dependencies": {
"@postlight/parser": "^2.2.3", "@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor", "readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.54" "single-file-cli": "^2.0.58"
} }
}, },
"node_modules/@asamuzakjp/dom-selector": { "node_modules/@asamuzakjp/dom-selector": {
@ -236,9 +236,9 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/node": { "node_modules/@types/node": {
"version": "22.5.0", "version": "22.5.1",
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
"integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==", "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"dependencies": { "dependencies": {
@ -353,9 +353,9 @@
} }
}, },
"node_modules/aws4": { "node_modules/aws4": {
"version": "1.13.1", "version": "1.13.2",
"resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz", "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
"integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==", "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/b4a": { "node_modules/b4a": {
@ -2376,9 +2376,9 @@
} }
}, },
"node_modules/tslib": { "node_modules/tslib": {
"version": "2.6.3", "version": "2.7.0",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
"integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==", "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
"license": "0BSD" "license": "0BSD"
}, },
"node_modules/turndown": { "node_modules/turndown": {

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
from configparser import ConfigParser from configparser import ConfigParser
from pathlib import Path from pathlib import Path
from ..vendor.pocket import Pocket from pocket import Pocket
from ..index.schema import Link from ..index.schema import Link
from ..util import enforce_types from ..util import enforce_types

View file

3
archivebox/pkg/admin.py Normal file
View file

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

14
archivebox/pkg/apps.py Normal file
View file

@ -0,0 +1,14 @@
__package__ = 'archivebox.pkg'
from django.apps import AppConfig
class PkgsConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'pkg'
def ready(self):
from .settings import LOADED_DEPENDENCIES
# print(LOADED_DEPENDENCIES)

View file

View file

@ -0,0 +1,75 @@
__package__ = 'archivebox.pkg.management.commands'
from django.core.management.base import BaseCommand
from django.conf import settings
from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
from pydantic_pkgr.binprovider import bin_abspath
from ....config import NODE_BIN_PATH, bin_path
from plugantic.plugins import LOADED_PLUGINS
from pkg.settings import env
class Command(BaseCommand):
def handle(self, *args, method, **options):
method(*args, **options)
def add_arguments(self, parser):
subparsers = parser.add_subparsers(title="sub-commands", required=True)
list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
list_parser.set_defaults(method=self.list)
install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
install_parser.add_argument("package_names", nargs="+", type=str)
install_parser.set_defaults(method=self.install)
def list(self, *args, **options):
self.stdout.write('################# PLUGINS ####################')
for plugin in LOADED_PLUGINS:
self.stdout.write(f'{plugin.name}:')
for binary in plugin.binaries:
try:
binary = binary.install()
except Exception as e:
# import ipdb; ipdb.set_trace()
raise
self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}')
self.stdout.write('\n################# LEGACY ####################')
for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
bin_name = settings.CONFIG[bin_key]
self.stdout.write(f'{bin_key}: {bin_name}')
# binary = Binary(name=package_name, providers=[env])
# print(binary)
# try:
# loaded_bin = binary.load()
# self.stdout.write(
# self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
# )
# except Exception as e:
# self.stderr.write(
# self.style.ERROR(f"Error loading {package_name}: {e}")
# )
def install(self, *args, bright, **options):
for package_name in options["package_names"]:
binary = Binary(name=package_name, providers=[env])
print(binary)
try:
loaded_bin = binary.load()
self.stdout.write(
self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
)
except Exception as e:
self.stderr.write(
self.style.ERROR(f"Error loading {package_name}: {e}")
)

View file

3
archivebox/pkg/models.py Normal file
View file

@ -0,0 +1,3 @@
from django.db import models
# Create your models here.

View file

@ -0,0 +1,86 @@
__package__ = 'archivebox.pkg'
import os
import sys
import shutil
import inspect
from pathlib import Path
import django
from django.conf import settings
from django.db.backends.sqlite3.base import Database as sqlite3
from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
from pydantic_pkgr.binprovider import bin_abspath
from ..config import NODE_BIN_PATH, bin_path
env = EnvProvider(PATH=NODE_BIN_PATH + ':' + os.environ.get('PATH', '/bin'))
LOADED_DEPENDENCIES = {}
for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
# 'PYTHON_BINARY': {
# 'path': bin_path(config['PYTHON_BINARY']),
# 'version': config['PYTHON_VERSION'],
# 'hash': bin_hash(config['PYTHON_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['PYTHON_VERSION']),
# },
bin_name = settings.CONFIG[bin_key]
if bin_name.endswith('django/__init__.py'):
binary_spec = Binary(name='django', providers=[env], provider_overrides={
'env': {
'abspath': lambda: Path(inspect.getfile(django)),
'version': lambda: SemVer('{}.{}.{} {} ({})'.format(*django.VERSION)),
}
})
elif bin_name.endswith('sqlite3/dbapi2.py'):
binary_spec = Binary(name='sqlite3', providers=[env], provider_overrides={
'env': {
'abspath': lambda: Path(inspect.getfile(sqlite3)),
'version': lambda: SemVer(sqlite3.version),
}
})
elif bin_name.endswith('archivebox'):
binary_spec = Binary(name='archivebox', providers=[env], provider_overrides={
'env': {
'abspath': lambda: shutil.which(str(Path('archivebox').expanduser())),
'version': lambda: settings.CONFIG.VERSION,
}
})
elif bin_name.endswith('postlight/parser/cli.js'):
binary_spec = Binary(name='postlight-parser', providers=[env], provider_overrides={
'env': {
'abspath': lambda: bin_path('postlight-parser'),
'version': lambda: SemVer('1.0.0'),
}
})
else:
binary_spec = Binary(name=bin_name, providers=[env])
try:
binary = binary_spec.load()
except Exception as e:
# print(f"- ❌ Binary {bin_name} failed to load with error: {e}")
continue
assert isinstance(binary.loaded_version, SemVer)
try:
assert str(binary.loaded_version) == dependency['version'], f"Expected {bin_name} version {dependency['version']}, got {binary.loaded_version}"
assert str(binary.loaded_respath) == str(bin_abspath(dependency['path']).resolve()), f"Expected {bin_name} abspath {bin_abspath(dependency['path']).resolve()}, got {binary.loaded_respath}"
assert binary.is_valid == dependency['is_valid'], f"Expected {bin_name} is_valid={dependency['is_valid']}, got {binary.is_valid}"
except Exception as e:
pass
# print(f"WARNING: Error loading {bin_name}: {e}")
# import ipdb; ipdb.set_trace()
# print(f"- ✅ Binary {bin_name} loaded successfully")
LOADED_DEPENDENCIES[bin_key] = binary

3
archivebox/pkg/tests.py Normal file
View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

3
archivebox/pkg/views.py Normal file
View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View file

@ -1,6 +1,5 @@
__package__ = 'archivebox.plugantic' __package__ = 'archivebox.plugantic'
from .binproviders import BinProvider
from .binaries import Binary from .binaries import Binary
from .extractors import Extractor from .extractors import Extractor
from .replayers import Replayer from .replayers import Replayer

View file

@ -1,6 +1,17 @@
import importlib
from django.apps import AppConfig from django.apps import AppConfig
class PluganticConfig(AppConfig): class PluganticConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField' default_auto_field = 'django.db.models.BigAutoField'
name = 'plugantic' name = 'plugantic'
def ready(self) -> None:
from django.conf import settings
from .plugins import PLUGINS
for plugin_name in settings.INSTALLED_PLUGINS.keys():
lib = importlib.import_module(f'{plugin_name}.apps')
if hasattr(lib, 'PLUGINS'):
for plugin_instance in lib.PLUGINS:
PLUGINS.append(plugin_instance)

View file

@ -10,285 +10,17 @@ from typing import Any, Optional, Dict, List
from typing_extensions import Self from typing_extensions import Self
from subprocess import run, PIPE from subprocess import run, PIPE
from pydantic_pkgr import Binary, SemVer, BinName, BinProvider, EnvProvider, AptProvider, BrewProvider, PipProvider, BinProviderName, ProviderLookupDict
from pydantic_core import ValidationError import django
from django.db.backends.sqlite3.base import Database as sqlite3
from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer
from .binproviders import (
SemVer,
BinName,
BinProviderName,
HostBinPath,
BinProvider,
EnvProvider,
AptProvider,
BrewProvider,
PipProvider,
ProviderLookupDict,
bin_name,
bin_abspath,
path_is_script,
path_is_executable,
)
class Binary(BaseModel):
name: BinName
description: str = Field(default='')
providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers')
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides')
loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider')
loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath')
loaded_version: Optional[SemVer] = Field(default=None, alias='version')
# bin_filename: see below
# is_executable: see below
# is_script
# is_valid: see below
@model_validator(mode='after')
def validate(self):
self.loaded_abspath = bin_abspath(self.name) or self.name
self.description = self.description or self.name
assert self.providers_supported, f'No providers were given for package {self.name}'
# pull in any overrides from the binproviders
for provider in self.providers_supported:
overrides_by_provider = provider.get_providers_for_bin(self.name)
if overrides_by_provider:
self.provider_overrides[provider.name] = {
**overrides_by_provider,
**self.provider_overrides.get(provider.name, {}),
}
return self
@field_validator('loaded_abspath', mode='before')
def parse_abspath(cls, value: Any):
return bin_abspath(value)
@field_validator('loaded_version', mode='before')
def parse_version(cls, value: Any):
return value and SemVer(value)
@field_serializer('provider_overrides', when_used='json')
def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]:
return {
provider_name: {
key: str(val)
for key, val in overrides.items()
}
for provider_name, overrides in provider_overrides.items()
}
@computed_field # type: ignore[misc] # see mypy issue #1362
@property
def bin_filename(self) -> BinName:
if self.is_script:
# e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite
name = self.name
elif self.loaded_abspath:
# e.g. '/opt/homebrew/bin/wget' -> wget
name = bin_name(self.loaded_abspath)
else:
# e.g. 'ytdlp' -> 'yt-dlp'
name = bin_name(self.name)
return name
@computed_field # type: ignore[misc] # see mypy issue #1362
@property
def is_executable(self) -> bool:
try:
assert self.loaded_abspath and path_is_executable(self.loaded_abspath)
return True
except (ValidationError, AssertionError):
return False
@computed_field # type: ignore[misc] # see mypy issue #1362
@property
def is_script(self) -> bool:
try:
assert self.loaded_abspath and path_is_script(self.loaded_abspath)
return True
except (ValidationError, AssertionError):
return False
@computed_field # type: ignore[misc] # see mypy issue #1362
@property
def is_valid(self) -> bool:
return bool(
self.name
and self.loaded_abspath
and self.loaded_version
and (self.is_executable or self.is_script)
)
@validate_call
def install(self) -> Self:
if not self.providers_supported:
return self
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
for provider in self.providers_supported:
try:
installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name))
if installed_bin:
# print('INSTALLED', self.name, installed_bin)
return self.model_copy(update={
'loaded_provider': provider.name,
'loaded_abspath': installed_bin.abspath,
'loaded_version': installed_bin.version,
})
except Exception as err:
print(err)
exc = err
raise exc
@validate_call
def load(self, cache=True) -> Self:
if self.is_valid:
return self
if not self.providers_supported:
return self
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
for provider in self.providers_supported:
try:
installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name))
if installed_bin:
# print('LOADED', provider, self.name, installed_bin)
return self.model_copy(update={
'loaded_provider': provider.name,
'loaded_abspath': installed_bin.abspath,
'loaded_version': installed_bin.version,
})
except Exception as err:
print(err)
exc = err
raise exc
@validate_call
def load_or_install(self, cache=True) -> Self:
if self.is_valid:
return self
if not self.providers_supported:
return self
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
for provider in self.providers_supported:
try:
installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache)
if installed_bin:
# print('LOADED_OR_INSTALLED', self.name, installed_bin)
return self.model_copy(update={
'loaded_provider': provider.name,
'loaded_abspath': installed_bin.abspath,
'loaded_version': installed_bin.version,
})
except Exception as err:
print(err)
exc = err
raise exc
@validate_call
def exec(self, args=(), pwd='.'):
assert self.loaded_abspath
assert self.loaded_version
return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd)
class SystemPythonHelpers: def get_ytdlp_version() -> str:
@staticmethod import yt_dlp
def get_subdeps() -> str: return yt_dlp.version.__version__
return 'python3 python3-minimal python3-pip python3-virtualenv'
@staticmethod
def get_abspath() -> str:
return sys.executable
@staticmethod
def get_version() -> str:
return '{}.{}.{}'.format(*sys.version_info[:3])
class SqliteHelpers:
@staticmethod
def get_abspath() -> Path:
import sqlite3
importlib.reload(sqlite3)
return Path(inspect.getfile(sqlite3))
@staticmethod
def get_version() -> SemVer:
import sqlite3
importlib.reload(sqlite3)
version = sqlite3.version
assert version
return SemVer(version)
class DjangoHelpers:
@staticmethod
def get_django_abspath() -> str:
import django
return inspect.getfile(django)
@staticmethod
def get_django_version() -> str:
import django
return '{}.{}.{} {} ({})'.format(*django.VERSION)
class YtdlpHelpers:
@staticmethod
def get_ytdlp_subdeps() -> str:
return 'yt-dlp ffmpeg'
@staticmethod
def get_ytdlp_version() -> str:
import yt_dlp
importlib.reload(yt_dlp)
version = yt_dlp.version.__version__
assert version
return version
class PythonBinary(Binary):
name: BinName = 'python'
providers_supported: List[BinProvider] = [
EnvProvider(
subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'},
abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'},
version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'},
),
]
class SqliteBinary(Binary):
name: BinName = 'sqlite'
providers_supported: List[BinProvider] = [
EnvProvider(
version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'},
abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'},
),
]
class DjangoBinary(Binary):
name: BinName = 'django'
providers_supported: List[BinProvider] = [
EnvProvider(
abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'},
version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'},
),
]
@ -296,16 +28,26 @@ class DjangoBinary(Binary):
class YtdlpBinary(Binary): class YtdlpBinary(Binary):
name: BinName = 'yt-dlp' name: BinName = 'yt-dlp'
providers_supported: List[BinProvider] = [ providers_supported: List[BinProvider] = [
# EnvProvider(), EnvProvider(),
PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}), PipProvider(),
BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}), BrewProvider(),
# AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}), AptProvider(),
] ]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
'pip': {
'version': get_ytdlp_version,
},
'brew': {
'subdeps': lambda: 'yt-dlp ffmpeg',
},
'apt': {
'subdeps': lambda: 'yt-dlp ffmpeg',
}
}
class WgetBinary(Binary): class WgetBinary(Binary):
name: BinName = 'wget' name: BinName = 'wget'
providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()] providers_supported: List[BinProvider] = [EnvProvider(), AptProvider(), BrewProvider()]
# if __name__ == '__main__': # if __name__ == '__main__':

View file

@ -1,561 +0,0 @@
__package__ = 'archivebox.plugantic'
import os
import shutil
import operator
from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING
from typing_extensions import Self
from abc import ABC, abstractmethod
from collections import namedtuple
from pathlib import Path
from subprocess import run, PIPE
from pydantic_core import core_schema, ValidationError
from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler
def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool:
"""returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless"""
code = lambda_func.__code__
has_args = code.co_argcount > 0
has_varargs = code.co_flags & 0x04 != 0
has_varkw = code.co_flags & 0x08 != 0
return has_args or has_varargs or has_varkw
def is_semver_str(semver: Any) -> bool:
if isinstance(semver, str):
return (semver.count('.') == 2 and semver.replace('.', '').isdigit())
return False
def semver_to_str(semver: tuple[int, int, int] | str) -> str:
if isinstance(semver, (list, tuple)):
return '.'.join(str(chunk) for chunk in semver)
if is_semver_str(semver):
return semver
raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver))
SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0))
SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int]
class SemVer(SemVerTuple):
major: int
minor: int = 0
patch: int = 0
if TYPE_CHECKING:
full_text: str | None = ''
def __new__(cls, *args, full_text=None, **kwargs):
# '1.1.1'
if len(args) == 1 and is_semver_str(args[0]):
result = SemVer.parse(args[0])
# ('1', '2', '3')
elif len(args) == 1 and isinstance(args[0], (tuple, list)):
result = SemVer.parse(args[0])
# (1, '2', None)
elif not all(isinstance(arg, (int, type(None))) for arg in args):
result = SemVer.parse(args)
# (None)
elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())):
result = None
# 1, 2, 3
else:
result = SemVerTuple.__new__(cls, *args, **kwargs)
if result is not None:
# add first line as extra hidden metadata so it can be logged without having to re-run version cmd
result.full_text = full_text or str(result)
return result
@classmethod
def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None:
"""
parses a version tag string formatted like into (major, minor, patch) ints
'Google Chrome 124.0.6367.208' -> (124, 0, 6367)
'GNU Wget 1.24.5 built on darwin23.2.0.' -> (1, 24, 5)
'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0)
'2024.04.09' -> (2024, 4, 9)
"""
# print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout)
if isinstance(version_stdout, (tuple, list)):
version_stdout = '.'.join(str(chunk) for chunk in version_stdout)
elif isinstance(version_stdout, bytes):
version_stdout = version_stdout.decode()
elif not isinstance(version_stdout, str):
version_stdout = str(version_stdout)
# no text to work with, return None immediately
if not version_stdout.strip():
# raise Exception('Tried to parse semver from empty version output (is binary installed and available?)')
return None
just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0]
contains_semver = lambda col: (
col.count('.') in (1, 2, 3)
and all(chunk.isdigit() for chunk in col.split('.')[:3]) # first 3 chunks can only be nums
)
full_text = version_stdout.split('\n')[0].strip()
first_line_columns = full_text.split()[:4]
version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns)))
# could not find any column of first line that looks like a version number, despite there being some text
if not version_columns:
# raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns)))
return None
# take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09)
first_version_tuple = version_columns[0].split('.', 3)[:3]
# print('FINAL_VALUE', first_version_tuple)
return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text)
def __str__(self):
return '.'.join(str(chunk) for chunk in self)
# @classmethod
# def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema:
# default_schema = handler(source)
# return core_schema.no_info_after_validator_function(
# cls.parse,
# default_schema,
# serialization=core_schema.plain_serializer_function_ser_schema(
# lambda semver: str(semver),
# info_arg=False,
# return_schema=core_schema.str_schema(),
# ),
# )
assert SemVer(None) == None
assert SemVer('') == None
assert SemVer.parse('') == None
assert SemVer(1) == (1, 0, 0)
assert SemVer(1, 2) == (1, 2, 0)
assert SemVer('1.2+234234') == (1, 2, 0)
assert SemVer((1, 2, 3)) == (1, 2, 3)
assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3'
assert SemVer(('1', '2', '3')) == (1, 2, 3)
assert SemVer.parse('5.6.7') == (5, 6, 7)
assert SemVer.parse('124.0.6367.208') == (124, 0, 6367)
assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0)
assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367)
assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367)
assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123'
assert SemVer.parse('Google Chrome') == None
@validate_call
def bin_name(bin_path_or_name: str | Path) -> str:
name = Path(bin_path_or_name).name
assert len(name) > 1
assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), (
f'Binary name can only contain a-Z0-9-_.: {name}')
return name
BinName = Annotated[str, AfterValidator(bin_name)]
@validate_call
def path_is_file(path: Path | str) -> Path:
path = Path(path) if isinstance(path, str) else path
assert path.is_file(), f'Path is not a file: {path}'
return path
HostExistsPath = Annotated[Path, AfterValidator(path_is_file)]
@validate_call
def path_is_executable(path: HostExistsPath) -> HostExistsPath:
assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})'
return path
@validate_call
def path_is_script(path: HostExistsPath) -> HostExistsPath:
SCRIPT_EXTENSIONS = ('.py', '.js', '.sh')
assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS))
return path
HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)]
@validate_call
def path_is_abspath(path: Path) -> Path:
return path.resolve()
HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)]
HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)]
@validate_call
def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None:
assert bin_path_or_name
if str(bin_path_or_name).startswith('/'):
# already a path, get its absolute form
abspath = Path(bin_path_or_name).resolve()
else:
# not a path yet, get path using os.which
binpath = shutil.which(bin_path_or_name)
if not binpath:
return None
abspath = Path(binpath).resolve()
try:
return TypeAdapter(HostBinPath).validate_python(abspath)
except ValidationError:
return None
@validate_call
def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None:
return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode())
class InstalledBin(BaseModel):
abspath: HostBinPath
version: SemVer
def is_valid_install_string(pkgs_str: str) -> str:
"""Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'"""
assert pkgs_str
assert all(len(pkg) > 1 for pkg in pkgs_str.split(' '))
return pkgs_str
def is_valid_python_dotted_import(import_str: str) -> str:
assert import_str and import_str.replace('.', '').replace('_', '').isalnum()
return import_str
InstallStr = Annotated[str, AfterValidator(is_valid_install_string)]
LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)]
ProviderHandler = Callable[..., Any] | Callable[[], Any] # must take no args [], or [bin_name: str, **kwargs]
#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
ProviderHandlerRef = LazyImportStr | ProviderHandler
ProviderLookupDict = Dict[str, LazyImportStr]
ProviderType = Literal['abspath', 'version', 'subdeps', 'install']
# class Host(BaseModel):
# machine: str
# system: str
# platform: str
# in_docker: bool
# in_qemu: bool
# python: str
BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor']
class BinProvider(ABC, BaseModel):
name: BinProviderName
abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True)
version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True)
subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True)
install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True)
_abspath_cache: ClassVar = {}
_version_cache: ClassVar = {}
_install_cache: ClassVar = {}
# def provider_version(self) -> SemVer | None:
# """Version of the actual underlying package manager (e.g. pip v20.4.1)"""
# if self.name in ('env', 'vendor'):
# return SemVer('0.0.0')
# installer_binpath = Path(shutil.which(self.name)).resolve()
# return bin_version(installer_binpath)
# def provider_host(self) -> Host:
# """Information about the host env, archictecture, and OS needed to select & build packages"""
# p = platform.uname()
# return Host(
# machine=p.machine,
# system=p.system,
# platform=platform.platform(),
# python=sys.implementation.name,
# in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true',
# in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true',
# )
def get_default_providers(self):
return self.get_providers_for_bin('*')
def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None:
if provider_func is None:
return None
# if provider_func is a dotted path to a function on self, swap it for the actual function
if isinstance(provider_func, str) and provider_func.startswith('self.'):
provider_func = getattr(self, provider_func.split('self.', 1)[-1])
# if provider_func is a dot-formatted import string, import the function
if isinstance(provider_func, str):
from django.utils.module_loading import import_string
package_name, module_name, classname, path = provider_func.split('.', 3) # -> abc, def, ghi.jkl
# get .ghi.jkl nested attr present on module abc.def
imported_module = import_string(f'{package_name}.{module_name}.{classname}')
provider_func = operator.attrgetter(path)(imported_module)
# # abc.def.ghi.jkl -> 1, 2, 3
# for idx in range(1, len(path)):
# parent_path = '.'.join(path[:-idx]) # abc.def.ghi
# try:
# parent_module = import_string(parent_path)
# provider_func = getattr(parent_module, path[-idx])
# except AttributeError, ImportError:
# continue
assert TypeAdapter(ProviderHandler).validate_python(provider_func), (
f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}')
return provider_func
@validate_call
def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict:
providers_for_bin = {
'abspath': self.abspath_provider.get(bin_name),
'version': self.version_provider.get(bin_name),
'subdeps': self.subdeps_provider.get(bin_name),
'install': self.install_provider.get(bin_name),
}
only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None}
return only_set_providers_for_bin
@validate_call
def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler:
"""
Get the provider func for a given key + Dict of provider callbacks + fallback default provider.
e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable
"""
provider_func_ref = (
(overrides or {}).get(provider_type)
or self.get_providers_for_bin(bin_name).get(provider_type)
or self.get_default_providers().get(provider_type)
or default_provider
)
# print('getting provider for action', bin_name, provider_type, provider_func)
provider_func = self.resolve_provider_func(provider_func_ref)
assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.'
return provider_func
@validate_call
def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any:
provider_func: ProviderHandler = self.get_provider_for_action(
bin_name=bin_name,
provider_type=provider_type,
default_provider=default_provider,
overrides=overrides,
)
if not func_takes_args_or_kwargs(provider_func):
# if it's a pure argless lambdas, dont pass bin_path and other **kwargs
provider_func_without_args = cast(Callable[[], Any], provider_func)
return provider_func_without_args()
provider_func = cast(Callable[..., Any], provider_func)
return provider_func(bin_name, **kwargs)
def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None:
print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...')
try:
return bin_abspath(bin_name)
except ValidationError:
return None
def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None:
abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name)
if not abspath: return None
print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...')
try:
return bin_version(abspath)
except ValidationError:
return None
def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr:
print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}')
# ... subdependency calculation logic here
return TypeAdapter(InstallStr).validate_python(bin_name)
@abstractmethod
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
subdeps = subdeps or self.get_subdeps(bin_name)
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
# ... install logic here
assert True
@validate_call
def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None:
abspath = self.call_provider_for_action(
bin_name=bin_name,
provider_type='abspath',
default_provider=self.on_get_abspath,
overrides=overrides,
)
if not abspath:
return None
result = TypeAdapter(HostBinPath).validate_python(abspath)
self._abspath_cache[bin_name] = result
return result
@validate_call
def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None:
version = self.call_provider_for_action(
bin_name=bin_name,
provider_type='version',
default_provider=self.on_get_version,
overrides=overrides,
abspath=abspath,
)
if not version:
return None
result = SemVer(version)
self._version_cache[bin_name] = result
return result
@validate_call
def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr:
subdeps = self.call_provider_for_action(
bin_name=bin_name,
provider_type='subdeps',
default_provider=self.on_get_subdeps,
overrides=overrides,
)
if not subdeps:
subdeps = bin_name
result = TypeAdapter(InstallStr).validate_python(subdeps)
return result
@validate_call
def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None:
subdeps = self.get_subdeps(bin_name, overrides=overrides)
self.call_provider_for_action(
bin_name=bin_name,
provider_type='install',
default_provider=self.on_install,
overrides=overrides,
subdeps=subdeps,
)
installed_abspath = self.get_abspath(bin_name)
assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}'
installed_version = self.get_version(bin_name, abspath=installed_abspath)
assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}'
result = InstalledBin(abspath=installed_abspath, version=installed_version)
self._install_cache[bin_name] = result
return result
@validate_call
def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None:
installed_abspath = None
installed_version = None
if cache:
installed_bin = self._install_cache.get(bin_name)
if installed_bin:
return installed_bin
installed_abspath = self._abspath_cache.get(bin_name)
installed_version = self._version_cache.get(bin_name)
installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides)
if not installed_abspath:
return None
installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides)
if not installed_version:
return None
return InstalledBin(abspath=installed_abspath, version=installed_version)
@validate_call
def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None:
installed = self.load(bin_name, overrides=overrides, cache=cache)
if not installed:
installed = self.install(bin_name, overrides=overrides)
return installed
class PipProvider(BinProvider):
name: BinProviderName = 'pip'
def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
subdeps = subdeps or self.on_get_subdeps(bin_name)
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
if proc.returncode != 0:
print(proc.stdout.strip().decode())
print(proc.stderr.strip().decode())
raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
class AptProvider(BinProvider):
name: BinProviderName = 'apt'
subdeps_provider: ProviderLookupDict = {
'yt-dlp': lambda: 'yt-dlp ffmpeg',
}
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
subdeps = subdeps or self.on_get_subdeps(bin_name)
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
run(['apt-get', 'update', '-qq'])
proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
if proc.returncode != 0:
print(proc.stdout.strip().decode())
print(proc.stderr.strip().decode())
raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
class BrewProvider(BinProvider):
name: BinProviderName = 'brew'
def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
subdeps = subdeps or self.on_get_subdeps(bin_name)
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
if proc.returncode != 0:
print(proc.stdout.strip().decode())
print(proc.stderr.strip().decode())
raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
class EnvProvider(BinProvider):
name: BinProviderName = 'env'
abspath_provider: ProviderLookupDict = {
# 'python': lambda: Path('/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'),
}
version_provider: ProviderLookupDict = {
# 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
}
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
"""The env provider is ready-only and does not install any packages, so this is a no-op"""
pass

View file

@ -31,7 +31,7 @@ def no_empty_args(args: List[str]) -> List[str]:
assert all(len(arg) for arg in args) assert all(len(arg) for arg in args)
return args return args
ExtractorName = Literal['wget', 'warc', 'media'] ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)] CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]

View file

@ -14,9 +14,6 @@ from pydantic import (
from .binaries import ( from .binaries import (
Binary, Binary,
PythonBinary,
SqliteBinary,
DjangoBinary,
WgetBinary, WgetBinary,
YtdlpBinary, YtdlpBinary,
) )
@ -28,7 +25,6 @@ from .extractors import (
) )
from .replayers import ( from .replayers import (
Replayer, Replayer,
GENERIC_REPLAYER,
MEDIA_REPLAYER, MEDIA_REPLAYER,
) )
from .configs import ( from .configs import (
@ -80,12 +76,6 @@ class Plugin(BaseModel):
}) })
class CorePlugin(Plugin):
name: str = 'core'
configs: List[SerializeAsAny[ConfigSet]] = []
binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
extractors: List[SerializeAsAny[Extractor]] = []
replayers: List[SerializeAsAny[Replayer]] = [GENERIC_REPLAYER]
class YtdlpPlugin(Plugin): class YtdlpPlugin(Plugin):
name: str = 'ytdlp' name: str = 'ytdlp'
@ -101,11 +91,9 @@ class WgetPlugin(Plugin):
extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()] extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()]
CORE_PLUGIN = CorePlugin()
YTDLP_PLUGIN = YtdlpPlugin() YTDLP_PLUGIN = YtdlpPlugin()
WGET_PLUGIN = WgetPlugin() WGET_PLUGIN = WgetPlugin()
PLUGINS = [ PLUGINS = [
CORE_PLUGIN,
YTDLP_PLUGIN, YTDLP_PLUGIN,
WGET_PLUGIN, WGET_PLUGIN,
] ]

View file

@ -22,5 +22,4 @@ class Replayer(BaseModel):
# thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon' # thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
GENERIC_REPLAYER = Replayer(name='generic')
MEDIA_REPLAYER = Replayer(name='media') MEDIA_REPLAYER = Replayer(name='media')

View file

@ -1,5 +1,8 @@
__package__ = 'archivebox.plugantic' __package__ = 'archivebox.plugantic'
import inspect
from typing import Any
from django.http import HttpRequest from django.http import HttpRequest
from django.utils.html import format_html, mark_safe from django.utils.html import format_html, mark_safe
@ -10,6 +13,44 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
from plugantic.plugins import LOADED_PLUGINS from plugantic.plugins import LOADED_PLUGINS
from django.conf import settings from django.conf import settings
def obj_to_yaml(obj: Any, indent: int=0) -> str:
indent_str = " " * indent
if isinstance(obj, dict):
if not obj:
return "{}"
result = "\n"
for key, value in obj.items():
result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n"
return result
elif isinstance(obj, list):
if not obj:
return "[]"
result = "\n"
for item in obj:
result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n"
return result.rstrip()
elif isinstance(obj, str):
if "\n" in obj:
return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ")
else:
return f" {obj}"
elif isinstance(obj, (int, float, bool)):
return f" {str(obj)}"
elif callable(obj):
source = '\n'.join(
'' if 'def ' in line else line
for line in inspect.getsource(obj).split('\n')
if line.strip()
).split('lambda: ')[-1].rstrip(',')
return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ")
else:
return f" {str(obj)}"
@render_with_table_view @render_with_table_view
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@ -18,13 +59,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows = { rows = {
"Binary": [], "Binary": [],
"From Plugin": [],
"Found Version": [], "Found Version": [],
"From Plugin": [],
"Provided By": [], "Provided By": [],
"Found Abspath": [], "Found Abspath": [],
"Related Configuration": [], "Related Configuration": [],
"Overrides": [], "Overrides": [],
"Description": [], # "Description": [],
} }
relevant_configs = { relevant_configs = {
@ -38,8 +79,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
binary = binary.load_or_install() binary = binary.load_or_install()
rows['Binary'].append(ItemLink(binary.name, key=binary.name)) rows['Binary'].append(ItemLink(binary.name, key=binary.name))
rows['From Plugin'].append(plugin.name)
rows['Found Version'].append(binary.loaded_version) rows['Found Version'].append(binary.loaded_version)
rows['From Plugin'].append(plugin.name)
rows['Provided By'].append(binary.loaded_provider) rows['Provided By'].append(binary.loaded_provider)
rows['Found Abspath'].append(binary.loaded_abspath) rows['Found Abspath'].append(binary.loaded_abspath)
rows['Related Configuration'].append(mark_safe(', '.join( rows['Related Configuration'].append(mark_safe(', '.join(
@ -48,8 +89,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower() if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
# or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower() # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
))) )))
rows['Overrides'].append(str(binary.provider_overrides)) rows['Overrides'].append(obj_to_yaml(binary.provider_overrides))
rows['Description'].append(binary.description) # rows['Description'].append(binary.description)
return TableContext( return TableContext(
title="Binaries", title="Binaries",
@ -85,8 +126,8 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
'binprovider': binary.loaded_provider, 'binprovider': binary.loaded_provider,
'abspath': binary.loaded_abspath, 'abspath': binary.loaded_abspath,
'version': binary.loaded_version, 'version': binary.loaded_version,
'overrides': str(binary.provider_overrides), 'overrides': obj_to_yaml(binary.provider_overrides),
'providers': str(binary.providers_supported), 'providers': obj_to_yaml(binary.providers_supported),
}, },
"help_texts": { "help_texts": {
# TODO # TODO

View file

@ -11,13 +11,12 @@ from typing import Optional, Union, Set, Tuple
from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
from crontab import CronTab from crontab import CronTab
from .vendor.atomicwrites import atomic_write as lib_atomic_write from atomicwrites import atomic_write as lib_atomic_write
from .util import enforce_types, ExtendedEncoder from .util import enforce_types, ExtendedEncoder
from .config import PYTHON_BINARY, OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES from .config import PYTHON_BINARY, OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs): def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
"""Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective """Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py

View file

@ -16,7 +16,7 @@ from datetime import datetime, timezone
from dateparser import parse as dateparser from dateparser import parse as dateparser
from requests.exceptions import RequestException, ReadTimeout from requests.exceptions import RequestException, ReadTimeout
from .vendor.base32_crockford import encode as base32_encode # type: ignore from base32_crockford import encode as base32_encode # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
from os.path import lexists from os.path import lexists
from os import remove as remove_file from os import remove as remove_file
@ -273,8 +273,8 @@ def get_headers(url: str, timeout: int=None) -> str:
{ {
'URL': url, 'URL': url,
'Status-Code': response.status_code, 'Status-Code': response.status_code,
'Elapsed': response.elapsed, 'Elapsed': response.elapsed.total_seconds()*1000,
'Encoding': response.encoding, 'Encoding': str(response.encoding),
'Apparent-Encoding': response.apparent_encoding, 'Apparent-Encoding': response.apparent_encoding,
**dict(response.headers), **dict(response.headers),
}, },
@ -304,11 +304,7 @@ def chrome_args(**options) -> List[str]:
cmd_args += CHROME_EXTRA_ARGS cmd_args += CHROME_EXTRA_ARGS
if options['CHROME_HEADLESS']: if options['CHROME_HEADLESS']:
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1]) cmd_args += ("--headless=new",) # expects chrome version >= 111
if chrome_major_version >= 111:
cmd_args += ("--headless=new",)
else:
cmd_args += ('--headless',)
if not options['CHROME_SANDBOX']: if not options['CHROME_SANDBOX']:
# assume this means we are running inside a docker container # assume this means we are running inside a docker container

View file

@ -0,0 +1,34 @@
import sys
import inspect
import importlib
from pathlib import Path
VENDOR_DIR = Path(__file__).parent
VENDORED_LIBS = {
# sys.path dir: library name
'python-atomicwrites': 'atomicwrites',
'django-taggit': 'taggit',
'pydantic-pkgr': 'pydantic_pkgr',
'pocket': 'pocket',
'base32-crockford': 'base32_crockford',
}
def load_vendored_libs():
for lib_subdir, lib_name in VENDORED_LIBS.items():
lib_dir = VENDOR_DIR / lib_subdir
assert lib_dir.is_dir(), 'Expected vendor libary {lib_name} could not be found in {lib_dir}'
try:
lib = importlib.import_module(lib_name)
# print(f"Successfully imported lib from environment {lib_name}: {inspect.getfile(lib)}")
except ImportError:
sys.path.append(str(lib_dir))
try:
lib = importlib.import_module(lib_name)
# print(f"Successfully imported lib from vendored fallback {lib_name}: {inspect.getfile(lib)}")
except ImportError as e:
print(f"Failed to import lib from environment or vendored fallback {lib_name}: {e}", file=sys.stderr)
sys.exit(1)

View file

@ -1 +0,0 @@
python-atomicwrites/atomicwrites/__init__.py

View file

@ -1 +0,0 @@
base32-crockford/base32_crockford.py

View file

@ -1 +0,0 @@
../../package-lock.json

View file

@ -1 +0,0 @@
../../package.json

View file

@ -1 +0,0 @@
pocket/pocket.py

1
archivebox/vendor/pydantic-pkgr vendored Submodule

@ -0,0 +1 @@
Subproject commit 2cd844533d888ce29b9bf32b8363510dd0d76166

View file

@ -1 +0,0 @@
django-taggit/taggit/utils.py

18
package-lock.json generated
View file

@ -236,9 +236,9 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/node": { "node_modules/@types/node": {
"version": "22.5.0", "version": "22.5.1",
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
"integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==", "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"dependencies": { "dependencies": {
@ -353,9 +353,9 @@
} }
}, },
"node_modules/aws4": { "node_modules/aws4": {
"version": "1.13.1", "version": "1.13.2",
"resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz", "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
"integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==", "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/b4a": { "node_modules/b4a": {
@ -2376,9 +2376,9 @@
} }
}, },
"node_modules/tslib": { "node_modules/tslib": {
"version": "2.6.3", "version": "2.7.0",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
"integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==", "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
"license": "0BSD" "license": "0BSD"
}, },
"node_modules/turndown": { "node_modules/turndown": {

103
pdm.lock generated
View file

@ -5,7 +5,7 @@
groups = ["default", "ldap", "sonic"] groups = ["default", "ldap", "sonic"]
strategy = ["inherit_metadata"] strategy = ["inherit_metadata"]
lock_version = "4.5.0" lock_version = "4.5.0"
content_hash = "sha256:f2f7ca01f2e18a1ef07d59b7a8985d89785a4b8a2a4e66452f1f9e8e8ad529ad" content_hash = "sha256:c6aa1f436032d18d079a4c2e9d9b95a5110579eb96a449751bfaf4d472eba401"
[[metadata.targets]] [[metadata.targets]]
requires_python = "==3.10.*" requires_python = "==3.10.*"
@ -78,6 +78,29 @@ files = [
{file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"}, {file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"},
] ]
[[package]]
name = "atomicwrites"
version = "1.4.0"
requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
summary = "Atomic file writes."
groups = ["default"]
marker = "python_version == \"3.10\""
files = [
{file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
{file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
]
[[package]]
name = "base32-crockford"
version = "0.3.0"
summary = "A Python implementation of Douglas Crockford's base32 encoding scheme"
groups = ["default"]
marker = "python_version == \"3.10\""
files = [
{file = "base32-crockford-0.3.0.tar.gz", hash = "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969"},
{file = "base32_crockford-0.3.0-py2.py3-none-any.whl", hash = "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"},
]
[[package]] [[package]]
name = "brotli" name = "brotli"
version = "1.1.0" version = "1.1.0"
@ -407,6 +430,21 @@ files = [
{file = "django_stubs_ext-5.0.4.tar.gz", hash = "sha256:85da065224204774208be29c7d02b4482d5a69218a728465c2fbe41725fdc819"}, {file = "django_stubs_ext-5.0.4.tar.gz", hash = "sha256:85da065224204774208be29c7d02b4482d5a69218a728465c2fbe41725fdc819"},
] ]
[[package]]
name = "django-taggit"
version = "1.3.0"
requires_python = ">=3.5"
summary = "django-taggit is a reusable Django application for simple tagging."
groups = ["default"]
marker = "python_version == \"3.10\""
dependencies = [
"Django>=1.11",
]
files = [
{file = "django-taggit-1.3.0.tar.gz", hash = "sha256:4a833bf71f4c2deddd9745924eee53be1c075d7f0020a06f12e29fa3d752732d"},
{file = "django_taggit-1.3.0-py3-none-any.whl", hash = "sha256:609b0223d8a652f3fae088b7fd29f294fdadaca2d7931d45c27d6c59b02fdf31"},
]
[[package]] [[package]]
name = "exceptiongroup" name = "exceptiongroup"
version = "1.2.2" version = "1.2.2"
@ -479,7 +517,7 @@ files = [
[[package]] [[package]]
name = "httpx" name = "httpx"
version = "0.27.0" version = "0.27.2"
requires_python = ">=3.8" requires_python = ">=3.8"
summary = "The next generation HTTP client." summary = "The next generation HTTP client."
groups = ["default"] groups = ["default"]
@ -492,20 +530,20 @@ dependencies = [
"sniffio", "sniffio",
] ]
files = [ files = [
{file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"}, {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
{file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"}, {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
] ]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.7" version = "3.8"
requires_python = ">=3.5" requires_python = ">=3.6"
summary = "Internationalized Domain Names in Applications (IDNA)" summary = "Internationalized Domain Names in Applications (IDNA)"
groups = ["default"] groups = ["default"]
marker = "python_version == \"3.10\"" marker = "python_version == \"3.10\""
files = [ files = [
{file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"},
{file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"},
] ]
[[package]] [[package]]
@ -613,6 +651,32 @@ files = [
{file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
] ]
[[package]]
name = "pocket"
version = "0.3.7"
git = "https://github.com/tapanpandita/pocket.git"
ref = "v0.3.7"
revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
summary = "api wrapper for getpocket.com"
groups = ["default"]
marker = "python_version == \"3.10\""
dependencies = [
"requests",
]
[[package]]
name = "pocket"
version = "0.3.7"
git = "https://github.com/tapanpandita/pocket.git"
ref = "v0.3.7"
revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
summary = "api wrapper for getpocket.com"
groups = ["default"]
marker = "python_version == \"3.10\""
dependencies = [
"requests",
]
[[package]] [[package]]
name = "prompt-toolkit" name = "prompt-toolkit"
version = "3.0.47" version = "3.0.47"
@ -739,6 +803,23 @@ files = [
{file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"}, {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
] ]
[[package]]
name = "pydantic-pkgr"
version = "0.1.4"
requires_python = ">=3.10"
summary = "System package manager APIs in strongly typed Python"
groups = ["default"]
marker = "python_version == \"3.10\""
dependencies = [
"pydantic-core>=2.18.2",
"pydantic>=2.7.1",
"typing-extensions>=4.11.0",
]
files = [
{file = "pydantic_pkgr-0.1.4-py3-none-any.whl", hash = "sha256:bd9ddfa8eeb4d361257c4d3d8d36ba44a72515b497ee52cf0763240c66006417"},
{file = "pydantic_pkgr-0.1.4.tar.gz", hash = "sha256:e0422022dd83341f1e869a54da9aca903a6407a983ece0735f69493841b0fbb8"},
]
[[package]] [[package]]
name = "pygments" name = "pygments"
version = "2.18.0" version = "2.18.0"
@ -841,14 +922,14 @@ files = [
[[package]] [[package]]
name = "setuptools" name = "setuptools"
version = "73.0.1" version = "74.0.0"
requires_python = ">=3.8" requires_python = ">=3.8"
summary = "Easily download, build, install, upgrade, and uninstall Python packages" summary = "Easily download, build, install, upgrade, and uninstall Python packages"
groups = ["default"] groups = ["default"]
marker = "python_version == \"3.10\"" marker = "python_version == \"3.10\""
files = [ files = [
{file = "setuptools-73.0.1-py3-none-any.whl", hash = "sha256:b208925fcb9f7af924ed2dc04708ea89791e24bde0d3020b27df0e116088b34e"}, {file = "setuptools-74.0.0-py3-none-any.whl", hash = "sha256:0274581a0037b638b9fc1c6883cc71c0210865aaa76073f7882376b641b84e8f"},
{file = "setuptools-73.0.1.tar.gz", hash = "sha256:d59a3e788ab7e012ab2c4baed1b376da6366883ee20d7a5fc426816e3d7b1193"}, {file = "setuptools-74.0.0.tar.gz", hash = "sha256:a85e96b8be2b906f3e3e789adec6a9323abf79758ecfa3065bd740d81158b11e"},
] ]
[[package]] [[package]]

View file

@ -29,12 +29,9 @@ dependencies = [
"croniter>=2.0.5", # for: archivebox schedule "croniter>=2.0.5", # for: archivebox schedule
"ipython>=8.23.0", # for: archivebox shell "ipython>=8.23.0", # for: archivebox shell
# Extractor Dependencies # Extractor Dependencies
"yt-dlp>=2024.4.9", # for: media "yt-dlp>=2024.8.6", # for: media
# "playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages # "playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
# TODO: add more extractors
# - gallery-dl
# - scihubdl
# - See Github issues for more...
"django-signal-webhooks>=0.3.0", "django-signal-webhooks>=0.3.0",
"django-admin-data-views>=0.3.1", "django-admin-data-views>=0.3.1",
"ulid-py>=1.1.0", "ulid-py>=1.1.0",
@ -43,6 +40,14 @@ dependencies = [
"django-pydantic-field>=0.3.9", "django-pydantic-field>=0.3.9",
"django-jsonform>=2.22.0", "django-jsonform>=2.22.0",
"django-stubs>=5.0.2", "django-stubs>=5.0.2",
# these can be safely omitted when installation subsystem does not provide these as packages (e.g. apt/debian)
# archivebox will automatically load fallback vendored copies bundled via archivebox/vendor/__init__.py
"pydantic-pkgr>=0.1.4",
"atomicwrites==1.4.0",
"pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7",
"django-taggit==1.3.0",
"base32-crockford==0.3.0",
] ]
homepage = "https://github.com/ArchiveBox/ArchiveBox" homepage = "https://github.com/ArchiveBox/ArchiveBox"
@ -139,7 +144,7 @@ exclude = [
"**/migrations", "**/migrations",
"archivebox/vendor", "archivebox/vendor",
] ]
stubPath = "./typings" stubPath = "./archivebox/typings"
venvPath = "." venvPath = "."
venv = ".venv" venv = ".venv"
# ignore = ["src/oldstuff"] # ignore = ["src/oldstuff"]
@ -169,6 +174,9 @@ debug = [
"djdt_flamegraph", "djdt_flamegraph",
"ipdb", "ipdb",
"requests-tracker>=0.3.3", "requests-tracker>=0.3.3",
"logfire[django]>=0.51.0",
"opentelemetry-instrumentation-django>=0.47b0",
"opentelemetry-instrumentation-sqlite3>=0.47b0",
] ]
test = [ test = [
"pytest", "pytest",
@ -177,8 +185,6 @@ test = [
lint = [ lint = [
"flake8", "flake8",
"mypy", "mypy",
]
dev = [
"django-autotyping>=0.5.1", "django-autotyping>=0.5.1",
] ]

View file

@ -5,6 +5,8 @@ annotated-types==0.7.0; python_version == "3.10"
anyio==4.4.0; python_version == "3.10" anyio==4.4.0; python_version == "3.10"
asgiref==3.8.1; python_version == "3.10" asgiref==3.8.1; python_version == "3.10"
asttokens==2.4.1; python_version == "3.10" asttokens==2.4.1; python_version == "3.10"
atomicwrites==1.4.0; python_version == "3.10"
base32-crockford==0.3.0; python_version == "3.10"
brotli==1.1.0; implementation_name == "cpython" and python_version == "3.10" brotli==1.1.0; implementation_name == "cpython" and python_version == "3.10"
brotlicffi==1.1.0.0; implementation_name != "cpython" and python_version == "3.10" brotlicffi==1.1.0.0; implementation_name != "cpython" and python_version == "3.10"
certifi==2024.7.4; python_version == "3.10" certifi==2024.7.4; python_version == "3.10"
@ -26,13 +28,14 @@ django-settings-holder==0.1.2; python_version == "3.10"
django-signal-webhooks==0.3.0; python_version == "3.10" django-signal-webhooks==0.3.0; python_version == "3.10"
django-stubs==5.0.4; python_version == "3.10" django-stubs==5.0.4; python_version == "3.10"
django-stubs-ext==5.0.4; python_version == "3.10" django-stubs-ext==5.0.4; python_version == "3.10"
django-taggit==1.3.0; python_version == "3.10"
exceptiongroup==1.2.2; python_version == "3.10" exceptiongroup==1.2.2; python_version == "3.10"
executing==2.0.1; python_version == "3.10" executing==2.0.1; python_version == "3.10"
feedparser==6.0.11; python_version == "3.10" feedparser==6.0.11; python_version == "3.10"
h11==0.14.0; python_version == "3.10" h11==0.14.0; python_version == "3.10"
httpcore==1.0.5; python_version == "3.10" httpcore==1.0.5; python_version == "3.10"
httpx==0.27.0; python_version == "3.10" httpx==0.27.2; python_version == "3.10"
idna==3.7; python_version == "3.10" idna==3.8; python_version == "3.10"
ipython==8.26.0; python_version == "3.10" ipython==8.26.0; python_version == "3.10"
jedi==0.19.1; python_version == "3.10" jedi==0.19.1; python_version == "3.10"
matplotlib-inline==0.1.7; python_version == "3.10" matplotlib-inline==0.1.7; python_version == "3.10"
@ -40,6 +43,7 @@ mutagen==1.47.0; python_version == "3.10"
mypy-extensions==1.0.0; python_version == "3.10" mypy-extensions==1.0.0; python_version == "3.10"
parso==0.8.4; python_version == "3.10" parso==0.8.4; python_version == "3.10"
pexpect==4.9.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10" pexpect==4.9.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
pocket @ git+https://github.com/tapanpandita/pocket.git@5a144438cc89bfc0ec94db960718ccf1f76468c1 ; python_version == "3.10"
prompt-toolkit==3.0.47; python_version == "3.10" prompt-toolkit==3.0.47; python_version == "3.10"
ptyprocess==0.7.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10" ptyprocess==0.7.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
pure-eval==0.2.3; python_version == "3.10" pure-eval==0.2.3; python_version == "3.10"
@ -49,6 +53,7 @@ pycparser==2.22; platform_python_implementation != "PyPy" and python_version ==
pycryptodomex==3.20.0; python_version == "3.10" pycryptodomex==3.20.0; python_version == "3.10"
pydantic==2.8.2; python_version == "3.10" pydantic==2.8.2; python_version == "3.10"
pydantic-core==2.20.1; python_version == "3.10" pydantic-core==2.20.1; python_version == "3.10"
pydantic-pkgr==0.1.4; python_version == "3.10"
pygments==2.18.0; python_version == "3.10" pygments==2.18.0; python_version == "3.10"
python-crontab==3.2.0; python_version == "3.10" python-crontab==3.2.0; python_version == "3.10"
python-dateutil==2.9.0.post0; python_version == "3.10" python-dateutil==2.9.0.post0; python_version == "3.10"
@ -56,7 +61,7 @@ python-ldap==3.4.4; python_version == "3.10"
pytz==2024.1; python_version == "3.10" pytz==2024.1; python_version == "3.10"
regex==2024.7.24; python_version == "3.10" regex==2024.7.24; python_version == "3.10"
requests==2.32.3; python_version == "3.10" requests==2.32.3; python_version == "3.10"
setuptools==73.0.1; python_version == "3.10" setuptools==74.0.0; python_version == "3.10"
sgmllib3k==1.0.0; python_version == "3.10" sgmllib3k==1.0.0; python_version == "3.10"
six==1.16.0; python_version == "3.10" six==1.16.0; python_version == "3.10"
sniffio==1.3.1; python_version == "3.10" sniffio==1.3.1; python_version == "3.10"