mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
v0.8.+: Massive speed improvements for Admin UI & common queries, v3 plugins progress, and bugfixes (#1498)
Some checks failed
Build Debian package / build (push) Has been cancelled
Build Docker image / buildx (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Run linters / lint (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
Build Pip package / build (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled
Some checks failed
Build Debian package / build (push) Has been cancelled
Build Docker image / buildx (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Run linters / lint (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
Build Pip package / build (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled
This commit is contained in:
commit
43e87ef437
66 changed files with 1150 additions and 1100 deletions
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -26,3 +26,6 @@
|
|||
[submodule "archivebox/vendor/python-atomicwrites"]
|
||||
path = archivebox/vendor/python-atomicwrites
|
||||
url = https://github.com/untitaker/python-atomicwrites
|
||||
[submodule "archivebox/vendor/pydantic-pkgr"]
|
||||
path = archivebox/vendor/pydantic-pkgr
|
||||
url = https://github.com/ArchiveBox/pydantic-pkgr
|
||||
|
|
|
@ -61,6 +61,11 @@ def get_or_create_system_user_pk(username='system'):
|
|||
return user.pk
|
||||
|
||||
|
||||
class AutoDateTimeField(models.DateTimeField):
|
||||
def pre_save(self, model_instance, add):
|
||||
return timezone.now()
|
||||
|
||||
|
||||
class ABIDModel(models.Model):
|
||||
"""
|
||||
Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
|
||||
|
@ -76,13 +81,16 @@ class ABIDModel(models.Model):
|
|||
abid = ABIDField(prefix=abid_prefix)
|
||||
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
created = AutoDateTimeField(default=timezone.now, db_index=True)
|
||||
modified = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
|
||||
def save(self, *args: Any, **kwargs: Any) -> None:
|
||||
if self._state.adding or not self.created:
|
||||
self.created = timezone.now()
|
||||
|
||||
# when first creating a row, self.ABID is the source of truth
|
||||
# overwrite default prefilled self.id & self.abid with generated self.ABID value
|
||||
if self._state.adding or not self.id:
|
||||
|
@ -93,6 +101,7 @@ class ABIDModel(models.Model):
|
|||
super().save(*args, **kwargs)
|
||||
assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}'
|
||||
assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}'
|
||||
assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})'
|
||||
|
||||
@property
|
||||
def abid_values(self) -> Dict[str, Any]:
|
||||
|
@ -187,6 +196,14 @@ class ABIDModel(models.Model):
|
|||
"""
|
||||
return self.ABID.uuid
|
||||
|
||||
@property
|
||||
def uuid(self) -> str:
|
||||
"""
|
||||
Get a str uuid.UUID (v4) representation of the object's ABID.
|
||||
"""
|
||||
assert str(self.id) == str(self.ABID.uuid)
|
||||
return str(self.id)
|
||||
|
||||
@property
|
||||
def TypeID(self) -> TypeID:
|
||||
"""
|
||||
|
|
0
archivebox/builtin_plugins/__init__.py
Normal file
0
archivebox/builtin_plugins/__init__.py
Normal file
0
archivebox/builtin_plugins/base/__init__.py
Normal file
0
archivebox/builtin_plugins/base/__init__.py
Normal file
3
archivebox/builtin_plugins/base/admin.py
Normal file
3
archivebox/builtin_plugins/base/admin.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
83
archivebox/builtin_plugins/base/apps.py
Normal file
83
archivebox/builtin_plugins/base/apps.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
import sys
|
||||
import inspect
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
|
||||
import django
|
||||
from django.apps import AppConfig
|
||||
from django.core.checks import Tags, Warning, register
|
||||
from django.db.backends.sqlite3.base import Database as sqlite3
|
||||
|
||||
from pydantic import (
|
||||
Field,
|
||||
SerializeAsAny,
|
||||
)
|
||||
|
||||
from pydantic_pkgr import SemVer, BinProvider, BinProviderName, ProviderLookupDict, BinName, Binary, EnvProvider, NpmProvider
|
||||
|
||||
from plugantic.extractors import Extractor, ExtractorName
|
||||
from plugantic.plugins import Plugin
|
||||
from plugantic.configs import ConfigSet, ConfigSectionName
|
||||
from plugantic.replayers import Replayer
|
||||
|
||||
|
||||
class PythonBinary(Binary):
|
||||
name: BinName = 'python'
|
||||
|
||||
providers_supported: List[BinProvider] = [EnvProvider()]
|
||||
provider_overrides: Dict[str, Any] = {
|
||||
'env': {
|
||||
'subdeps': \
|
||||
lambda: 'python3 python3-minimal python3-pip python3-virtualenv',
|
||||
'abspath': \
|
||||
lambda: sys.executable,
|
||||
'version': \
|
||||
lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
|
||||
},
|
||||
}
|
||||
|
||||
class SqliteBinary(Binary):
|
||||
name: BinName = 'sqlite'
|
||||
providers_supported: List[BinProvider] = [EnvProvider()]
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||
'env': {
|
||||
'abspath': \
|
||||
lambda: Path(inspect.getfile(sqlite3)),
|
||||
'version': \
|
||||
lambda: SemVer(sqlite3.version),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class DjangoBinary(Binary):
|
||||
name: BinName = 'django'
|
||||
|
||||
providers_supported: List[BinProvider] = [EnvProvider()]
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||
'env': {
|
||||
'abspath': \
|
||||
lambda: inspect.getfile(django),
|
||||
'version': \
|
||||
lambda: django.VERSION[:3],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class BasicReplayer(Replayer):
|
||||
name: str = 'basic'
|
||||
|
||||
|
||||
class BasePlugin(Plugin):
|
||||
name: str = 'base'
|
||||
configs: List[SerializeAsAny[ConfigSet]] = []
|
||||
binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
|
||||
extractors: List[SerializeAsAny[Extractor]] = []
|
||||
replayers: List[SerializeAsAny[Replayer]] = [BasicReplayer()]
|
||||
|
||||
|
||||
PLUGINS = [BasePlugin()]
|
||||
|
||||
|
||||
class BaseConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'builtin_plugins.base'
|
0
archivebox/builtin_plugins/base/migrations/__init__.py
Normal file
0
archivebox/builtin_plugins/base/migrations/__init__.py
Normal file
3
archivebox/builtin_plugins/base/models.py
Normal file
3
archivebox/builtin_plugins/base/models.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.db import models
|
||||
|
||||
# Create your models here.
|
3
archivebox/builtin_plugins/base/tests.py
Normal file
3
archivebox/builtin_plugins/base/tests.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
3
archivebox/builtin_plugins/base/views.py
Normal file
3
archivebox/builtin_plugins/base/views.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.shortcuts import render
|
||||
|
||||
# Create your views here.
|
0
archivebox/builtin_plugins/singlefile/__init__.py
Normal file
0
archivebox/builtin_plugins/singlefile/__init__.py
Normal file
113
archivebox/builtin_plugins/singlefile/apps.py
Normal file
113
archivebox/builtin_plugins/singlefile/apps.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
from typing import List, Optional, Dict
|
||||
from pathlib import Path
|
||||
|
||||
from django.apps import AppConfig
|
||||
from django.core.checks import Tags, Warning, register
|
||||
|
||||
from pydantic import (
|
||||
Field,
|
||||
SerializeAsAny,
|
||||
)
|
||||
|
||||
from pydantic_pkgr import BinProvider, BinName, Binary, EnvProvider, NpmProvider
|
||||
from pydantic_pkgr.binprovider import bin_abspath
|
||||
from pydantic_pkgr.binary import BinProviderName, ProviderLookupDict
|
||||
|
||||
from plugantic.extractors import Extractor, ExtractorName
|
||||
from plugantic.plugins import Plugin
|
||||
from plugantic.configs import ConfigSet, ConfigSectionName
|
||||
|
||||
from pkg.settings import env
|
||||
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
class SinglefileToggleConfig(ConfigSet):
|
||||
section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
|
||||
|
||||
SAVE_SINGLEFILE: bool = True
|
||||
|
||||
|
||||
class SinglefileDependencyConfig(ConfigSet):
|
||||
section: ConfigSectionName = 'DEPENDENCY_CONFIG'
|
||||
|
||||
SINGLEFILE_BINARY: str = Field(default='wget')
|
||||
SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
|
||||
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
||||
SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||
|
||||
class SinglefileOptionsConfig(ConfigSet):
|
||||
section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
|
||||
|
||||
# loaded from shared config
|
||||
SINGLEFILE_USER_AGENT: str = Field(default='', alias='USER_AGENT')
|
||||
SINGLEFILE_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
|
||||
SINGLEFILE_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
|
||||
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
|
||||
|
||||
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
'CHECK_SSL_VALIDITY': False,
|
||||
'SAVE_SINGLEFILE': True,
|
||||
'TIMEOUT': 120,
|
||||
}
|
||||
|
||||
PLUGIN_CONFIG = [
|
||||
SinglefileToggleConfig(**DEFAULT_CONFIG),
|
||||
SinglefileDependencyConfig(**DEFAULT_CONFIG),
|
||||
SinglefileOptionsConfig(**DEFAULT_CONFIG),
|
||||
]
|
||||
|
||||
###################### Binaries ############################
|
||||
|
||||
min_version: str = "1.1.54"
|
||||
max_version: str = "2.0.0"
|
||||
|
||||
class SinglefileBinary(Binary):
|
||||
name: BinName = 'single-file'
|
||||
providers_supported: List[BinProvider] = [NpmProvider()]
|
||||
|
||||
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] ={
|
||||
'env': {
|
||||
'abspath': lambda: bin_abspath('single-file-node.js', PATH=env.PATH) or bin_abspath('single-file', PATH=env.PATH),
|
||||
},
|
||||
'npm': {
|
||||
# 'abspath': lambda: bin_abspath('single-file', PATH=NpmProvider().PATH) or bin_abspath('single-file', PATH=env.PATH),
|
||||
'subdeps': lambda: f'single-file-cli@>={min_version} <{max_version}',
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
###################### Extractors ##########################
|
||||
|
||||
class SinglefileExtractor(Extractor):
|
||||
name: ExtractorName = 'singlefile'
|
||||
binary: Binary = SinglefileBinary()
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(snapshot.link_dir) / 'singlefile.html'
|
||||
|
||||
|
||||
###################### Plugins #############################
|
||||
|
||||
|
||||
class SinglefilePlugin(Plugin):
|
||||
name: str = 'singlefile'
|
||||
configs: List[SerializeAsAny[ConfigSet]] = [*PLUGIN_CONFIG]
|
||||
binaries: List[SerializeAsAny[Binary]] = [SinglefileBinary()]
|
||||
extractors: List[SerializeAsAny[Extractor]] = [SinglefileExtractor()]
|
||||
|
||||
PLUGINS = [SinglefilePlugin()]
|
||||
|
||||
###################### Django Apps #########################
|
||||
|
||||
class SinglefileConfig(AppConfig):
|
||||
name = 'builtin_plugins.singlefile'
|
||||
verbose_name = 'SingleFile'
|
||||
|
||||
def ready(self):
|
||||
pass
|
||||
# print('Loaded singlefile plugin')
|
66
archivebox/builtin_plugins/singlefile/config.yaml
Normal file
66
archivebox/builtin_plugins/singlefile/config.yaml
Normal file
|
@ -0,0 +1,66 @@
|
|||
name: singlefile
|
||||
plugin_version: '0.0.1'
|
||||
plugin_spec: '0.0.1'
|
||||
|
||||
binaries:
|
||||
singlefile:
|
||||
providers:
|
||||
- env
|
||||
- npm
|
||||
|
||||
commands:
|
||||
- singlefile.exec
|
||||
- singlefile.extract
|
||||
- singlefile.should_extract
|
||||
- singlefile.get_output_path
|
||||
|
||||
extractors:
|
||||
singlefile:
|
||||
binary: singlefile
|
||||
test: singlefile.should_extract
|
||||
extract: singlefile.extract
|
||||
output_files:
|
||||
- singlefile.html
|
||||
|
||||
configs:
|
||||
ARCHIVE_METHOD_TOGGLES:
|
||||
SAVE_SINGLEFILE:
|
||||
type: bool
|
||||
default: true
|
||||
|
||||
DEPENDENCY_CONFIG:
|
||||
SINGLEFILE_BINARY:
|
||||
type: str
|
||||
default: wget
|
||||
SINGLEFILE_ARGS:
|
||||
type: Optional[List[str]]
|
||||
default: null
|
||||
SINGLEFILE_EXTRA_ARGS:
|
||||
type: List[str]
|
||||
default: []
|
||||
SINGLEFILE_DEFAULT_ARGS:
|
||||
type: List[str]
|
||||
default:
|
||||
- "--timeout={TIMEOUT-10}"
|
||||
|
||||
ARCHIVE_METHOD_OPTIONS:
|
||||
SINGLEFILE_USER_AGENT:
|
||||
type: str
|
||||
default: ""
|
||||
alias: USER_AGENT
|
||||
SINGLEFILE_TIMEOUT:
|
||||
type: int
|
||||
default: 60
|
||||
alias: TIMEOUT
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY:
|
||||
type: bool
|
||||
default: true
|
||||
alias: CHECK_SSL_VALIDITY
|
||||
SINGLEFILE_RESTRICT_FILE_NAMES:
|
||||
type: str
|
||||
default: windows
|
||||
alias: RESTRICT_FILE_NAMES
|
||||
SINGLEFILE_COOKIES_FILE:
|
||||
type: Optional[Path]
|
||||
default: null
|
||||
alias: COOKIES_FILE
|
3
archivebox/builtin_plugins/singlefile/tests.py
Normal file
3
archivebox/builtin_plugins/singlefile/tests.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
|
@ -31,8 +31,6 @@ import getpass
|
|||
import platform
|
||||
import shutil
|
||||
import requests
|
||||
import django
|
||||
from sqlite3 import dbapi2 as sqlite3
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
|
@ -43,6 +41,11 @@ from configparser import ConfigParser
|
|||
from collections import defaultdict
|
||||
import importlib.metadata
|
||||
|
||||
from pydantic_pkgr import SemVer
|
||||
|
||||
import django
|
||||
from django.db.backends.sqlite3.base import Database as sqlite3
|
||||
|
||||
from .config_stubs import (
|
||||
AttrDict,
|
||||
SimpleConfigValueDict,
|
||||
|
@ -52,6 +55,11 @@ from .config_stubs import (
|
|||
ConfigDefaultDict,
|
||||
)
|
||||
|
||||
# load fallback libraries from vendor dir
|
||||
from .vendor import load_vendored_libs
|
||||
load_vendored_libs()
|
||||
|
||||
|
||||
|
||||
############################### Config Schema ##################################
|
||||
|
||||
|
@ -89,13 +97,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'SECRET_KEY': {'type': str, 'default': None},
|
||||
'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
|
||||
'ALLOWED_HOSTS': {'type': str, 'default': '*'}, # e.g. archivebox.example.com,archivebox2.example.com
|
||||
'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': ''}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
|
||||
'CSRF_TRUSTED_ORIGINS': {'type': str, 'default': lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c['BIND_ADDR'])}, # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
|
||||
'DEBUG': {'type': bool, 'default': False},
|
||||
'PUBLIC_INDEX': {'type': bool, 'default': True},
|
||||
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
|
||||
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
|
||||
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
|
||||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
||||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 100},
|
||||
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
||||
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
||||
'TIMEZONE': {'type': str, 'default': 'UTC'},
|
||||
|
@ -565,7 +573,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
|
||||
|
||||
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
|
||||
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
|
||||
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
|
||||
|
||||
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
|
||||
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
|
||||
|
@ -902,16 +910,9 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None) -> Optional[str]
|
|||
version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
|
||||
|
||||
# take first 3 columns of first line of version info
|
||||
version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE)
|
||||
try:
|
||||
version_nums = version_ptn.findall(version_str.split('\n')[0])[0]
|
||||
if version_nums:
|
||||
return version_nums
|
||||
else:
|
||||
raise IndexError
|
||||
except IndexError:
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
semver = SemVer.parse(version_str)
|
||||
if semver:
|
||||
return str(semver)
|
||||
except OSError:
|
||||
pass
|
||||
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
|
||||
|
@ -1524,5 +1525,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
|||
assert sql_index_path.exists(), (
|
||||
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
|
||||
|
||||
|
||||
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
|
||||
if settings.DEBUG_LOGFIRE:
|
||||
from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
|
||||
SQLite3Instrumentor().instrument()
|
||||
|
||||
import logfire
|
||||
|
||||
logfire.configure()
|
||||
logfire.instrument_django(is_sql_commentor_enabled=True)
|
||||
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(2)
|
||||
|
|
|
@ -10,12 +10,15 @@ from datetime import datetime, timezone
|
|||
from typing import Dict, Any
|
||||
|
||||
from django.contrib import admin
|
||||
from django.db.models import Count, Q
|
||||
from django.urls import path, reverse
|
||||
from django.db.models import Count, Q, Prefetch
|
||||
from django.urls import path, reverse, resolve
|
||||
from django.utils import timezone
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.shortcuts import render, redirect
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.core.paginator import Paginator
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.conf import settings
|
||||
from django import forms
|
||||
|
@ -126,23 +129,100 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
|
|||
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
|
||||
|
||||
class AccelleratedPaginator(Paginator):
|
||||
"""
|
||||
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
|
||||
Speeds up SELECT Count(*) on Admin views by >20x.
|
||||
https://hakibenita.com/optimizing-the-django-admin-paginator
|
||||
"""
|
||||
|
||||
@cached_property
|
||||
def count(self):
|
||||
if self.object_list._has_filters(): # type: ignore
|
||||
# fallback to normal count method on filtered queryset
|
||||
return super().count
|
||||
else:
|
||||
# otherwise count total rows in a separate fast query
|
||||
return self.object_list.model.objects.count()
|
||||
|
||||
# Alternative approach for PostgreSQL: fallback count takes > 200ms
|
||||
# from django.db import connection, transaction, OperationalError
|
||||
# with transaction.atomic(), connection.cursor() as cursor:
|
||||
# cursor.execute('SET LOCAL statement_timeout TO 200;')
|
||||
# try:
|
||||
# return super().count
|
||||
# except OperationalError:
|
||||
# return 9999999999999
|
||||
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
name = 'Archive Results Log'
|
||||
model = ArchiveResult
|
||||
parent_model = Snapshot
|
||||
# fk_name = 'snapshot'
|
||||
extra = 1
|
||||
readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version')
|
||||
fields = ('id', *readonly_fields, 'status', 'output')
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
|
||||
readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version')
|
||||
fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
show_change_link = True
|
||||
# # classes = ['collapse']
|
||||
# # list_display_links = ['abid']
|
||||
|
||||
def get_parent_object_from_request(self, request):
|
||||
resolved = resolve(request.path_info)
|
||||
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
|
||||
|
||||
@admin.display(
|
||||
description='Completed',
|
||||
ordering='end_ts',
|
||||
)
|
||||
def completed(self, obj):
|
||||
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
|
||||
|
||||
def result_id(self, obj):
|
||||
return format_html('<a href="{}"><small><code>[{}]</code></small></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
|
||||
return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
|
||||
|
||||
def command(self, obj):
|
||||
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
|
||||
|
||||
def version(self, obj):
|
||||
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
|
||||
|
||||
def get_formset(self, request, obj=None, **kwargs):
|
||||
formset = super().get_formset(request, obj, **kwargs)
|
||||
snapshot = self.get_parent_object_from_request(request)
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
|
||||
|
||||
# default values for new entries
|
||||
formset.form.base_fields['status'].initial = 'succeeded'
|
||||
formset.form.base_fields['start_ts'].initial = timezone.now()
|
||||
formset.form.base_fields['end_ts'].initial = timezone.now()
|
||||
formset.form.base_fields['cmd_version'].initial = '-'
|
||||
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
|
||||
formset.form.base_fields['created_by'].initial = request.user
|
||||
formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
|
||||
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
|
||||
|
||||
if obj is not None:
|
||||
# hidden values for existing entries and new entries
|
||||
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
|
||||
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
|
||||
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
|
||||
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
|
||||
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
|
||||
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
|
||||
return formset
|
||||
|
||||
def get_readonly_fields(self, request, obj=None):
|
||||
if obj is not None:
|
||||
return self.readonly_fields
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
class TagInline(admin.TabularInline):
|
||||
model = Tag.snapshot_set.through # type: ignore
|
||||
|
@ -222,25 +302,22 @@ def get_abid_info(self, obj):
|
|||
|
||||
@admin.register(Snapshot, site=archivebox_admin)
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
||||
class Meta:
|
||||
model = Snapshot
|
||||
|
||||
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
|
||||
# list_editable = ('title',)
|
||||
sort_fields = ('title_str', 'url_str', 'added', 'files')
|
||||
readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
|
||||
readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
|
||||
search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
|
||||
list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags')
|
||||
list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name')
|
||||
fields = ('url', 'created_by', 'title', *readonly_fields)
|
||||
ordering = ['-added']
|
||||
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||
autocomplete_fields = ['tags']
|
||||
inlines = [TagInline, ArchiveResultInline]
|
||||
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
|
||||
list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000)
|
||||
|
||||
action_form = SnapshotActionForm
|
||||
paginator = AccelleratedPaginator
|
||||
|
||||
save_on_top = True
|
||||
show_full_result_count = False
|
||||
|
||||
def changelist_view(self, request, extra_context=None):
|
||||
extra_context = extra_context or {}
|
||||
|
@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def get_queryset(self, request):
|
||||
self.request = request
|
||||
return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult'))
|
||||
# def get_queryset(self, request):
|
||||
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
|
||||
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
|
||||
|
||||
# self.request = request
|
||||
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
|
||||
|
||||
def tag_list(self, obj):
|
||||
return ', '.join(obj.tags.values_list('name', flat=True))
|
||||
return ', '.join(tag.name for tag in obj.tags.all())
|
||||
|
||||
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
|
||||
# def action(self, obj):
|
||||
|
@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
ordering='title',
|
||||
)
|
||||
def title_str(self, obj):
|
||||
canon = obj.as_link().canonical_outputs()
|
||||
tags = ''.join(
|
||||
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
|
||||
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
|
||||
for tag in obj.tags.all()
|
||||
if str(tag).strip()
|
||||
if str(tag.name).strip()
|
||||
)
|
||||
return format_html(
|
||||
'<a href="/{}">'
|
||||
'<img src="/{}/{}" class="favicon" onerror="this.remove()">'
|
||||
'<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
|
||||
'</a>'
|
||||
'<a href="/{}/index.html">'
|
||||
'<b class="status-{}">{}</b>'
|
||||
'</a>',
|
||||
obj.archive_path,
|
||||
obj.archive_path, canon['favicon_path'],
|
||||
obj.archive_path,
|
||||
obj.archive_path,
|
||||
'fetched' if obj.latest_title or obj.title else 'pending',
|
||||
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
||||
|
@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
|
||||
@admin.display(
|
||||
description='Files Saved',
|
||||
ordering='archiveresult_count',
|
||||
# ordering='archiveresult_count',
|
||||
)
|
||||
def files(self, obj):
|
||||
return snapshot_icons(obj)
|
||||
|
||||
|
||||
@admin.display(
|
||||
ordering='archiveresult_count'
|
||||
# ordering='archiveresult_count'
|
||||
)
|
||||
def size(self, obj):
|
||||
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
|
||||
|
@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin):
|
|||
actions = ['delete_selected']
|
||||
ordering = ['-created']
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
|
||||
def API(self, obj):
|
||||
try:
|
||||
return get_abid_info(self, obj)
|
||||
|
@ -575,6 +656,8 @@ class ArchiveResultAdmin(ABIDModelAdmin):
|
|||
ordering = ['-start_ts']
|
||||
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
|
||||
@admin.display(
|
||||
description='Snapshot Info'
|
||||
)
|
||||
|
|
|
@ -4,7 +4,7 @@ from django import forms
|
|||
|
||||
from ..util import URL_REGEX
|
||||
from ..parsers import PARSERS
|
||||
from ..vendor.taggit_utils import edit_string_for_tags, parse_tags
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
|
||||
PARSER_CHOICES = [
|
||||
(parser_key, parser[0])
|
||||
|
|
|
@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor):
|
|||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
num_total = Snapshot.objects.all().count()
|
||||
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
|
||||
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
|
||||
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
|
||||
assert snapshot.abid
|
||||
snapshot.abid_prefix = 'snp_'
|
||||
snapshot.abid_ts_src = 'self.added'
|
||||
|
@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor):
|
|||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
num_total = ArchiveResult.objects.all().count()
|
||||
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
|
||||
assert result.abid
|
||||
result.abid_prefix = 'res_'
|
||||
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
|
||||
|
|
|
@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor):
|
|||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
num_total = ArchiveResult.objects.all().count()
|
||||
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
|
||||
assert result.abid
|
||||
result.uuid = ABID.parse(result.abid).uuid
|
||||
result.save(update_fields=["uuid"])
|
||||
|
|
|
@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
|
|||
SnapshotTag = apps.get_model("core", "SnapshotTag")
|
||||
num_total = SnapshotTag.objects.all().count()
|
||||
print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()):
|
||||
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
|
||||
assert snapshottag.snapshot_old_id
|
||||
snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
|
||||
snapshottag.snapshot_id = snapshot.id
|
||||
|
|
|
@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor):
|
|||
Tag = apps.get_model("core", "Tag")
|
||||
num_total = Tag.objects.all().count()
|
||||
print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
|
||||
for idx, tag in enumerate(Tag.objects.all().iterator()):
|
||||
for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
|
||||
if not tag.slug:
|
||||
tag.slug = tag.name.lower().replace(' ', '_')
|
||||
if not tag.name:
|
||||
|
|
|
@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
|
|||
SnapshotTag = apps.get_model("core", "SnapshotTag")
|
||||
num_total = SnapshotTag.objects.all().count()
|
||||
print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()):
|
||||
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
|
||||
assert snapshottag.old_tag_id
|
||||
tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
|
||||
snapshottag.tag_id = tag.id
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
# Generated by Django 5.1 on 2024-08-28 09:40
|
||||
|
||||
import abid_utils.models
|
||||
import django.utils.timezone
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0068_alter_archiveresult_options'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created',
|
||||
field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='added',
|
||||
field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created',
|
||||
field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created',
|
||||
field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
]
|
|
@ -12,6 +12,7 @@ from uuid import uuid4
|
|||
from pathlib import Path
|
||||
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils.text import slugify
|
||||
from django.core.cache import cache
|
||||
|
@ -19,7 +20,7 @@ from django.urls import reverse, reverse_lazy
|
|||
from django.db.models import Case, When, Value, IntegerField
|
||||
from django.conf import settings
|
||||
|
||||
from abid_utils.models import ABIDModel, ABIDField
|
||||
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
|
||||
|
||||
from ..system import get_dir_size
|
||||
from ..util import parse_date, base_url
|
||||
|
@ -50,7 +51,7 @@ class Tag(ABIDModel):
|
|||
Based on django-taggit model + ABID base.
|
||||
"""
|
||||
abid_prefix = 'tag_'
|
||||
abid_ts_src = 'self.created' # TODO: add created/modified time
|
||||
abid_ts_src = 'self.created'
|
||||
abid_uri_src = 'self.slug'
|
||||
abid_subtype_src = '"03"'
|
||||
abid_rand_src = 'self.old_id'
|
||||
|
@ -60,7 +61,6 @@ class Tag(ABIDModel):
|
|||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
|
||||
abid = ABIDField(prefix=abid_prefix)
|
||||
|
||||
|
||||
name = models.CharField(unique=True, blank=False, max_length=100)
|
||||
slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
|
||||
# slug is autoset on save from name, never set it manually
|
||||
|
@ -125,6 +125,12 @@ class SnapshotTag(models.Model):
|
|||
db_table = 'core_snapshot_tags'
|
||||
unique_together = [('snapshot', 'tag')]
|
||||
|
||||
|
||||
class SnapshotManager(models.Manager):
|
||||
def get_queryset(self):
|
||||
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
|
||||
|
||||
|
||||
class Snapshot(ABIDModel):
|
||||
abid_prefix = 'snp_'
|
||||
abid_ts_src = 'self.added'
|
||||
|
@ -143,16 +149,15 @@ class Snapshot(ABIDModel):
|
|||
|
||||
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
|
||||
|
||||
added = models.DateTimeField(auto_now_add=True, db_index=True)
|
||||
added = AutoDateTimeField(default=timezone.now, db_index=True)
|
||||
updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
|
||||
|
||||
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
||||
|
||||
archiveresult_set: models.Manager['ArchiveResult']
|
||||
|
||||
@property
|
||||
def uuid(self):
|
||||
return self.id
|
||||
objects = SnapshotManager()
|
||||
|
||||
|
||||
def __repr__(self) -> str:
|
||||
title = (self.title_stripped or '-')[:64]
|
||||
|
@ -162,13 +167,6 @@ class Snapshot(ABIDModel):
|
|||
title = (self.title_stripped or '-')[:64]
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title})'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
super().save(*args, **kwargs)
|
||||
try:
|
||||
assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
|
||||
except AssertionError as e:
|
||||
print(e)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, info: dict):
|
||||
info = {k: v for k, v in info.items() if k in cls.keys}
|
||||
|
@ -177,8 +175,7 @@ class Snapshot(ABIDModel):
|
|||
def as_json(self, *args) -> dict:
|
||||
args = args or self.keys
|
||||
return {
|
||||
key: getattr(self, key)
|
||||
if key != 'tags' else self.tags_str()
|
||||
key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
|
||||
for key in args
|
||||
}
|
||||
|
||||
|
@ -190,8 +187,14 @@ class Snapshot(ABIDModel):
|
|||
return load_link_details(self.as_link())
|
||||
|
||||
def tags_str(self, nocache=True) -> str | None:
|
||||
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
|
||||
cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
|
||||
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
|
||||
# tags are pre-fetched already, use them directly (best because db is always freshest)
|
||||
tags_str = calc_tags_str()
|
||||
return tags_str
|
||||
|
||||
if nocache:
|
||||
tags_str = calc_tags_str()
|
||||
cache.set(cache_key, tags_str)
|
||||
|
@ -234,7 +237,10 @@ class Snapshot(ABIDModel):
|
|||
|
||||
@cached_property
|
||||
def num_outputs(self) -> int:
|
||||
return self.archiveresult_set.filter(status='succeeded').count()
|
||||
# DONT DO THIS: it will trigger a separate query for every snapshot
|
||||
# return self.archiveresult_set.filter(status='succeeded').count()
|
||||
# this is better:
|
||||
return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
|
||||
|
||||
@cached_property
|
||||
def base_url(self):
|
||||
|
@ -262,10 +268,21 @@ class Snapshot(ABIDModel):
|
|||
|
||||
@cached_property
|
||||
def thumbnail_url(self) -> Optional[str]:
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||
result = (sorted(
|
||||
(
|
||||
result
|
||||
for result in self.archiveresult_set.all()
|
||||
if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
|
||||
),
|
||||
key=lambda result: result.created,
|
||||
) or [None])[-1]
|
||||
else:
|
||||
result = self.archiveresult_set.filter(
|
||||
extractor='screenshot',
|
||||
status='succeeded'
|
||||
).only('output').last()
|
||||
|
||||
if result:
|
||||
return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
|
||||
return None
|
||||
|
@ -292,6 +309,21 @@ class Snapshot(ABIDModel):
|
|||
if self.title:
|
||||
return self.title # whoopdedoo that was easy
|
||||
|
||||
# check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||
try:
|
||||
return (sorted(
|
||||
(
|
||||
result.output.strip()
|
||||
for result in self.archiveresult_set.all()
|
||||
if result.extractor == 'title' and result.status =='succeeded' and result.output
|
||||
),
|
||||
key=lambda title: len(title),
|
||||
) or [None])[-1]
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
# take longest successful title from ArchiveResult db history
|
||||
return sorted(
|
||||
|
@ -355,12 +387,23 @@ class Snapshot(ABIDModel):
|
|||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
"""Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
|
||||
|
||||
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
|
||||
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
|
||||
|
||||
if sorted:
|
||||
precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
|
||||
precedence = [
|
||||
When(extractor=method, then=Value(precedence))
|
||||
for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
]
|
||||
qs = qs.annotate(
|
||||
indexing_precedence=Case(
|
||||
*precedence,
|
||||
default=Value(1000),
|
||||
output_field=IntegerField()
|
||||
)
|
||||
).order_by('indexing_precedence')
|
||||
return qs
|
||||
|
||||
class ArchiveResult(ABIDModel):
|
||||
|
@ -418,17 +461,6 @@ class ArchiveResult(ABIDModel):
|
|||
def __str__(self):
|
||||
return self.extractor
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
super().save(*args, **kwargs)
|
||||
try:
|
||||
assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
|
||||
except AssertionError as e:
|
||||
print(e)
|
||||
|
||||
@property
|
||||
def uuid(self):
|
||||
return self.id
|
||||
|
||||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
return Path(self.snapshot.link_dir)
|
||||
|
|
|
@ -4,7 +4,9 @@ import os
|
|||
import sys
|
||||
import re
|
||||
import logging
|
||||
import inspect
|
||||
import tempfile
|
||||
from typing import Any, Dict
|
||||
|
||||
from pathlib import Path
|
||||
from django.utils.crypto import get_random_string
|
||||
|
@ -33,22 +35,20 @@ APPEND_SLASH = True
|
|||
DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
|
||||
|
||||
|
||||
# add plugins folders to system path, and load plugins in installed_apps
|
||||
BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'plugins'
|
||||
USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'plugins'
|
||||
sys.path.insert(0, str(BUILTIN_PLUGINS_DIR))
|
||||
sys.path.insert(0, str(USER_PLUGINS_DIR))
|
||||
BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'builtin_plugins'
|
||||
USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'user_plugins'
|
||||
|
||||
def find_plugins(plugins_dir):
|
||||
return {
|
||||
# plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA
|
||||
plugin_entrypoint.parent.name: plugin_entrypoint.parent
|
||||
def find_plugins(plugins_dir, prefix: str) -> Dict[str, Any]:
|
||||
plugins = {
|
||||
f'{prefix}.{plugin_entrypoint.parent.name}': plugin_entrypoint.parent
|
||||
for plugin_entrypoint in plugins_dir.glob('*/apps.py')
|
||||
}
|
||||
# print(f'Found {prefix} plugins:\n', '\n '.join(plugins.keys()))
|
||||
return plugins
|
||||
|
||||
INSTALLED_PLUGINS = {
|
||||
**find_plugins(BUILTIN_PLUGINS_DIR),
|
||||
**find_plugins(USER_PLUGINS_DIR),
|
||||
**find_plugins(BUILTIN_PLUGINS_DIR, prefix='builtin_plugins'),
|
||||
**find_plugins(USER_PLUGINS_DIR, prefix='user_plugins'),
|
||||
}
|
||||
|
||||
|
||||
|
@ -66,11 +66,11 @@ INSTALLED_APPS = [
|
|||
'plugantic',
|
||||
'core',
|
||||
'api',
|
||||
'pkg',
|
||||
|
||||
*INSTALLED_PLUGINS.keys(),
|
||||
|
||||
'admin_data_views',
|
||||
|
||||
'django_extensions',
|
||||
]
|
||||
|
||||
|
@ -144,64 +144,6 @@ if CONFIG.LDAP:
|
|||
# sys.exit(1)
|
||||
|
||||
|
||||
################################################################################
|
||||
### Debug Settings
|
||||
################################################################################
|
||||
|
||||
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
|
||||
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
|
||||
if DEBUG_TOOLBAR:
|
||||
try:
|
||||
import debug_toolbar # noqa
|
||||
DEBUG_TOOLBAR = True
|
||||
except ImportError:
|
||||
DEBUG_TOOLBAR = False
|
||||
|
||||
if DEBUG_TOOLBAR:
|
||||
INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
|
||||
INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
|
||||
DEBUG_TOOLBAR_CONFIG = {
|
||||
"SHOW_TOOLBAR_CALLBACK": lambda request: True,
|
||||
"RENDER_PANELS": True,
|
||||
}
|
||||
DEBUG_TOOLBAR_PANELS = [
|
||||
'debug_toolbar.panels.history.HistoryPanel',
|
||||
'debug_toolbar.panels.versions.VersionsPanel',
|
||||
'debug_toolbar.panels.timer.TimerPanel',
|
||||
'debug_toolbar.panels.settings.SettingsPanel',
|
||||
'debug_toolbar.panels.headers.HeadersPanel',
|
||||
'debug_toolbar.panels.request.RequestPanel',
|
||||
'debug_toolbar.panels.sql.SQLPanel',
|
||||
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
|
||||
# 'debug_toolbar.panels.templates.TemplatesPanel',
|
||||
'debug_toolbar.panels.cache.CachePanel',
|
||||
'debug_toolbar.panels.signals.SignalsPanel',
|
||||
'debug_toolbar.panels.logging.LoggingPanel',
|
||||
'debug_toolbar.panels.redirects.RedirectsPanel',
|
||||
'debug_toolbar.panels.profiling.ProfilingPanel',
|
||||
'djdt_flamegraph.FlamegraphPanel',
|
||||
]
|
||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||
|
||||
if DEBUG:
|
||||
from django_autotyping.typing import AutotypingSettingsDict
|
||||
|
||||
INSTALLED_APPS += ['django_autotyping']
|
||||
AUTOTYPING: AutotypingSettingsDict = {
|
||||
"STUBS_GENERATION": {
|
||||
"LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
|
||||
}
|
||||
}
|
||||
|
||||
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
|
||||
# Must delete archivebox/templates/admin to use because it relies on some things we override
|
||||
# visit /__requests_tracker__/ to access
|
||||
DEBUG_REQUESTS_TRACKER = False
|
||||
if DEBUG_REQUESTS_TRACKER:
|
||||
INSTALLED_APPS += ["requests_tracker"]
|
||||
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
|
||||
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
|
||||
|
||||
|
||||
################################################################################
|
||||
### Staticfile and Template Settings
|
||||
|
@ -317,13 +259,15 @@ STORAGES = {
|
|||
SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
|
||||
|
||||
ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
|
||||
CSRF_TRUSTED_ORIGINS = CONFIG.CSRF_TRUSTED_ORIGINS.split(',')
|
||||
CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
|
||||
|
||||
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
|
||||
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
|
||||
if CONFIG.ALLOWED_HOSTS != '*' and (not CSRF_TRUSTED_ORIGINS):
|
||||
for hostname in ALLOWED_HOSTS:
|
||||
CSRF_TRUSTED_ORIGINS.append(f'https://{hostname}')
|
||||
for hostname in ALLOWED_HOSTS:
|
||||
https_endpoint = f'https://{hostname}'
|
||||
if hostname != '*' and https_endpoint not in CSRF_TRUSTED_ORIGINS:
|
||||
print(f'[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS')
|
||||
CSRF_TRUSTED_ORIGINS.append(https_endpoint)
|
||||
|
||||
SECURE_BROWSER_XSS_FILTER = True
|
||||
SECURE_CONTENT_TYPE_NOSNIFF = True
|
||||
|
@ -345,6 +289,8 @@ AUTH_PASSWORD_VALIDATORS = [
|
|||
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
|
||||
]
|
||||
|
||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = None
|
||||
|
||||
################################################################################
|
||||
### Shell Settings
|
||||
################################################################################
|
||||
|
@ -385,6 +331,10 @@ IGNORABLE_404_URLS = [
|
|||
re.compile(r'robots\.txt$'),
|
||||
re.compile(r'.*\.(css|js)\.map$'),
|
||||
]
|
||||
IGNORABLE_200_URLS = [
|
||||
re.compile(r'^"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M),
|
||||
re.compile(r'^"GET /admin/jsi18n/ HTTP/.*" (200|30.) .+', re.I | re.M),
|
||||
]
|
||||
|
||||
class NoisyRequestsFilter(logging.Filter):
|
||||
def filter(self, record) -> bool:
|
||||
|
@ -396,19 +346,26 @@ class NoisyRequestsFilter(logging.Filter):
|
|||
if ignorable_log_pattern.match(logline):
|
||||
return False
|
||||
|
||||
ignorable_log_pattern = re.compile(f'^Not Found: /.*/?{ignorable_url_pattern.pattern}', re.I | re.M)
|
||||
if ignorable_log_pattern.match(logline):
|
||||
return False
|
||||
|
||||
# ignore staticfile requests that 200 or 30*
|
||||
ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
|
||||
if ignoreable_200_log_pattern.match(logline):
|
||||
for ignorable_url_pattern in IGNORABLE_200_URLS:
|
||||
if ignorable_log_pattern.match(logline):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
||||
|
||||
if CONFIG.LOGS_DIR.exists():
|
||||
ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
|
||||
else:
|
||||
# historically too many edge cases here around creating log dir w/ correct permissions early on
|
||||
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
|
||||
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
||||
print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
|
||||
|
||||
LOGGING = {
|
||||
'version': 1,
|
||||
|
@ -445,6 +402,10 @@ LOGGING = {
|
|||
}
|
||||
|
||||
|
||||
################################################################################
|
||||
### REST API Outbound Webhooks settings
|
||||
################################################################################
|
||||
|
||||
# Add default webhook configuration to the User model
|
||||
SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
|
||||
SIGNAL_WEBHOOKS = {
|
||||
|
@ -458,7 +419,9 @@ SIGNAL_WEBHOOKS = {
|
|||
},
|
||||
}
|
||||
|
||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = None
|
||||
################################################################################
|
||||
### Admin Data View Settings
|
||||
################################################################################
|
||||
|
||||
ADMIN_DATA_VIEWS = {
|
||||
"NAME": "Environment",
|
||||
|
@ -495,3 +458,86 @@ ADMIN_DATA_VIEWS = {
|
|||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
################################################################################
|
||||
### Debug Settings
|
||||
################################################################################
|
||||
|
||||
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
|
||||
DEBUG_TOOLBAR = False
|
||||
DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
|
||||
if DEBUG_TOOLBAR:
|
||||
try:
|
||||
import debug_toolbar # noqa
|
||||
DEBUG_TOOLBAR = True
|
||||
except ImportError:
|
||||
DEBUG_TOOLBAR = False
|
||||
|
||||
if DEBUG_TOOLBAR:
|
||||
INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
|
||||
INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
|
||||
DEBUG_TOOLBAR_CONFIG = {
|
||||
"SHOW_TOOLBAR_CALLBACK": lambda request: True,
|
||||
"RENDER_PANELS": True,
|
||||
}
|
||||
DEBUG_TOOLBAR_PANELS = [
|
||||
'debug_toolbar.panels.history.HistoryPanel',
|
||||
'debug_toolbar.panels.versions.VersionsPanel',
|
||||
'debug_toolbar.panels.timer.TimerPanel',
|
||||
'debug_toolbar.panels.settings.SettingsPanel',
|
||||
'debug_toolbar.panels.headers.HeadersPanel',
|
||||
'debug_toolbar.panels.request.RequestPanel',
|
||||
'debug_toolbar.panels.sql.SQLPanel',
|
||||
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
|
||||
# 'debug_toolbar.panels.templates.TemplatesPanel',
|
||||
'debug_toolbar.panels.cache.CachePanel',
|
||||
'debug_toolbar.panels.signals.SignalsPanel',
|
||||
'debug_toolbar.panels.logging.LoggingPanel',
|
||||
'debug_toolbar.panels.redirects.RedirectsPanel',
|
||||
'debug_toolbar.panels.profiling.ProfilingPanel',
|
||||
'djdt_flamegraph.FlamegraphPanel',
|
||||
]
|
||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||
|
||||
if DEBUG:
|
||||
from django_autotyping.typing import AutotypingSettingsDict
|
||||
|
||||
INSTALLED_APPS += ['django_autotyping']
|
||||
AUTOTYPING: AutotypingSettingsDict = {
|
||||
"STUBS_GENERATION": {
|
||||
"LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
|
||||
}
|
||||
}
|
||||
|
||||
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
|
||||
# Must delete archivebox/templates/admin to use because it relies on some things we override
|
||||
# visit /__requests_tracker__/ to access
|
||||
DEBUG_REQUESTS_TRACKER = True
|
||||
DEBUG_REQUESTS_TRACKER = DEBUG_REQUESTS_TRACKER and DEBUG
|
||||
if DEBUG_REQUESTS_TRACKER:
|
||||
import requests_tracker
|
||||
|
||||
INSTALLED_APPS += ["requests_tracker"]
|
||||
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
|
||||
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
|
||||
|
||||
TEMPLATE_DIRS.insert(0, str(Path(inspect.getfile(requests_tracker)).parent / "templates"))
|
||||
|
||||
REQUESTS_TRACKER_CONFIG = {
|
||||
"TRACK_SQL": True,
|
||||
"ENABLE_STACKTRACES": False,
|
||||
"IGNORE_PATHS_PATTERNS": (
|
||||
r".*/favicon\.ico",
|
||||
r".*\.png",
|
||||
r"/admin/jsi18n/",
|
||||
),
|
||||
"IGNORE_SQL_PATTERNS": (
|
||||
r"^SELECT .* FROM django_migrations WHERE app = 'requests_tracker'",
|
||||
r"^SELECT .* FROM django_migrations WHERE app = 'auth'",
|
||||
),
|
||||
}
|
||||
|
||||
# https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.)
|
||||
DEBUG_LOGFIRE = False
|
||||
DEBUG_LOGFIRE = DEBUG_LOGFIRE and (Path(CONFIG.OUTPUT_DIR) / '.logfire').is_dir()
|
||||
|
|
|
@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
|||
if type(all_links) is QuerySet:
|
||||
num_links: int = all_links.count()
|
||||
get_link = lambda x: x.as_link_with_details()
|
||||
all_links = all_links.iterator()
|
||||
all_links = all_links.iterator(chunk_size=500)
|
||||
else:
|
||||
num_links: int = len(all_links)
|
||||
get_link = lambda x: x
|
||||
|
|
|
@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def wget_output_path(link: Link) -> Optional[str]:
|
||||
def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
|
||||
"""calculate the path to the wgetted .html file, since wget may
|
||||
adjust some paths to be different than the base_url path.
|
||||
|
||||
|
@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]:
|
|||
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
|
||||
|
||||
cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path'
|
||||
|
||||
if not nocache:
|
||||
from django.core.cache import cache
|
||||
cached_result = cache.get(cache_key)
|
||||
if cached_result:
|
||||
return cached_result
|
||||
|
||||
|
||||
# There's also lots of complexity around how the urlencoding and renaming
|
||||
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
|
||||
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
|
||||
|
@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]:
|
|||
output_path = None
|
||||
|
||||
if output_path:
|
||||
if not nocache:
|
||||
cache.set(cache_key, output_path)
|
||||
return output_path
|
||||
|
||||
# fallback to just the domain dir
|
||||
|
|
|
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
|||
|
||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in links
|
||||
|
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
|||
|
||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_archived, links)
|
||||
|
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
|||
|
||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_unarchived, links)
|
||||
|
@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
|||
|
||||
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs with a valid index matched to the main index and archived content"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_valid, links)
|
||||
|
@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
|
|||
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
|
||||
)
|
||||
|
||||
for path in chain(snapshots.iterator(), data_folders):
|
||||
for path in chain(snapshots.iterator(chunk_size=500), data_folders):
|
||||
link = None
|
||||
if type(path) is not str:
|
||||
path = path.as_link().link_dir
|
||||
|
@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
|||
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
||||
corrupted = {}
|
||||
for snapshot in snapshots.iterator():
|
||||
for snapshot in snapshots.iterator(chunk_size=500):
|
||||
link = snapshot.as_link()
|
||||
if is_corrupt(link):
|
||||
corrupted[link.link_dir] = link
|
||||
|
|
|
@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str:
|
|||
from core.models import ArchiveResult
|
||||
# start = datetime.now(timezone.utc)
|
||||
|
||||
if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache:
|
||||
archive_results = [
|
||||
result
|
||||
for result in snapshot.archiveresult_set.all()
|
||||
if result.status == "succeeded" and result.output
|
||||
]
|
||||
else:
|
||||
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||
|
||||
link = snapshot.as_link()
|
||||
path = link.archive_path
|
||||
canon = link.canonical_outputs()
|
||||
|
|
|
@ -37,9 +37,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
|
|||
@enforce_types
|
||||
def write_link_to_sql_index(link: Link, created_by_id: int | None=None):
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from abid_utils.models import get_or_create_system_user_pk
|
||||
|
||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||
|
||||
info['created_by_id'] = created_by_id
|
||||
info['created_by_id'] = created_by_id or get_or_create_system_user_pk()
|
||||
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
|
||||
|
|
|
@ -960,7 +960,8 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
|
||||
setup_django(out_dir=out_dir, check_db=True)
|
||||
from core.models import User
|
||||
from django.contrib.auth import get_user_model
|
||||
User = get_user_model()
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exists():
|
||||
stderr('\n[+] Creating new admin user for the Web UI...', color='green')
|
||||
|
@ -979,16 +980,16 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
'--upgrade',
|
||||
'--no-cache-dir',
|
||||
'--no-warn-script-location',
|
||||
'youtube_dl',
|
||||
'yt-dlp',
|
||||
], capture_output=False, cwd=out_dir)
|
||||
pkg_path = run_shell([
|
||||
PYTHON_BINARY, '-m', 'pip',
|
||||
'show',
|
||||
'youtube_dl',
|
||||
'yt-dlp',
|
||||
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
|
||||
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'youtube_dl' / '__main__.py'
|
||||
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
|
||||
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
|
||||
assert NEW_YOUTUBEDL_BINARY.exists(), f'youtube_dl must exist inside {pkg_path}'
|
||||
assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
|
||||
config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir)
|
||||
except BaseException as e: # lgtm [py/catch-base-exception]
|
||||
stderr(f'[X] Failed to install python packages: {e}', color='red')
|
||||
|
|
20
archivebox/package-lock.json
generated
20
archivebox/package-lock.json
generated
|
@ -11,7 +11,7 @@
|
|||
"dependencies": {
|
||||
"@postlight/parser": "^2.2.3",
|
||||
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
||||
"single-file-cli": "^1.1.54"
|
||||
"single-file-cli": "^2.0.58"
|
||||
}
|
||||
},
|
||||
"node_modules/@asamuzakjp/dom-selector": {
|
||||
|
@ -236,9 +236,9 @@
|
|||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "22.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz",
|
||||
"integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==",
|
||||
"version": "22.5.1",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
|
||||
"integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
|
@ -353,9 +353,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/aws4": {
|
||||
"version": "1.13.1",
|
||||
"resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz",
|
||||
"integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==",
|
||||
"version": "1.13.2",
|
||||
"resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
|
||||
"integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/b4a": {
|
||||
|
@ -2376,9 +2376,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/tslib": {
|
||||
"version": "2.6.3",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
|
||||
"integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==",
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
|
||||
"integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
|
||||
"license": "0BSD"
|
||||
},
|
||||
"node_modules/turndown": {
|
||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
|
|||
from configparser import ConfigParser
|
||||
|
||||
from pathlib import Path
|
||||
from ..vendor.pocket import Pocket
|
||||
from pocket import Pocket
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import enforce_types
|
||||
|
|
0
archivebox/pkg/__init__.py
Normal file
0
archivebox/pkg/__init__.py
Normal file
3
archivebox/pkg/admin.py
Normal file
3
archivebox/pkg/admin.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
14
archivebox/pkg/apps.py
Normal file
14
archivebox/pkg/apps.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
__package__ = 'archivebox.pkg'
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class PkgsConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'pkg'
|
||||
|
||||
def ready(self):
|
||||
from .settings import LOADED_DEPENDENCIES
|
||||
|
||||
# print(LOADED_DEPENDENCIES)
|
||||
|
0
archivebox/pkg/management/__init__.py
Normal file
0
archivebox/pkg/management/__init__.py
Normal file
0
archivebox/pkg/management/commands/__init__.py
Normal file
0
archivebox/pkg/management/commands/__init__.py
Normal file
75
archivebox/pkg/management/commands/pkg.py
Normal file
75
archivebox/pkg/management/commands/pkg.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
__package__ = 'archivebox.pkg.management.commands'
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.conf import settings
|
||||
|
||||
from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
|
||||
from pydantic_pkgr.binprovider import bin_abspath
|
||||
|
||||
from ....config import NODE_BIN_PATH, bin_path
|
||||
|
||||
from plugantic.plugins import LOADED_PLUGINS
|
||||
|
||||
from pkg.settings import env
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
def handle(self, *args, method, **options):
|
||||
method(*args, **options)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
subparsers = parser.add_subparsers(title="sub-commands", required=True)
|
||||
|
||||
list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
|
||||
list_parser.set_defaults(method=self.list)
|
||||
|
||||
install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
|
||||
install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
|
||||
install_parser.add_argument("package_names", nargs="+", type=str)
|
||||
install_parser.set_defaults(method=self.install)
|
||||
|
||||
def list(self, *args, **options):
|
||||
self.stdout.write('################# PLUGINS ####################')
|
||||
for plugin in LOADED_PLUGINS:
|
||||
self.stdout.write(f'{plugin.name}:')
|
||||
for binary in plugin.binaries:
|
||||
try:
|
||||
binary = binary.install()
|
||||
except Exception as e:
|
||||
# import ipdb; ipdb.set_trace()
|
||||
raise
|
||||
self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}')
|
||||
|
||||
self.stdout.write('\n################# LEGACY ####################')
|
||||
for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
|
||||
bin_name = settings.CONFIG[bin_key]
|
||||
|
||||
self.stdout.write(f'{bin_key}: {bin_name}')
|
||||
|
||||
# binary = Binary(name=package_name, providers=[env])
|
||||
# print(binary)
|
||||
|
||||
# try:
|
||||
# loaded_bin = binary.load()
|
||||
# self.stdout.write(
|
||||
# self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
|
||||
# )
|
||||
# except Exception as e:
|
||||
# self.stderr.write(
|
||||
# self.style.ERROR(f"Error loading {package_name}: {e}")
|
||||
# )
|
||||
|
||||
def install(self, *args, bright, **options):
|
||||
for package_name in options["package_names"]:
|
||||
binary = Binary(name=package_name, providers=[env])
|
||||
print(binary)
|
||||
|
||||
try:
|
||||
loaded_bin = binary.load()
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
|
||||
)
|
||||
except Exception as e:
|
||||
self.stderr.write(
|
||||
self.style.ERROR(f"Error loading {package_name}: {e}")
|
||||
)
|
0
archivebox/pkg/migrations/__init__.py
Normal file
0
archivebox/pkg/migrations/__init__.py
Normal file
3
archivebox/pkg/models.py
Normal file
3
archivebox/pkg/models.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.db import models
|
||||
|
||||
# Create your models here.
|
86
archivebox/pkg/settings.py
Normal file
86
archivebox/pkg/settings.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
__package__ = 'archivebox.pkg'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import inspect
|
||||
from pathlib import Path
|
||||
|
||||
import django
|
||||
from django.conf import settings
|
||||
from django.db.backends.sqlite3.base import Database as sqlite3
|
||||
|
||||
from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
|
||||
from pydantic_pkgr.binprovider import bin_abspath
|
||||
|
||||
from ..config import NODE_BIN_PATH, bin_path
|
||||
|
||||
env = EnvProvider(PATH=NODE_BIN_PATH + ':' + os.environ.get('PATH', '/bin'))
|
||||
|
||||
|
||||
LOADED_DEPENDENCIES = {}
|
||||
|
||||
for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
|
||||
# 'PYTHON_BINARY': {
|
||||
# 'path': bin_path(config['PYTHON_BINARY']),
|
||||
# 'version': config['PYTHON_VERSION'],
|
||||
# 'hash': bin_hash(config['PYTHON_BINARY']),
|
||||
# 'enabled': True,
|
||||
# 'is_valid': bool(config['PYTHON_VERSION']),
|
||||
# },
|
||||
|
||||
|
||||
bin_name = settings.CONFIG[bin_key]
|
||||
|
||||
if bin_name.endswith('django/__init__.py'):
|
||||
binary_spec = Binary(name='django', providers=[env], provider_overrides={
|
||||
'env': {
|
||||
'abspath': lambda: Path(inspect.getfile(django)),
|
||||
'version': lambda: SemVer('{}.{}.{} {} ({})'.format(*django.VERSION)),
|
||||
}
|
||||
})
|
||||
elif bin_name.endswith('sqlite3/dbapi2.py'):
|
||||
binary_spec = Binary(name='sqlite3', providers=[env], provider_overrides={
|
||||
'env': {
|
||||
'abspath': lambda: Path(inspect.getfile(sqlite3)),
|
||||
'version': lambda: SemVer(sqlite3.version),
|
||||
}
|
||||
})
|
||||
elif bin_name.endswith('archivebox'):
|
||||
binary_spec = Binary(name='archivebox', providers=[env], provider_overrides={
|
||||
'env': {
|
||||
'abspath': lambda: shutil.which(str(Path('archivebox').expanduser())),
|
||||
'version': lambda: settings.CONFIG.VERSION,
|
||||
}
|
||||
})
|
||||
elif bin_name.endswith('postlight/parser/cli.js'):
|
||||
binary_spec = Binary(name='postlight-parser', providers=[env], provider_overrides={
|
||||
'env': {
|
||||
'abspath': lambda: bin_path('postlight-parser'),
|
||||
'version': lambda: SemVer('1.0.0'),
|
||||
}
|
||||
})
|
||||
else:
|
||||
binary_spec = Binary(name=bin_name, providers=[env])
|
||||
|
||||
try:
|
||||
binary = binary_spec.load()
|
||||
except Exception as e:
|
||||
# print(f"- ❌ Binary {bin_name} failed to load with error: {e}")
|
||||
continue
|
||||
|
||||
assert isinstance(binary.loaded_version, SemVer)
|
||||
|
||||
try:
|
||||
assert str(binary.loaded_version) == dependency['version'], f"Expected {bin_name} version {dependency['version']}, got {binary.loaded_version}"
|
||||
assert str(binary.loaded_respath) == str(bin_abspath(dependency['path']).resolve()), f"Expected {bin_name} abspath {bin_abspath(dependency['path']).resolve()}, got {binary.loaded_respath}"
|
||||
assert binary.is_valid == dependency['is_valid'], f"Expected {bin_name} is_valid={dependency['is_valid']}, got {binary.is_valid}"
|
||||
except Exception as e:
|
||||
pass
|
||||
# print(f"WARNING: Error loading {bin_name}: {e}")
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
# print(f"- ✅ Binary {bin_name} loaded successfully")
|
||||
LOADED_DEPENDENCIES[bin_key] = binary
|
||||
|
||||
|
3
archivebox/pkg/tests.py
Normal file
3
archivebox/pkg/tests.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
3
archivebox/pkg/views.py
Normal file
3
archivebox/pkg/views.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.shortcuts import render
|
||||
|
||||
# Create your views here.
|
|
@ -1,6 +1,5 @@
|
|||
__package__ = 'archivebox.plugantic'
|
||||
|
||||
from .binproviders import BinProvider
|
||||
from .binaries import Binary
|
||||
from .extractors import Extractor
|
||||
from .replayers import Replayer
|
||||
|
|
|
@ -1,6 +1,17 @@
|
|||
import importlib
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class PluganticConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'plugantic'
|
||||
|
||||
def ready(self) -> None:
|
||||
from django.conf import settings
|
||||
from .plugins import PLUGINS
|
||||
|
||||
for plugin_name in settings.INSTALLED_PLUGINS.keys():
|
||||
lib = importlib.import_module(f'{plugin_name}.apps')
|
||||
if hasattr(lib, 'PLUGINS'):
|
||||
for plugin_instance in lib.PLUGINS:
|
||||
PLUGINS.append(plugin_instance)
|
||||
|
|
|
@ -10,285 +10,17 @@ from typing import Any, Optional, Dict, List
|
|||
from typing_extensions import Self
|
||||
from subprocess import run, PIPE
|
||||
|
||||
from pydantic_pkgr import Binary, SemVer, BinName, BinProvider, EnvProvider, AptProvider, BrewProvider, PipProvider, BinProviderName, ProviderLookupDict
|
||||
|
||||
from pydantic_core import ValidationError
|
||||
|
||||
from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer
|
||||
|
||||
from .binproviders import (
|
||||
SemVer,
|
||||
BinName,
|
||||
BinProviderName,
|
||||
HostBinPath,
|
||||
BinProvider,
|
||||
EnvProvider,
|
||||
AptProvider,
|
||||
BrewProvider,
|
||||
PipProvider,
|
||||
ProviderLookupDict,
|
||||
bin_name,
|
||||
bin_abspath,
|
||||
path_is_script,
|
||||
path_is_executable,
|
||||
)
|
||||
|
||||
|
||||
class Binary(BaseModel):
|
||||
name: BinName
|
||||
description: str = Field(default='')
|
||||
|
||||
providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers')
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides')
|
||||
|
||||
loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider')
|
||||
loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath')
|
||||
loaded_version: Optional[SemVer] = Field(default=None, alias='version')
|
||||
|
||||
# bin_filename: see below
|
||||
# is_executable: see below
|
||||
# is_script
|
||||
# is_valid: see below
|
||||
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate(self):
|
||||
self.loaded_abspath = bin_abspath(self.name) or self.name
|
||||
self.description = self.description or self.name
|
||||
|
||||
assert self.providers_supported, f'No providers were given for package {self.name}'
|
||||
|
||||
# pull in any overrides from the binproviders
|
||||
for provider in self.providers_supported:
|
||||
overrides_by_provider = provider.get_providers_for_bin(self.name)
|
||||
if overrides_by_provider:
|
||||
self.provider_overrides[provider.name] = {
|
||||
**overrides_by_provider,
|
||||
**self.provider_overrides.get(provider.name, {}),
|
||||
}
|
||||
return self
|
||||
|
||||
@field_validator('loaded_abspath', mode='before')
|
||||
def parse_abspath(cls, value: Any):
|
||||
return bin_abspath(value)
|
||||
|
||||
@field_validator('loaded_version', mode='before')
|
||||
def parse_version(cls, value: Any):
|
||||
return value and SemVer(value)
|
||||
|
||||
@field_serializer('provider_overrides', when_used='json')
|
||||
def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]:
|
||||
return {
|
||||
provider_name: {
|
||||
key: str(val)
|
||||
for key, val in overrides.items()
|
||||
}
|
||||
for provider_name, overrides in provider_overrides.items()
|
||||
}
|
||||
|
||||
@computed_field # type: ignore[misc] # see mypy issue #1362
|
||||
@property
|
||||
def bin_filename(self) -> BinName:
|
||||
if self.is_script:
|
||||
# e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite
|
||||
name = self.name
|
||||
elif self.loaded_abspath:
|
||||
# e.g. '/opt/homebrew/bin/wget' -> wget
|
||||
name = bin_name(self.loaded_abspath)
|
||||
else:
|
||||
# e.g. 'ytdlp' -> 'yt-dlp'
|
||||
name = bin_name(self.name)
|
||||
return name
|
||||
|
||||
@computed_field # type: ignore[misc] # see mypy issue #1362
|
||||
@property
|
||||
def is_executable(self) -> bool:
|
||||
try:
|
||||
assert self.loaded_abspath and path_is_executable(self.loaded_abspath)
|
||||
return True
|
||||
except (ValidationError, AssertionError):
|
||||
return False
|
||||
|
||||
@computed_field # type: ignore[misc] # see mypy issue #1362
|
||||
@property
|
||||
def is_script(self) -> bool:
|
||||
try:
|
||||
assert self.loaded_abspath and path_is_script(self.loaded_abspath)
|
||||
return True
|
||||
except (ValidationError, AssertionError):
|
||||
return False
|
||||
|
||||
@computed_field # type: ignore[misc] # see mypy issue #1362
|
||||
@property
|
||||
def is_valid(self) -> bool:
|
||||
return bool(
|
||||
self.name
|
||||
and self.loaded_abspath
|
||||
and self.loaded_version
|
||||
and (self.is_executable or self.is_script)
|
||||
)
|
||||
|
||||
@validate_call
|
||||
def install(self) -> Self:
|
||||
if not self.providers_supported:
|
||||
return self
|
||||
|
||||
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
|
||||
for provider in self.providers_supported:
|
||||
try:
|
||||
installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name))
|
||||
if installed_bin:
|
||||
# print('INSTALLED', self.name, installed_bin)
|
||||
return self.model_copy(update={
|
||||
'loaded_provider': provider.name,
|
||||
'loaded_abspath': installed_bin.abspath,
|
||||
'loaded_version': installed_bin.version,
|
||||
})
|
||||
except Exception as err:
|
||||
print(err)
|
||||
exc = err
|
||||
raise exc
|
||||
|
||||
@validate_call
|
||||
def load(self, cache=True) -> Self:
|
||||
if self.is_valid:
|
||||
return self
|
||||
|
||||
if not self.providers_supported:
|
||||
return self
|
||||
|
||||
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
|
||||
for provider in self.providers_supported:
|
||||
try:
|
||||
installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name))
|
||||
if installed_bin:
|
||||
# print('LOADED', provider, self.name, installed_bin)
|
||||
return self.model_copy(update={
|
||||
'loaded_provider': provider.name,
|
||||
'loaded_abspath': installed_bin.abspath,
|
||||
'loaded_version': installed_bin.version,
|
||||
})
|
||||
except Exception as err:
|
||||
print(err)
|
||||
exc = err
|
||||
raise exc
|
||||
|
||||
@validate_call
|
||||
def load_or_install(self, cache=True) -> Self:
|
||||
if self.is_valid:
|
||||
return self
|
||||
|
||||
if not self.providers_supported:
|
||||
return self
|
||||
|
||||
exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
|
||||
for provider in self.providers_supported:
|
||||
try:
|
||||
installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache)
|
||||
if installed_bin:
|
||||
# print('LOADED_OR_INSTALLED', self.name, installed_bin)
|
||||
return self.model_copy(update={
|
||||
'loaded_provider': provider.name,
|
||||
'loaded_abspath': installed_bin.abspath,
|
||||
'loaded_version': installed_bin.version,
|
||||
})
|
||||
except Exception as err:
|
||||
print(err)
|
||||
exc = err
|
||||
raise exc
|
||||
|
||||
@validate_call
|
||||
def exec(self, args=(), pwd='.'):
|
||||
assert self.loaded_abspath
|
||||
assert self.loaded_version
|
||||
return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd)
|
||||
import django
|
||||
from django.db.backends.sqlite3.base import Database as sqlite3
|
||||
|
||||
|
||||
|
||||
|
||||
class SystemPythonHelpers:
|
||||
@staticmethod
|
||||
def get_subdeps() -> str:
|
||||
return 'python3 python3-minimal python3-pip python3-virtualenv'
|
||||
|
||||
@staticmethod
|
||||
def get_abspath() -> str:
|
||||
return sys.executable
|
||||
|
||||
@staticmethod
|
||||
def get_version() -> str:
|
||||
return '{}.{}.{}'.format(*sys.version_info[:3])
|
||||
|
||||
|
||||
class SqliteHelpers:
|
||||
@staticmethod
|
||||
def get_abspath() -> Path:
|
||||
import sqlite3
|
||||
importlib.reload(sqlite3)
|
||||
return Path(inspect.getfile(sqlite3))
|
||||
|
||||
@staticmethod
|
||||
def get_version() -> SemVer:
|
||||
import sqlite3
|
||||
importlib.reload(sqlite3)
|
||||
version = sqlite3.version
|
||||
assert version
|
||||
return SemVer(version)
|
||||
|
||||
class DjangoHelpers:
|
||||
@staticmethod
|
||||
def get_django_abspath() -> str:
|
||||
import django
|
||||
return inspect.getfile(django)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_django_version() -> str:
|
||||
import django
|
||||
return '{}.{}.{} {} ({})'.format(*django.VERSION)
|
||||
|
||||
class YtdlpHelpers:
|
||||
@staticmethod
|
||||
def get_ytdlp_subdeps() -> str:
|
||||
return 'yt-dlp ffmpeg'
|
||||
|
||||
@staticmethod
|
||||
def get_ytdlp_version() -> str:
|
||||
def get_ytdlp_version() -> str:
|
||||
import yt_dlp
|
||||
importlib.reload(yt_dlp)
|
||||
|
||||
version = yt_dlp.version.__version__
|
||||
assert version
|
||||
return version
|
||||
|
||||
class PythonBinary(Binary):
|
||||
name: BinName = 'python'
|
||||
|
||||
providers_supported: List[BinProvider] = [
|
||||
EnvProvider(
|
||||
subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'},
|
||||
abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'},
|
||||
version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'},
|
||||
),
|
||||
]
|
||||
|
||||
class SqliteBinary(Binary):
|
||||
name: BinName = 'sqlite'
|
||||
providers_supported: List[BinProvider] = [
|
||||
EnvProvider(
|
||||
version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'},
|
||||
abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'},
|
||||
),
|
||||
]
|
||||
|
||||
class DjangoBinary(Binary):
|
||||
name: BinName = 'django'
|
||||
providers_supported: List[BinProvider] = [
|
||||
EnvProvider(
|
||||
abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'},
|
||||
version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'},
|
||||
),
|
||||
]
|
||||
|
||||
return yt_dlp.version.__version__
|
||||
|
||||
|
||||
|
||||
|
@ -296,16 +28,26 @@ class DjangoBinary(Binary):
|
|||
class YtdlpBinary(Binary):
|
||||
name: BinName = 'yt-dlp'
|
||||
providers_supported: List[BinProvider] = [
|
||||
# EnvProvider(),
|
||||
PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}),
|
||||
BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}),
|
||||
# AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}),
|
||||
EnvProvider(),
|
||||
PipProvider(),
|
||||
BrewProvider(),
|
||||
AptProvider(),
|
||||
]
|
||||
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||
'pip': {
|
||||
'version': get_ytdlp_version,
|
||||
},
|
||||
'brew': {
|
||||
'subdeps': lambda: 'yt-dlp ffmpeg',
|
||||
},
|
||||
'apt': {
|
||||
'subdeps': lambda: 'yt-dlp ffmpeg',
|
||||
}
|
||||
}
|
||||
|
||||
class WgetBinary(Binary):
|
||||
name: BinName = 'wget'
|
||||
providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()]
|
||||
providers_supported: List[BinProvider] = [EnvProvider(), AptProvider(), BrewProvider()]
|
||||
|
||||
|
||||
# if __name__ == '__main__':
|
||||
|
|
|
@ -1,561 +0,0 @@
|
|||
__package__ = 'archivebox.plugantic'
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import operator
|
||||
|
||||
from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING
|
||||
from typing_extensions import Self
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import namedtuple
|
||||
from pathlib import Path
|
||||
from subprocess import run, PIPE
|
||||
|
||||
from pydantic_core import core_schema, ValidationError
|
||||
from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler
|
||||
|
||||
|
||||
|
||||
def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool:
|
||||
"""returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless"""
|
||||
code = lambda_func.__code__
|
||||
has_args = code.co_argcount > 0
|
||||
has_varargs = code.co_flags & 0x04 != 0
|
||||
has_varkw = code.co_flags & 0x08 != 0
|
||||
return has_args or has_varargs or has_varkw
|
||||
|
||||
|
||||
def is_semver_str(semver: Any) -> bool:
|
||||
if isinstance(semver, str):
|
||||
return (semver.count('.') == 2 and semver.replace('.', '').isdigit())
|
||||
return False
|
||||
|
||||
def semver_to_str(semver: tuple[int, int, int] | str) -> str:
|
||||
if isinstance(semver, (list, tuple)):
|
||||
return '.'.join(str(chunk) for chunk in semver)
|
||||
if is_semver_str(semver):
|
||||
return semver
|
||||
raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver))
|
||||
|
||||
|
||||
SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0))
|
||||
SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int]
|
||||
|
||||
class SemVer(SemVerTuple):
|
||||
major: int
|
||||
minor: int = 0
|
||||
patch: int = 0
|
||||
|
||||
if TYPE_CHECKING:
|
||||
full_text: str | None = ''
|
||||
|
||||
def __new__(cls, *args, full_text=None, **kwargs):
|
||||
# '1.1.1'
|
||||
if len(args) == 1 and is_semver_str(args[0]):
|
||||
result = SemVer.parse(args[0])
|
||||
|
||||
# ('1', '2', '3')
|
||||
elif len(args) == 1 and isinstance(args[0], (tuple, list)):
|
||||
result = SemVer.parse(args[0])
|
||||
|
||||
# (1, '2', None)
|
||||
elif not all(isinstance(arg, (int, type(None))) for arg in args):
|
||||
result = SemVer.parse(args)
|
||||
|
||||
# (None)
|
||||
elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())):
|
||||
result = None
|
||||
|
||||
# 1, 2, 3
|
||||
else:
|
||||
result = SemVerTuple.__new__(cls, *args, **kwargs)
|
||||
|
||||
if result is not None:
|
||||
# add first line as extra hidden metadata so it can be logged without having to re-run version cmd
|
||||
result.full_text = full_text or str(result)
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None:
|
||||
"""
|
||||
parses a version tag string formatted like into (major, minor, patch) ints
|
||||
'Google Chrome 124.0.6367.208' -> (124, 0, 6367)
|
||||
'GNU Wget 1.24.5 built on darwin23.2.0.' -> (1, 24, 5)
|
||||
'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0)
|
||||
'2024.04.09' -> (2024, 4, 9)
|
||||
|
||||
"""
|
||||
# print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout)
|
||||
|
||||
if isinstance(version_stdout, (tuple, list)):
|
||||
version_stdout = '.'.join(str(chunk) for chunk in version_stdout)
|
||||
elif isinstance(version_stdout, bytes):
|
||||
version_stdout = version_stdout.decode()
|
||||
elif not isinstance(version_stdout, str):
|
||||
version_stdout = str(version_stdout)
|
||||
|
||||
# no text to work with, return None immediately
|
||||
if not version_stdout.strip():
|
||||
# raise Exception('Tried to parse semver from empty version output (is binary installed and available?)')
|
||||
return None
|
||||
|
||||
just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0]
|
||||
contains_semver = lambda col: (
|
||||
col.count('.') in (1, 2, 3)
|
||||
and all(chunk.isdigit() for chunk in col.split('.')[:3]) # first 3 chunks can only be nums
|
||||
)
|
||||
|
||||
full_text = version_stdout.split('\n')[0].strip()
|
||||
first_line_columns = full_text.split()[:4]
|
||||
version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns)))
|
||||
|
||||
# could not find any column of first line that looks like a version number, despite there being some text
|
||||
if not version_columns:
|
||||
# raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns)))
|
||||
return None
|
||||
|
||||
# take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09)
|
||||
first_version_tuple = version_columns[0].split('.', 3)[:3]
|
||||
|
||||
# print('FINAL_VALUE', first_version_tuple)
|
||||
|
||||
return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text)
|
||||
|
||||
def __str__(self):
|
||||
return '.'.join(str(chunk) for chunk in self)
|
||||
|
||||
# @classmethod
|
||||
# def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema:
|
||||
# default_schema = handler(source)
|
||||
# return core_schema.no_info_after_validator_function(
|
||||
# cls.parse,
|
||||
# default_schema,
|
||||
# serialization=core_schema.plain_serializer_function_ser_schema(
|
||||
# lambda semver: str(semver),
|
||||
# info_arg=False,
|
||||
# return_schema=core_schema.str_schema(),
|
||||
# ),
|
||||
# )
|
||||
|
||||
assert SemVer(None) == None
|
||||
assert SemVer('') == None
|
||||
assert SemVer.parse('') == None
|
||||
assert SemVer(1) == (1, 0, 0)
|
||||
assert SemVer(1, 2) == (1, 2, 0)
|
||||
assert SemVer('1.2+234234') == (1, 2, 0)
|
||||
assert SemVer((1, 2, 3)) == (1, 2, 3)
|
||||
assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3'
|
||||
assert SemVer(('1', '2', '3')) == (1, 2, 3)
|
||||
assert SemVer.parse('5.6.7') == (5, 6, 7)
|
||||
assert SemVer.parse('124.0.6367.208') == (124, 0, 6367)
|
||||
assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0)
|
||||
assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367)
|
||||
assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367)
|
||||
assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123'
|
||||
assert SemVer.parse('Google Chrome') == None
|
||||
|
||||
@validate_call
|
||||
def bin_name(bin_path_or_name: str | Path) -> str:
|
||||
name = Path(bin_path_or_name).name
|
||||
assert len(name) > 1
|
||||
assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), (
|
||||
f'Binary name can only contain a-Z0-9-_.: {name}')
|
||||
return name
|
||||
|
||||
BinName = Annotated[str, AfterValidator(bin_name)]
|
||||
|
||||
@validate_call
|
||||
def path_is_file(path: Path | str) -> Path:
|
||||
path = Path(path) if isinstance(path, str) else path
|
||||
assert path.is_file(), f'Path is not a file: {path}'
|
||||
return path
|
||||
|
||||
HostExistsPath = Annotated[Path, AfterValidator(path_is_file)]
|
||||
|
||||
@validate_call
|
||||
def path_is_executable(path: HostExistsPath) -> HostExistsPath:
|
||||
assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})'
|
||||
return path
|
||||
|
||||
@validate_call
|
||||
def path_is_script(path: HostExistsPath) -> HostExistsPath:
|
||||
SCRIPT_EXTENSIONS = ('.py', '.js', '.sh')
|
||||
assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS))
|
||||
return path
|
||||
|
||||
HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)]
|
||||
|
||||
@validate_call
|
||||
def path_is_abspath(path: Path) -> Path:
|
||||
return path.resolve()
|
||||
|
||||
HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)]
|
||||
HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)]
|
||||
|
||||
|
||||
@validate_call
|
||||
def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None:
|
||||
assert bin_path_or_name
|
||||
|
||||
if str(bin_path_or_name).startswith('/'):
|
||||
# already a path, get its absolute form
|
||||
abspath = Path(bin_path_or_name).resolve()
|
||||
else:
|
||||
# not a path yet, get path using os.which
|
||||
binpath = shutil.which(bin_path_or_name)
|
||||
if not binpath:
|
||||
return None
|
||||
abspath = Path(binpath).resolve()
|
||||
|
||||
try:
|
||||
return TypeAdapter(HostBinPath).validate_python(abspath)
|
||||
except ValidationError:
|
||||
return None
|
||||
|
||||
|
||||
@validate_call
|
||||
def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None:
|
||||
return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode())
|
||||
|
||||
|
||||
class InstalledBin(BaseModel):
|
||||
abspath: HostBinPath
|
||||
version: SemVer
|
||||
|
||||
|
||||
def is_valid_install_string(pkgs_str: str) -> str:
|
||||
"""Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'"""
|
||||
assert pkgs_str
|
||||
assert all(len(pkg) > 1 for pkg in pkgs_str.split(' '))
|
||||
return pkgs_str
|
||||
|
||||
def is_valid_python_dotted_import(import_str: str) -> str:
|
||||
assert import_str and import_str.replace('.', '').replace('_', '').isalnum()
|
||||
return import_str
|
||||
|
||||
InstallStr = Annotated[str, AfterValidator(is_valid_install_string)]
|
||||
|
||||
LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)]
|
||||
|
||||
ProviderHandler = Callable[..., Any] | Callable[[], Any] # must take no args [], or [bin_name: str, **kwargs]
|
||||
#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||
ProviderHandlerRef = LazyImportStr | ProviderHandler
|
||||
ProviderLookupDict = Dict[str, LazyImportStr]
|
||||
ProviderType = Literal['abspath', 'version', 'subdeps', 'install']
|
||||
|
||||
|
||||
# class Host(BaseModel):
|
||||
# machine: str
|
||||
# system: str
|
||||
# platform: str
|
||||
# in_docker: bool
|
||||
# in_qemu: bool
|
||||
# python: str
|
||||
|
||||
BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor']
|
||||
|
||||
|
||||
class BinProvider(ABC, BaseModel):
|
||||
name: BinProviderName
|
||||
|
||||
abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True)
|
||||
version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True)
|
||||
subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True)
|
||||
install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True)
|
||||
|
||||
_abspath_cache: ClassVar = {}
|
||||
_version_cache: ClassVar = {}
|
||||
_install_cache: ClassVar = {}
|
||||
|
||||
# def provider_version(self) -> SemVer | None:
|
||||
# """Version of the actual underlying package manager (e.g. pip v20.4.1)"""
|
||||
# if self.name in ('env', 'vendor'):
|
||||
# return SemVer('0.0.0')
|
||||
# installer_binpath = Path(shutil.which(self.name)).resolve()
|
||||
# return bin_version(installer_binpath)
|
||||
|
||||
# def provider_host(self) -> Host:
|
||||
# """Information about the host env, archictecture, and OS needed to select & build packages"""
|
||||
# p = platform.uname()
|
||||
# return Host(
|
||||
# machine=p.machine,
|
||||
# system=p.system,
|
||||
# platform=platform.platform(),
|
||||
# python=sys.implementation.name,
|
||||
# in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true',
|
||||
# in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true',
|
||||
# )
|
||||
|
||||
def get_default_providers(self):
|
||||
return self.get_providers_for_bin('*')
|
||||
|
||||
def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None:
|
||||
if provider_func is None:
|
||||
return None
|
||||
|
||||
# if provider_func is a dotted path to a function on self, swap it for the actual function
|
||||
if isinstance(provider_func, str) and provider_func.startswith('self.'):
|
||||
provider_func = getattr(self, provider_func.split('self.', 1)[-1])
|
||||
|
||||
# if provider_func is a dot-formatted import string, import the function
|
||||
if isinstance(provider_func, str):
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
package_name, module_name, classname, path = provider_func.split('.', 3) # -> abc, def, ghi.jkl
|
||||
|
||||
# get .ghi.jkl nested attr present on module abc.def
|
||||
imported_module = import_string(f'{package_name}.{module_name}.{classname}')
|
||||
provider_func = operator.attrgetter(path)(imported_module)
|
||||
|
||||
# # abc.def.ghi.jkl -> 1, 2, 3
|
||||
# for idx in range(1, len(path)):
|
||||
# parent_path = '.'.join(path[:-idx]) # abc.def.ghi
|
||||
# try:
|
||||
# parent_module = import_string(parent_path)
|
||||
# provider_func = getattr(parent_module, path[-idx])
|
||||
# except AttributeError, ImportError:
|
||||
# continue
|
||||
|
||||
assert TypeAdapter(ProviderHandler).validate_python(provider_func), (
|
||||
f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}')
|
||||
|
||||
return provider_func
|
||||
|
||||
@validate_call
|
||||
def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict:
|
||||
providers_for_bin = {
|
||||
'abspath': self.abspath_provider.get(bin_name),
|
||||
'version': self.version_provider.get(bin_name),
|
||||
'subdeps': self.subdeps_provider.get(bin_name),
|
||||
'install': self.install_provider.get(bin_name),
|
||||
}
|
||||
only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None}
|
||||
|
||||
return only_set_providers_for_bin
|
||||
|
||||
@validate_call
|
||||
def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler:
|
||||
"""
|
||||
Get the provider func for a given key + Dict of provider callbacks + fallback default provider.
|
||||
e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable
|
||||
"""
|
||||
|
||||
provider_func_ref = (
|
||||
(overrides or {}).get(provider_type)
|
||||
or self.get_providers_for_bin(bin_name).get(provider_type)
|
||||
or self.get_default_providers().get(provider_type)
|
||||
or default_provider
|
||||
)
|
||||
# print('getting provider for action', bin_name, provider_type, provider_func)
|
||||
|
||||
provider_func = self.resolve_provider_func(provider_func_ref)
|
||||
|
||||
assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.'
|
||||
|
||||
return provider_func
|
||||
|
||||
@validate_call
|
||||
def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any:
|
||||
provider_func: ProviderHandler = self.get_provider_for_action(
|
||||
bin_name=bin_name,
|
||||
provider_type=provider_type,
|
||||
default_provider=default_provider,
|
||||
overrides=overrides,
|
||||
)
|
||||
if not func_takes_args_or_kwargs(provider_func):
|
||||
# if it's a pure argless lambdas, dont pass bin_path and other **kwargs
|
||||
provider_func_without_args = cast(Callable[[], Any], provider_func)
|
||||
return provider_func_without_args()
|
||||
|
||||
provider_func = cast(Callable[..., Any], provider_func)
|
||||
return provider_func(bin_name, **kwargs)
|
||||
|
||||
|
||||
|
||||
def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None:
|
||||
print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...')
|
||||
try:
|
||||
return bin_abspath(bin_name)
|
||||
except ValidationError:
|
||||
return None
|
||||
|
||||
def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None:
|
||||
abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name)
|
||||
if not abspath: return None
|
||||
|
||||
print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...')
|
||||
try:
|
||||
return bin_version(abspath)
|
||||
except ValidationError:
|
||||
return None
|
||||
|
||||
def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr:
|
||||
print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}')
|
||||
# ... subdependency calculation logic here
|
||||
return TypeAdapter(InstallStr).validate_python(bin_name)
|
||||
|
||||
@abstractmethod
|
||||
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
|
||||
subdeps = subdeps or self.get_subdeps(bin_name)
|
||||
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
|
||||
# ... install logic here
|
||||
assert True
|
||||
|
||||
|
||||
@validate_call
|
||||
def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None:
|
||||
abspath = self.call_provider_for_action(
|
||||
bin_name=bin_name,
|
||||
provider_type='abspath',
|
||||
default_provider=self.on_get_abspath,
|
||||
overrides=overrides,
|
||||
)
|
||||
if not abspath:
|
||||
return None
|
||||
result = TypeAdapter(HostBinPath).validate_python(abspath)
|
||||
self._abspath_cache[bin_name] = result
|
||||
return result
|
||||
|
||||
@validate_call
|
||||
def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None:
|
||||
version = self.call_provider_for_action(
|
||||
bin_name=bin_name,
|
||||
provider_type='version',
|
||||
default_provider=self.on_get_version,
|
||||
overrides=overrides,
|
||||
abspath=abspath,
|
||||
)
|
||||
if not version:
|
||||
return None
|
||||
result = SemVer(version)
|
||||
self._version_cache[bin_name] = result
|
||||
return result
|
||||
|
||||
@validate_call
|
||||
def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr:
|
||||
subdeps = self.call_provider_for_action(
|
||||
bin_name=bin_name,
|
||||
provider_type='subdeps',
|
||||
default_provider=self.on_get_subdeps,
|
||||
overrides=overrides,
|
||||
)
|
||||
if not subdeps:
|
||||
subdeps = bin_name
|
||||
result = TypeAdapter(InstallStr).validate_python(subdeps)
|
||||
return result
|
||||
|
||||
@validate_call
|
||||
def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None:
|
||||
subdeps = self.get_subdeps(bin_name, overrides=overrides)
|
||||
|
||||
self.call_provider_for_action(
|
||||
bin_name=bin_name,
|
||||
provider_type='install',
|
||||
default_provider=self.on_install,
|
||||
overrides=overrides,
|
||||
subdeps=subdeps,
|
||||
)
|
||||
|
||||
installed_abspath = self.get_abspath(bin_name)
|
||||
assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}'
|
||||
|
||||
installed_version = self.get_version(bin_name, abspath=installed_abspath)
|
||||
assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}'
|
||||
|
||||
result = InstalledBin(abspath=installed_abspath, version=installed_version)
|
||||
self._install_cache[bin_name] = result
|
||||
return result
|
||||
|
||||
@validate_call
|
||||
def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None:
|
||||
installed_abspath = None
|
||||
installed_version = None
|
||||
|
||||
if cache:
|
||||
installed_bin = self._install_cache.get(bin_name)
|
||||
if installed_bin:
|
||||
return installed_bin
|
||||
installed_abspath = self._abspath_cache.get(bin_name)
|
||||
installed_version = self._version_cache.get(bin_name)
|
||||
|
||||
|
||||
installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides)
|
||||
if not installed_abspath:
|
||||
return None
|
||||
|
||||
installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides)
|
||||
if not installed_version:
|
||||
return None
|
||||
|
||||
return InstalledBin(abspath=installed_abspath, version=installed_version)
|
||||
|
||||
@validate_call
|
||||
def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None:
|
||||
installed = self.load(bin_name, overrides=overrides, cache=cache)
|
||||
if not installed:
|
||||
installed = self.install(bin_name, overrides=overrides)
|
||||
return installed
|
||||
|
||||
|
||||
class PipProvider(BinProvider):
|
||||
name: BinProviderName = 'pip'
|
||||
|
||||
def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
|
||||
subdeps = subdeps or self.on_get_subdeps(bin_name)
|
||||
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
|
||||
|
||||
proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
|
||||
|
||||
if proc.returncode != 0:
|
||||
print(proc.stdout.strip().decode())
|
||||
print(proc.stderr.strip().decode())
|
||||
raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
|
||||
|
||||
|
||||
class AptProvider(BinProvider):
|
||||
name: BinProviderName = 'apt'
|
||||
|
||||
subdeps_provider: ProviderLookupDict = {
|
||||
'yt-dlp': lambda: 'yt-dlp ffmpeg',
|
||||
}
|
||||
|
||||
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
|
||||
subdeps = subdeps or self.on_get_subdeps(bin_name)
|
||||
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
|
||||
|
||||
run(['apt-get', 'update', '-qq'])
|
||||
proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
|
||||
|
||||
if proc.returncode != 0:
|
||||
print(proc.stdout.strip().decode())
|
||||
print(proc.stderr.strip().decode())
|
||||
raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
|
||||
|
||||
class BrewProvider(BinProvider):
|
||||
name: BinProviderName = 'brew'
|
||||
|
||||
def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
|
||||
subdeps = subdeps or self.on_get_subdeps(bin_name)
|
||||
print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
|
||||
|
||||
proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
|
||||
|
||||
if proc.returncode != 0:
|
||||
print(proc.stdout.strip().decode())
|
||||
print(proc.stderr.strip().decode())
|
||||
raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
|
||||
|
||||
|
||||
class EnvProvider(BinProvider):
|
||||
name: BinProviderName = 'env'
|
||||
|
||||
abspath_provider: ProviderLookupDict = {
|
||||
# 'python': lambda: Path('/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'),
|
||||
}
|
||||
version_provider: ProviderLookupDict = {
|
||||
# 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
|
||||
}
|
||||
|
||||
def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
|
||||
"""The env provider is ready-only and does not install any packages, so this is a no-op"""
|
||||
pass
|
|
@ -31,7 +31,7 @@ def no_empty_args(args: List[str]) -> List[str]:
|
|||
assert all(len(arg) for arg in args)
|
||||
return args
|
||||
|
||||
ExtractorName = Literal['wget', 'warc', 'media']
|
||||
ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
|
||||
|
||||
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||
CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]
|
||||
|
|
|
@ -14,9 +14,6 @@ from pydantic import (
|
|||
|
||||
from .binaries import (
|
||||
Binary,
|
||||
PythonBinary,
|
||||
SqliteBinary,
|
||||
DjangoBinary,
|
||||
WgetBinary,
|
||||
YtdlpBinary,
|
||||
)
|
||||
|
@ -28,7 +25,6 @@ from .extractors import (
|
|||
)
|
||||
from .replayers import (
|
||||
Replayer,
|
||||
GENERIC_REPLAYER,
|
||||
MEDIA_REPLAYER,
|
||||
)
|
||||
from .configs import (
|
||||
|
@ -80,12 +76,6 @@ class Plugin(BaseModel):
|
|||
})
|
||||
|
||||
|
||||
class CorePlugin(Plugin):
|
||||
name: str = 'core'
|
||||
configs: List[SerializeAsAny[ConfigSet]] = []
|
||||
binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
|
||||
extractors: List[SerializeAsAny[Extractor]] = []
|
||||
replayers: List[SerializeAsAny[Replayer]] = [GENERIC_REPLAYER]
|
||||
|
||||
class YtdlpPlugin(Plugin):
|
||||
name: str = 'ytdlp'
|
||||
|
@ -101,11 +91,9 @@ class WgetPlugin(Plugin):
|
|||
extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()]
|
||||
|
||||
|
||||
CORE_PLUGIN = CorePlugin()
|
||||
YTDLP_PLUGIN = YtdlpPlugin()
|
||||
WGET_PLUGIN = WgetPlugin()
|
||||
PLUGINS = [
|
||||
CORE_PLUGIN,
|
||||
YTDLP_PLUGIN,
|
||||
WGET_PLUGIN,
|
||||
]
|
||||
|
|
|
@ -22,5 +22,4 @@ class Replayer(BaseModel):
|
|||
# thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
|
||||
|
||||
|
||||
GENERIC_REPLAYER = Replayer(name='generic')
|
||||
MEDIA_REPLAYER = Replayer(name='media')
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
__package__ = 'archivebox.plugantic'
|
||||
|
||||
import inspect
|
||||
from typing import Any
|
||||
|
||||
from django.http import HttpRequest
|
||||
from django.utils.html import format_html, mark_safe
|
||||
|
||||
|
@ -10,6 +13,44 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
|
|||
from plugantic.plugins import LOADED_PLUGINS
|
||||
from django.conf import settings
|
||||
|
||||
def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
||||
indent_str = " " * indent
|
||||
|
||||
if isinstance(obj, dict):
|
||||
if not obj:
|
||||
return "{}"
|
||||
result = "\n"
|
||||
for key, value in obj.items():
|
||||
result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n"
|
||||
return result
|
||||
|
||||
elif isinstance(obj, list):
|
||||
if not obj:
|
||||
return "[]"
|
||||
result = "\n"
|
||||
for item in obj:
|
||||
result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n"
|
||||
return result.rstrip()
|
||||
|
||||
elif isinstance(obj, str):
|
||||
if "\n" in obj:
|
||||
return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ")
|
||||
else:
|
||||
return f" {obj}"
|
||||
|
||||
elif isinstance(obj, (int, float, bool)):
|
||||
return f" {str(obj)}"
|
||||
|
||||
elif callable(obj):
|
||||
source = '\n'.join(
|
||||
'' if 'def ' in line else line
|
||||
for line in inspect.getsource(obj).split('\n')
|
||||
if line.strip()
|
||||
).split('lambda: ')[-1].rstrip(',')
|
||||
return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ")
|
||||
|
||||
else:
|
||||
return f" {str(obj)}"
|
||||
|
||||
@render_with_table_view
|
||||
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
@ -18,13 +59,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
|
||||
rows = {
|
||||
"Binary": [],
|
||||
"From Plugin": [],
|
||||
"Found Version": [],
|
||||
"From Plugin": [],
|
||||
"Provided By": [],
|
||||
"Found Abspath": [],
|
||||
"Related Configuration": [],
|
||||
"Overrides": [],
|
||||
"Description": [],
|
||||
# "Description": [],
|
||||
}
|
||||
|
||||
relevant_configs = {
|
||||
|
@ -38,8 +79,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
binary = binary.load_or_install()
|
||||
|
||||
rows['Binary'].append(ItemLink(binary.name, key=binary.name))
|
||||
rows['From Plugin'].append(plugin.name)
|
||||
rows['Found Version'].append(binary.loaded_version)
|
||||
rows['From Plugin'].append(plugin.name)
|
||||
rows['Provided By'].append(binary.loaded_provider)
|
||||
rows['Found Abspath'].append(binary.loaded_abspath)
|
||||
rows['Related Configuration'].append(mark_safe(', '.join(
|
||||
|
@ -48,8 +89,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
|
||||
# or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
|
||||
)))
|
||||
rows['Overrides'].append(str(binary.provider_overrides))
|
||||
rows['Description'].append(binary.description)
|
||||
rows['Overrides'].append(obj_to_yaml(binary.provider_overrides))
|
||||
# rows['Description'].append(binary.description)
|
||||
|
||||
return TableContext(
|
||||
title="Binaries",
|
||||
|
@ -85,8 +126,8 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
|||
'binprovider': binary.loaded_provider,
|
||||
'abspath': binary.loaded_abspath,
|
||||
'version': binary.loaded_version,
|
||||
'overrides': str(binary.provider_overrides),
|
||||
'providers': str(binary.providers_supported),
|
||||
'overrides': obj_to_yaml(binary.provider_overrides),
|
||||
'providers': obj_to_yaml(binary.providers_supported),
|
||||
},
|
||||
"help_texts": {
|
||||
# TODO
|
||||
|
|
|
@ -11,13 +11,12 @@ from typing import Optional, Union, Set, Tuple
|
|||
from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
|
||||
|
||||
from crontab import CronTab
|
||||
from .vendor.atomicwrites import atomic_write as lib_atomic_write
|
||||
from atomicwrites import atomic_write as lib_atomic_write
|
||||
|
||||
from .util import enforce_types, ExtendedEncoder
|
||||
from .config import PYTHON_BINARY, OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
|
||||
|
||||
|
||||
|
||||
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
|
||||
"""Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
|
||||
Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
|
||||
|
|
|
@ -16,7 +16,7 @@ from datetime import datetime, timezone
|
|||
from dateparser import parse as dateparser
|
||||
from requests.exceptions import RequestException, ReadTimeout
|
||||
|
||||
from .vendor.base32_crockford import encode as base32_encode # type: ignore
|
||||
from base32_crockford import encode as base32_encode # type: ignore
|
||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||
from os.path import lexists
|
||||
from os import remove as remove_file
|
||||
|
@ -273,8 +273,8 @@ def get_headers(url: str, timeout: int=None) -> str:
|
|||
{
|
||||
'URL': url,
|
||||
'Status-Code': response.status_code,
|
||||
'Elapsed': response.elapsed,
|
||||
'Encoding': response.encoding,
|
||||
'Elapsed': response.elapsed.total_seconds()*1000,
|
||||
'Encoding': str(response.encoding),
|
||||
'Apparent-Encoding': response.apparent_encoding,
|
||||
**dict(response.headers),
|
||||
},
|
||||
|
@ -304,11 +304,7 @@ def chrome_args(**options) -> List[str]:
|
|||
cmd_args += CHROME_EXTRA_ARGS
|
||||
|
||||
if options['CHROME_HEADLESS']:
|
||||
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
|
||||
if chrome_major_version >= 111:
|
||||
cmd_args += ("--headless=new",)
|
||||
else:
|
||||
cmd_args += ('--headless',)
|
||||
cmd_args += ("--headless=new",) # expects chrome version >= 111
|
||||
|
||||
if not options['CHROME_SANDBOX']:
|
||||
# assume this means we are running inside a docker container
|
||||
|
|
34
archivebox/vendor/__init__.py
vendored
34
archivebox/vendor/__init__.py
vendored
|
@ -0,0 +1,34 @@
|
|||
import sys
|
||||
import inspect
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
VENDOR_DIR = Path(__file__).parent
|
||||
|
||||
VENDORED_LIBS = {
|
||||
# sys.path dir: library name
|
||||
'python-atomicwrites': 'atomicwrites',
|
||||
'django-taggit': 'taggit',
|
||||
'pydantic-pkgr': 'pydantic_pkgr',
|
||||
'pocket': 'pocket',
|
||||
'base32-crockford': 'base32_crockford',
|
||||
}
|
||||
|
||||
def load_vendored_libs():
|
||||
for lib_subdir, lib_name in VENDORED_LIBS.items():
|
||||
lib_dir = VENDOR_DIR / lib_subdir
|
||||
assert lib_dir.is_dir(), 'Expected vendor libary {lib_name} could not be found in {lib_dir}'
|
||||
|
||||
try:
|
||||
lib = importlib.import_module(lib_name)
|
||||
# print(f"Successfully imported lib from environment {lib_name}: {inspect.getfile(lib)}")
|
||||
except ImportError:
|
||||
sys.path.append(str(lib_dir))
|
||||
try:
|
||||
lib = importlib.import_module(lib_name)
|
||||
# print(f"Successfully imported lib from vendored fallback {lib_name}: {inspect.getfile(lib)}")
|
||||
except ImportError as e:
|
||||
print(f"Failed to import lib from environment or vendored fallback {lib_name}: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
1
archivebox/vendor/atomicwrites.py
vendored
1
archivebox/vendor/atomicwrites.py
vendored
|
@ -1 +0,0 @@
|
|||
python-atomicwrites/atomicwrites/__init__.py
|
1
archivebox/vendor/base32_crockford.py
vendored
1
archivebox/vendor/base32_crockford.py
vendored
|
@ -1 +0,0 @@
|
|||
base32-crockford/base32_crockford.py
|
1
archivebox/vendor/package-lock.json
generated
vendored
1
archivebox/vendor/package-lock.json
generated
vendored
|
@ -1 +0,0 @@
|
|||
../../package-lock.json
|
1
archivebox/vendor/package.json
vendored
1
archivebox/vendor/package.json
vendored
|
@ -1 +0,0 @@
|
|||
../../package.json
|
1
archivebox/vendor/pocket.py
vendored
1
archivebox/vendor/pocket.py
vendored
|
@ -1 +0,0 @@
|
|||
pocket/pocket.py
|
1
archivebox/vendor/pydantic-pkgr
vendored
Submodule
1
archivebox/vendor/pydantic-pkgr
vendored
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 2cd844533d888ce29b9bf32b8363510dd0d76166
|
1
archivebox/vendor/taggit_utils.py
vendored
1
archivebox/vendor/taggit_utils.py
vendored
|
@ -1 +0,0 @@
|
|||
django-taggit/taggit/utils.py
|
18
package-lock.json
generated
18
package-lock.json
generated
|
@ -236,9 +236,9 @@
|
|||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "22.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz",
|
||||
"integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==",
|
||||
"version": "22.5.1",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
|
||||
"integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
|
@ -353,9 +353,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/aws4": {
|
||||
"version": "1.13.1",
|
||||
"resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz",
|
||||
"integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==",
|
||||
"version": "1.13.2",
|
||||
"resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
|
||||
"integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/b4a": {
|
||||
|
@ -2376,9 +2376,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/tslib": {
|
||||
"version": "2.6.3",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
|
||||
"integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==",
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
|
||||
"integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
|
||||
"license": "0BSD"
|
||||
},
|
||||
"node_modules/turndown": {
|
||||
|
|
103
pdm.lock
generated
103
pdm.lock
generated
|
@ -5,7 +5,7 @@
|
|||
groups = ["default", "ldap", "sonic"]
|
||||
strategy = ["inherit_metadata"]
|
||||
lock_version = "4.5.0"
|
||||
content_hash = "sha256:f2f7ca01f2e18a1ef07d59b7a8985d89785a4b8a2a4e66452f1f9e8e8ad529ad"
|
||||
content_hash = "sha256:c6aa1f436032d18d079a4c2e9d9b95a5110579eb96a449751bfaf4d472eba401"
|
||||
|
||||
[[metadata.targets]]
|
||||
requires_python = "==3.10.*"
|
||||
|
@ -78,6 +78,29 @@ files = [
|
|||
{file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomicwrites"
|
||||
version = "1.4.0"
|
||||
requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
summary = "Atomic file writes."
|
||||
groups = ["default"]
|
||||
marker = "python_version == \"3.10\""
|
||||
files = [
|
||||
{file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
|
||||
{file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "base32-crockford"
|
||||
version = "0.3.0"
|
||||
summary = "A Python implementation of Douglas Crockford's base32 encoding scheme"
|
||||
groups = ["default"]
|
||||
marker = "python_version == \"3.10\""
|
||||
files = [
|
||||
{file = "base32-crockford-0.3.0.tar.gz", hash = "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969"},
|
||||
{file = "base32_crockford-0.3.0-py2.py3-none-any.whl", hash = "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "1.1.0"
|
||||
|
@ -407,6 +430,21 @@ files = [
|
|||
{file = "django_stubs_ext-5.0.4.tar.gz", hash = "sha256:85da065224204774208be29c7d02b4482d5a69218a728465c2fbe41725fdc819"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "django-taggit"
|
||||
version = "1.3.0"
|
||||
requires_python = ">=3.5"
|
||||
summary = "django-taggit is a reusable Django application for simple tagging."
|
||||
groups = ["default"]
|
||||
marker = "python_version == \"3.10\""
|
||||
dependencies = [
|
||||
"Django>=1.11",
|
||||
]
|
||||
files = [
|
||||
{file = "django-taggit-1.3.0.tar.gz", hash = "sha256:4a833bf71f4c2deddd9745924eee53be1c075d7f0020a06f12e29fa3d752732d"},
|
||||
{file = "django_taggit-1.3.0-py3-none-any.whl", hash = "sha256:609b0223d8a652f3fae088b7fd29f294fdadaca2d7931d45c27d6c59b02fdf31"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
version = "1.2.2"
|
||||
|
@ -479,7 +517,7 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "httpx"
|
||||
version = "0.27.0"
|
||||
version = "0.27.2"
|
||||
requires_python = ">=3.8"
|
||||
summary = "The next generation HTTP client."
|
||||
groups = ["default"]
|
||||
|
@ -492,20 +530,20 @@ dependencies = [
|
|||
"sniffio",
|
||||
]
|
||||
files = [
|
||||
{file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"},
|
||||
{file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"},
|
||||
{file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
|
||||
{file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.7"
|
||||
requires_python = ">=3.5"
|
||||
version = "3.8"
|
||||
requires_python = ">=3.6"
|
||||
summary = "Internationalized Domain Names in Applications (IDNA)"
|
||||
groups = ["default"]
|
||||
marker = "python_version == \"3.10\""
|
||||
files = [
|
||||
{file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
|
||||
{file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
|
||||
{file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"},
|
||||
{file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -613,6 +651,32 @@ files = [
|
|||
{file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pocket"
|
||||
version = "0.3.7"
|
||||
git = "https://github.com/tapanpandita/pocket.git"
|
||||
ref = "v0.3.7"
|
||||
revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
|
||||
summary = "api wrapper for getpocket.com"
|
||||
groups = ["default"]
|
||||
marker = "python_version == \"3.10\""
|
||||
dependencies = [
|
||||
"requests",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pocket"
|
||||
version = "0.3.7"
|
||||
git = "https://github.com/tapanpandita/pocket.git"
|
||||
ref = "v0.3.7"
|
||||
revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
|
||||
summary = "api wrapper for getpocket.com"
|
||||
groups = ["default"]
|
||||
marker = "python_version == \"3.10\""
|
||||
dependencies = [
|
||||
"requests",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prompt-toolkit"
|
||||
version = "3.0.47"
|
||||
|
@ -739,6 +803,23 @@ files = [
|
|||
{file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic-pkgr"
|
||||
version = "0.1.4"
|
||||
requires_python = ">=3.10"
|
||||
summary = "System package manager APIs in strongly typed Python"
|
||||
groups = ["default"]
|
||||
marker = "python_version == \"3.10\""
|
||||
dependencies = [
|
||||
"pydantic-core>=2.18.2",
|
||||
"pydantic>=2.7.1",
|
||||
"typing-extensions>=4.11.0",
|
||||
]
|
||||
files = [
|
||||
{file = "pydantic_pkgr-0.1.4-py3-none-any.whl", hash = "sha256:bd9ddfa8eeb4d361257c4d3d8d36ba44a72515b497ee52cf0763240c66006417"},
|
||||
{file = "pydantic_pkgr-0.1.4.tar.gz", hash = "sha256:e0422022dd83341f1e869a54da9aca903a6407a983ece0735f69493841b0fbb8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pygments"
|
||||
version = "2.18.0"
|
||||
|
@ -841,14 +922,14 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "setuptools"
|
||||
version = "73.0.1"
|
||||
version = "74.0.0"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
groups = ["default"]
|
||||
marker = "python_version == \"3.10\""
|
||||
files = [
|
||||
{file = "setuptools-73.0.1-py3-none-any.whl", hash = "sha256:b208925fcb9f7af924ed2dc04708ea89791e24bde0d3020b27df0e116088b34e"},
|
||||
{file = "setuptools-73.0.1.tar.gz", hash = "sha256:d59a3e788ab7e012ab2c4baed1b376da6366883ee20d7a5fc426816e3d7b1193"},
|
||||
{file = "setuptools-74.0.0-py3-none-any.whl", hash = "sha256:0274581a0037b638b9fc1c6883cc71c0210865aaa76073f7882376b641b84e8f"},
|
||||
{file = "setuptools-74.0.0.tar.gz", hash = "sha256:a85e96b8be2b906f3e3e789adec6a9323abf79758ecfa3065bd740d81158b11e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -29,12 +29,9 @@ dependencies = [
|
|||
"croniter>=2.0.5", # for: archivebox schedule
|
||||
"ipython>=8.23.0", # for: archivebox shell
|
||||
# Extractor Dependencies
|
||||
"yt-dlp>=2024.4.9", # for: media
|
||||
"yt-dlp>=2024.8.6", # for: media
|
||||
# "playwright>=1.43.0; platform_machine != 'armv7l'", # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
|
||||
# TODO: add more extractors
|
||||
# - gallery-dl
|
||||
# - scihubdl
|
||||
# - See Github issues for more...
|
||||
|
||||
"django-signal-webhooks>=0.3.0",
|
||||
"django-admin-data-views>=0.3.1",
|
||||
"ulid-py>=1.1.0",
|
||||
|
@ -43,6 +40,14 @@ dependencies = [
|
|||
"django-pydantic-field>=0.3.9",
|
||||
"django-jsonform>=2.22.0",
|
||||
"django-stubs>=5.0.2",
|
||||
|
||||
# these can be safely omitted when installation subsystem does not provide these as packages (e.g. apt/debian)
|
||||
# archivebox will automatically load fallback vendored copies bundled via archivebox/vendor/__init__.py
|
||||
"pydantic-pkgr>=0.1.4",
|
||||
"atomicwrites==1.4.0",
|
||||
"pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7",
|
||||
"django-taggit==1.3.0",
|
||||
"base32-crockford==0.3.0",
|
||||
]
|
||||
|
||||
homepage = "https://github.com/ArchiveBox/ArchiveBox"
|
||||
|
@ -139,7 +144,7 @@ exclude = [
|
|||
"**/migrations",
|
||||
"archivebox/vendor",
|
||||
]
|
||||
stubPath = "./typings"
|
||||
stubPath = "./archivebox/typings"
|
||||
venvPath = "."
|
||||
venv = ".venv"
|
||||
# ignore = ["src/oldstuff"]
|
||||
|
@ -169,6 +174,9 @@ debug = [
|
|||
"djdt_flamegraph",
|
||||
"ipdb",
|
||||
"requests-tracker>=0.3.3",
|
||||
"logfire[django]>=0.51.0",
|
||||
"opentelemetry-instrumentation-django>=0.47b0",
|
||||
"opentelemetry-instrumentation-sqlite3>=0.47b0",
|
||||
]
|
||||
test = [
|
||||
"pytest",
|
||||
|
@ -177,8 +185,6 @@ test = [
|
|||
lint = [
|
||||
"flake8",
|
||||
"mypy",
|
||||
]
|
||||
dev = [
|
||||
"django-autotyping>=0.5.1",
|
||||
]
|
||||
|
||||
|
|
|
@ -5,6 +5,8 @@ annotated-types==0.7.0; python_version == "3.10"
|
|||
anyio==4.4.0; python_version == "3.10"
|
||||
asgiref==3.8.1; python_version == "3.10"
|
||||
asttokens==2.4.1; python_version == "3.10"
|
||||
atomicwrites==1.4.0; python_version == "3.10"
|
||||
base32-crockford==0.3.0; python_version == "3.10"
|
||||
brotli==1.1.0; implementation_name == "cpython" and python_version == "3.10"
|
||||
brotlicffi==1.1.0.0; implementation_name != "cpython" and python_version == "3.10"
|
||||
certifi==2024.7.4; python_version == "3.10"
|
||||
|
@ -26,13 +28,14 @@ django-settings-holder==0.1.2; python_version == "3.10"
|
|||
django-signal-webhooks==0.3.0; python_version == "3.10"
|
||||
django-stubs==5.0.4; python_version == "3.10"
|
||||
django-stubs-ext==5.0.4; python_version == "3.10"
|
||||
django-taggit==1.3.0; python_version == "3.10"
|
||||
exceptiongroup==1.2.2; python_version == "3.10"
|
||||
executing==2.0.1; python_version == "3.10"
|
||||
feedparser==6.0.11; python_version == "3.10"
|
||||
h11==0.14.0; python_version == "3.10"
|
||||
httpcore==1.0.5; python_version == "3.10"
|
||||
httpx==0.27.0; python_version == "3.10"
|
||||
idna==3.7; python_version == "3.10"
|
||||
httpx==0.27.2; python_version == "3.10"
|
||||
idna==3.8; python_version == "3.10"
|
||||
ipython==8.26.0; python_version == "3.10"
|
||||
jedi==0.19.1; python_version == "3.10"
|
||||
matplotlib-inline==0.1.7; python_version == "3.10"
|
||||
|
@ -40,6 +43,7 @@ mutagen==1.47.0; python_version == "3.10"
|
|||
mypy-extensions==1.0.0; python_version == "3.10"
|
||||
parso==0.8.4; python_version == "3.10"
|
||||
pexpect==4.9.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
|
||||
pocket @ git+https://github.com/tapanpandita/pocket.git@5a144438cc89bfc0ec94db960718ccf1f76468c1 ; python_version == "3.10"
|
||||
prompt-toolkit==3.0.47; python_version == "3.10"
|
||||
ptyprocess==0.7.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
|
||||
pure-eval==0.2.3; python_version == "3.10"
|
||||
|
@ -49,6 +53,7 @@ pycparser==2.22; platform_python_implementation != "PyPy" and python_version ==
|
|||
pycryptodomex==3.20.0; python_version == "3.10"
|
||||
pydantic==2.8.2; python_version == "3.10"
|
||||
pydantic-core==2.20.1; python_version == "3.10"
|
||||
pydantic-pkgr==0.1.4; python_version == "3.10"
|
||||
pygments==2.18.0; python_version == "3.10"
|
||||
python-crontab==3.2.0; python_version == "3.10"
|
||||
python-dateutil==2.9.0.post0; python_version == "3.10"
|
||||
|
@ -56,7 +61,7 @@ python-ldap==3.4.4; python_version == "3.10"
|
|||
pytz==2024.1; python_version == "3.10"
|
||||
regex==2024.7.24; python_version == "3.10"
|
||||
requests==2.32.3; python_version == "3.10"
|
||||
setuptools==73.0.1; python_version == "3.10"
|
||||
setuptools==74.0.0; python_version == "3.10"
|
||||
sgmllib3k==1.0.0; python_version == "3.10"
|
||||
six==1.16.0; python_version == "3.10"
|
||||
sniffio==1.3.1; python_version == "3.10"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue