move util.py into misc folder

This commit is contained in:
Nick Sweeting 2024-09-30 17:25:15 -07:00
parent dfca4b13b2
commit 363a499289
No known key found for this signature in database
68 changed files with 136 additions and 161 deletions

View file

@ -11,7 +11,7 @@ from uuid import UUID
from typeid import TypeID # type: ignore[import-untyped] from typeid import TypeID # type: ignore[import-untyped]
from datetime import datetime from datetime import datetime
from ..util import enforce_types from archivebox.misc.util import enforce_types
ABID_PREFIX_LEN = 4 ABID_PREFIX_LEN = 4

View file

@ -13,7 +13,7 @@ from django_object_actions import DjangoObjectActions, action
from api.auth import get_or_create_api_token from api.auth import get_or_create_api_token
from ..util import parse_date from archivebox.misc.util import parse_date
from .abid import ABID from .abid import ABID
def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None): def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None):

View file

@ -25,7 +25,7 @@ class BaseQueue(BaseHook):
@property @property
def tasks(self) -> Dict[str, 'TaskWrapper']: def tasks(self) -> Dict[str, 'TaskWrapper']:
"""Return an AttrDict of all the background worker tasks defined in the plugin's tasks.py file.""" """Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
tasks = importlib.import_module(f"{self.plugin_module}.tasks") tasks = importlib.import_module(f"{self.plugin_module}.tasks")
all_tasks = {} all_tasks = {}
@ -83,7 +83,7 @@ class BaseQueue(BaseHook):
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy) worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
# Update settings.WORKERS to include this worker # Update settings.WORKERS to include this worker
settings.WORKERS = getattr(settings, "WORKERS", None) or AttrDict({}) settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True) settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
return worker return worker

View file

@ -12,7 +12,7 @@ from ..main import (
list_all, list_all,
schedule, schedule,
) )
from ..util import ansi_to_html from archivebox.misc.util import ansi_to_html
from ..config.legacy import ONLY_NEW from ..config.legacy import ONLY_NEW

View file

@ -9,7 +9,7 @@ import argparse
from typing import List, Optional, IO from typing import List, Optional, IO
from ..main import add from ..main import add
from ..util import docstring from archivebox.misc.util import docstring
from ..parsers import PARSERS from ..parsers import PARSERS
from ..config.legacy import OUTPUT_DIR, ONLY_NEW from ..config.legacy import OUTPUT_DIR, ONLY_NEW
from ..logging_util import SmartFormatter, accept_stdin, stderr from ..logging_util import SmartFormatter, accept_stdin, stderr

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import config from ..main import config
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, accept_stdin from ..logging_util import SmartFormatter, accept_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import help from ..main import help
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import init from ..main import init
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import list_all from ..main import list_all
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..index import ( from ..index import (
LINK_FILTERS, LINK_FILTERS,

View file

@ -8,7 +8,7 @@ import sys
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import manage from ..main import manage
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR

View file

@ -10,7 +10,7 @@ from pathlib import Path
from typing import List, Optional, IO from typing import List, Optional, IO
from ..main import oneshot from ..main import oneshot
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, accept_stdin, stderr from ..logging_util import SmartFormatter, accept_stdin, stderr

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import remove from ..main import remove
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, accept_stdin from ..logging_util import SmartFormatter, accept_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import schedule from ..main import schedule
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import server from ..main import server
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR, BIND_ADDR from ..config.legacy import OUTPUT_DIR, BIND_ADDR
from ..logging_util import SmartFormatter, reject_stdin from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import setup from ..main import setup
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import shell from ..main import shell
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import status from ..main import status
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import List, Optional, IO from typing import List, Optional, IO
from ..main import update from ..main import update
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..index import ( from ..index import (
LINK_FILTERS, LINK_FILTERS,

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO from typing import Optional, List, IO
from ..main import version from ..main import version
from ..util import docstring from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,17 +9,12 @@ SimpleConfigValueDict = Dict[str, SimpleConfigValue]
SimpleConfigValueGetter = Callable[[], SimpleConfigValue] SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter] ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
# class AttrDict(dict):
# def __init__(self, *args, **kwargs):
# super().__init__(*args, **kwargs)
# self.__dict__ = self
AttrDict = benedict # https://github.com/fabiocaccamo/python-benedict/
class BaseConfig(TypedDict): class BaseConfig(TypedDict):
pass pass
class ConfigDict(BaseConfig, AttrDict, total=False): class ConfigDict(BaseConfig, benedict, total=False):
""" """
# Regenerate by pasting this quine into `archivebox shell` 🥚 # Regenerate by pasting this quine into `archivebox shell` 🥚
from archivebox.config import ConfigDict, CONFIG_DEFAULTS from archivebox.config import ConfigDict, CONFIG_DEFAULTS

View file

@ -173,7 +173,7 @@ class ConstantsDict(Mapping):
# actually empty so that we dont clobber someone's home directory or desktop by accident. # actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir, # These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc. # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset(( ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
*INGORED_PATHS, *INGORED_PATHS,
*PIP_RELATED_NAMES, *PIP_RELATED_NAMES,
*NPM_RELATED_NAMES, *NPM_RELATED_NAMES,
@ -212,7 +212,7 @@ class ConstantsDict(Mapping):
}) })
DATA_LOCATIONS = benedict({ DATA_LOCATIONS = benedict({
"OUTPUT_DIR": { "DATA_DIR": {
"path": DATA_DIR.resolve(), "path": DATA_DIR.resolve(),
"enabled": True, "enabled": True,
"is_valid": DATABASE_FILE.exists(), "is_valid": DATABASE_FILE.exists(),

View file

@ -23,7 +23,7 @@ from signal_webhooks.utils import get_webhook_model
from archivebox.config import VERSION from archivebox.config import VERSION
from ..util import htmldecode, urldecode from archivebox.misc.util import htmldecode, urldecode
from core.models import Snapshot, ArchiveResult, Tag from core.models import Snapshot, ArchiveResult, Tag
from core.mixins import SearchResultsAdminMixin from core.mixins import SearchResultsAdminMixin

View file

@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
from django import forms from django import forms
from ..util import URL_REGEX from archivebox.misc.util import URL_REGEX
from ..parsers import PARSERS from ..parsers import PARSERS
from taggit.utils import edit_string_for_tags, parse_tags from taggit.utils import edit_string_for_tags, parse_tags

View file

@ -23,7 +23,7 @@ from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
from queues.tasks import bg_archive_snapshot from queues.tasks import bg_archive_snapshot
from archivebox.misc.system import get_dir_size from archivebox.misc.system import get_dir_size
from ..util import parse_date, base_url from archivebox.misc.util import parse_date, base_url
from ..index.schema import Link from ..index.schema import Link
from ..index.html import snapshot_icons from ..index.html import snapshot_icons
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
@ -231,7 +231,7 @@ class Snapshot(ABIDModel):
@cached_property @cached_property
def extension(self) -> str: def extension(self) -> str:
from ..util import extension from archivebox.misc.util import extension
return extension(self.url) return extension(self.url)
@cached_property @cached_property

View file

@ -37,7 +37,7 @@ from ..config.legacy import (
CONFIG, CONFIG,
) )
from ..logging_util import printable_filesize from ..logging_util import printable_filesize
from ..util import base_url, htmlencode, ts_to_date_str from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from ..search import query_search_index from ..search import query_search_index
from .serve_static import serve_static_with_byterange_support from .serve_static import serve_static_with_byterange_support

View file

@ -20,7 +20,7 @@ from ..index import (
load_link_details, load_link_details,
write_link_details, write_link_details,
) )
from ..util import enforce_types from archivebox.misc.util import enforce_types
from ..logging_util import ( from ..logging_util import (
log_archiving_started, log_archiving_started,
log_archiving_paused, log_archiving_paused,

View file

@ -7,7 +7,7 @@ from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
dedupe, dedupe,

View file

@ -5,7 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file, atomic_write from archivebox.misc.system import run, chmod_file, atomic_write
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
) )

View file

@ -6,7 +6,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..index.schema import Link, ArchiveResult, ArchiveOutput
from archivebox.misc.system import chmod_file, run from archivebox.misc.system import chmod_file, run
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
domain, domain,
dedupe, dedupe,

View file

@ -6,7 +6,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
domain, domain,

View file

@ -6,7 +6,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..index.schema import Link, ArchiveResult, ArchiveOutput
from archivebox.misc.system import atomic_write from archivebox.misc.system import atomic_write
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
get_headers, get_headers,
dedupe, dedupe,

View file

@ -13,7 +13,7 @@ from ..config.legacy import (
from ..index.schema import Link, ArchiveResult, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveError
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from archivebox.misc.system import atomic_write from archivebox.misc.system import atomic_write
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
) )

View file

@ -5,7 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from ..util import enforce_types, is_static_file, dedupe from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress from ..logging_util import TimedProgress

View file

@ -8,7 +8,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, atomic_write from archivebox.misc.system import run, atomic_write
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
dedupe, dedupe,

View file

@ -5,7 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
is_static_file, is_static_file,
) )

View file

@ -8,7 +8,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, atomic_write from archivebox.misc.system import run, atomic_write
from ..util import enforce_types, is_static_file from archivebox.misc.util import enforce_types, is_static_file
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from .title import get_html from .title import get_html

View file

@ -5,7 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from ..util import enforce_types, is_static_file from archivebox.misc.util import enforce_types, is_static_file
from ..logging_util import TimedProgress from ..logging_util import TimedProgress

View file

@ -7,7 +7,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from ..util import enforce_types, is_static_file, dedupe from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress from ..logging_util import TimedProgress

View file

@ -6,7 +6,7 @@ from pathlib import Path
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
download_url, download_url,
htmldecode, htmldecode,

View file

@ -8,7 +8,7 @@ from datetime import datetime, timezone
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
without_fragment, without_fragment,
without_query, without_query,

View file

@ -13,7 +13,7 @@ from django.db.models import QuerySet, Q
from archivebox.config import DATA_DIR, CONSTANTS, SEARCH_BACKEND_CONFIG from archivebox.config import DATA_DIR, CONSTANTS, SEARCH_BACKEND_CONFIG
from ..util import ( from archivebox.misc.util import (
scheme, scheme,
enforce_types, enforce_types,
ExtendedEncoder, ExtendedEncoder,

View file

@ -2,7 +2,7 @@ __package__ = 'archivebox.index'
from typing import List, Optional, Any from typing import List, Optional, Any
from ..util import enforce_types from archivebox.misc.util import enforce_types
from .schema import Link from .schema import Link

View file

@ -11,7 +11,7 @@ from django.core.cache import cache
from .schema import Link from .schema import Link
from archivebox.misc.system import atomic_write from archivebox.misc.system import atomic_write
from ..logging_util import printable_filesize from ..logging_util import printable_filesize
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
ts_to_date_str, ts_to_date_str,
urlencode, urlencode,

View file

@ -12,7 +12,7 @@ from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL
from .schema import Link from .schema import Link
from archivebox.misc.system import atomic_write from archivebox.misc.system import atomic_write
from ..util import enforce_types from archivebox.misc.util import enforce_types

View file

@ -22,7 +22,7 @@ from archivebox.config.constants import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from plugins_extractor.favicon.apps import FAVICON_CONFIG from plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.misc.system import get_dir_size from archivebox.misc.system import get_dir_size
from ..util import ts_to_date_str, parse_date from archivebox.misc.util import ts_to_date_str, parse_date
class ArchiveError(Exception): class ArchiveError(Exception):
@ -67,7 +67,7 @@ class ArchiveResult:
@classmethod @classmethod
def guess_ts(_cls, dict_info): def guess_ts(_cls, dict_info):
from ..util import parse_date from archivebox.misc.util import parse_date
parsed_timestamp = parse_date(dict_info["timestamp"]) parsed_timestamp = parse_date(dict_info["timestamp"])
start_ts = parsed_timestamp start_ts = parsed_timestamp
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"])) end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
@ -75,7 +75,7 @@ class ArchiveResult:
@classmethod @classmethod
def from_json(cls, json_info, guess=False): def from_json(cls, json_info, guess=False):
from ..util import parse_date from archivebox.misc.util import parse_date
info = { info = {
key: val key: val
@ -231,7 +231,7 @@ class Link:
@classmethod @classmethod
def from_json(cls, json_info, guess=False): def from_json(cls, json_info, guess=False):
from ..util import parse_date from archivebox.misc.util import parse_date
info = { info = {
key: val key: val
@ -299,38 +299,38 @@ class Link:
### URL Helpers ### URL Helpers
@property @property
def url_hash(self): def url_hash(self):
from ..util import hashurl from archivebox.misc.util import hashurl
return hashurl(self.url) return hashurl(self.url)
@property @property
def scheme(self) -> str: def scheme(self) -> str:
from ..util import scheme from archivebox.misc.util import scheme
return scheme(self.url) return scheme(self.url)
@property @property
def extension(self) -> str: def extension(self) -> str:
from ..util import extension from archivebox.misc.util import extension
return extension(self.url) return extension(self.url)
@property @property
def domain(self) -> str: def domain(self) -> str:
from ..util import domain from archivebox.misc.util import domain
return domain(self.url) return domain(self.url)
@property @property
def path(self) -> str: def path(self) -> str:
from ..util import path from archivebox.misc.util import path
return path(self.url) return path(self.url)
@property @property
def basename(self) -> str: def basename(self) -> str:
from ..util import basename from archivebox.misc.util import basename
return basename(self.url) return basename(self.url)
@property @property
def base_url(self) -> str: def base_url(self) -> str:
from ..util import base_url from archivebox.misc.util import base_url
return base_url(self.url) return base_url(self.url)
### Pretty Printing Helpers ### Pretty Printing Helpers
@ -380,12 +380,12 @@ class Link:
@property @property
def is_static(self) -> bool: def is_static(self) -> bool:
from ..util import is_static_file from archivebox.misc.util import is_static_file
return is_static_file(self.url) return is_static_file(self.url)
@property @property
def is_archived(self) -> bool: def is_archived(self) -> bool:
from ..util import domain from archivebox.misc.util import domain
output_paths = ( output_paths = (
domain(self.url), domain(self.url),

View file

@ -9,7 +9,7 @@ from django.db.models import QuerySet
from django.db import transaction from django.db import transaction
from .schema import Link from .schema import Link
from ..util import enforce_types, parse_date from archivebox.misc.util import enforce_types, parse_date
from ..config.legacy import ( from ..config.legacy import (
OUTPUT_DIR, OUTPUT_DIR,
TAG_SEPARATOR_PATTERN, TAG_SEPARATOR_PATTERN,

View file

@ -23,8 +23,8 @@ from rich.panel import Panel
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG
from archivebox.misc.system import get_dir_size from archivebox.misc.system import get_dir_size
from .util import enforce_types from archivebox.misc.util import enforce_types
from .misc.logging import ANSI, stderr from archivebox.misc.logging import ANSI, stderr
@dataclass @dataclass
class RuntimeStats: class RuntimeStats:

View file

@ -28,10 +28,10 @@ from .parsers import (
save_file_as_source, save_file_as_source,
parse_links_memory, parse_links_memory,
) )
from .index.schema import Link from archivebox.misc.util import enforce_types # type: ignore
from .util import enforce_types # type: ignore
from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from archivebox.misc.system import run as run_shell from archivebox.misc.system import run as run_shell
from .index.schema import Link
from .index import ( from .index import (
load_main_index, load_main_index,
parse_links_from_source, parse_links_from_source,
@ -61,14 +61,12 @@ from .index.sql import (
apply_migrations, apply_migrations,
remove_from_sql_main_index, remove_from_sql_main_index,
) )
from .index.html import ( from .index.html import generate_index_from_links
generate_index_from_links,
)
from .index.csv import links_to_csv from .index.csv import links_to_csv
from .extractors import archive_links, archive_link, ignore_methods from .extractors import archive_links, archive_link, ignore_methods
from .misc.logging import stderr, hint from archivebox.misc.logging import stderr, hint
from .misc.checks import check_data_folder from archivebox.misc.checks import check_data_folder
from .config.legacy import ( from archivebox.config.legacy import (
write_config_file, write_config_file,
DEPENDENCIES, DEPENDENCIES,
load_all_config, load_all_config,
@ -194,7 +192,7 @@ def version(quiet: bool=False,
f'PLATFORM={platform.platform()}', f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}', f'PYTHON={sys.implementation.name.title()}',
) )
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['DATA_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount'] OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
print( print(
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
@ -221,7 +219,7 @@ def version(quiet: bool=False,
print() print()
print('{white}[i] New dependency versions:{reset}'.format(**SHELL_CONFIG.ANSI)) print('{white}[i] New dependency versions:{reset}'.format(**SHELL_CONFIG.ANSI))
for name, binary in settings.BINARIES.items(): for name, binary in reversed(list(settings.BINARIES.items())):
err = None err = None
try: try:
loaded_bin = binary.load() loaded_bin = binary.load()

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox' __package__ = 'archivebox.misc'
import os import os
@ -14,8 +14,8 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro
from crontab import CronTab from crontab import CronTab
from atomicwrites import atomic_write as lib_atomic_write from atomicwrites import atomic_write as lib_atomic_write
from .util import enforce_types, ExtendedEncoder from archivebox.config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
from .config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES from archivebox.misc.util import enforce_types, ExtendedEncoder
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs): def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):

View file

@ -317,22 +317,6 @@ def dedupe(options: List[str]) -> List[str]:
return list(deduped.values()) return list(deduped.values())
class AttributeDict(dict):
"""Helper to allow accessing dict values via Example.key or Example['key']"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Recursively convert nested dicts to AttributeDicts (optional):
# for key, val in self.items():
# if isinstance(val, dict) and type(val) is not AttributeDict:
# self[key] = AttributeDict(val)
def __getattr__(self, attr: str) -> Any:
return dict.__getitem__(self, attr)
def __setattr__(self, attr: str, value: Any) -> None:
return dict.__setitem__(self, attr, value)
class ExtendedEncoder(pyjson.JSONEncoder): class ExtendedEncoder(pyjson.JSONEncoder):
""" """

View file

@ -22,7 +22,7 @@ from ..config.legacy import (
stderr, stderr,
hint, hint,
) )
from ..util import ( from archivebox.misc.util import (
basename, basename,
htmldecode, htmldecode,
download_url, download_url,

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
from datetime import datetime, timezone from datetime import datetime, timezone
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
find_all_urls, find_all_urls,

View file

@ -6,7 +6,7 @@ from typing import IO, Iterable
from datetime import datetime, timezone from datetime import datetime, timezone
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
) )

View file

@ -5,7 +5,7 @@ import json
from typing import IO, Iterable from typing import IO, Iterable
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
) )

View file

@ -6,7 +6,7 @@ from time import mktime
from feedparser import parse as feedparser from feedparser import parse as feedparser
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types enforce_types
) )

View file

@ -6,7 +6,7 @@ from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
find_all_urls, find_all_urls,

View file

@ -7,7 +7,7 @@ from datetime import datetime
from xml.etree import ElementTree from xml.etree import ElementTree
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
) )

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
) )

View file

@ -6,7 +6,7 @@ from time import mktime
from feedparser import parse as feedparser from feedparser import parse as feedparser
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types enforce_types
) )

View file

@ -11,7 +11,7 @@ from pocket import Pocket
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from ..index.schema import Link from ..index.schema import Link
from ..util import enforce_types from archivebox.misc.util import enforce_types
from archivebox.misc.system import atomic_write from archivebox.misc.system import atomic_write
from ..config.legacy import ( from ..config.legacy import (
POCKET_CONSUMER_KEY, POCKET_CONSUMER_KEY,

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
) )

View file

@ -11,7 +11,7 @@ from configparser import ConfigParser
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from ..index.schema import Link from ..index.schema import Link
from ..util import enforce_types from archivebox.misc.util import enforce_types
from archivebox.misc.system import atomic_write from archivebox.misc.system import atomic_write
from ..config.legacy import READWISE_READER_TOKENS from ..config.legacy import READWISE_READER_TOKENS

View file

@ -5,7 +5,7 @@ from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
str_between, str_between,

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable
from datetime import datetime, timezone from datetime import datetime, timezone
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
URL_REGEX, URL_REGEX,
) )

View file

@ -5,7 +5,7 @@ from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from archivebox.misc.util import (
htmldecode, htmldecode,
enforce_types, enforce_types,
str_between, str_between,

View file

@ -3,7 +3,7 @@ __package__ = 'archivebox.plugins_extractor.chrome'
import sys import sys
import platform import platform
from pathlib import Path from pathlib import Path
from typing import List, Optional, Dict, ClassVar from typing import List, Optional, Dict
# Depends on other PyPI/vendor packages: # Depends on other PyPI/vendor packages:
from rich import print from rich import print
@ -29,7 +29,7 @@ from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from ...util import dedupe from archivebox.misc.util import dedupe
CHROMIUM_BINARY_NAMES_LINUX = [ CHROMIUM_BINARY_NAMES_LINUX = [

View file

@ -1,3 +1,5 @@
__package__ = 'archivebox.search'
from typing import List, Union from typing import List, Union
from pathlib import Path from pathlib import Path
@ -5,12 +7,53 @@ from django.db.models import QuerySet
from django.conf import settings from django.conf import settings
from archivebox.index.schema import Link from archivebox.index.schema import Link
from archivebox.util import enforce_types from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr from archivebox.misc.logging import stderr
from archivebox.config.legacy import ANSI
# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig # from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
from .utils import get_indexable_content, log_index_started
def log_index_started(url):
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
print( )
def get_file_result_content(res, extra_path, use_pwd=False):
if use_pwd:
fpath = f'{res.pwd}/{res.output}'
else:
fpath = f'{res.output}'
if extra_path:
fpath = f'{fpath}/{extra_path}'
with open(fpath, 'r', encoding='utf-8') as file:
data = file.read()
if data:
return [data]
return []
# This should be abstracted by a plugin interface for extractors
@enforce_types
def get_indexable_content(results: QuerySet):
if not results:
return []
# Only use the first method available
res, method = results.first(), results.first().extractor
if method not in ('readability', 'singlefile', 'dom', 'wget'):
return []
# This should come from a plugin interface
# TODO: banish this duplication and get these from the extractor file
if method == 'readability':
return get_file_result_content(res, 'content.txt', use_pwd=True)
elif method == 'singlefile':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'dom':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'wget':
return get_file_result_content(res, '', use_pwd=True)
def import_backend(): def import_backend():

View file

@ -1,45 +0,0 @@
from django.db.models import QuerySet
from archivebox.util import enforce_types
from archivebox.config.legacy import ANSI
def log_index_started(url):
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
print( )
def get_file_result_content(res, extra_path, use_pwd=False):
if use_pwd:
fpath = f'{res.pwd}/{res.output}'
else:
fpath = f'{res.output}'
if extra_path:
fpath = f'{fpath}/{extra_path}'
with open(fpath, 'r', encoding='utf-8') as file:
data = file.read()
if data:
return [data]
return []
# This should be abstracted by a plugin interface for extractors
@enforce_types
def get_indexable_content(results: QuerySet):
if not results:
return []
# Only use the first method available
res, method = results.first(), results.first().extractor
if method not in ('readability', 'singlefile', 'dom', 'wget'):
return []
# This should come from a plugin interface
# TODO: banish this duplication and get these from the extractor file
if method == 'readability':
return get_file_result_content(res, 'content.txt', use_pwd=True)
elif method == 'singlefile':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'dom':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'wget':
return get_file_result_content(res, '', use_pwd=True)