mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
move util.py into misc folder
This commit is contained in:
parent
dfca4b13b2
commit
363a499289
68 changed files with 136 additions and 161 deletions
|
@ -11,7 +11,7 @@ from uuid import UUID
|
||||||
from typeid import TypeID # type: ignore[import-untyped]
|
from typeid import TypeID # type: ignore[import-untyped]
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ..util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
|
|
||||||
|
|
||||||
ABID_PREFIX_LEN = 4
|
ABID_PREFIX_LEN = 4
|
||||||
|
|
|
@ -13,7 +13,7 @@ from django_object_actions import DjangoObjectActions, action
|
||||||
|
|
||||||
from api.auth import get_or_create_api_token
|
from api.auth import get_or_create_api_token
|
||||||
|
|
||||||
from ..util import parse_date
|
from archivebox.misc.util import parse_date
|
||||||
from .abid import ABID
|
from .abid import ABID
|
||||||
|
|
||||||
def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None):
|
def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None):
|
||||||
|
|
|
@ -25,7 +25,7 @@ class BaseQueue(BaseHook):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tasks(self) -> Dict[str, 'TaskWrapper']:
|
def tasks(self) -> Dict[str, 'TaskWrapper']:
|
||||||
"""Return an AttrDict of all the background worker tasks defined in the plugin's tasks.py file."""
|
"""Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
|
||||||
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
|
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
|
||||||
|
|
||||||
all_tasks = {}
|
all_tasks = {}
|
||||||
|
@ -83,7 +83,7 @@ class BaseQueue(BaseHook):
|
||||||
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
|
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
|
||||||
|
|
||||||
# Update settings.WORKERS to include this worker
|
# Update settings.WORKERS to include this worker
|
||||||
settings.WORKERS = getattr(settings, "WORKERS", None) or AttrDict({})
|
settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
|
||||||
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
|
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
|
||||||
|
|
||||||
return worker
|
return worker
|
||||||
|
|
|
@ -12,7 +12,7 @@ from ..main import (
|
||||||
list_all,
|
list_all,
|
||||||
schedule,
|
schedule,
|
||||||
)
|
)
|
||||||
from ..util import ansi_to_html
|
from archivebox.misc.util import ansi_to_html
|
||||||
from ..config.legacy import ONLY_NEW
|
from ..config.legacy import ONLY_NEW
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import List, Optional, IO
|
from typing import List, Optional, IO
|
||||||
|
|
||||||
from ..main import add
|
from ..main import add
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..parsers import PARSERS
|
from ..parsers import PARSERS
|
||||||
from ..config.legacy import OUTPUT_DIR, ONLY_NEW
|
from ..config.legacy import OUTPUT_DIR, ONLY_NEW
|
||||||
from ..logging_util import SmartFormatter, accept_stdin, stderr
|
from ..logging_util import SmartFormatter, accept_stdin, stderr
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import config
|
from ..main import config
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, accept_stdin
|
from ..logging_util import SmartFormatter, accept_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import help
|
from ..main import help
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, reject_stdin
|
from ..logging_util import SmartFormatter, reject_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import init
|
from ..main import init
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, reject_stdin
|
from ..logging_util import SmartFormatter, reject_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import list_all
|
from ..main import list_all
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..index import (
|
from ..index import (
|
||||||
LINK_FILTERS,
|
LINK_FILTERS,
|
||||||
|
|
|
@ -8,7 +8,7 @@ import sys
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import manage
|
from ..main import manage
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ from pathlib import Path
|
||||||
from typing import List, Optional, IO
|
from typing import List, Optional, IO
|
||||||
|
|
||||||
from ..main import oneshot
|
from ..main import oneshot
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, accept_stdin, stderr
|
from ..logging_util import SmartFormatter, accept_stdin, stderr
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import remove
|
from ..main import remove
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, accept_stdin
|
from ..logging_util import SmartFormatter, accept_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import schedule
|
from ..main import schedule
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, reject_stdin
|
from ..logging_util import SmartFormatter, reject_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import server
|
from ..main import server
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR, BIND_ADDR
|
from ..config.legacy import OUTPUT_DIR, BIND_ADDR
|
||||||
from ..logging_util import SmartFormatter, reject_stdin
|
from ..logging_util import SmartFormatter, reject_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import setup
|
from ..main import setup
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, reject_stdin
|
from ..logging_util import SmartFormatter, reject_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import shell
|
from ..main import shell
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, reject_stdin
|
from ..logging_util import SmartFormatter, reject_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import status
|
from ..main import status
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, reject_stdin
|
from ..logging_util import SmartFormatter, reject_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import List, Optional, IO
|
from typing import List, Optional, IO
|
||||||
|
|
||||||
from ..main import update
|
from ..main import update
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..index import (
|
from ..index import (
|
||||||
LINK_FILTERS,
|
LINK_FILTERS,
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from typing import Optional, List, IO
|
from typing import Optional, List, IO
|
||||||
|
|
||||||
from ..main import version
|
from ..main import version
|
||||||
from ..util import docstring
|
from archivebox.misc.util import docstring
|
||||||
from ..config.legacy import OUTPUT_DIR
|
from ..config.legacy import OUTPUT_DIR
|
||||||
from ..logging_util import SmartFormatter, reject_stdin
|
from ..logging_util import SmartFormatter, reject_stdin
|
||||||
|
|
||||||
|
|
|
@ -9,17 +9,12 @@ SimpleConfigValueDict = Dict[str, SimpleConfigValue]
|
||||||
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
|
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
|
||||||
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
|
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
|
||||||
|
|
||||||
# class AttrDict(dict):
|
|
||||||
# def __init__(self, *args, **kwargs):
|
|
||||||
# super().__init__(*args, **kwargs)
|
|
||||||
# self.__dict__ = self
|
|
||||||
AttrDict = benedict # https://github.com/fabiocaccamo/python-benedict/
|
|
||||||
|
|
||||||
|
|
||||||
class BaseConfig(TypedDict):
|
class BaseConfig(TypedDict):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class ConfigDict(BaseConfig, AttrDict, total=False):
|
class ConfigDict(BaseConfig, benedict, total=False):
|
||||||
"""
|
"""
|
||||||
# Regenerate by pasting this quine into `archivebox shell` 🥚
|
# Regenerate by pasting this quine into `archivebox shell` 🥚
|
||||||
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
|
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
|
||||||
|
|
|
@ -173,7 +173,7 @@ class ConstantsDict(Mapping):
|
||||||
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
||||||
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
||||||
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
||||||
ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset((
|
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
|
||||||
*INGORED_PATHS,
|
*INGORED_PATHS,
|
||||||
*PIP_RELATED_NAMES,
|
*PIP_RELATED_NAMES,
|
||||||
*NPM_RELATED_NAMES,
|
*NPM_RELATED_NAMES,
|
||||||
|
@ -212,7 +212,7 @@ class ConstantsDict(Mapping):
|
||||||
})
|
})
|
||||||
|
|
||||||
DATA_LOCATIONS = benedict({
|
DATA_LOCATIONS = benedict({
|
||||||
"OUTPUT_DIR": {
|
"DATA_DIR": {
|
||||||
"path": DATA_DIR.resolve(),
|
"path": DATA_DIR.resolve(),
|
||||||
"enabled": True,
|
"enabled": True,
|
||||||
"is_valid": DATABASE_FILE.exists(),
|
"is_valid": DATABASE_FILE.exists(),
|
||||||
|
|
|
@ -23,7 +23,7 @@ from signal_webhooks.utils import get_webhook_model
|
||||||
|
|
||||||
from archivebox.config import VERSION
|
from archivebox.config import VERSION
|
||||||
|
|
||||||
from ..util import htmldecode, urldecode
|
from archivebox.misc.util import htmldecode, urldecode
|
||||||
|
|
||||||
from core.models import Snapshot, ArchiveResult, Tag
|
from core.models import Snapshot, ArchiveResult, Tag
|
||||||
from core.mixins import SearchResultsAdminMixin
|
from core.mixins import SearchResultsAdminMixin
|
||||||
|
|
|
@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
|
||||||
|
|
||||||
from django import forms
|
from django import forms
|
||||||
|
|
||||||
from ..util import URL_REGEX
|
from archivebox.misc.util import URL_REGEX
|
||||||
from ..parsers import PARSERS
|
from ..parsers import PARSERS
|
||||||
from taggit.utils import edit_string_for_tags, parse_tags
|
from taggit.utils import edit_string_for_tags, parse_tags
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
|
||||||
from queues.tasks import bg_archive_snapshot
|
from queues.tasks import bg_archive_snapshot
|
||||||
|
|
||||||
from archivebox.misc.system import get_dir_size
|
from archivebox.misc.system import get_dir_size
|
||||||
from ..util import parse_date, base_url
|
from archivebox.misc.util import parse_date, base_url
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..index.html import snapshot_icons
|
from ..index.html import snapshot_icons
|
||||||
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||||
|
@ -231,7 +231,7 @@ class Snapshot(ABIDModel):
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def extension(self) -> str:
|
def extension(self) -> str:
|
||||||
from ..util import extension
|
from archivebox.misc.util import extension
|
||||||
return extension(self.url)
|
return extension(self.url)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
|
|
|
@ -37,7 +37,7 @@ from ..config.legacy import (
|
||||||
CONFIG,
|
CONFIG,
|
||||||
)
|
)
|
||||||
from ..logging_util import printable_filesize
|
from ..logging_util import printable_filesize
|
||||||
from ..util import base_url, htmlencode, ts_to_date_str
|
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||||
from ..search import query_search_index
|
from ..search import query_search_index
|
||||||
from .serve_static import serve_static_with_byterange_support
|
from .serve_static import serve_static_with_byterange_support
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ from ..index import (
|
||||||
load_link_details,
|
load_link_details,
|
||||||
write_link_details,
|
write_link_details,
|
||||||
)
|
)
|
||||||
from ..util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
from ..logging_util import (
|
from ..logging_util import (
|
||||||
log_archiving_started,
|
log_archiving_started,
|
||||||
log_archiving_paused,
|
log_archiving_paused,
|
||||||
|
|
|
@ -7,7 +7,7 @@ from collections import defaultdict
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
dedupe,
|
dedupe,
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file, atomic_write
|
from archivebox.misc.system import run, chmod_file, atomic_write
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
)
|
)
|
||||||
|
|
|
@ -6,7 +6,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from archivebox.misc.system import chmod_file, run
|
from archivebox.misc.system import chmod_file, run
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
domain,
|
domain,
|
||||||
dedupe,
|
dedupe,
|
||||||
|
|
|
@ -6,7 +6,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
domain,
|
domain,
|
||||||
|
|
|
@ -6,7 +6,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
get_headers,
|
get_headers,
|
||||||
dedupe,
|
dedupe,
|
||||||
|
|
|
@ -13,7 +13,7 @@ from ..config.legacy import (
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from ..util import enforce_types, is_static_file, dedupe
|
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ import json
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from archivebox.misc.system import run, atomic_write
|
from archivebox.misc.system import run, atomic_write
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
dedupe,
|
dedupe,
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
)
|
)
|
||||||
|
|
|
@ -8,7 +8,7 @@ import json
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from archivebox.misc.system import run, atomic_write
|
from archivebox.misc.system import run, atomic_write
|
||||||
from ..util import enforce_types, is_static_file
|
from archivebox.misc.util import enforce_types, is_static_file
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
from .title import get_html
|
from .title import get_html
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from ..util import enforce_types, is_static_file
|
from archivebox.misc.util import enforce_types, is_static_file
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ import json
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from ..util import enforce_types, is_static_file, dedupe
|
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
download_url,
|
download_url,
|
||||||
htmldecode,
|
htmldecode,
|
||||||
|
|
|
@ -8,7 +8,7 @@ from datetime import datetime, timezone
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from archivebox.misc.system import run, chmod_file
|
from archivebox.misc.system import run, chmod_file
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
without_fragment,
|
without_fragment,
|
||||||
without_query,
|
without_query,
|
||||||
|
|
|
@ -13,7 +13,7 @@ from django.db.models import QuerySet, Q
|
||||||
|
|
||||||
|
|
||||||
from archivebox.config import DATA_DIR, CONSTANTS, SEARCH_BACKEND_CONFIG
|
from archivebox.config import DATA_DIR, CONSTANTS, SEARCH_BACKEND_CONFIG
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
scheme,
|
scheme,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
ExtendedEncoder,
|
ExtendedEncoder,
|
||||||
|
|
|
@ -2,7 +2,7 @@ __package__ = 'archivebox.index'
|
||||||
|
|
||||||
from typing import List, Optional, Any
|
from typing import List, Optional, Any
|
||||||
|
|
||||||
from ..util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ from django.core.cache import cache
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from ..logging_util import printable_filesize
|
from ..logging_util import printable_filesize
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
ts_to_date_str,
|
ts_to_date_str,
|
||||||
urlencode,
|
urlencode,
|
||||||
|
|
|
@ -12,7 +12,7 @@ from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from ..util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@ from archivebox.config.constants import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
||||||
from plugins_extractor.favicon.apps import FAVICON_CONFIG
|
from plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||||
|
|
||||||
from archivebox.misc.system import get_dir_size
|
from archivebox.misc.system import get_dir_size
|
||||||
from ..util import ts_to_date_str, parse_date
|
from archivebox.misc.util import ts_to_date_str, parse_date
|
||||||
|
|
||||||
|
|
||||||
class ArchiveError(Exception):
|
class ArchiveError(Exception):
|
||||||
|
@ -67,7 +67,7 @@ class ArchiveResult:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def guess_ts(_cls, dict_info):
|
def guess_ts(_cls, dict_info):
|
||||||
from ..util import parse_date
|
from archivebox.misc.util import parse_date
|
||||||
parsed_timestamp = parse_date(dict_info["timestamp"])
|
parsed_timestamp = parse_date(dict_info["timestamp"])
|
||||||
start_ts = parsed_timestamp
|
start_ts = parsed_timestamp
|
||||||
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
|
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
|
||||||
|
@ -75,7 +75,7 @@ class ArchiveResult:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_json(cls, json_info, guess=False):
|
def from_json(cls, json_info, guess=False):
|
||||||
from ..util import parse_date
|
from archivebox.misc.util import parse_date
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
key: val
|
key: val
|
||||||
|
@ -231,7 +231,7 @@ class Link:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_json(cls, json_info, guess=False):
|
def from_json(cls, json_info, guess=False):
|
||||||
from ..util import parse_date
|
from archivebox.misc.util import parse_date
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
key: val
|
key: val
|
||||||
|
@ -299,38 +299,38 @@ class Link:
|
||||||
### URL Helpers
|
### URL Helpers
|
||||||
@property
|
@property
|
||||||
def url_hash(self):
|
def url_hash(self):
|
||||||
from ..util import hashurl
|
from archivebox.misc.util import hashurl
|
||||||
|
|
||||||
return hashurl(self.url)
|
return hashurl(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def scheme(self) -> str:
|
def scheme(self) -> str:
|
||||||
from ..util import scheme
|
from archivebox.misc.util import scheme
|
||||||
return scheme(self.url)
|
return scheme(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def extension(self) -> str:
|
def extension(self) -> str:
|
||||||
from ..util import extension
|
from archivebox.misc.util import extension
|
||||||
return extension(self.url)
|
return extension(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def domain(self) -> str:
|
def domain(self) -> str:
|
||||||
from ..util import domain
|
from archivebox.misc.util import domain
|
||||||
return domain(self.url)
|
return domain(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def path(self) -> str:
|
def path(self) -> str:
|
||||||
from ..util import path
|
from archivebox.misc.util import path
|
||||||
return path(self.url)
|
return path(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def basename(self) -> str:
|
def basename(self) -> str:
|
||||||
from ..util import basename
|
from archivebox.misc.util import basename
|
||||||
return basename(self.url)
|
return basename(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def base_url(self) -> str:
|
def base_url(self) -> str:
|
||||||
from ..util import base_url
|
from archivebox.misc.util import base_url
|
||||||
return base_url(self.url)
|
return base_url(self.url)
|
||||||
|
|
||||||
### Pretty Printing Helpers
|
### Pretty Printing Helpers
|
||||||
|
@ -380,12 +380,12 @@ class Link:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_static(self) -> bool:
|
def is_static(self) -> bool:
|
||||||
from ..util import is_static_file
|
from archivebox.misc.util import is_static_file
|
||||||
return is_static_file(self.url)
|
return is_static_file(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_archived(self) -> bool:
|
def is_archived(self) -> bool:
|
||||||
from ..util import domain
|
from archivebox.misc.util import domain
|
||||||
|
|
||||||
output_paths = (
|
output_paths = (
|
||||||
domain(self.url),
|
domain(self.url),
|
||||||
|
|
|
@ -9,7 +9,7 @@ from django.db.models import QuerySet
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from ..util import enforce_types, parse_date
|
from archivebox.misc.util import enforce_types, parse_date
|
||||||
from ..config.legacy import (
|
from ..config.legacy import (
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
TAG_SEPARATOR_PATTERN,
|
TAG_SEPARATOR_PATTERN,
|
||||||
|
|
|
@ -23,8 +23,8 @@ from rich.panel import Panel
|
||||||
|
|
||||||
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG
|
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG
|
||||||
from archivebox.misc.system import get_dir_size
|
from archivebox.misc.system import get_dir_size
|
||||||
from .util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
from .misc.logging import ANSI, stderr
|
from archivebox.misc.logging import ANSI, stderr
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RuntimeStats:
|
class RuntimeStats:
|
||||||
|
|
|
@ -28,10 +28,10 @@ from .parsers import (
|
||||||
save_file_as_source,
|
save_file_as_source,
|
||||||
parse_links_memory,
|
parse_links_memory,
|
||||||
)
|
)
|
||||||
from .index.schema import Link
|
from archivebox.misc.util import enforce_types # type: ignore
|
||||||
from .util import enforce_types # type: ignore
|
|
||||||
from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
||||||
from archivebox.misc.system import run as run_shell
|
from archivebox.misc.system import run as run_shell
|
||||||
|
from .index.schema import Link
|
||||||
from .index import (
|
from .index import (
|
||||||
load_main_index,
|
load_main_index,
|
||||||
parse_links_from_source,
|
parse_links_from_source,
|
||||||
|
@ -61,14 +61,12 @@ from .index.sql import (
|
||||||
apply_migrations,
|
apply_migrations,
|
||||||
remove_from_sql_main_index,
|
remove_from_sql_main_index,
|
||||||
)
|
)
|
||||||
from .index.html import (
|
from .index.html import generate_index_from_links
|
||||||
generate_index_from_links,
|
|
||||||
)
|
|
||||||
from .index.csv import links_to_csv
|
from .index.csv import links_to_csv
|
||||||
from .extractors import archive_links, archive_link, ignore_methods
|
from .extractors import archive_links, archive_link, ignore_methods
|
||||||
from .misc.logging import stderr, hint
|
from archivebox.misc.logging import stderr, hint
|
||||||
from .misc.checks import check_data_folder
|
from archivebox.misc.checks import check_data_folder
|
||||||
from .config.legacy import (
|
from archivebox.config.legacy import (
|
||||||
write_config_file,
|
write_config_file,
|
||||||
DEPENDENCIES,
|
DEPENDENCIES,
|
||||||
load_all_config,
|
load_all_config,
|
||||||
|
@ -194,7 +192,7 @@ def version(quiet: bool=False,
|
||||||
f'PLATFORM={platform.platform()}',
|
f'PLATFORM={platform.platform()}',
|
||||||
f'PYTHON={sys.implementation.name.title()}',
|
f'PYTHON={sys.implementation.name.title()}',
|
||||||
)
|
)
|
||||||
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['DATA_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
|
||||||
print(
|
print(
|
||||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||||
|
@ -221,7 +219,7 @@ def version(quiet: bool=False,
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print('{white}[i] New dependency versions:{reset}'.format(**SHELL_CONFIG.ANSI))
|
print('{white}[i] New dependency versions:{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||||
for name, binary in settings.BINARIES.items():
|
for name, binary in reversed(list(settings.BINARIES.items())):
|
||||||
err = None
|
err = None
|
||||||
try:
|
try:
|
||||||
loaded_bin = binary.load()
|
loaded_bin = binary.load()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
__package__ = 'archivebox'
|
__package__ = 'archivebox.misc'
|
||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
@ -14,8 +14,8 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro
|
||||||
from crontab import CronTab
|
from crontab import CronTab
|
||||||
from atomicwrites import atomic_write as lib_atomic_write
|
from atomicwrites import atomic_write as lib_atomic_write
|
||||||
|
|
||||||
from .util import enforce_types, ExtendedEncoder
|
from archivebox.config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
|
||||||
from .config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
|
from archivebox.misc.util import enforce_types, ExtendedEncoder
|
||||||
|
|
||||||
|
|
||||||
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
|
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
|
||||||
|
|
|
@ -317,22 +317,6 @@ def dedupe(options: List[str]) -> List[str]:
|
||||||
return list(deduped.values())
|
return list(deduped.values())
|
||||||
|
|
||||||
|
|
||||||
class AttributeDict(dict):
|
|
||||||
"""Helper to allow accessing dict values via Example.key or Example['key']"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
# Recursively convert nested dicts to AttributeDicts (optional):
|
|
||||||
# for key, val in self.items():
|
|
||||||
# if isinstance(val, dict) and type(val) is not AttributeDict:
|
|
||||||
# self[key] = AttributeDict(val)
|
|
||||||
|
|
||||||
def __getattr__(self, attr: str) -> Any:
|
|
||||||
return dict.__getitem__(self, attr)
|
|
||||||
|
|
||||||
def __setattr__(self, attr: str, value: Any) -> None:
|
|
||||||
return dict.__setitem__(self, attr, value)
|
|
||||||
|
|
||||||
|
|
||||||
class ExtendedEncoder(pyjson.JSONEncoder):
|
class ExtendedEncoder(pyjson.JSONEncoder):
|
||||||
"""
|
"""
|
|
@ -22,7 +22,7 @@ from ..config.legacy import (
|
||||||
stderr,
|
stderr,
|
||||||
hint,
|
hint,
|
||||||
)
|
)
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
basename,
|
basename,
|
||||||
htmldecode,
|
htmldecode,
|
||||||
download_url,
|
download_url,
|
||||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
find_all_urls,
|
find_all_urls,
|
||||||
|
|
|
@ -6,7 +6,7 @@ from typing import IO, Iterable
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import json
|
||||||
from typing import IO, Iterable
|
from typing import IO, Iterable
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from time import mktime
|
||||||
from feedparser import parse as feedparser
|
from feedparser import parse as feedparser
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types
|
enforce_types
|
||||||
)
|
)
|
||||||
|
|
|
@ -6,7 +6,7 @@ from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
find_all_urls,
|
find_all_urls,
|
||||||
|
|
|
@ -7,7 +7,7 @@ from datetime import datetime
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
)
|
)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
)
|
)
|
||||||
|
|
|
@ -6,7 +6,7 @@ from time import mktime
|
||||||
from feedparser import parse as feedparser
|
from feedparser import parse as feedparser
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types
|
enforce_types
|
||||||
)
|
)
|
||||||
|
|
|
@ -11,7 +11,7 @@ from pocket import Pocket
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from ..config.legacy import (
|
from ..config.legacy import (
|
||||||
POCKET_CONSUMER_KEY,
|
POCKET_CONSUMER_KEY,
|
||||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
)
|
)
|
||||||
|
|
|
@ -11,7 +11,7 @@ from configparser import ConfigParser
|
||||||
from archivebox.config import CONSTANTS
|
from archivebox.config import CONSTANTS
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
from archivebox.misc.system import atomic_write
|
from archivebox.misc.system import atomic_write
|
||||||
from ..config.legacy import READWISE_READER_TOKENS
|
from ..config.legacy import READWISE_READER_TOKENS
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import IO, Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
str_between,
|
str_between,
|
||||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
URL_REGEX,
|
URL_REGEX,
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import IO, Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from archivebox.misc.util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
str_between,
|
str_between,
|
||||||
|
|
|
@ -3,7 +3,7 @@ __package__ = 'archivebox.plugins_extractor.chrome'
|
||||||
import sys
|
import sys
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Dict, ClassVar
|
from typing import List, Optional, Dict
|
||||||
|
|
||||||
# Depends on other PyPI/vendor packages:
|
# Depends on other PyPI/vendor packages:
|
||||||
from rich import print
|
from rich import print
|
||||||
|
@ -29,7 +29,7 @@ from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG
|
||||||
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
||||||
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
||||||
|
|
||||||
from ...util import dedupe
|
from archivebox.misc.util import dedupe
|
||||||
|
|
||||||
|
|
||||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
__package__ = 'archivebox.search'
|
||||||
|
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -5,12 +7,53 @@ from django.db.models import QuerySet
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from archivebox.index.schema import Link
|
from archivebox.index.schema import Link
|
||||||
from archivebox.util import enforce_types
|
from archivebox.misc.util import enforce_types
|
||||||
from archivebox.misc.logging import stderr
|
from archivebox.misc.logging import stderr
|
||||||
|
from archivebox.config.legacy import ANSI
|
||||||
|
|
||||||
# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
|
# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
|
||||||
|
|
||||||
from .utils import get_indexable_content, log_index_started
|
|
||||||
|
def log_index_started(url):
|
||||||
|
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
|
||||||
|
print( )
|
||||||
|
|
||||||
|
def get_file_result_content(res, extra_path, use_pwd=False):
|
||||||
|
if use_pwd:
|
||||||
|
fpath = f'{res.pwd}/{res.output}'
|
||||||
|
else:
|
||||||
|
fpath = f'{res.output}'
|
||||||
|
|
||||||
|
if extra_path:
|
||||||
|
fpath = f'{fpath}/{extra_path}'
|
||||||
|
|
||||||
|
with open(fpath, 'r', encoding='utf-8') as file:
|
||||||
|
data = file.read()
|
||||||
|
if data:
|
||||||
|
return [data]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# This should be abstracted by a plugin interface for extractors
|
||||||
|
@enforce_types
|
||||||
|
def get_indexable_content(results: QuerySet):
|
||||||
|
if not results:
|
||||||
|
return []
|
||||||
|
# Only use the first method available
|
||||||
|
res, method = results.first(), results.first().extractor
|
||||||
|
if method not in ('readability', 'singlefile', 'dom', 'wget'):
|
||||||
|
return []
|
||||||
|
# This should come from a plugin interface
|
||||||
|
|
||||||
|
# TODO: banish this duplication and get these from the extractor file
|
||||||
|
if method == 'readability':
|
||||||
|
return get_file_result_content(res, 'content.txt', use_pwd=True)
|
||||||
|
elif method == 'singlefile':
|
||||||
|
return get_file_result_content(res, '', use_pwd=True)
|
||||||
|
elif method == 'dom':
|
||||||
|
return get_file_result_content(res, '', use_pwd=True)
|
||||||
|
elif method == 'wget':
|
||||||
|
return get_file_result_content(res, '', use_pwd=True)
|
||||||
|
|
||||||
|
|
||||||
def import_backend():
|
def import_backend():
|
||||||
|
|
|
@ -1,45 +0,0 @@
|
||||||
from django.db.models import QuerySet
|
|
||||||
|
|
||||||
from archivebox.util import enforce_types
|
|
||||||
from archivebox.config.legacy import ANSI
|
|
||||||
|
|
||||||
def log_index_started(url):
|
|
||||||
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
|
|
||||||
print( )
|
|
||||||
|
|
||||||
def get_file_result_content(res, extra_path, use_pwd=False):
|
|
||||||
if use_pwd:
|
|
||||||
fpath = f'{res.pwd}/{res.output}'
|
|
||||||
else:
|
|
||||||
fpath = f'{res.output}'
|
|
||||||
|
|
||||||
if extra_path:
|
|
||||||
fpath = f'{fpath}/{extra_path}'
|
|
||||||
|
|
||||||
with open(fpath, 'r', encoding='utf-8') as file:
|
|
||||||
data = file.read()
|
|
||||||
if data:
|
|
||||||
return [data]
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
# This should be abstracted by a plugin interface for extractors
|
|
||||||
@enforce_types
|
|
||||||
def get_indexable_content(results: QuerySet):
|
|
||||||
if not results:
|
|
||||||
return []
|
|
||||||
# Only use the first method available
|
|
||||||
res, method = results.first(), results.first().extractor
|
|
||||||
if method not in ('readability', 'singlefile', 'dom', 'wget'):
|
|
||||||
return []
|
|
||||||
# This should come from a plugin interface
|
|
||||||
|
|
||||||
# TODO: banish this duplication and get these from the extractor file
|
|
||||||
if method == 'readability':
|
|
||||||
return get_file_result_content(res, 'content.txt', use_pwd=True)
|
|
||||||
elif method == 'singlefile':
|
|
||||||
return get_file_result_content(res, '', use_pwd=True)
|
|
||||||
elif method == 'dom':
|
|
||||||
return get_file_result_content(res, '', use_pwd=True)
|
|
||||||
elif method == 'wget':
|
|
||||||
return get_file_result_content(res, '', use_pwd=True)
|
|
Loading…
Add table
Add a link
Reference in a new issue