move almost all config into new archivebox.CONSTANTS

2025-06-03 08:08:43 -04:00 · 2024-09-25 05:10:09 -07:00 · 2024-09-25 05:10:09 -07:00 · bb65b2dbec
commit bb65b2dbec
parent f5e8d99fdf
32 changed files with 982 additions and 840 deletions
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -11,20 +11,19 @@ from contextlib import contextmanager
 from urllib.parse import urlparse
 from django.db.models import QuerySet, Q

+
+import archivebox
+
 from ..util import (
    scheme,
    enforce_types,
    ExtendedEncoder,
 )
+from ..misc.logging import stderr
 from ..config import (
-    ARCHIVE_DIR_NAME,
-    SQL_INDEX_FILENAME,
-    JSON_INDEX_FILENAME,
-    OUTPUT_DIR,
    TIMEOUT,
    URL_DENYLIST_PTN,
    URL_ALLOWLIST_PTN,
-    stderr,
    OUTPUT_PERMISSIONS
 )
 from ..logging_util import (
@ -224,28 +223,28 @@ def timed_index_update(out_path: Path):


@enforce_types
-def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, created_by_id: int | None=None) -> None:
+def write_main_index(links: List[Link], out_dir: Path=archivebox.DATA_DIR, created_by_id: int | None=None) -> None:
    """Writes links to sqlite3 file for a given list of links"""

    log_indexing_process_started(len(links))

    try:
-        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
+        with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
            write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
-            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
+            os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes

    except (KeyboardInterrupt, SystemExit):
        stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
        stderr('    Run archivebox init to fix any inconsistencies from an ungraceful exit.')
-        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
+        with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
            write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
-            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
+            os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
        raise SystemExit(0)

    log_indexing_process_finished()

@enforce_types
-def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
+def load_main_index(out_dir: Path=archivebox.DATA_DIR, warn: bool=True) -> List[Link]:
    """parse and load existing index with any new links from import_path merged in"""
    from core.models import Snapshot
    try:
@ -255,8 +254,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
        raise SystemExit(0)

@enforce_types
-def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
-    index_path = out_dir / JSON_INDEX_FILENAME
+def load_main_index_meta(out_dir: Path=archivebox.DATA_DIR) -> Optional[dict]:
+    index_path = out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME
    if index_path.exists():
        with open(index_path, 'r', encoding='utf-8') as f:
            meta_dict = pyjson.load(f)
@ -407,7 +406,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
        return search_filter(snapshots, filter_patterns, filter_type)


-def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_indexed_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """indexed links without checking archive status or data directory validity"""
    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
    return {
@ -415,7 +414,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
        for link in links
    }

-def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_archived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """indexed links that are archived with a valid data directory"""
    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
    return {
@ -423,7 +422,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
        for link in filter(is_archived, links)
    }

-def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_unarchived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """indexed links that are unarchived with no data directory or an empty data directory"""
    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
    return {
@ -431,12 +430,12 @@ def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opt
        for link in filter(is_unarchived, links)
    }

-def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_present_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """dirs that actually exist in the archive/ folder"""

    all_folders = {}

-    for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
+    for entry in (out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
        if entry.is_dir():
            link = None
            try:
@ -448,7 +447,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option

    return all_folders

-def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_valid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """dirs with a valid index matched to the main index and archived content"""
    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
    return {
@ -456,16 +455,16 @@ def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional
        for link in filter(is_valid, links)
    }

-def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_invalid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
-    duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
-    orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
-    corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR)
-    unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR)
+    duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
+    orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
+    corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
+    unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
    return {**duplicate, **orphaned, **corrupted, **unrecognized}


-def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_duplicate_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """dirs that conflict with other directories that have the same link URL or timestamp"""
    by_url = {}
    by_timestamp = {}
@ -473,7 +472,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti

    data_folders = (
        str(entry)
-        for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
+        for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir()
            if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
    )

@ -499,11 +498,11 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
                duplicate_folders[path] = link
    return duplicate_folders

-def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_orphaned_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """dirs that contain a valid index but aren't listed in the main index"""
    orphaned_folders = {}

-    for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
+    for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir():
        if entry.is_dir():
            link = None
            try:
@ -517,7 +516,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio

    return orphaned_folders

-def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_corrupted_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """dirs that don't contain a valid index and aren't listed in the main index"""
    corrupted = {}
    for snapshot in snapshots.iterator(chunk_size=500):
@ -526,11 +525,11 @@ def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
            corrupted[link.link_dir] = link
    return corrupted

-def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_unrecognized_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
    """dirs that don't contain recognizable archive data and aren't listed in the main index"""
    unrecognized_folders: Dict[str, Optional[Link]] = {}

-    for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
+    for entry in (Path(out_dir) / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
        if entry.is_dir():
            index_exists = (entry / "index.json").exists()
            link = None
@ -595,10 +594,10 @@ def is_unarchived(link: Link) -> bool:
    return not link.is_archived


-def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
+def fix_invalid_folder_locations(out_dir: Path=archivebox.DATA_DIR) -> Tuple[List[str], List[str]]:
    fixed = []
    cant_fix = []
-    for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME):
+    for entry in os.scandir(out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME):
        if entry.is_dir(follow_symlinks=True):
            if (Path(entry.path) / 'index.json').exists():
                try:
@ -609,7 +608,7 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
                    continue

                if not entry.path.endswith(f'/{link.timestamp}'):
-                    dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp
+                    dest = out_dir /archivebox.CONSTANTS.ARCHIVE_DIR_NAME / link.timestamp
                    if dest.exists():
                        cant_fix.append(entry.path)
                    else:
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -1,11 +1,12 @@
 __package__ = 'archivebox.index'

+import archivebox
 from pathlib import Path
 from datetime import datetime, timezone
 from collections import defaultdict
 from typing import List, Optional, Iterator, Mapping

-from django.utils.html import format_html, mark_safe
+from django.utils.html import format_html, mark_safe   # type: ignore
 from django.core.cache import cache

 from .schema import Link
@ -19,10 +20,6 @@ from ..util import (
    urldecode,
 )
 from ..config import (
-    OUTPUT_DIR,
-    VERSION,
-    FOOTER_INFO,
-    HTML_INDEX_FILENAME,
    SAVE_ARCHIVE_DOT_ORG,
    PREVIEW_ORIGINALS,
 )
@ -36,10 +33,12 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 ### Main Links Index

@enforce_types
-def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
+def parse_html_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[str]:
    """parse an archive index html file and return the list of urls"""

-    index_path = Path(out_dir) / HTML_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+
+    index_path = Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME
    if index_path.exists():
        with open(index_path, 'r', encoding='utf-8') as f:
            for line in f:
@ -59,14 +58,16 @@ def generate_index_from_links(links: List[Link], with_headers: bool):
 def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
    """render the template for the entire main index"""

+    from plugins_sys.config.apps import SHELL_CONFIG, SERVER_CONFIG
+
    return render_django_template(template, {
-        'version': VERSION,
-        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
+        'version': archivebox.VERSION,
+        'git_sha': SHELL_CONFIG.COMMIT_HASH or archivebox.VERSION,
        'num_links': str(len(links)),
        'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
        'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
        'links': [link._asdict(extended=True) for link in links],
-        'FOOTER_INFO': FOOTER_INFO,
+        'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
    })


@ -74,10 +75,11 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->

@enforce_types
 def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
+    from plugins_sys.config.constants import CONSTANTS
    out_dir = out_dir or link.link_dir

    rendered_html = link_details_template(link)
-    atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
+    atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)


@enforce_types
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@ -8,38 +8,36 @@ from pathlib import Path
 from datetime import datetime, timezone
 from typing import List, Optional, Iterator, Any, Union

+import archivebox
+
 from .schema import Link
 from ..system import atomic_write
 from ..util import enforce_types
-from ..config import (
-    VERSION,
-    OUTPUT_DIR,
-    FOOTER_INFO,
-    DEPENDENCIES,
-    JSON_INDEX_FILENAME,
-    ARCHIVE_DIR_NAME,
-    ANSI
-)


-MAIN_INDEX_HEADER = {
-    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
-    'schema': 'archivebox.index.json',
-    'copyright_info': FOOTER_INFO,
-    'meta': {
-        'project': 'ArchiveBox',
-        'version': VERSION,
-        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
-        'website': 'https://ArchiveBox.io',
-        'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
-        'source': 'https://github.com/ArchiveBox/ArchiveBox',
-        'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
-        'dependencies': DEPENDENCIES,
-    },
-}

@enforce_types
 def generate_json_index_from_links(links: List[Link], with_headers: bool):
+    from django.conf import settings
+    from plugins_sys.config.apps import SERVER_CONFIG
+    
+    MAIN_INDEX_HEADER = {
+        'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
+        'schema': 'archivebox.index.json',
+        'copyright_info': SERVER_CONFIG.FOOTER_INFO,
+        'meta': {
+            'project': 'ArchiveBox',
+            'version': archivebox.VERSION,
+            'git_sha': archivebox.VERSION,  # not used anymore, but kept for backwards compatibility
+            'website': 'https://ArchiveBox.io',
+            'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
+            'source': 'https://github.com/ArchiveBox/ArchiveBox',
+            'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
+            'dependencies': settings.BINARIES.to_dict(),
+        },
+    }
+    
+    
    if with_headers:
        output = {
            **MAIN_INDEX_HEADER,
@ -54,10 +52,12 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool):


@enforce_types
-def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
+def parse_json_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[Link]:
    """parse an archive index json file and return the list of links"""

-    index_path = Path(out_dir) / JSON_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+
+    index_path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
    if index_path.exists():
        with open(index_path, 'r', encoding='utf-8') as f:
            try:
@ -77,14 +77,14 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
                    yield Link.from_json(link_json)
                except KeyError:
                    try:
-                        detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
+                        detail_index_path = CONSTANTS.ARCHIVE_DIR / link_json['timestamp']
                        yield parse_json_link_details(str(detail_index_path))
                    except KeyError: 
                        # as a last effort, try to guess the missing values out of existing ones
                        try:
                            yield Link.from_json(link_json, guess=True)
                        except KeyError:
-                            print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
+                            # print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
                            continue
    return ()

@ -94,15 +94,19 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
 def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
    """write a json file with some info about the link"""
    
+    from plugins_sys.config.constants import CONSTANTS
+    
    out_dir = out_dir or link.link_dir
-    path = Path(out_dir) / JSON_INDEX_FILENAME
+    path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
    atomic_write(str(path), link._asdict(extended=True))


@enforce_types
-def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
+def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Optional[Link]:
    """load the json link index from a given directory"""
-    existing_index = Path(out_dir) / JSON_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+    
+    existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
    if existing_index.exists():
        with open(existing_index, 'r', encoding='utf-8') as f:
            try:
@ -117,7 +121,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
 def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
    """read through all the archive data folders and return the parsed links"""

-    for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
+    from plugins_sys.config.constants import CONSTANTS
+
+    for entry in os.scandir(CONSTANTS.ARCHIVE_DIR):
        if entry.is_dir(follow_symlinks=True):
            if (Path(entry.path) / 'index.json').exists():
                try: