use constants in more places

This commit is contained in:
Nick Sweeting 2024-09-26 02:41:09 -07:00
parent eb360f188a
commit ed45f58758
No known key found for this signature in database
5 changed files with 53 additions and 64 deletions

View file

@ -2,7 +2,6 @@ __package__ = 'archivebox.core'
from typing import Callable from typing import Callable
import threading
from pathlib import Path from pathlib import Path
from django.shortcuts import render, redirect from django.shortcuts import render, redirect
@ -12,6 +11,7 @@ from django.views import View
from django.views.generic.list import ListView from django.views.generic.list import ListView
from django.views.generic import FormView from django.views.generic import FormView
from django.db.models import Q from django.db.models import Q
from django.conf import settings
from django.contrib import messages from django.contrib import messages
from django.contrib.auth.mixins import UserPassesTestMixin from django.contrib.auth.mixins import UserPassesTestMixin
from django.views.decorators.csrf import csrf_exempt from django.views.decorators.csrf import csrf_exempt
@ -20,6 +20,8 @@ from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import archivebox
from archivebox.constants import CONSTANTS
from core.models import Snapshot from core.models import Snapshot
from core.forms import AddLinkForm from core.forms import AddLinkForm
@ -27,28 +29,17 @@ from core.admin import result_url
from queues.tasks import bg_add from queues.tasks import bg_add
from ..plugins_sys.config.apps import SHELL_CONFIG, SERVER_CONFIG
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from ..config import ( from ..config import (
OUTPUT_DIR,
PUBLIC_INDEX,
PUBLIC_SNAPSHOTS,
PUBLIC_ADD_VIEW,
VERSION,
COMMIT_HASH,
FOOTER_INFO,
SNAPSHOTS_PER_PAGE,
CONFIG,
CONFIG_SCHEMA, CONFIG_SCHEMA,
DYNAMIC_CONFIG_SCHEMA, DYNAMIC_CONFIG_SCHEMA,
USER_CONFIG, USER_CONFIG,
SAVE_ARCHIVE_DOT_ORG,
PREVIEW_ORIGINALS,
CONSTANTS,
) )
from ..logging_util import printable_filesize from ..logging_util import printable_filesize
from ..main import add from ..util import base_url, htmlencode, ts_to_date_str
from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str
from ..search import query_search_index from ..search import query_search_index
from ..extractors.wget import wget_output_path
from .serve_static import serve_static_with_byterange_support from .serve_static import serve_static_with_byterange_support
@ -57,7 +48,7 @@ class HomepageView(View):
if request.user.is_authenticated: if request.user.is_authenticated:
return redirect('/admin/core/snapshot/') return redirect('/admin/core/snapshot/')
if PUBLIC_INDEX: if SERVER_CONFIG.PUBLIC_INDEX:
return redirect('/public') return redirect('/public')
return redirect(f'/admin/login/?next={request.path}') return redirect(f'/admin/login/?next={request.path}')
@ -166,8 +157,8 @@ class SnapshotView(View):
'status_color': 'success' if link.is_archived else 'danger', 'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date), 'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
'warc_path': warc_path, 'warc_path': warc_path,
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, 'SAVE_ARCHIVE_DOT_ORG': ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS, 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']), 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result, 'best_result': best_result,
# 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234', # 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234',
@ -176,7 +167,7 @@ class SnapshotView(View):
def get(self, request, path): def get(self, request, path):
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS: if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}') return redirect(f'/admin/login/?next={request.path}')
snapshot = None snapshot = None
@ -381,15 +372,15 @@ class SnapshotView(View):
class PublicIndexView(ListView): class PublicIndexView(ListView):
template_name = 'public_index.html' template_name = 'public_index.html'
model = Snapshot model = Snapshot
paginate_by = SNAPSHOTS_PER_PAGE paginate_by = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
ordering = ['-bookmarked_at', '-created_at'] ordering = ['-bookmarked_at', '-created_at']
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):
return { return {
**super().get_context_data(**kwargs), **super().get_context_data(**kwargs),
'VERSION': VERSION, 'VERSION': archivebox.VERSION,
'COMMIT_HASH': COMMIT_HASH, 'COMMIT_HASH': SHELL_CONFIG.COMMIT_HASH,
'FOOTER_INFO': FOOTER_INFO, 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
} }
def get_queryset(self, **kwargs): def get_queryset(self, **kwargs):
@ -428,7 +419,7 @@ class PublicIndexView(ListView):
return qs.distinct() return qs.distinct()
def get(self, *args, **kwargs): def get(self, *args, **kwargs):
if PUBLIC_INDEX or self.request.user.is_authenticated: if SERVER_CONFIG.PUBLIC_INDEX or self.request.user.is_authenticated:
response = super().get(*args, **kwargs) response = super().get(*args, **kwargs)
return response return response
else: else:
@ -449,7 +440,7 @@ class AddView(UserPassesTestMixin, FormView):
return super().get_initial() return super().get_initial()
def test_func(self): def test_func(self):
return PUBLIC_ADD_VIEW or self.request.user.is_authenticated return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):
return { return {
@ -457,8 +448,8 @@ class AddView(UserPassesTestMixin, FormView):
'title': "Add URLs", 'title': "Add URLs",
# We can't just call request.build_absolute_uri in the template, because it would include query parameters # We can't just call request.build_absolute_uri in the template, because it would include query parameters
'absolute_add_path': self.request.build_absolute_uri(self.request.path), 'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION, 'VERSION': archivebox.VERSION,
'FOOTER_INFO': FOOTER_INFO, 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
'stdout': '', 'stdout': '',
} }
@ -475,7 +466,7 @@ class AddView(UserPassesTestMixin, FormView):
"depth": depth, "depth": depth,
"parser": parser, "parser": parser,
"update_all": False, "update_all": False,
"out_dir": OUTPUT_DIR, "out_dir": archivebox.DATA_DIR,
"created_by_id": self.request.user.pk, "created_by_id": self.request.user.pk,
} }
if extractors: if extractors:

View file

@ -9,8 +9,6 @@ These are the old types we used to use before ArchiveBox v0.4 (before we switche
__package__ = 'archivebox.index' __package__ = 'archivebox.index'
from pathlib import Path
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from typing import List, Dict, Any, Optional, Union from typing import List, Dict, Any, Optional, Union
@ -19,9 +17,13 @@ from dataclasses import dataclass, asdict, field, fields
from django.utils.functional import cached_property from django.utils.functional import cached_property
from archivebox.constants import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from plugins_extractor.favicon.apps import FAVICON_CONFIG
from ..system import get_dir_size from ..system import get_dir_size
from ..util import ts_to_date_str, parse_date from ..util import ts_to_date_str, parse_date
from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME, FAVICON_PROVIDER
class ArchiveError(Exception): class ArchiveError(Exception):
def __init__(self, message, hints=None): def __init__(self, message, hints=None):
@ -88,7 +90,7 @@ class ArchiveResult:
info['start_ts'] = parse_date(info['start_ts']) info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts']) info['end_ts'] = parse_date(info['end_ts'])
if "pwd" not in keys: if "pwd" not in keys:
info["pwd"] = str(Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / json_info["timestamp"]) info["pwd"] = str(ARCHIVE_DIR / json_info["timestamp"])
if "cmd_version" not in keys: if "cmd_version" not in keys:
info["cmd_version"] = "Undefined" info["cmd_version"] = "Undefined"
if "cmd" not in keys: if "cmd" not in keys:
@ -281,12 +283,10 @@ class Link:
@property @property
def link_dir(self) -> str: def link_dir(self) -> str:
from ..config import CONFIG return str(ARCHIVE_DIR / self.timestamp)
return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
@property @property
def archive_path(self) -> str: def archive_path(self) -> str:
from ..config import ARCHIVE_DIR_NAME
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
@property @property
@ -385,7 +385,6 @@ class Link:
@property @property
def is_archived(self) -> bool: def is_archived(self) -> bool:
from ..config import ARCHIVE_DIR
from ..util import domain from ..util import domain
output_paths = ( output_paths = (
@ -402,7 +401,7 @@ class Link:
) )
return any( return any(
(Path(ARCHIVE_DIR) / self.timestamp / path).exists() (ARCHIVE_DIR / self.timestamp / path).exists()
for path in output_paths for path in output_paths
) )
@ -438,7 +437,7 @@ class Link:
canonical = { canonical = {
'index_path': 'index.html', 'index_path': 'index.html',
'favicon_path': 'favicon.ico', 'favicon_path': 'favicon.ico',
'google_favicon_path': FAVICON_PROVIDER.format(self.domain), 'google_favicon_path': FAVICON_CONFIG.FAVICON_PROVIDER.format(self.domain),
'wget_path': wget_output_path(self), 'wget_path': wget_output_path(self),
'warc_path': 'warc/', 'warc_path': 'warc/',
'singlefile_path': 'singlefile.html', 'singlefile_path': 'singlefile.html',

View file

@ -12,6 +12,8 @@ from django.utils.html import format_html, mark_safe
from admin_data_views.typing import TableContext, ItemContext from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import archivebox
from ..config_stubs import AttrDict from ..config_stubs import AttrDict
from ..util import parse_date from ..util import parse_date
@ -378,9 +380,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
def log_list_view(request: HttpRequest, **kwargs) -> TableContext: def log_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert request.user.is_superuser, "Must be a superuser to view configuration settings." assert request.user.is_superuser, "Must be a superuser to view configuration settings."
from django.conf import settings
log_files = settings.CONFIG.LOGS_DIR.glob("*.log") log_files = archivebox.CONSTANTS.LOGS_DIR.glob("*.log")
log_files = sorted(log_files, key=os.path.getmtime)[::-1] log_files = sorted(log_files, key=os.path.getmtime)[::-1]
rows = { rows = {
@ -418,7 +419,7 @@ def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
from django.conf import settings from django.conf import settings
log_file = [logfile for logfile in settings.CONFIG.LOGS_DIR.glob('*.log') if key in logfile.name][0] log_file = [logfile for logfile in archivebox.CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0]
log_text = log_file.read_text() log_text = log_file.read_text()
log_stat = log_file.stat() log_stat = log_file.stat()

View file

@ -37,7 +37,7 @@ class RipgrepConfig(BaseConfigSet):
'--files-with-matches', '--files-with-matches',
'--regexp', '--regexp',
]) ])
RIPGREP_SEARCH_DIR: str = Field(default=lambda: str(settings.ARCHIVE_DIR)) RIPGREP_SEARCH_DIR: Path = archivebox.CONSTANTS.ARCHIVE_DIR
RIPGREP_CONFIG = RipgrepConfig() RIPGREP_CONFIG = RipgrepConfig()
@ -81,7 +81,7 @@ class RipgrepSearchBackend(BaseSearchBackend):
ripgrep_binary.abspath, ripgrep_binary.abspath,
*RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT, *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT,
text, text,
RIPGREP_CONFIG.RIPGREP_SEARCH_DIR, str(RIPGREP_CONFIG.RIPGREP_SEARCH_DIR),
] ]
proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True) proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True)
timestamps = set() timestamps = set()

View file

@ -18,13 +18,19 @@ from requests.exceptions import RequestException, ReadTimeout
from base32_crockford import encode as base32_encode # type: ignore from base32_crockford import encode as base32_encode # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
try: try:
import chardet import chardet
detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"] detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
except ImportError: except ImportError:
detect_encoding = lambda rawdata: "utf-8" detect_encoding = lambda rawdata: "utf-8"
from archivebox.constants import STATICFILE_EXTENSIONS
from archivebox.plugins_sys.config.apps import ARCHIVING_CONFIG
from .misc.logging import COLOR_DICT
### Parsing Helpers ### Parsing Helpers
# All of these are (str) -> str # All of these are (str) -> str
@ -114,7 +120,6 @@ def find_all_urls(urls_str: str):
def is_static_file(url: str): def is_static_file(url: str):
# TODO: the proper way is with MIME type detection + ext, not only extension # TODO: the proper way is with MIME type detection + ext, not only extension
from .config import STATICFILE_EXTENSIONS
return extension(url).lower() in STATICFILE_EXTENSIONS return extension(url).lower() in STATICFILE_EXTENSIONS
@ -206,25 +211,20 @@ def parse_date(date: Any) -> Optional[datetime]:
@enforce_types @enforce_types
def download_url(url: str, timeout: int=None) -> str: def download_url(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the text""" """Download the contents of a remote url and return the text"""
from .config import (
TIMEOUT, timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
COOKIES_FILE,
)
timeout = timeout or TIMEOUT
session = requests.Session() session = requests.Session()
if COOKIES_FILE and Path(COOKIES_FILE).is_file(): if ARCHIVING_CONFIG.COOKIES_FILE and Path(ARCHIVING_CONFIG.COOKIES_FILE).is_file():
cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE) cookie_jar = http.cookiejar.MozillaCookieJar(ARCHIVING_CONFIG.COOKIES_FILE)
cookie_jar.load(ignore_discard=True, ignore_expires=True) cookie_jar.load(ignore_discard=True, ignore_expires=True)
for cookie in cookie_jar: for cookie in cookie_jar:
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
response = session.get( response = session.get(
url, url,
headers={'User-Agent': WGET_USER_AGENT}, headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
verify=CHECK_SSL_VALIDITY, verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
timeout=timeout, timeout=timeout,
) )
@ -243,14 +243,13 @@ def download_url(url: str, timeout: int=None) -> str:
@enforce_types @enforce_types
def get_headers(url: str, timeout: int=None) -> str: def get_headers(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the headers""" """Download the contents of a remote url and return the headers"""
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
timeout = timeout or TIMEOUT
try: try:
response = requests.head( response = requests.head(
url, url,
headers={'User-Agent': WGET_USER_AGENT}, headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
verify=CHECK_SSL_VALIDITY, verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
timeout=timeout, timeout=timeout,
allow_redirects=True, allow_redirects=True,
) )
@ -261,8 +260,8 @@ def get_headers(url: str, timeout: int=None) -> str:
except RequestException: except RequestException:
response = requests.get( response = requests.get(
url, url,
headers={'User-Agent': WGET_USER_AGENT}, headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
verify=CHECK_SSL_VALIDITY, verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
timeout=timeout, timeout=timeout,
stream=True stream=True
) )
@ -285,7 +284,6 @@ def ansi_to_html(text: str) -> str:
""" """
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
""" """
from .config import COLOR_DICT
TEMPLATE = '<span style="color: rgb{}"><br>' TEMPLATE = '<span style="color: rgb{}"><br>'
text = text.replace('[m', '</span>') text = text.replace('[m', '</span>')