mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-27 21:24:16 -04:00
Merge branch 'dev' into issue1316
This commit is contained in:
commit
c6d644be29
74 changed files with 5518 additions and 1370 deletions
0
archivebox/api/__init__.py
Normal file
0
archivebox/api/__init__.py
Normal file
5
archivebox/api/apps.py
Normal file
5
archivebox/api/apps.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class APIConfig(AppConfig):
|
||||
name = 'api'
|
184
archivebox/api/archive.py
Normal file
184
archivebox/api/archive.py
Normal file
|
@ -0,0 +1,184 @@
|
|||
# archivebox_api.py
|
||||
from typing import List, Optional
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel
|
||||
from ninja import Router
|
||||
from main import (
|
||||
add,
|
||||
remove,
|
||||
update,
|
||||
list_all,
|
||||
ONLY_NEW,
|
||||
) # Assuming these functions are defined in main.py
|
||||
|
||||
|
||||
# Schemas
|
||||
|
||||
class StatusChoices(str, Enum):
|
||||
indexed = 'indexed'
|
||||
archived = 'archived'
|
||||
unarchived = 'unarchived'
|
||||
present = 'present'
|
||||
valid = 'valid'
|
||||
invalid = 'invalid'
|
||||
duplicate = 'duplicate'
|
||||
orphaned = 'orphaned'
|
||||
corrupted = 'corrupted'
|
||||
unrecognized = 'unrecognized'
|
||||
|
||||
|
||||
class AddURLSchema(BaseModel):
|
||||
urls: List[str]
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
|
||||
update_all: bool = False
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
init: bool = False
|
||||
extractors: str = ""
|
||||
parser: str = "auto"
|
||||
|
||||
|
||||
class RemoveURLSchema(BaseModel):
|
||||
yes: bool = False
|
||||
delete: bool = False
|
||||
before: Optional[float] = None
|
||||
after: Optional[float] = None
|
||||
filter_type: str = "exact"
|
||||
filter_patterns: Optional[List[str]] = None
|
||||
|
||||
|
||||
class UpdateSchema(BaseModel):
|
||||
resume: Optional[float] = None
|
||||
only_new: Optional[bool] = None
|
||||
index_only: Optional[bool] = False
|
||||
overwrite: Optional[bool] = False
|
||||
before: Optional[float] = None
|
||||
after: Optional[float] = None
|
||||
status: Optional[StatusChoices] = None
|
||||
filter_type: Optional[str] = 'exact'
|
||||
filter_patterns: Optional[List[str]] = None
|
||||
extractors: Optional[str] = ""
|
||||
|
||||
|
||||
class ListAllSchema(BaseModel):
|
||||
filter_patterns: Optional[List[str]] = None
|
||||
filter_type: str = 'exact'
|
||||
status: Optional[StatusChoices] = None
|
||||
after: Optional[float] = None
|
||||
before: Optional[float] = None
|
||||
sort: Optional[str] = None
|
||||
csv: Optional[str] = None
|
||||
json: bool = False
|
||||
html: bool = False
|
||||
with_headers: bool = False
|
||||
|
||||
|
||||
# API Router
|
||||
router = Router()
|
||||
|
||||
|
||||
@router.post("/add", response={200: dict})
|
||||
def api_add(request, payload: AddURLSchema):
|
||||
try:
|
||||
result = add(
|
||||
urls=payload.urls,
|
||||
tag=payload.tag,
|
||||
depth=payload.depth,
|
||||
update=payload.update,
|
||||
update_all=payload.update_all,
|
||||
index_only=payload.index_only,
|
||||
overwrite=payload.overwrite,
|
||||
init=payload.init,
|
||||
extractors=payload.extractors,
|
||||
parser=payload.parser,
|
||||
)
|
||||
# Currently the add function returns a list of ALL items in the DB, ideally only return new items
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "URLs added successfully.",
|
||||
"result": str(result),
|
||||
}
|
||||
except Exception as e:
|
||||
# Handle exceptions raised by the add function or during processing
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
@router.post("/remove", response={200: dict})
|
||||
def api_remove(request, payload: RemoveURLSchema):
|
||||
try:
|
||||
result = remove(
|
||||
yes=payload.yes,
|
||||
delete=payload.delete,
|
||||
before=payload.before,
|
||||
after=payload.after,
|
||||
filter_type=payload.filter_type,
|
||||
filter_patterns=payload.filter_patterns,
|
||||
)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "URLs removed successfully.",
|
||||
"result": result,
|
||||
}
|
||||
except Exception as e:
|
||||
# Handle exceptions raised by the remove function or during processing
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
@router.post("/update", response={200: dict})
|
||||
def api_update(request, payload: UpdateSchema):
|
||||
try:
|
||||
result = update(
|
||||
resume=payload.resume,
|
||||
only_new=payload.only_new,
|
||||
index_only=payload.index_only,
|
||||
overwrite=payload.overwrite,
|
||||
before=payload.before,
|
||||
after=payload.after,
|
||||
status=payload.status,
|
||||
filter_type=payload.filter_type,
|
||||
filter_patterns=payload.filter_patterns,
|
||||
extractors=payload.extractors,
|
||||
)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Archive updated successfully.",
|
||||
"result": result,
|
||||
}
|
||||
except Exception as e:
|
||||
# Handle exceptions raised by the update function or during processing
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
@router.post("/list_all", response={200: dict})
|
||||
def api_list_all(request, payload: ListAllSchema):
|
||||
try:
|
||||
result = list_all(
|
||||
filter_patterns=payload.filter_patterns,
|
||||
filter_type=payload.filter_type,
|
||||
status=payload.status,
|
||||
after=payload.after,
|
||||
before=payload.before,
|
||||
sort=payload.sort,
|
||||
csv=payload.csv,
|
||||
json=payload.json,
|
||||
html=payload.html,
|
||||
with_headers=payload.with_headers,
|
||||
)
|
||||
# TODO: This is kind of bad, make the format a choice field
|
||||
if payload.json:
|
||||
return {"status": "success", "format": "json", "data": result}
|
||||
elif payload.html:
|
||||
return {"status": "success", "format": "html", "data": result}
|
||||
elif payload.csv:
|
||||
return {"status": "success", "format": "csv", "data": result}
|
||||
else:
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "List generated successfully.",
|
||||
"data": result,
|
||||
}
|
||||
except Exception as e:
|
||||
# Handle exceptions raised by the list_all function or during processing
|
||||
return {"status": "error", "message": str(e)}
|
48
archivebox/api/auth.py
Normal file
48
archivebox/api/auth.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
from django.contrib.auth import authenticate
|
||||
from ninja import Form, Router, Schema
|
||||
from ninja.security import HttpBearer
|
||||
|
||||
from api.models import Token
|
||||
|
||||
router = Router()
|
||||
|
||||
|
||||
class GlobalAuth(HttpBearer):
|
||||
def authenticate(self, request, token):
|
||||
try:
|
||||
return Token.objects.get(token=token).user
|
||||
except Token.DoesNotExist:
|
||||
pass
|
||||
|
||||
|
||||
class AuthSchema(Schema):
|
||||
email: str
|
||||
password: str
|
||||
|
||||
|
||||
@router.post("/authenticate", auth=None) # overriding global auth
|
||||
def get_token(request, auth_data: AuthSchema):
|
||||
user = authenticate(username=auth_data.email, password=auth_data.password)
|
||||
if user:
|
||||
# Assuming a user can have multiple tokens and you want to create a new one every time
|
||||
new_token = Token.objects.create(user=user)
|
||||
return {"token": new_token.token, "expires": new_token.expiry_as_iso8601}
|
||||
else:
|
||||
return {"error": "Invalid credentials"}
|
||||
|
||||
|
||||
class TokenValidationSchema(Schema):
|
||||
token: str
|
||||
|
||||
|
||||
@router.post("/validate_token", auth=None) # No authentication required for this endpoint
|
||||
def validate_token(request, token_data: TokenValidationSchema):
|
||||
try:
|
||||
# Attempt to authenticate using the provided token
|
||||
user = GlobalAuth().authenticate(request, token_data.token)
|
||||
if user:
|
||||
return {"status": "valid"}
|
||||
else:
|
||||
return {"status": "invalid"}
|
||||
except Token.DoesNotExist:
|
||||
return {"status": "invalid"}
|
28
archivebox/api/migrations/0001_initial.py
Normal file
28
archivebox/api/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
# Generated by Django 3.1.14 on 2024-04-09 18:52
|
||||
|
||||
import api.models
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Token',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('token', models.CharField(default=auth.models.hex_uuid, max_length=32, unique=True)),
|
||||
('created', models.DateTimeField(auto_now_add=True)),
|
||||
('expiry', models.DateTimeField(blank=True, null=True)),
|
||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tokens', to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
),
|
||||
]
|
0
archivebox/api/migrations/__init__.py
Normal file
0
archivebox/api/migrations/__init__.py
Normal file
30
archivebox/api/models.py
Normal file
30
archivebox/api/models.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
import uuid
|
||||
from datetime import timedelta
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
def hex_uuid():
|
||||
return uuid.uuid4().hex
|
||||
|
||||
|
||||
class Token(models.Model):
|
||||
user = models.ForeignKey(
|
||||
settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tokens"
|
||||
)
|
||||
token = models.CharField(max_length=32, default=hex_uuid, unique=True)
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
expiry = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
@property
|
||||
def expiry_as_iso8601(self):
|
||||
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
|
||||
expiry_date = (
|
||||
self.expiry if self.expiry else timezone.now() + timedelta(days=365 * 100)
|
||||
)
|
||||
return expiry_date.isoformat()
|
||||
|
||||
def __str__(self):
|
||||
return self.token
|
27
archivebox/api/tests.py
Normal file
27
archivebox/api/tests.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from django.test import TestCase
|
||||
from ninja.testing import TestClient
|
||||
from archivebox.api.archive import router as archive_router
|
||||
|
||||
class ArchiveBoxAPITestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.client = TestClient(archive_router)
|
||||
|
||||
def test_add_endpoint(self):
|
||||
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "test"})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.json()["status"], "success")
|
||||
|
||||
def test_remove_endpoint(self):
|
||||
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.json()["status"], "success")
|
||||
|
||||
def test_update_endpoint(self):
|
||||
response = self.client.post("/update", json={})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.json()["status"], "success")
|
||||
|
||||
def test_list_all_endpoint(self):
|
||||
response = self.client.post("/list_all", json={})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertTrue("success" in response.json()["status"])
|
|
@ -112,6 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
|
@ -136,14 +137,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
},
|
||||
|
||||
'ARCHIVE_METHOD_OPTIONS': {
|
||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
|
||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
|
||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
|
||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
|
||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
'USER_AGENT': {'type': str, 'default': None},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||
|
@ -151,7 +153,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'CHROME_TIMEOUT': {'type': int, 'default': 0},
|
||||
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
||||
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
|
||||
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
|
||||
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
||||
'--restrict-filenames',
|
||||
'--trim-filenames', '128',
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
|
@ -173,6 +179,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--add-metadata',
|
||||
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
|
||||
]},
|
||||
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
|
||||
|
||||
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
|
||||
|
@ -184,12 +191,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]},
|
||||
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
||||
'--location',
|
||||
'--compressed'
|
||||
]},
|
||||
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default' : None},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
||||
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
|
||||
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
|
||||
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
|
||||
},
|
||||
|
||||
|
@ -269,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates'
|
|||
ARCHIVE_DIR_NAME = 'archive'
|
||||
SOURCES_DIR_NAME = 'sources'
|
||||
LOGS_DIR_NAME = 'logs'
|
||||
PERSONAS_DIR_NAME = 'personas'
|
||||
SQL_INDEX_FILENAME = 'index.sqlite3'
|
||||
JSON_INDEX_FILENAME = 'index.json'
|
||||
HTML_INDEX_FILENAME = 'index.html'
|
||||
|
@ -342,9 +355,11 @@ ALLOWED_IN_OUTPUT_DIR = {
|
|||
'static',
|
||||
'sonic',
|
||||
'search.sqlite3',
|
||||
'crontabs',
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
PERSONAS_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f'{SQL_INDEX_FILENAME}-wal',
|
||||
f'{SQL_INDEX_FILENAME}-shm',
|
||||
|
@ -363,24 +378,32 @@ ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
|||
|
||||
############################## Version Config ##################################
|
||||
|
||||
def get_system_user():
|
||||
SYSTEM_USER = getpass.getuser() or os.getlogin()
|
||||
def get_system_user() -> str:
|
||||
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
|
||||
# uid 999 is especially problematic and breaks many attempts
|
||||
SYSTEM_USER = None
|
||||
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
|
||||
|
||||
# Option 1
|
||||
try:
|
||||
import pwd
|
||||
return pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
|
||||
except KeyError:
|
||||
# Process' UID might not map to a user in cases such as running the Docker image
|
||||
# (where `archivebox` is 999) as a different UID.
|
||||
pass
|
||||
except ModuleNotFoundError:
|
||||
# pwd doesn't exist on windows
|
||||
pass
|
||||
except Exception:
|
||||
# this should never happen, uncomment to debug
|
||||
# raise
|
||||
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
|
||||
except (ModuleNotFoundError, Exception):
|
||||
pass
|
||||
|
||||
return SYSTEM_USER
|
||||
# Option 2
|
||||
try:
|
||||
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Option 3
|
||||
try:
|
||||
SYSTEM_USER = SYSTEM_USER or os.getlogin()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
|
||||
|
||||
def get_version(config):
|
||||
try:
|
||||
|
@ -487,9 +510,10 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
||||
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
|
||||
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
|
||||
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
||||
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
||||
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
|
||||
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
|
||||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
|
||||
|
@ -519,6 +543,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
||||
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
|
||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
||||
|
||||
|
@ -529,18 +554,22 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
||||
|
||||
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
||||
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
|
||||
|
||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||
|
||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
||||
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
|
||||
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
|
||||
|
||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||
|
@ -550,6 +579,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
|
||||
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
||||
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
||||
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
|
||||
|
||||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
|
||||
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
||||
|
@ -571,6 +601,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
||||
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
||||
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
||||
}
|
||||
|
@ -899,27 +930,36 @@ def find_chrome_binary() -> Optional[str]:
|
|||
|
||||
def find_chrome_data_dir() -> Optional[str]:
|
||||
"""find any installed chrome user data directories in the default locations"""
|
||||
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||
# make sure data dir finding precedence order always matches binary finding order
|
||||
default_profile_paths = (
|
||||
'~/.config/chromium',
|
||||
'~/Library/Application Support/Chromium',
|
||||
'~/AppData/Local/Chromium/User Data',
|
||||
'~/.config/chrome',
|
||||
'~/.config/google-chrome',
|
||||
'~/Library/Application Support/Google/Chrome',
|
||||
'~/AppData/Local/Google/Chrome/User Data',
|
||||
'~/.config/google-chrome-stable',
|
||||
'~/.config/google-chrome-beta',
|
||||
'~/Library/Application Support/Google/Chrome Canary',
|
||||
'~/AppData/Local/Google/Chrome SxS/User Data',
|
||||
'~/.config/google-chrome-unstable',
|
||||
'~/.config/google-chrome-dev',
|
||||
)
|
||||
for path in default_profile_paths:
|
||||
full_path = Path(path).resolve()
|
||||
if full_path.exists():
|
||||
return full_path
|
||||
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
|
||||
|
||||
# Going forward we want to discourage people from using their main chrome profile for archiving.
|
||||
# Session tokens, personal data, and cookies are often returned in server responses,
|
||||
# when they get archived, they are essentially burned as anyone who can view the archive
|
||||
# can use that data to masquerade as the logged-in user that did the archiving.
|
||||
# For this reason users should always create dedicated burner profiles for archiving and not use
|
||||
# their daily driver main accounts.
|
||||
|
||||
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||
# # make sure data dir finding precedence order always matches binary finding order
|
||||
# default_profile_paths = (
|
||||
# '~/.config/chromium',
|
||||
# '~/Library/Application Support/Chromium',
|
||||
# '~/AppData/Local/Chromium/User Data',
|
||||
# '~/.config/chrome',
|
||||
# '~/.config/google-chrome',
|
||||
# '~/Library/Application Support/Google/Chrome',
|
||||
# '~/AppData/Local/Google/Chrome/User Data',
|
||||
# '~/.config/google-chrome-stable',
|
||||
# '~/.config/google-chrome-beta',
|
||||
# '~/Library/Application Support/Google/Chrome Canary',
|
||||
# '~/AppData/Local/Google/Chrome SxS/User Data',
|
||||
# '~/.config/google-chrome-unstable',
|
||||
# '~/.config/google-chrome-dev',
|
||||
# )
|
||||
# for path in default_profile_paths:
|
||||
# full_path = Path(path).resolve()
|
||||
# if full_path.exists():
|
||||
# return full_path
|
||||
return None
|
||||
|
||||
def wget_supports_compression(config):
|
||||
|
@ -990,6 +1030,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': True,
|
||||
'is_valid': config['LOGS_DIR'].exists(),
|
||||
},
|
||||
'PERSONAS_DIR': {
|
||||
'path': config['PERSONAS_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['PERSONAS_DIR'].exists(),
|
||||
},
|
||||
'ARCHIVE_DIR': {
|
||||
'path': config['ARCHIVE_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
|
@ -1337,6 +1382,8 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
|
|||
|
||||
(Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
|
||||
(Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
|
||||
(Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
|
||||
(Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1 +1,2 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
|
|
|
@ -24,8 +24,16 @@ from core.mixins import SearchResultsAdminMixin
|
|||
from index.html import snapshot_icons
|
||||
from logging_util import printable_filesize
|
||||
from main import add, remove
|
||||
from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
|
||||
from extractors import archive_links
|
||||
from config import (
|
||||
OUTPUT_DIR,
|
||||
SNAPSHOTS_PER_PAGE,
|
||||
VERSION,
|
||||
VERSIONS_AVAILABLE,
|
||||
CAN_UPGRADE
|
||||
)
|
||||
|
||||
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
||||
|
||||
# Admin URLs
|
||||
# /admin/
|
||||
|
@ -40,6 +48,60 @@ from extractors import archive_links
|
|||
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
||||
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
index_title = 'Links'
|
||||
site_title = 'Index'
|
||||
namespace = 'admin'
|
||||
|
||||
def get_urls(self):
|
||||
return [
|
||||
path('core/snapshot/add/', self.add_view, name='Add'),
|
||||
] + super().get_urls()
|
||||
|
||||
def add_view(self, request):
|
||||
if not request.user.is_authenticated:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
request.current_app = self.name
|
||||
context = {
|
||||
**self.each_context(request),
|
||||
'title': 'Add URLs',
|
||||
}
|
||||
|
||||
if request.method == 'GET':
|
||||
context['form'] = AddLinkForm()
|
||||
|
||||
elif request.method == 'POST':
|
||||
form = AddLinkForm(request.POST)
|
||||
if form.is_valid():
|
||||
url = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {url}')
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
input_kwargs = {
|
||||
"urls": url,
|
||||
"depth": depth,
|
||||
"update_all": False,
|
||||
"out_dir": OUTPUT_DIR,
|
||||
}
|
||||
add_stdout = StringIO()
|
||||
with redirect_stdout(add_stdout):
|
||||
add(**input_kwargs)
|
||||
print(add_stdout.getvalue())
|
||||
|
||||
context.update({
|
||||
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
|
||||
"form": AddLinkForm()
|
||||
})
|
||||
else:
|
||||
context["form"] = form
|
||||
|
||||
return render(template_name='add.html', request=request, context=context)
|
||||
|
||||
archivebox_admin = ArchiveBoxAdmin()
|
||||
archivebox_admin.register(get_user_model())
|
||||
archivebox_admin.disable_action('delete_selected')
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
model = ArchiveResult
|
||||
|
||||
|
@ -49,11 +111,11 @@ class TagInline(admin.TabularInline):
|
|||
from django.contrib.admin.helpers import ActionForm
|
||||
from django.contrib.admin.widgets import AutocompleteSelectMultiple
|
||||
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
class AutocompleteTags:
|
||||
model = Tag
|
||||
search_fields = ['name']
|
||||
name = 'tags'
|
||||
remote_field = TagInline
|
||||
|
||||
class AutocompleteTagsAdminStub:
|
||||
name = 'admin'
|
||||
|
@ -63,7 +125,6 @@ class SnapshotActionForm(ActionForm):
|
|||
tags = forms.ModelMultipleChoiceField(
|
||||
queryset=Tag.objects.all(),
|
||||
required=False,
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
widget=AutocompleteSelectMultiple(
|
||||
AutocompleteTags(),
|
||||
AutocompleteTagsAdminStub(),
|
||||
|
@ -82,6 +143,7 @@ class SnapshotActionForm(ActionForm):
|
|||
# )
|
||||
|
||||
|
||||
@admin.register(Snapshot, site=archivebox_admin)
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
||||
list_display = ('added', 'title_str', 'files', 'size', 'url_str')
|
||||
sort_fields = ('title_str', 'url_str', 'added', 'files')
|
||||
|
@ -97,6 +159,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
|
||||
action_form = SnapshotActionForm
|
||||
|
||||
def changelist_view(self, request, extra_context=None):
|
||||
extra_context = extra_context or {}
|
||||
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
|
||||
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
|
@ -164,6 +230,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
obj.id,
|
||||
)
|
||||
|
||||
@admin.display(
|
||||
description='Title',
|
||||
ordering='title',
|
||||
)
|
||||
def title_str(self, obj):
|
||||
canon = obj.as_link().canonical_outputs()
|
||||
tags = ''.join(
|
||||
|
@ -185,12 +255,17 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
||||
) + mark_safe(f' <span class="tags">{tags}</span>')
|
||||
|
||||
@admin.display(
|
||||
description='Files Saved',
|
||||
ordering='archiveresult_count',
|
||||
)
|
||||
def files(self, obj):
|
||||
return snapshot_icons(obj)
|
||||
|
||||
files.admin_order_field = 'archiveresult_count'
|
||||
files.short_description = 'Files Saved'
|
||||
|
||||
@admin.display(
|
||||
ordering='archiveresult_count'
|
||||
)
|
||||
def size(self, obj):
|
||||
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
|
||||
if archive_size:
|
||||
|
@ -205,8 +280,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
size_txt,
|
||||
)
|
||||
|
||||
size.admin_order_field = 'archiveresult_count'
|
||||
|
||||
@admin.display(
|
||||
description='Original URL',
|
||||
ordering='url',
|
||||
)
|
||||
def url_str(self, obj):
|
||||
return format_html(
|
||||
'<a href="{}"><code style="user-select: all;">{}</code></a>',
|
||||
|
@ -243,65 +321,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
# print('[*] Got request', request.method, request.POST)
|
||||
# return super().changelist_view(request, extra_context=None)
|
||||
|
||||
@admin.action(
|
||||
description="Pull"
|
||||
)
|
||||
def update_snapshots(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], out_dir=OUTPUT_DIR)
|
||||
update_snapshots.short_description = "Pull"
|
||||
|
||||
@admin.action(
|
||||
description="⬇️ Title"
|
||||
)
|
||||
def update_titles(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
|
||||
update_titles.short_description = "⬇️ Title"
|
||||
|
||||
@admin.action(
|
||||
description="Re-Snapshot"
|
||||
)
|
||||
def resnapshot_snapshot(self, request, queryset):
|
||||
for snapshot in queryset:
|
||||
timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds')
|
||||
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
|
||||
add(new_url, tag=snapshot.tags_str())
|
||||
resnapshot_snapshot.short_description = "Re-Snapshot"
|
||||
|
||||
@admin.action(
|
||||
description="Reset"
|
||||
)
|
||||
def overwrite_snapshots(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, out_dir=OUTPUT_DIR)
|
||||
overwrite_snapshots.short_description = "Reset"
|
||||
|
||||
@admin.action(
|
||||
description="Delete"
|
||||
)
|
||||
def delete_snapshots(self, request, queryset):
|
||||
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
|
||||
|
||||
delete_snapshots.short_description = "Delete"
|
||||
|
||||
@admin.action(
|
||||
description="+"
|
||||
)
|
||||
def add_tags(self, request, queryset):
|
||||
tags = request.POST.getlist('tags')
|
||||
print('[+] Adding tags', tags, 'to Snapshots', queryset)
|
||||
for obj in queryset:
|
||||
obj.tags.add(*tags)
|
||||
|
||||
add_tags.short_description = "+"
|
||||
|
||||
@admin.action(
|
||||
description="–"
|
||||
)
|
||||
def remove_tags(self, request, queryset):
|
||||
tags = request.POST.getlist('tags')
|
||||
print('[-] Removing tags', tags, 'to Snapshots', queryset)
|
||||
for obj in queryset:
|
||||
obj.tags.remove(*tags)
|
||||
|
||||
remove_tags.short_description = "–"
|
||||
|
||||
|
||||
|
||||
title_str.short_description = 'Title'
|
||||
url_str.short_description = 'Original URL'
|
||||
|
||||
title_str.admin_order_field = 'title'
|
||||
url_str.admin_order_field = 'url'
|
||||
|
||||
|
||||
|
||||
|
||||
@admin.register(Tag, site=archivebox_admin)
|
||||
class TagAdmin(admin.ModelAdmin):
|
||||
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
|
||||
sort_fields = ('id', 'name', 'slug')
|
||||
|
@ -332,6 +421,7 @@ class TagAdmin(admin.ModelAdmin):
|
|||
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
|
||||
|
||||
|
||||
@admin.register(ArchiveResult, site=archivebox_admin)
|
||||
class ArchiveResultAdmin(admin.ModelAdmin):
|
||||
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str')
|
||||
sort_fields = ('start_ts', 'extractor', 'status')
|
||||
|
@ -344,6 +434,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
|
|||
ordering = ['-start_ts']
|
||||
list_per_page = SNAPSHOTS_PER_PAGE
|
||||
|
||||
@admin.display(
|
||||
description='snapshot'
|
||||
)
|
||||
def snapshot_str(self, obj):
|
||||
return format_html(
|
||||
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
|
||||
|
@ -353,6 +446,9 @@ class ArchiveResultAdmin(admin.ModelAdmin):
|
|||
obj.snapshot.url[:128],
|
||||
)
|
||||
|
||||
@admin.display(
|
||||
description='tags'
|
||||
)
|
||||
def tags_str(self, obj):
|
||||
return obj.snapshot.tags_str()
|
||||
|
||||
|
@ -369,62 +465,3 @@ class ArchiveResultAdmin(admin.ModelAdmin):
|
|||
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
|
||||
obj.output,
|
||||
)
|
||||
|
||||
tags_str.short_description = 'tags'
|
||||
snapshot_str.short_description = 'snapshot'
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
index_title = 'Links'
|
||||
site_title = 'Index'
|
||||
|
||||
def get_urls(self):
|
||||
return [
|
||||
path('core/snapshot/add/', self.add_view, name='Add'),
|
||||
] + super().get_urls()
|
||||
|
||||
def add_view(self, request):
|
||||
if not request.user.is_authenticated:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
request.current_app = self.name
|
||||
context = {
|
||||
**self.each_context(request),
|
||||
'title': 'Add URLs',
|
||||
}
|
||||
|
||||
if request.method == 'GET':
|
||||
context['form'] = AddLinkForm()
|
||||
|
||||
elif request.method == 'POST':
|
||||
form = AddLinkForm(request.POST)
|
||||
if form.is_valid():
|
||||
url = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {url}')
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
input_kwargs = {
|
||||
"urls": url,
|
||||
"depth": depth,
|
||||
"update_all": False,
|
||||
"out_dir": OUTPUT_DIR,
|
||||
}
|
||||
add_stdout = StringIO()
|
||||
with redirect_stdout(add_stdout):
|
||||
add(**input_kwargs)
|
||||
print(add_stdout.getvalue())
|
||||
|
||||
context.update({
|
||||
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
|
||||
"form": AddLinkForm()
|
||||
})
|
||||
else:
|
||||
context["form"] = form
|
||||
|
||||
return render(template_name='add.html', request=request, context=context)
|
||||
|
||||
admin.site = ArchiveBoxAdmin()
|
||||
admin.site.register(get_user_model())
|
||||
admin.site.register(Snapshot, SnapshotAdmin)
|
||||
admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(ArchiveResult, ArchiveResultAdmin)
|
||||
admin.site.disable_action('delete_selected')
|
||||
|
|
|
@ -3,5 +3,8 @@ from django.apps import AppConfig
|
|||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'core'
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
default_auto_field = 'django.db.models.UUIDField'
|
||||
|
||||
def ready(self):
|
||||
from .auth import register_signals
|
||||
|
||||
register_signals()
|
||||
|
|
13
archivebox/core/auth.py
Normal file
13
archivebox/core/auth.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
import os
|
||||
from django.conf import settings
|
||||
from ..config import (
|
||||
LDAP
|
||||
)
|
||||
|
||||
def register_signals():
|
||||
|
||||
if LDAP:
|
||||
import django_auth_ldap.backend
|
||||
from .auth_ldap import create_user
|
||||
|
||||
django_auth_ldap.backend.populate_user.connect(create_user)
|
12
archivebox/core/auth_ldap.py
Normal file
12
archivebox/core/auth_ldap.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
from django.conf import settings
|
||||
from ..config import (
|
||||
LDAP_CREATE_SUPERUSER
|
||||
)
|
||||
|
||||
def create_user(sender, user=None, ldap_user=None, **kwargs):
|
||||
|
||||
if not user.id and LDAP_CREATE_SUPERUSER:
|
||||
user.is_superuser = True
|
||||
|
||||
user.is_staff = True
|
||||
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
|
|
@ -61,6 +61,7 @@ INSTALLED_APPS = [
|
|||
'django.contrib.admin',
|
||||
|
||||
'core',
|
||||
'api',
|
||||
|
||||
'django_extensions',
|
||||
]
|
||||
|
@ -269,9 +270,6 @@ AUTH_PASSWORD_VALIDATORS = [
|
|||
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
|
||||
]
|
||||
|
||||
# WIP: broken by Django 3.1.2 -> 4.0 migration
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField'
|
||||
|
||||
################################################################################
|
||||
### Shell Settings
|
||||
################################################################################
|
||||
|
@ -290,7 +288,6 @@ if IS_SHELL:
|
|||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
USE_I18N = True
|
||||
USE_L10N = True
|
||||
USE_TZ = True
|
||||
DATETIME_FORMAT = 'Y-m-d g:iA'
|
||||
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from django.contrib import admin
|
||||
from .admin import archivebox_admin
|
||||
|
||||
from django.urls import path, include
|
||||
from django.views import static
|
||||
|
@ -8,6 +8,13 @@ from django.views.generic.base import RedirectView
|
|||
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||
|
||||
from ninja import NinjaAPI
|
||||
from api.auth import GlobalAuth
|
||||
|
||||
api = NinjaAPI(auth=GlobalAuth())
|
||||
api.add_router("/auth/", "api.auth.router")
|
||||
api.add_router("/archive/", "api.archive.router")
|
||||
|
||||
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
||||
|
@ -34,10 +41,9 @@ urlpatterns = [
|
|||
|
||||
|
||||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
path('admin/', admin.site.urls),
|
||||
path('admin/', archivebox_admin.urls),
|
||||
|
||||
# do not add extra_context like this as not all admin views (e.g. ModelAdmin.autocomplete_view accept extra kwargs)
|
||||
# path('admin/', admin.site.urls, {'extra_context': GLOBAL_CONTEXT}),
|
||||
path("api/", api.urls),
|
||||
|
||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||
path('error/', lambda _: 1/0),
|
||||
|
|
|
@ -131,7 +131,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
|
||||
link = load_link_details(link, out_dir=out_dir)
|
||||
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
|
||||
log_link_archiving_started(link, out_dir, is_new)
|
||||
log_link_archiving_started(link, str(out_dir), is_new)
|
||||
link = link.overwrite(updated=datetime.now(timezone.utc))
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
|
@ -165,16 +165,6 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
# print('{black} X {}{reset}'.format(method_name, **ANSI))
|
||||
stats['skipped'] += 1
|
||||
except Exception as e:
|
||||
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
|
||||
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
|
||||
# are fixed.
|
||||
"""
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
"""
|
||||
# Instead, use the kludgy workaround from
|
||||
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
|
||||
with open(ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
|
@ -186,6 +176,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
ts
|
||||
) + "\n" + str(e) + "\n"))
|
||||
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
|
||||
# print(f' ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
|
||||
|
||||
# print(' ', stats)
|
||||
|
||||
|
@ -218,7 +215,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
|||
|
||||
if type(all_links) is QuerySet:
|
||||
num_links: int = all_links.count()
|
||||
get_link = lambda x: x.as_link()
|
||||
get_link = lambda x: x.as_link_with_details()
|
||||
all_links = all_links.iterator()
|
||||
else:
|
||||
num_links: int = len(all_links)
|
||||
|
|
|
@ -10,10 +10,12 @@ from ..system import run, chmod_file
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
CURL_BINARY,
|
||||
|
@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
|||
output: ArchiveOutput = 'archive.org.txt'
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(options),
|
||||
submit_url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
|
|
@ -6,13 +6,18 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..system import chmod_file, run
|
||||
from ..util import enforce_types, domain
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
domain,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_FAVICON,
|
||||
FAVICON_PROVIDER,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
CURL_USER_AGENT,
|
||||
|
@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'favicon.ico'
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
'--max-time', str(timeout),
|
||||
'--output', str(output),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(options),
|
||||
FAVICON_PROVIDER.format(domain(link.url)),
|
||||
]
|
||||
status = 'failed'
|
||||
|
|
|
@ -9,11 +9,13 @@ from ..system import atomic_write
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
get_headers,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_USER_AGENT,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
|
@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
'--head',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
try:
|
||||
|
|
|
@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output = "htmltotext.txt"
|
||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
extracted_text = None
|
||||
status = 'failed'
|
||||
try:
|
||||
extractor = HTMLTextExtractor()
|
||||
document = get_html(link, out_dir)
|
||||
|
@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
extracted_text = str(extractor)
|
||||
|
||||
atomic_write(str(out_dir / output), extracted_text)
|
||||
status = 'succeeded'
|
||||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
|
|
@ -8,11 +8,13 @@ from ..system import run, chmod_file
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
MEDIA_TIMEOUT,
|
||||
SAVE_MEDIA,
|
||||
YOUTUBEDL_ARGS,
|
||||
YOUTUBEDL_EXTRA_ARGS,
|
||||
YOUTUBEDL_BINARY,
|
||||
YOUTUBEDL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
|
@ -39,11 +41,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
output: ArchiveOutput = 'media'
|
||||
output_path = out_dir / output
|
||||
output_path.mkdir(exist_ok=True)
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*YOUTUBEDL_ARGS,
|
||||
*YOUTUBEDL_EXTRA_ARGS,
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
||||
]
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
|
|
@ -11,13 +11,15 @@ from ..system import run, atomic_write
|
|||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_MERCURY,
|
||||
DEPENDENCIES,
|
||||
MERCURY_VERSION,
|
||||
MERCURY_ARGS,
|
||||
MERCURY_EXTRA_ARGS,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
output_folder.mkdir(exist_ok=True)
|
||||
|
||||
# Get plain text version of article
|
||||
# later options take precedence
|
||||
options = [
|
||||
*MERCURY_ARGS,
|
||||
*MERCURY_EXTRA_ARGS,
|
||||
]
|
||||
# By default, get plain text version of article
|
||||
cmd = [
|
||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
||||
link.url,
|
||||
"--format=text"
|
||||
*dedupe(options)
|
||||
]
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
try:
|
||||
|
|
|
@ -11,6 +11,7 @@ from ..util import (
|
|||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -18,7 +19,9 @@ from ..config import (
|
|||
DEPENDENCIES,
|
||||
SINGLEFILE_VERSION,
|
||||
SINGLEFILE_ARGS,
|
||||
SINGLEFILE_EXTRA_ARGS,
|
||||
CHROME_BINARY,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
@ -46,37 +49,24 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
||||
# later options take precedence
|
||||
options = [
|
||||
*SINGLEFILE_ARGS,
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
|
||||
browser_args,
|
||||
*SINGLEFILE_ARGS,
|
||||
*SINGLEFILE_EXTRA_ARGS,
|
||||
]
|
||||
|
||||
# Deduplicate options (single-file doesn't like when you use the same option two times)
|
||||
#
|
||||
# NOTE: Options names that come first clobber conflicting names that come later
|
||||
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
|
||||
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
|
||||
# kind of like the ergonomic principle of lexical scope in programming languages.
|
||||
seen_option_names = []
|
||||
def test_seen(argument):
|
||||
option_name = argument.split("=")[0]
|
||||
if option_name in seen_option_names:
|
||||
return False
|
||||
else:
|
||||
seen_option_names.append(option_name)
|
||||
return True
|
||||
deduped_options = list(filter(test_seen, options))
|
||||
|
||||
cmd = [
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
*deduped_options,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
output,
|
||||
]
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
result = None
|
||||
try:
|
||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
|
||||
|
@ -84,7 +74,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
|
||||
if line.strip()
|
||||
]
|
||||
hints = (
|
||||
|
@ -94,12 +84,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
# Check for common failure cases
|
||||
if (result.returncode > 0) or not (out_dir / output).is_file():
|
||||
raise ArchiveError('SingleFile was not able to archive the page', hints)
|
||||
raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
|
||||
chmod_file(output, cwd=str(out_dir))
|
||||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
||||
cmd[2] = browser_args.replace('"', "\\\"")
|
||||
err.hints = (result.stdout + result.stderr).decode().split('\n')
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
|
|
@ -10,6 +10,7 @@ from ..util import (
|
|||
enforce_types,
|
||||
download_url,
|
||||
htmldecode,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -17,6 +18,7 @@ from ..config import (
|
|||
SAVE_TITLE,
|
||||
CURL_BINARY,
|
||||
CURL_ARGS,
|
||||
CURL_EXTRA_ARGS,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
|
@ -75,7 +77,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
|||
with open(abs_path / source, "r", encoding="utf-8") as f:
|
||||
document = f.read()
|
||||
break
|
||||
except (FileNotFoundError, TypeError):
|
||||
except (FileNotFoundError, TypeError, UnicodeDecodeError):
|
||||
continue
|
||||
if document is None:
|
||||
return download_url(link.url, timeout=timeout)
|
||||
|
@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
from core.models import Snapshot
|
||||
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
# later options take precedence
|
||||
options = [
|
||||
*CURL_ARGS,
|
||||
*CURL_EXTRA_ARGS,
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
]
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
|
|
@ -15,9 +15,11 @@ from ..util import (
|
|||
path,
|
||||
domain,
|
||||
urldecode,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
WGET_ARGS,
|
||||
WGET_EXTRA_ARGS,
|
||||
TIMEOUT,
|
||||
SAVE_WGET,
|
||||
SAVE_WARC,
|
||||
|
@ -55,10 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
|
||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
# '--server-response', # print headers for better error parsing
|
||||
# later options take precedence
|
||||
options = [
|
||||
*WGET_ARGS,
|
||||
*WGET_EXTRA_ARGS,
|
||||
'--timeout={}'.format(timeout),
|
||||
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
||||
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
||||
|
@ -68,6 +70,11 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||
*([] if SAVE_WARC else ['--timestamping']),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||
# '--server-response', # print headers for better error parsing
|
||||
]
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
*dedupe(options),
|
||||
link.url,
|
||||
]
|
||||
|
||||
|
|
0
archivebox/index.sqlite3
Normal file
0
archivebox/index.sqlite3
Normal file
|
@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
|||
"""parse and load existing index with any new links from import_path merged in"""
|
||||
from core.models import Snapshot
|
||||
try:
|
||||
return Snapshot.objects.all()
|
||||
return Snapshot.objects.all().only('id')
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise SystemExit(0)
|
||||
|
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
|||
|
||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in links
|
||||
|
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
|||
|
||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_archived, links)
|
||||
|
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
|||
|
||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_unarchived, links)
|
||||
|
|
|
@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
|||
**ANSI,
|
||||
),
|
||||
]
|
||||
|
||||
# import pudb; pudb.set_trace()
|
||||
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = getattr(result.output, 'hints', None) or ()
|
||||
if hints:
|
||||
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
|
||||
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
|
||||
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
|
||||
else:
|
||||
if isinstance(hints, bytes):
|
||||
hints = hints.decode()
|
||||
|
@ -492,12 +494,12 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
|||
if delete:
|
||||
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
|
||||
print(
|
||||
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
||||
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' +
|
||||
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
||||
)
|
||||
else:
|
||||
print(
|
||||
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
|
||||
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' +
|
||||
' (Pass --delete if you also want to permanently delete the data folders)'
|
||||
)
|
||||
|
||||
|
@ -636,17 +638,15 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
|||
|
||||
@enforce_types
|
||||
def printable_dependency_version(name: str, dependency: Dict) -> str:
|
||||
version = None
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
|
||||
if dependency['enabled']:
|
||||
if dependency['is_valid']:
|
||||
color, symbol, note, version = 'green', '√', 'valid', ''
|
||||
color, symbol, note = 'green', '√', 'valid'
|
||||
|
||||
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
|
||||
if parsed_version_num:
|
||||
version = f'v{parsed_version_num[0]}'
|
||||
|
||||
if not version:
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
|
|
|
@ -791,6 +791,8 @@ def update(resume: Optional[float]=None,
|
|||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
|
||||
from core.models import ArchiveResult
|
||||
|
||||
check_data_folder(out_dir=out_dir)
|
||||
check_dependencies()
|
||||
new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||
|
@ -798,19 +800,23 @@ def update(resume: Optional[float]=None,
|
|||
extractors = extractors.split(",") if extractors else []
|
||||
|
||||
# Step 1: Filter for selected_links
|
||||
print('[*] Finding matching Snapshots to update...')
|
||||
print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
|
||||
matching_snapshots = list_links(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
)
|
||||
|
||||
print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
|
||||
matching_folders = list_folders(
|
||||
links=matching_snapshots,
|
||||
status=status,
|
||||
out_dir=out_dir,
|
||||
)
|
||||
all_links = [link for link in matching_folders.values() if link]
|
||||
all_links = (link for link in matching_folders.values() if link)
|
||||
print(' - Sorting by most unfinished -> least unfinished + date archived...')
|
||||
all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
|
||||
|
||||
if index_only:
|
||||
for link in all_links:
|
||||
|
@ -836,6 +842,7 @@ def update(resume: Optional[float]=None,
|
|||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
|
||||
|
||||
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
|
|
2371
archivebox/package-lock.json
generated
Normal file
2371
archivebox/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "archivebox",
|
||||
"version": "0.7.3",
|
||||
"version": "0.8.0",
|
||||
"description": "ArchiveBox: The self-hosted internet archive",
|
||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||
"repository": "github:ArchiveBox/ArchiveBox",
|
||||
|
@ -8,6 +8,6 @@
|
|||
"dependencies": {
|
||||
"@postlight/parser": "^2.2.3",
|
||||
"readability-extractor": "github:ArchiveBox/readability-extractor",
|
||||
"single-file-cli": "^1.1.46"
|
||||
"single-file-cli": "^1.1.54"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ from . import medium_rss
|
|||
from . import netscape_html
|
||||
from . import generic_rss
|
||||
from . import generic_json
|
||||
from . import generic_jsonl
|
||||
from . import generic_html
|
||||
from . import generic_txt
|
||||
from . import url_list
|
||||
|
@ -63,6 +64,7 @@ PARSERS = {
|
|||
netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
|
||||
generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
|
||||
generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
|
||||
generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER),
|
||||
generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
|
||||
|
||||
# Catchall fallback parser
|
||||
|
|
|
@ -11,6 +11,60 @@ from ..util import (
|
|||
enforce_types,
|
||||
)
|
||||
|
||||
# This gets used by generic_jsonl, too
|
||||
def jsonObjectToLink(link: str, source: str):
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now(timezone.utc).timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
|
||||
# if we have a list, join it with commas
|
||||
tags = link.get('tags')
|
||||
if type(tags) == list:
|
||||
tags = ','.join(tags)
|
||||
elif type(tags) == str:
|
||||
# if there's no comma, assume it was space-separated
|
||||
if ',' not in tags:
|
||||
tags = tags.replace(' ', ',')
|
||||
|
||||
return Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=ts_str,
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags),
|
||||
sources=[source],
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
|
@ -18,55 +72,21 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
|
||||
json_file.seek(0)
|
||||
|
||||
# sometimes the first line is a comment or filepath, so we get everything after the first {
|
||||
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
|
||||
links = json.loads(json_file_json_str)
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
try:
|
||||
links = json.load(json_file)
|
||||
if type(links) != list:
|
||||
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
|
||||
except json.decoder.JSONDecodeError:
|
||||
# sometimes the first line is a comment or other junk, so try without
|
||||
json_file.seek(0)
|
||||
first_line = json_file.readline()
|
||||
#print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
|
||||
links = json.load(json_file)
|
||||
# we may fail again, which means we really don't know what to do
|
||||
|
||||
for link in links:
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
if link:
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now(timezone.utc).timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=ts_str,
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(link.get('tags')) or '',
|
||||
sources=[json_file.name],
|
||||
)
|
||||
|
||||
yield jsonObjectToLink(link,json_file.name)
|
||||
|
||||
KEY = 'json'
|
||||
NAME = 'Generic JSON'
|
||||
|
|
34
archivebox/parsers/generic_jsonl.py
Normal file
34
archivebox/parsers/generic_jsonl.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
import json
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
from .generic_json import jsonObjectToLink
|
||||
|
||||
def parse_line(line: str):
|
||||
if line.strip() != "":
|
||||
return json.loads(line)
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse JSONL format bookmarks export files"""
|
||||
|
||||
json_file.seek(0)
|
||||
|
||||
links = [ parse_line(line) for line in json_file ]
|
||||
|
||||
for link in links:
|
||||
if link:
|
||||
yield jsonObjectToLink(link,json_file.name)
|
||||
|
||||
KEY = 'jsonl'
|
||||
NAME = 'Generic JSONL'
|
||||
PARSER = parse_generic_jsonl_export
|
|
@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers'
|
|||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
from feedparser import parse as feedparser
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
enforce_types
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
|
@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
"""Parse RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
items = rss_file.read().split('<item>')
|
||||
items = items[1:] if items else []
|
||||
for item in items:
|
||||
# example item:
|
||||
# <item>
|
||||
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
|
||||
# <category>Unread</category>
|
||||
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
|
||||
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
|
||||
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
|
||||
# </item>
|
||||
feed = feedparser(rss_file.read())
|
||||
for item in feed.entries:
|
||||
url = item.link
|
||||
title = item.title
|
||||
time = mktime(item.updated_parsed)
|
||||
|
||||
trailing_removed = item.split('</item>', 1)[0]
|
||||
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
||||
rows = leading_removed.split('\n')
|
||||
try:
|
||||
tags = ','.join(map(lambda tag: tag.term, item.tags))
|
||||
except AttributeError:
|
||||
tags = ''
|
||||
|
||||
def get_row(key):
|
||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||
|
||||
url = str_between(get_row('link'), '<link>', '</link>')
|
||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||
if url is None:
|
||||
# Yielding a Link with no URL will
|
||||
# crash on a URL validation assertion
|
||||
continue
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
timestamp=str(time),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
tags=tags,
|
||||
sources=[rss_file.name],
|
||||
)
|
||||
|
||||
|
|
|
@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers'
|
|||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from xml.etree import ElementTree
|
||||
from time import mktime
|
||||
from feedparser import parse as feedparser
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
enforce_types
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
||||
for item in items:
|
||||
find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore
|
||||
feed = feedparser(rss_file.read())
|
||||
for item in feed.entries:
|
||||
url = item.link
|
||||
# title will start with "[priv] " if pin was marked private. useful?
|
||||
title = item.title
|
||||
time = mktime(item.updated_parsed)
|
||||
|
||||
url = find("{http://purl.org/rss/1.0/}link")
|
||||
tags = find("{http://purl.org/dc/elements/1.1/}subject")
|
||||
title = find("{http://purl.org/rss/1.0/}title")
|
||||
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
|
||||
# all tags are in one entry.tags with spaces in it. annoying!
|
||||
try:
|
||||
tags = item.tags[0].term.replace(' ', ',')
|
||||
except AttributeError:
|
||||
tags = ''
|
||||
|
||||
if url is None:
|
||||
# Yielding a Link with no URL will
|
||||
# crash on a URL validation assertion
|
||||
continue
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now(timezone.utc)
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
timestamp=str(time),
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags) or None,
|
||||
sources=[rss_file.name],
|
||||
|
|
|
@ -30,8 +30,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
|
|||
|
||||
if capture_output:
|
||||
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
||||
raise ValueError('stdout and stderr arguments may not be used '
|
||||
'with capture_output.')
|
||||
raise ValueError('stdout and stderr arguments may not be used with capture_output.')
|
||||
kwargs['stdout'] = PIPE
|
||||
kwargs['stderr'] = PIPE
|
||||
|
||||
|
@ -146,20 +145,24 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
|
|||
recursively and limiting to a given filter list
|
||||
"""
|
||||
num_bytes, num_dirs, num_files = 0, 0, 0
|
||||
for entry in os.scandir(path):
|
||||
if (pattern is not None) and (pattern not in entry.path):
|
||||
continue
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
if not recursive:
|
||||
try:
|
||||
for entry in os.scandir(path):
|
||||
if (pattern is not None) and (pattern not in entry.path):
|
||||
continue
|
||||
num_dirs += 1
|
||||
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
|
||||
num_bytes += bytes_inside
|
||||
num_dirs += dirs_inside
|
||||
num_files += files_inside
|
||||
else:
|
||||
num_bytes += entry.stat(follow_symlinks=False).st_size
|
||||
num_files += 1
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
if not recursive:
|
||||
continue
|
||||
num_dirs += 1
|
||||
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
|
||||
num_bytes += bytes_inside
|
||||
num_dirs += dirs_inside
|
||||
num_files += files_inside
|
||||
else:
|
||||
num_bytes += entry.stat(follow_symlinks=False).st_size
|
||||
num_files += 1
|
||||
except OSError:
|
||||
# e.g. FileNameTooLong or other error while trying to read dir
|
||||
pass
|
||||
return num_bytes, num_dirs, num_files
|
||||
|
||||
|
||||
|
@ -171,7 +174,7 @@ def dedupe_cron_jobs(cron: CronTab) -> CronTab:
|
|||
deduped: Set[Tuple[str, str]] = set()
|
||||
|
||||
for job in list(cron):
|
||||
unique_tuple = (str(job.slices), job.command)
|
||||
unique_tuple = (str(job.slices), str(job.command))
|
||||
if unique_tuple not in deduped:
|
||||
deduped.add(unique_tuple)
|
||||
cron.remove(job)
|
||||
|
|
|
@ -3,6 +3,7 @@ __package__ = 'archivebox'
|
|||
import re
|
||||
import requests
|
||||
import json as pyjson
|
||||
import http.cookiejar
|
||||
|
||||
from typing import List, Optional, Any
|
||||
from pathlib import Path
|
||||
|
@ -56,19 +57,57 @@ short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
|
|||
ts_to_date_str = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
|
||||
ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
|
||||
|
||||
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
|
||||
|
||||
|
||||
# https://mathiasbynens.be/demo/url-regex
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=('
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
||||
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
|
||||
r'(?=(' +
|
||||
r'http[s]?://' + # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' + # followed by allowed alphanum characters
|
||||
r'|[-_$@.&+!*\(\),]' + # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r'|[^\u0000-\u007F])+' + # or allowed unicode bytes
|
||||
r'[^\]\[<>"\'\s]+' + # stop parsing at these symbols
|
||||
r'))',
|
||||
re.IGNORECASE,
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
|
||||
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
|
||||
def parens_are_matched(string: str, open_char='(', close_char=')'):
|
||||
"""check that all parentheses in a string are balanced and nested properly"""
|
||||
count = 0
|
||||
for c in string:
|
||||
if c == open_char:
|
||||
count += 1
|
||||
elif c == close_char:
|
||||
count -= 1
|
||||
if count < 0:
|
||||
return False
|
||||
return count == 0
|
||||
|
||||
def fix_url_from_markdown(url_str: str) -> str:
|
||||
"""
|
||||
cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
|
||||
helpful to fix URLs parsed from markdown e.g.
|
||||
input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
|
||||
result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
|
||||
"""
|
||||
trimmed_url = url_str
|
||||
|
||||
# cut off one trailing character at a time
|
||||
# until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
|
||||
while not parens_are_matched(trimmed_url):
|
||||
trimmed_url = trimmed_url[:-1]
|
||||
|
||||
# make sure trimmed url is still valid
|
||||
if re.findall(URL_REGEX, trimmed_url):
|
||||
return trimmed_url
|
||||
|
||||
return url_str
|
||||
|
||||
def find_all_urls(urls_str: str):
|
||||
for url in re.findall(URL_REGEX, urls_str):
|
||||
yield fix_url_from_markdown(url)
|
||||
|
||||
|
||||
def is_static_file(url: str):
|
||||
# TODO: the proper way is with MIME type detection + ext, not only extension
|
||||
|
@ -164,9 +203,22 @@ def parse_date(date: Any) -> Optional[datetime]:
|
|||
@enforce_types
|
||||
def download_url(url: str, timeout: int=None) -> str:
|
||||
"""Download the contents of a remote url and return the text"""
|
||||
from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
|
||||
from .config import (
|
||||
TIMEOUT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
WGET_USER_AGENT,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
timeout = timeout or TIMEOUT
|
||||
response = requests.get(
|
||||
session = requests.Session()
|
||||
|
||||
if COOKIES_FILE and Path(COOKIES_FILE).is_file():
|
||||
cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE)
|
||||
cookie_jar.load(ignore_discard=True, ignore_expires=True)
|
||||
for cookie in cookie_jar:
|
||||
session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
|
||||
|
||||
response = session.get(
|
||||
url,
|
||||
headers={'User-Agent': WGET_USER_AGENT},
|
||||
verify=CHECK_SSL_VALIDITY,
|
||||
|
@ -179,7 +231,11 @@ def download_url(url: str, timeout: int=None) -> str:
|
|||
if encoding is not None:
|
||||
response.encoding = encoding
|
||||
|
||||
return response.text
|
||||
try:
|
||||
return response.text
|
||||
except UnicodeDecodeError:
|
||||
# if response is non-test (e.g. image or other binary files), just return the filename instead
|
||||
return url.rsplit('/', 1)[-1]
|
||||
|
||||
@enforce_types
|
||||
def get_headers(url: str, timeout: int=None) -> str:
|
||||
|
@ -223,7 +279,11 @@ def chrome_args(**options) -> List[str]:
|
|||
|
||||
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
||||
|
||||
from .config import CHROME_OPTIONS, CHROME_VERSION
|
||||
from .config import (
|
||||
CHROME_OPTIONS,
|
||||
CHROME_VERSION,
|
||||
CHROME_EXTRA_ARGS,
|
||||
)
|
||||
|
||||
options = {**CHROME_OPTIONS, **options}
|
||||
|
||||
|
@ -232,6 +292,8 @@ def chrome_args(**options) -> List[str]:
|
|||
|
||||
cmd_args = [options['CHROME_BINARY']]
|
||||
|
||||
cmd_args += CHROME_EXTRA_ARGS
|
||||
|
||||
if options['CHROME_HEADLESS']:
|
||||
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
|
||||
if chrome_major_version >= 111:
|
||||
|
@ -275,8 +337,10 @@ def chrome_args(**options) -> List[str]:
|
|||
|
||||
if options['CHROME_USER_DATA_DIR']:
|
||||
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
||||
|
||||
return cmd_args
|
||||
cmd_args.append('--profile-directory=Default')
|
||||
|
||||
return dedupe(cmd_args)
|
||||
|
||||
|
||||
def chrome_cleanup():
|
||||
"""
|
||||
|
@ -313,6 +377,20 @@ def ansi_to_html(text):
|
|||
return COLOR_REGEX.sub(single_sub, text)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def dedupe(options: List[str]) -> List[str]:
|
||||
"""
|
||||
Deduplicates the given options. Options that come later clobber earlier
|
||||
conflicting options.
|
||||
"""
|
||||
deduped = {}
|
||||
|
||||
for option in options:
|
||||
deduped[option.split('=')[0]] = option
|
||||
|
||||
return list(deduped.values())
|
||||
|
||||
|
||||
class AttributeDict(dict):
|
||||
"""Helper to allow accessing dict values via Example.key or Example['key']"""
|
||||
|
||||
|
@ -359,3 +437,48 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
|||
|
||||
return pyjson.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
### URL PARSING TESTS / ASSERTIONS
|
||||
# they run at runtime because I like having them inline in this file,
|
||||
# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
|
||||
# and these assertions are basically instant, so not a big performance cost to do it on startup
|
||||
|
||||
assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
|
||||
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
|
||||
|
||||
URL_REGEX_TESTS = [
|
||||
('https://example.com', ['https://example.com']),
|
||||
('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
|
||||
|
||||
('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
|
||||
('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
|
||||
|
||||
('///a', []),
|
||||
('http://', []),
|
||||
('http://../', ['http://../']),
|
||||
('http://-error-.invalid/', ['http://-error-.invalid/']),
|
||||
('https://a(b)c+1#2?3&4/', ['https://a(b)c+1#2?3&4/']),
|
||||
('http://उदाहरण.परीक्षा', ['http://उदाहरण.परीक्षा']),
|
||||
('http://例子.测试', ['http://例子.测试']),
|
||||
('http://➡.ws/䨹 htps://abc.1243?234', ['http://➡.ws/䨹']),
|
||||
('http://⌘.ws">https://exa+mple.com//:abc ', ['http://⌘.ws', 'https://exa+mple.com//:abc']),
|
||||
('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234', ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
|
||||
('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
|
||||
|
||||
('http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
|
||||
('http://code.google.com/events/#&product=browser', ['http://code.google.com/events/#&product=browser']),
|
||||
('http://foo.bar?q=Spaces should be encoded', ['http://foo.bar?q=Spaces']),
|
||||
('http://foo.com/blah_(wikipedia)#c(i)t[e]-1', ['http://foo.com/blah_(wikipedia)#c(i)t']),
|
||||
('http://foo.com/(something)?after=parens', ['http://foo.com/(something)?after=parens']),
|
||||
('http://foo.com/unicode_(✪)_in_parens) abc', ['http://foo.com/unicode_(✪)_in_parens']),
|
||||
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
|
||||
|
||||
('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff', ['http://a.b/?q=(Test)%20U']),
|
||||
('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123', ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
|
||||
('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3', ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
|
||||
('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3', ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
|
||||
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
|
||||
]
|
||||
for urls_str, expected_url_matches in URL_REGEX_TESTS:
|
||||
url_matches = list(find_all_urls(urls_str))
|
||||
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
|
||||
|
|
6
archivebox/vendor/requirements.txt
vendored
Normal file
6
archivebox/vendor/requirements.txt
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
# this folder contains vendored versions of these packages
|
||||
|
||||
atomicwrites==1.4.0
|
||||
pocket==0.3.7
|
||||
django-taggit==1.3.0
|
||||
base32-crockford==0.3.0
|
Loading…
Add table
Add a link
Reference in a new issue