Merge branch 'dev' into plugins-browsertrix

2025-05-23 03:06:55 -04:00 · 2024-04-24 16:29:36 -07:00 · 2024-04-24 16:29:36 -07:00 · 33e82736f9
commit 33e82736f9
parent b5311d2c57 8e9cfc8869
33 changed files with 3943 additions and 595 deletions
--- a/archivebox/api/init.py
+++ b/archivebox/api/init.py
--- a/archivebox/api/apps.py
+++ b/archivebox/api/apps.py
@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class APIConfig(AppConfig):
+    name = 'api'
--- a/archivebox/api/archive.py
+++ b/archivebox/api/archive.py
@ -0,0 +1,184 @@
+# archivebox_api.py
+from typing import List, Optional
+from enum import Enum
+from pydantic import BaseModel
+from ninja import Router
+from main import (
+    add,
+    remove,
+    update,
+    list_all,
+    ONLY_NEW,
+)  # Assuming these functions are defined in main.py
+
+
+# Schemas
+
+class StatusChoices(str, Enum):
+    indexed = 'indexed'
+    archived = 'archived'
+    unarchived = 'unarchived'
+    present = 'present'
+    valid = 'valid'
+    invalid = 'invalid'
+    duplicate = 'duplicate'
+    orphaned = 'orphaned'
+    corrupted = 'corrupted'
+    unrecognized = 'unrecognized'
+
+
+class AddURLSchema(BaseModel):
+    urls: List[str]
+    tag: str = ""
+    depth: int = 0
+    update: bool = not ONLY_NEW  # Default to the opposite of ONLY_NEW
+    update_all: bool = False
+    index_only: bool = False
+    overwrite: bool = False
+    init: bool = False
+    extractors: str = ""
+    parser: str = "auto"
+
+
+class RemoveURLSchema(BaseModel):
+    yes: bool = False
+    delete: bool = False
+    before: Optional[float] = None
+    after: Optional[float] = None
+    filter_type: str = "exact"
+    filter_patterns: Optional[List[str]] = None
+
+
+class UpdateSchema(BaseModel):
+    resume: Optional[float] = None
+    only_new: Optional[bool] = None
+    index_only: Optional[bool] = False
+    overwrite: Optional[bool] = False
+    before: Optional[float] = None
+    after: Optional[float] = None
+    status: Optional[StatusChoices] = None
+    filter_type: Optional[str] = 'exact'
+    filter_patterns: Optional[List[str]] = None
+    extractors: Optional[str] = ""
+
+
+class ListAllSchema(BaseModel):
+    filter_patterns: Optional[List[str]] = None
+    filter_type: str = 'exact'
+    status: Optional[StatusChoices] = None
+    after: Optional[float] = None
+    before: Optional[float] = None
+    sort: Optional[str] = None
+    csv: Optional[str] = None
+    json: bool = False
+    html: bool = False
+    with_headers: bool = False
+
+
+# API Router
+router = Router()
+
+
+@router.post("/add", response={200: dict})
+def api_add(request, payload: AddURLSchema):
+    try:
+        result = add(
+            urls=payload.urls,
+            tag=payload.tag,
+            depth=payload.depth,
+            update=payload.update,
+            update_all=payload.update_all,
+            index_only=payload.index_only,
+            overwrite=payload.overwrite,
+            init=payload.init,
+            extractors=payload.extractors,
+            parser=payload.parser,
+        )
+        # Currently the add function returns a list of ALL items in the DB, ideally only return new items
+        return {
+            "status": "success",
+            "message": "URLs added successfully.",
+            "result": str(result),
+        }
+    except Exception as e:
+        # Handle exceptions raised by the add function or during processing
+        return {"status": "error", "message": str(e)}
+
+
+@router.post("/remove", response={200: dict})
+def api_remove(request, payload: RemoveURLSchema):
+    try:
+        result = remove(
+            yes=payload.yes,
+            delete=payload.delete,
+            before=payload.before,
+            after=payload.after,
+            filter_type=payload.filter_type,
+            filter_patterns=payload.filter_patterns,
+        )
+        return {
+            "status": "success",
+            "message": "URLs removed successfully.",
+            "result": result,
+        }
+    except Exception as e:
+        # Handle exceptions raised by the remove function or during processing
+        return {"status": "error", "message": str(e)}
+
+
+@router.post("/update", response={200: dict})
+def api_update(request, payload: UpdateSchema):
+    try:
+        result = update(
+            resume=payload.resume,
+            only_new=payload.only_new,
+            index_only=payload.index_only,
+            overwrite=payload.overwrite,
+            before=payload.before,
+            after=payload.after,
+            status=payload.status,
+            filter_type=payload.filter_type,
+            filter_patterns=payload.filter_patterns,
+            extractors=payload.extractors,
+        )
+        return {
+            "status": "success",
+            "message": "Archive updated successfully.",
+            "result": result,
+        }
+    except Exception as e:
+        # Handle exceptions raised by the update function or during processing
+        return {"status": "error", "message": str(e)}
+
+
+@router.post("/list_all", response={200: dict})
+def api_list_all(request, payload: ListAllSchema):
+    try:
+        result = list_all(
+            filter_patterns=payload.filter_patterns,
+            filter_type=payload.filter_type,
+            status=payload.status,
+            after=payload.after,
+            before=payload.before,
+            sort=payload.sort,
+            csv=payload.csv,
+            json=payload.json,
+            html=payload.html,
+            with_headers=payload.with_headers,
+        )
+        # TODO: This is kind of bad, make the format a choice field
+        if payload.json:
+            return {"status": "success", "format": "json", "data": result}
+        elif payload.html:
+            return {"status": "success", "format": "html", "data": result}
+        elif payload.csv:
+            return {"status": "success", "format": "csv", "data": result}
+        else:
+            return {
+                "status": "success",
+                "message": "List generated successfully.",
+                "data": result,
+            }
+    except Exception as e:
+        # Handle exceptions raised by the list_all function or during processing
+        return {"status": "error", "message": str(e)}
--- a/archivebox/api/auth.py
+++ b/archivebox/api/auth.py
@ -0,0 +1,48 @@
+from django.contrib.auth import authenticate
+from ninja import Form, Router, Schema
+from ninja.security import HttpBearer
+
+from api.models import Token
+
+router = Router()
+
+
+class GlobalAuth(HttpBearer):
+    def authenticate(self, request, token):
+        try:
+            return Token.objects.get(token=token).user
+        except Token.DoesNotExist:
+            pass
+
+
+class AuthSchema(Schema):
+    email: str
+    password: str
+
+
+@router.post("/authenticate", auth=None)  # overriding global auth
+def get_token(request, auth_data: AuthSchema):
+    user = authenticate(username=auth_data.email, password=auth_data.password)
+    if user:
+        # Assuming a user can have multiple tokens and you want to create a new one every time
+        new_token = Token.objects.create(user=user)
+        return {"token": new_token.token, "expires": new_token.expiry_as_iso8601}
+    else:
+        return {"error": "Invalid credentials"}
+
+
+class TokenValidationSchema(Schema):
+    token: str
+
+
+@router.post("/validate_token", auth=None) # No authentication required for this endpoint
+def validate_token(request, token_data: TokenValidationSchema):
+    try:
+        # Attempt to authenticate using the provided token
+        user = GlobalAuth().authenticate(request, token_data.token)
+        if user:
+            return {"status": "valid"}
+        else:
+            return {"status": "invalid"}
+    except Token.DoesNotExist:
+        return {"status": "invalid"}
--- a/archivebox/api/migrations/0001_initial.py
+++ b/archivebox/api/migrations/0001_initial.py
@ -0,0 +1,28 @@
+# Generated by Django 3.1.14 on 2024-04-09 18:52
+
+import api.models
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Token',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('token', models.CharField(default=auth.models.hex_uuid, max_length=32, unique=True)),
+                ('created', models.DateTimeField(auto_now_add=True)),
+                ('expiry', models.DateTimeField(blank=True, null=True)),
+                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tokens', to=settings.AUTH_USER_MODEL)),
+            ],
+        ),
+    ]
--- a/archivebox/api/migrations/init.py
+++ b/archivebox/api/migrations/init.py
--- a/archivebox/api/models.py
+++ b/archivebox/api/models.py
@ -0,0 +1,30 @@
+import uuid
+from datetime import timedelta
+
+from django.conf import settings
+from django.db import models
+from django.utils import timezone
+from django.utils.translation import gettext_lazy as _
+
+def hex_uuid():
+    return uuid.uuid4().hex
+
+
+class Token(models.Model):
+    user = models.ForeignKey(
+        settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tokens"
+    )
+    token = models.CharField(max_length=32, default=hex_uuid, unique=True)
+    created = models.DateTimeField(auto_now_add=True)
+    expiry = models.DateTimeField(null=True, blank=True)
+
+    @property
+    def expiry_as_iso8601(self):
+        """Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
+        expiry_date = (
+            self.expiry if self.expiry else timezone.now() + timedelta(days=365 * 100)
+        )
+        return expiry_date.isoformat()
+
+    def __str__(self):
+        return self.token
--- a/archivebox/api/tests.py
+++ b/archivebox/api/tests.py
@ -0,0 +1,27 @@
+from django.test import TestCase
+from ninja.testing import TestClient
+from archivebox.api.archive import router as archive_router
+
+class ArchiveBoxAPITestCase(TestCase):
+    def setUp(self):
+        self.client = TestClient(archive_router)
+
+    def test_add_endpoint(self):
+        response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "test"})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_remove_endpoint(self):
+        response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_update_endpoint(self):
+        response = self.client.post("/update", json={})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_list_all_endpoint(self):
+        response = self.client.post("/list_all", json={})
+        self.assertEqual(response.status_code, 200)
+        self.assertTrue("success" in response.json()["status"])
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@ -68,6 +68,7 @@ INSTALLED_APPS = [


    'core',
+    'api',
    
    # Plugins
    
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@ -8,6 +8,18 @@ from django.views.generic.base import RedirectView

 from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView

+from ninja import NinjaAPI
+from api.auth import GlobalAuth
+
+api = NinjaAPI(auth=GlobalAuth())
+api.add_router("/auth/", "api.auth.router")
+api.add_router("/archive/", "api.archive.router")
+
+# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
+# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
+# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
+
+
 # print('DEBUG', settings.DEBUG)

 urlpatterns = [
@ -35,6 +47,8 @@ urlpatterns = [
    path('accounts/', include('django.contrib.auth.urls')),
    path('admin/', archivebox_admin.urls),
    
+    path("api/", api.urls),
+
    path('health/', HealthCheckView.as_view(), name='healthcheck'),
    path('error/', lambda _: 1/0),

--- a/archivebox/index.sqlite3
+++ b/archivebox/index.sqlite3
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -494,12 +494,12 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
    if delete:
        file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
        print(
-            f'    {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
+            f'    {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' +
            f'    ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
        )
    else:
        print(
-            '    Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
+            '    Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' +
            '    (Pass --delete if you also want to permanently delete the data folders)'
        )

@ -638,17 +638,15 @@ def printable_folder_status(name: str, folder: Dict) -> str:

@enforce_types
 def printable_dependency_version(name: str, dependency: Dict) -> str:
-    version = None
+    color, symbol, note, version = 'red', 'X', 'invalid', '?'
+
    if dependency['enabled']:
        if dependency['is_valid']:
-            color, symbol, note, version = 'green', '√', 'valid', ''
+            color, symbol, note = 'green', '√', 'valid'

            parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
            if parsed_version_num:
                version = f'v{parsed_version_num[0]}'
-
-        if not version:
-            color, symbol, note, version = 'red', 'X', 'invalid', '?'
    else:
        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'

--- a/archivebox/package-lock.json
+++ b/archivebox/package-lock.json
--- a/archivebox/package.json
+++ b/archivebox/package.json
@ -1,6 +1,6 @@
 {
  "name": "archivebox",
-  "version": "0.7.3",
+  "version": "0.8.0",
  "description": "ArchiveBox: The self-hosted internet archive",
  "author": "Nick Sweeting <archivebox-npm@sweeting.me>",
  "repository": "github:ArchiveBox/ArchiveBox",
@ -8,6 +8,6 @@
  "dependencies": {
    "@postlight/parser": "^2.2.3",
    "readability-extractor": "github:ArchiveBox/readability-extractor",
-    "single-file-cli": "^1.1.46"
+    "single-file-cli": "^1.1.54"
  }
 }
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -57,19 +57,57 @@ short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
 ts_to_date_str = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
 ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()

+COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')

+
+# https://mathiasbynens.be/demo/url-regex
 URL_REGEX = re.compile(
-    r'(?=('
-    r'http[s]?://'                    # start matching from allowed schemes
-    r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
-    r'|[-_$@.&+!*\(\),]'           #    or allowed symbols (keep hyphen first to match literal hyphen)
-    r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
-    r'[^\]\[\(\)<>"\'\s]+'          # stop parsing at these symbols
+    r'(?=('                           +
+    r'http[s]?://'                    +  # start matching from allowed schemes
+    r'(?:[a-zA-Z]|[0-9]'              +  # followed by allowed alphanum characters
+    r'|[-_$@.&+!*\(\),]'              +  #   or allowed symbols (keep hyphen first to match literal hyphen)
+    r'|[^\u0000-\u007F])+'            +  #   or allowed unicode bytes
+    r'[^\]\[<>"\'\s]+'                +  # stop parsing at these symbols
    r'))',
-    re.IGNORECASE,
+    re.IGNORECASE | re.UNICODE,
 )

-COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
+def parens_are_matched(string: str, open_char='(', close_char=')'):
+    """check that all parentheses in a string are balanced and nested properly"""
+    count = 0
+    for c in string:
+        if c == open_char:
+            count += 1
+        elif c == close_char:
+            count -= 1
+        if count < 0:
+            return False
+    return count == 0
+
+def fix_url_from_markdown(url_str: str) -> str:
+    """
+    cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
+    helpful to fix URLs parsed from markdown e.g.
+      input:  https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
+      result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
+    """
+    trimmed_url = url_str
+
+    # cut off one trailing character at a time
+    # until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
+    while not parens_are_matched(trimmed_url):
+        trimmed_url = trimmed_url[:-1]
+    
+    # make sure trimmed url is still valid
+    if re.findall(URL_REGEX, trimmed_url):
+        return trimmed_url
+    
+    return url_str
+
+def find_all_urls(urls_str: str):
+    for url in re.findall(URL_REGEX, urls_str):
+        yield fix_url_from_markdown(url)
+

 def is_static_file(url: str):
    # TODO: the proper way is with MIME type detection + ext, not only extension
@ -403,3 +441,48 @@ class ExtendedEncoder(pyjson.JSONEncoder):

        return pyjson.JSONEncoder.default(self, obj)

+
+### URL PARSING TESTS / ASSERTIONS
+# they run at runtime because I like having them inline in this file,
+# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
+# and these assertions are basically instant, so not a big performance cost to do it on startup
+
+assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
+assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
+
+URL_REGEX_TESTS = [
+    ('https://example.com', ['https://example.com']),
+    ('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
+
+    ('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
+    ('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
+
+    ('///a',                                                []),
+    ('http://',                                             []),
+    ('http://../',                                          ['http://../']),
+    ('http://-error-.invalid/',                             ['http://-error-.invalid/']),
+    ('https://a(b)c+1#2?3&4/',                              ['https://a(b)c+1#2?3&4/']),
+    ('http://उदाहरण.परीक्षा',                                   ['http://उदाहरण.परीक्षा']),
+    ('http://例子.测试',                                     ['http://例子.测试']),
+    ('http://➡.ws/䨹 htps://abc.1243?234',                  ['http://➡.ws/䨹']),
+    ('http://⌘.ws">https://exa+mple.com//:abc ',            ['http://⌘.ws', 'https://exa+mple.com//:abc']),
+    ('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234',          ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
+    ('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
+    
+    ('http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
+    ('http://code.google.com/events/#&product=browser',     ['http://code.google.com/events/#&product=browser']),
+    ('http://foo.bar?q=Spaces should be encoded',           ['http://foo.bar?q=Spaces']),
+    ('http://foo.com/blah_(wikipedia)#c(i)t[e]-1',          ['http://foo.com/blah_(wikipedia)#c(i)t']),
+    ('http://foo.com/(something)?after=parens',             ['http://foo.com/(something)?after=parens']),
+    ('http://foo.com/unicode_(✪)_in_parens) abc',           ['http://foo.com/unicode_(✪)_in_parens']),
+    ('http://foo.bar/?q=Test%20URL-encoded%20stuff',        ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
+
+    ('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff',   ['http://a.b/?q=(Test)%20U']),
+    ('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123',  ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
+    ('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3',  ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
+    ('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3',  ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
+    ('http://foo.bar/?q=Test%20URL-encoded%20stuff',        ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
+]
+for urls_str, expected_url_matches in URL_REGEX_TESTS:
+    url_matches = list(find_all_urls(urls_str))
+    assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
--- a/archivebox/vendor/requirements.txt
+++ b/archivebox/vendor/requirements.txt
@ -0,0 +1,6 @@
+# this folder contains vendored versions of these packages
+
+atomicwrites==1.4.0
+pocket==0.3.7
+django-taggit==1.3.0
+base32-crockford==0.3.0