From c76c50e71f1a0b894f194263e50f0534dd122ac2 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 5 Sep 2024 21:41:49 -0700 Subject: [PATCH] add HTTP byte range request support to media file serving --- archivebox/cli/__init__.py | 2 +- archivebox/core/serve_static.py | 169 ++++++++++++++++++++++++++++++++ archivebox/core/urls.py | 13 +-- archivebox/core/views.py | 5 +- 4 files changed, 181 insertions(+), 8 deletions(-) create mode 100644 archivebox/core/serve_static.py diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 204267d7..6a0106a0 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -68,7 +68,7 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It else: return tries - raise Exception('Background threads failed to exit after {tries}s: {threads_summary}') + raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}') def list_subcommands() -> Dict[str, str]: diff --git a/archivebox/core/serve_static.py b/archivebox/core/serve_static.py new file mode 100644 index 00000000..15bf1a2f --- /dev/null +++ b/archivebox/core/serve_static.py @@ -0,0 +1,169 @@ +import os +import stat +import posixpath +import mimetypes +from pathlib import Path + +from django.contrib.staticfiles import finders +from django.views import static +from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified +from django.utils._os import safe_join +from django.utils.http import http_date +from django.utils.translation import gettext as _ + + +def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False): + """ + Overrides Django's built-in django.views.static.serve function to support byte range requests. + This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + assert document_root + path = posixpath.normpath(path).lstrip("/") + fullpath = Path(safe_join(document_root, path)) + if fullpath.is_dir(): + if show_indexes: + return static.directory_index(path, fullpath) + raise Http404(_("Directory indexes are not allowed here.")) + if not fullpath.exists(): + raise Http404(_("ā€œ%(path)sā€ does not exist") % {"path": fullpath}) + + # Respect the If-Modified-Since header. + statobj = fullpath.stat() + if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime): + return HttpResponseNotModified() + + content_type, encoding = mimetypes.guess_type(str(fullpath)) + content_type = content_type or "application/octet-stream" + + # setup resposne object + ranged_file = RangedFileReader(open(fullpath, "rb")) + response = StreamingHttpResponse(ranged_file, content_type=content_type) + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + + # handle byte-range requests by serving chunk of file + if stat.S_ISREG(statobj.st_mode): + size = statobj.st_size + response["Content-Length"] = size + response["Accept-Ranges"] = "bytes" + response["X-Django-Ranges-Supported"] = "1" + # Respect the Range header. + if "HTTP_RANGE" in request.META: + try: + ranges = parse_range_header(request.META['HTTP_RANGE'], size) + except ValueError: + ranges = None + # only handle syntactically valid headers, that are simple (no + # multipart byteranges) + if ranges is not None and len(ranges) == 1: + start, stop = ranges[0] + if stop > size: + # requested range not satisfiable + return HttpResponse(status=416) + ranged_file.start = start + ranged_file.stop = stop + response["Content-Range"] = "bytes %d-%d/%d" % (start, stop - 1, size) + response["Content-Length"] = stop - start + response.status_code = 206 + if encoding: + response.headers["Content-Encoding"] = encoding + return response + + +def serve_static(request, path, **kwargs): + """ + Serve static files below a given point in the directory structure or + from locations inferred from the staticfiles finders. + + To use, put a URL pattern such as:: + + from django.contrib.staticfiles import views + + path('', views.serve) + + in your URLconf. + + It uses the django.views.static.serve() view to serve the found files. + """ + + normalized_path = posixpath.normpath(path).lstrip("/") + absolute_path = finders.find(normalized_path) + if not absolute_path: + if path.endswith("/") or path == "": + raise Http404("Directory indexes are not allowed here.") + raise Http404("'%s' could not be found" % path) + document_root, path = os.path.split(absolute_path) + return serve_static_with_byterange_support(request, path, document_root=document_root, **kwargs) + + +def parse_range_header(header, resource_size): + """ + Parses a range header into a list of two-tuples (start, stop) where `start` + is the starting byte of the range (inclusive) and `stop` is the ending byte + position of the range (exclusive). + Returns None if the value of the header is not syntatically valid. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + if not header or "=" not in header: + return None + + ranges = [] + units, range_ = header.split("=", 1) + units = units.strip().lower() + + if units != "bytes": + return None + + for val in range_.split(","): + val = val.strip() + if "-" not in val: + return None + + if val.startswith("-"): + # suffix-byte-range-spec: this form specifies the last N bytes of an + # entity-body + start = resource_size + int(val) + if start < 0: + start = 0 + stop = resource_size + else: + # byte-range-spec: first-byte-pos "-" [last-byte-pos] + start, stop = val.split("-", 1) + start = int(start) + # the +1 is here since we want the stopping point to be exclusive, whereas in + # the HTTP spec, the last-byte-pos is inclusive + stop = int(stop) + 1 if stop else resource_size + if start >= stop: + return None + + ranges.append((start, stop)) + + return ranges + + +class RangedFileReader: + """ + Wraps a file like object with an iterator that runs over part (or all) of + the file defined by start and stop. Blocks of block_size will be returned + from the starting position, up to, but not including the stop point. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + + block_size = 8192 + + def __init__(self, file_like, start=0, stop=float("inf"), block_size=None): + self.f = file_like + self.block_size = block_size or RangedFileReader.block_size + self.start = start + self.stop = stop + + def __iter__(self): + self.f.seek(self.start) + position = self.start + while position < self.stop: + data = self.f.read(min(self.block_size, self.stop - position)) + if not data: + break + + yield data + position += self.block_size diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index ab9bd275..266dace0 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -1,14 +1,13 @@ __package__ = 'archivebox.core' -from django.urls import path, include +from django.urls import path, re_path, include from django.views import static -from django.contrib.staticfiles.urls import staticfiles_urlpatterns from django.conf import settings from django.views.generic.base import RedirectView from .admin import archivebox_admin from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView - +from .serve_static import serve_static # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306 # from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE @@ -18,13 +17,16 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC # print('DEBUG', settings.DEBUG) urlpatterns = [ - path('public/', PublicIndexView.as_view(), name='public-index'), + re_path(r"^static/(?P.*)$", serve_static), + # re_path(r"^media/(?P.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}), path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}), path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}), path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'), + path('public/', PublicIndexView.as_view(), name='public-index'), + path('archive/', RedirectView.as_view(url='/')), path('archive/', SnapshotView.as_view(), name='Snapshot'), @@ -41,7 +43,7 @@ urlpatterns = [ path("api/", include('api.urls'), name='api'), path('health/', HealthCheckView.as_view(), name='healthcheck'), - path('error/', lambda *_: 1/0), + path('error/', lambda *_: 1/0), # type: ignore # path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django @@ -49,7 +51,6 @@ urlpatterns = [ path('index.json', static.serve, {'document_root': settings.CONFIG.OUTPUT_DIR, 'path': 'index.json'}), path('', HomepageView.as_view(), name='Home'), ] -urlpatterns += staticfiles_urlpatterns() if settings.DEBUG_TOOLBAR: urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))] diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 89082ace..ec084e99 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -46,6 +46,7 @@ from ..main import add from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str from ..search import query_search_index from ..extractors.wget import wget_output_path +from .serve_static import serve_static_with_byterange_support class HomepageView(View): @@ -197,7 +198,9 @@ class SnapshotView(View): # if they requested snapshot index, serve live rendered template instead of static html response = self.render_live_index(request, snapshot) else: - response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True) + response = serve_static_with_byterange_support( + request, archivefile, document_root=snapshot.link_dir, show_indexes=True, + ) response["Link"] = f'<{snapshot.url}>; rel="canonical"' return response except Snapshot.DoesNotExist: