mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
cache dir size, snapshot icons, tags str, and title in django cache
This commit is contained in:
parent
51440ede3a
commit
8b236b9367
3 changed files with 83 additions and 69 deletions
|
@ -5,9 +5,11 @@ import uuid
|
||||||
from django.db import models, transaction
|
from django.db import models, transaction
|
||||||
from django.utils.functional import cached_property
|
from django.utils.functional import cached_property
|
||||||
from django.utils.text import slugify
|
from django.utils.text import slugify
|
||||||
|
from django.core.cache import cache
|
||||||
from django.db.models import Case, When, Value, IntegerField
|
from django.db.models import Case, When, Value, IntegerField
|
||||||
|
|
||||||
from ..config import ARCHIVE_DIR
|
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
||||||
|
from ..system import get_dir_size
|
||||||
from ..util import parse_date, base_url, hashurl
|
from ..util import parse_date, base_url, hashurl
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||||
|
@ -111,7 +113,9 @@ class Snapshot(models.Model):
|
||||||
return load_link_details(self.as_link())
|
return load_link_details(self.as_link())
|
||||||
|
|
||||||
def tags_str(self) -> str:
|
def tags_str(self) -> str:
|
||||||
return ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
|
||||||
|
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||||
|
return cache.get_or_set(cache_key, calc_tags_str)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def bookmarked(self):
|
def bookmarked(self):
|
||||||
|
@ -148,10 +152,15 @@ class Snapshot(models.Model):
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def archive_size(self):
|
def archive_size(self):
|
||||||
try:
|
cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
|
||||||
return get_dir_size(self.link_dir)[0]
|
|
||||||
except Exception:
|
def calc_dir_size():
|
||||||
return 0
|
try:
|
||||||
|
return get_dir_size(self.link_dir)[0]
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
return cache.get_or_set(cache_key, calc_dir_size)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def history(self):
|
def history(self):
|
||||||
|
|
|
@ -6,6 +6,7 @@ from collections import defaultdict
|
||||||
from typing import List, Optional, Iterator, Mapping
|
from typing import List, Optional, Iterator, Mapping
|
||||||
|
|
||||||
from django.utils.html import format_html, mark_safe
|
from django.utils.html import format_html, mark_safe
|
||||||
|
from django.core.cache import cache
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
|
@ -115,74 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
|
||||||
|
|
||||||
|
|
||||||
def snapshot_icons(snapshot) -> str:
|
def snapshot_icons(snapshot) -> str:
|
||||||
from core.models import EXTRACTORS
|
cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
|
||||||
# start = datetime.now()
|
|
||||||
|
def calc_snapshot_icons():
|
||||||
|
from core.models import EXTRACTORS
|
||||||
|
# start = datetime.now()
|
||||||
|
|
||||||
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||||
link = snapshot.as_link()
|
link = snapshot.as_link()
|
||||||
path = link.archive_path
|
path = link.archive_path
|
||||||
canon = link.canonical_outputs()
|
canon = link.canonical_outputs()
|
||||||
output = ""
|
output = ""
|
||||||
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
||||||
icons = {
|
icons = {
|
||||||
"singlefile": "❶",
|
"singlefile": "❶",
|
||||||
"wget": "🆆",
|
"wget": "🆆",
|
||||||
"dom": "🅷",
|
"dom": "🅷",
|
||||||
"pdf": "📄",
|
"pdf": "📄",
|
||||||
"screenshot": "💻",
|
"screenshot": "💻",
|
||||||
"media": "📼",
|
"media": "📼",
|
||||||
"git": "🅶",
|
"git": "🅶",
|
||||||
"archive_org": "🏛",
|
"archive_org": "🏛",
|
||||||
"readability": "🆁",
|
"readability": "🆁",
|
||||||
"mercury": "🅼",
|
"mercury": "🅼",
|
||||||
"warc": "📦"
|
"warc": "📦"
|
||||||
}
|
}
|
||||||
exclude = ["favicon", "title", "headers", "archive_org"]
|
exclude = ["favicon", "title", "headers", "archive_org"]
|
||||||
# Missing specific entry for WARC
|
# Missing specific entry for WARC
|
||||||
|
|
||||||
extractor_outputs = defaultdict(lambda: None)
|
extractor_outputs = defaultdict(lambda: None)
|
||||||
for extractor, _ in EXTRACTORS:
|
for extractor, _ in EXTRACTORS:
|
||||||
for result in archive_results:
|
for result in archive_results:
|
||||||
if result.extractor == extractor and result:
|
if result.extractor == extractor and result:
|
||||||
extractor_outputs[extractor] = result
|
extractor_outputs[extractor] = result
|
||||||
|
|
||||||
for extractor, _ in EXTRACTORS:
|
for extractor, _ in EXTRACTORS:
|
||||||
if extractor not in exclude:
|
if extractor not in exclude:
|
||||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||||
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
||||||
# if existing:
|
# if existing:
|
||||||
# existing = (Path(path) / existing)
|
# existing = (Path(path) / existing)
|
||||||
# if existing.is_file():
|
# if existing.is_file():
|
||||||
# existing = True
|
# existing = True
|
||||||
# elif existing.is_dir():
|
# elif existing.is_dir():
|
||||||
# existing = any(existing.glob('*.*'))
|
# existing = any(existing.glob('*.*'))
|
||||||
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
|
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
|
||||||
extractor, icons.get(extractor, "?"))
|
extractor, icons.get(extractor, "?"))
|
||||||
if extractor == "wget":
|
if extractor == "wget":
|
||||||
# warc isn't technically it's own extractor, so we have to add it after wget
|
# warc isn't technically it's own extractor, so we have to add it after wget
|
||||||
|
|
||||||
# get from db (faster but less thurthful)
|
# get from db (faster but less thurthful)
|
||||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||||
# get from filesystem (slower but more accurate)
|
# get from filesystem (slower but more accurate)
|
||||||
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
|
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
|
||||||
output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
|
output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
|
||||||
|
|
||||||
if extractor == "archive_org":
|
if extractor == "archive_org":
|
||||||
# The check for archive_org is different, so it has to be handled separately
|
# The check for archive_org is different, so it has to be handled separately
|
||||||
|
|
||||||
# get from db (faster)
|
# get from db (faster)
|
||||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||||
# get from filesystem (slower)
|
# get from filesystem (slower)
|
||||||
# target_path = Path(path) / "archive.org.txt"
|
# target_path = Path(path) / "archive.org.txt"
|
||||||
# exists = target_path.exists()
|
# exists = target_path.exists()
|
||||||
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
|
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
|
||||||
"archive_org", icons.get("archive_org", "?"))
|
"archive_org", icons.get("archive_org", "?"))
|
||||||
|
|
||||||
result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
||||||
# end = datetime.now()
|
# end = datetime.now()
|
||||||
# print(((end - start).total_seconds()*1000) // 1, 'ms')
|
# print(((end - start).total_seconds()*1000) // 1, 'ms')
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# return cache.get_or_set(cache_key, calc_snapshot_icons)
|
return cache.get_or_set(cache_key, calc_snapshot_icons)
|
||||||
|
# return calc_snapshot_icons()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
|
||||||
|
|
||||||
from dataclasses import dataclass, asdict, field, fields
|
from dataclasses import dataclass, asdict, field, fields
|
||||||
|
|
||||||
|
from django.utils.functional import cached_property
|
||||||
|
|
||||||
from ..system import get_dir_size
|
from ..system import get_dir_size
|
||||||
|
|
||||||
|
@ -133,7 +134,6 @@ class Link:
|
||||||
updated: Optional[datetime] = None
|
updated: Optional[datetime] = None
|
||||||
schema: str = 'Link'
|
schema: str = 'Link'
|
||||||
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return f'[{self.timestamp}] {self.url} "{self.title}"'
|
return f'[{self.timestamp}] {self.url} "{self.title}"'
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue