mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
feat: Refactor add method to use querysets
This commit is contained in:
parent
6a2e6aad2f
commit
be520d137a
4 changed files with 71 additions and 44 deletions
|
@ -9,6 +9,7 @@ from ..index.schema import Link
|
||||||
from ..index import (
|
from ..index import (
|
||||||
load_link_details,
|
load_link_details,
|
||||||
write_link_details,
|
write_link_details,
|
||||||
|
write_main_index,
|
||||||
)
|
)
|
||||||
from ..util import enforce_types
|
from ..util import enforce_types
|
||||||
from ..logging_util import (
|
from ..logging_util import (
|
||||||
|
@ -128,24 +129,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
|
|
||||||
return link
|
return link
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def archive_links(links: List[Link], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
|
def archive_links(all_links: any, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
|
||||||
if not links:
|
|
||||||
|
if type(all_links) is list:
|
||||||
|
num_links: int = len(all_links)
|
||||||
|
get_link = lambda x: x
|
||||||
|
get_iter = lambda x: x
|
||||||
|
else:
|
||||||
|
num_links: int = all_links.count()
|
||||||
|
get_link = lambda x: x.as_link()
|
||||||
|
get_iter = lambda x: x.iterator()
|
||||||
|
|
||||||
|
if num_links == 0:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
log_archiving_started(len(links))
|
log_archiving_started(num_links)
|
||||||
idx: int = 0
|
idx: int = 0
|
||||||
link: Link = links[0]
|
|
||||||
try:
|
try:
|
||||||
for idx, link in enumerate(links):
|
for link in get_iter(all_links):
|
||||||
archive_link(link, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
|
idx += 1
|
||||||
|
to_archive = get_link(link)
|
||||||
|
archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
log_archiving_paused(len(links), idx, link.timestamp)
|
log_archiving_paused(num_links, idx, link.timestamp)
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
except BaseException:
|
except BaseException:
|
||||||
print()
|
print()
|
||||||
raise
|
raise
|
||||||
|
|
||||||
log_archiving_finished(len(links))
|
log_archiving_finished(num_links)
|
||||||
return links
|
return all_links
|
||||||
|
|
|
@ -11,6 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from ..util import (
|
from ..util import (
|
||||||
scheme,
|
scheme,
|
||||||
|
@ -133,7 +134,6 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
|
||||||
|
|
||||||
return list(links)
|
return list(links)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||||
|
@ -165,15 +165,6 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||||
link = merge_links(unique_urls[link.url], link)
|
link = merge_links(unique_urls[link.url], link)
|
||||||
unique_urls[link.url] = link
|
unique_urls[link.url] = link
|
||||||
|
|
||||||
# unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
|
||||||
# for link in unique_urls.values():
|
|
||||||
# closest_non_duplicate_ts = lowest_uniq_timestamp(unique_timestamps, link.timestamp)
|
|
||||||
# if closest_non_duplicate_ts != link.timestamp:
|
|
||||||
# link = link.overwrite(timestamp=closest_non_duplicate_ts)
|
|
||||||
# Snapshot.objects.filter(url=link.url).update(timestamp=link.timestamp)
|
|
||||||
# unique_timestamps[link.timestamp] = link
|
|
||||||
|
|
||||||
# return unique_timestamps.values()
|
|
||||||
return unique_urls.values()
|
return unique_urls.values()
|
||||||
|
|
||||||
|
|
||||||
|
@ -245,11 +236,7 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
|
||||||
os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
||||||
|
|
||||||
if finished:
|
if finished:
|
||||||
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
|
write_static_index(links, out_dir=out_dir)
|
||||||
write_json_main_index(links, out_dir=out_dir)
|
|
||||||
|
|
||||||
with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
|
|
||||||
write_html_main_index(links, out_dir=out_dir, finished=finished)
|
|
||||||
except (KeyboardInterrupt, SystemExit):
|
except (KeyboardInterrupt, SystemExit):
|
||||||
stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
|
stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
|
||||||
stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
|
stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
|
||||||
|
@ -260,7 +247,14 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
|
||||||
|
|
||||||
log_indexing_process_finished()
|
log_indexing_process_finished()
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def write_static_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||||
|
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
|
||||||
|
write_json_main_index(links)
|
||||||
|
with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
|
||||||
|
write_html_main_index(links, out_dir=out_dir, finished=True)
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
|
def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
|
||||||
setup_django(out_dir, check_db=True)
|
setup_django(out_dir, check_db=True)
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
@ -306,27 +300,47 @@ def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> T
|
||||||
|
|
||||||
return new_links
|
return new_links
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
|
||||||
|
"""
|
||||||
|
Look for urls in the index, and merge them too
|
||||||
|
"""
|
||||||
|
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
index_link = snapshots.filter(url=link.url)
|
||||||
|
if index_link:
|
||||||
|
link = merge_links(index_link[0].as_link(), link)
|
||||||
|
|
||||||
|
unique_urls[link.url] = link
|
||||||
|
|
||||||
|
return unique_urls.values()
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def dedupe_links(existing_links: List[Link],
|
def dedupe_links(snapshots: QuerySet,
|
||||||
new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
|
new_links: List[Link]) -> List[Link]:
|
||||||
|
"""
|
||||||
|
The validation of links happened at a different stage. This method will
|
||||||
|
focus on actual deduplication and timestamp fixing.
|
||||||
|
"""
|
||||||
|
|
||||||
# merge existing links in out_dir and new links
|
# merge existing links in out_dir and new links
|
||||||
all_links = validate_links(existing_links + new_links)
|
dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
|
||||||
all_link_urls = {link.url for link in existing_links}
|
|
||||||
|
|
||||||
new_links = [
|
new_links = [
|
||||||
link for link in new_links
|
link for link in new_links
|
||||||
if link.url not in all_link_urls
|
if not snapshots.filter(url=link.url).exists()
|
||||||
]
|
]
|
||||||
|
|
||||||
all_links_deduped = {link.url: link for link in all_links}
|
dedup_links_dict = {link.url: link for link in dedup_links}
|
||||||
|
|
||||||
|
# Replace links in new_links with the dedup version
|
||||||
for i in range(len(new_links)):
|
for i in range(len(new_links)):
|
||||||
if new_links[i].url in all_links_deduped.keys():
|
if new_links[i].url in dedup_links_dict.keys():
|
||||||
new_links[i] = all_links_deduped[new_links[i].url]
|
new_links[i] = dedup_links_dict[new_links[i].url]
|
||||||
log_deduping_finished(len(new_links))
|
log_deduping_finished(len(new_links))
|
||||||
|
|
||||||
return all_links, new_links
|
return new_links
|
||||||
|
|
||||||
### Link Details Index
|
### Link Details Index
|
||||||
|
|
||||||
|
|
|
@ -40,9 +40,11 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||||
for link in links:
|
for link in links:
|
||||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||||
try:
|
try:
|
||||||
info['timestamp'] = Snapshot.objects.get(url=link.url).timestamp
|
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
||||||
except Snapshot.DoesNotExist:
|
except Snapshot.DoesNotExist:
|
||||||
pass
|
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
||||||
|
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||||
|
|
||||||
Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
|
@ -30,6 +30,7 @@ from .index import (
|
||||||
parse_links_from_source,
|
parse_links_from_source,
|
||||||
dedupe_links,
|
dedupe_links,
|
||||||
write_main_index,
|
write_main_index,
|
||||||
|
write_static_index,
|
||||||
link_matches_filter,
|
link_matches_filter,
|
||||||
get_indexed_folders,
|
get_indexed_folders,
|
||||||
get_archived_folders,
|
get_archived_folders,
|
||||||
|
@ -520,7 +521,7 @@ def add(urls: Union[str, List[str]],
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
check_dependencies()
|
check_dependencies()
|
||||||
new_links: List[Link] = []
|
new_links: List[Link] = []
|
||||||
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
|
all_links = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
||||||
if isinstance(urls, str):
|
if isinstance(urls, str):
|
||||||
|
@ -541,8 +542,10 @@ def add(urls: Union[str, List[str]],
|
||||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||||
|
|
||||||
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||||
all_links, new_links = dedupe_links(all_links, imported_links)
|
new_links = dedupe_links(all_links, imported_links)
|
||||||
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
|
|
||||||
|
write_main_index(links=new_links, out_dir=out_dir, finished=not new_links)
|
||||||
|
all_links = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
if index_only:
|
if index_only:
|
||||||
return all_links
|
return all_links
|
||||||
|
@ -555,12 +558,9 @@ def add(urls: Union[str, List[str]],
|
||||||
elif new_links:
|
elif new_links:
|
||||||
archive_links(new_links, overwrite=False, out_dir=out_dir)
|
archive_links(new_links, overwrite=False, out_dir=out_dir)
|
||||||
else:
|
else:
|
||||||
# nothing was updated, don't bother re-saving the index
|
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
write_static_index([link.as_link() for link in all_links], out_dir=out_dir)
|
||||||
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
|
|
||||||
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
|
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue