feat: Refactor add method to use querysets

This commit is contained in:
Cristian 2020-08-21 09:57:29 -05:00 committed by Cristian Vargas
parent 6a2e6aad2f
commit be520d137a
4 changed files with 71 additions and 44 deletions

View file

@ -9,6 +9,7 @@ from ..index.schema import Link
from ..index import ( from ..index import (
load_link_details, load_link_details,
write_link_details, write_link_details,
write_main_index,
) )
from ..util import enforce_types from ..util import enforce_types
from ..logging_util import ( from ..logging_util import (
@ -128,24 +129,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
return link return link
@enforce_types @enforce_types
def archive_links(links: List[Link], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]: def archive_links(all_links: any, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
if not links:
if type(all_links) is list:
num_links: int = len(all_links)
get_link = lambda x: x
get_iter = lambda x: x
else:
num_links: int = all_links.count()
get_link = lambda x: x.as_link()
get_iter = lambda x: x.iterator()
if num_links == 0:
return [] return []
log_archiving_started(len(links)) log_archiving_started(num_links)
idx: int = 0 idx: int = 0
link: Link = links[0]
try: try:
for idx, link in enumerate(links): for link in get_iter(all_links):
archive_link(link, overwrite=overwrite, methods=methods, out_dir=link.link_dir) idx += 1
to_archive = get_link(link)
archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
except KeyboardInterrupt: except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp) log_archiving_paused(num_links, idx, link.timestamp)
raise SystemExit(0) raise SystemExit(0)
except BaseException: except BaseException:
print() print()
raise raise
log_archiving_finished(len(links)) log_archiving_finished(num_links)
return links return all_links

View file

@ -11,6 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict from collections import OrderedDict
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import urlparse from urllib.parse import urlparse
from django.db.models import QuerySet
from ..util import ( from ..util import (
scheme, scheme,
@ -133,7 +134,6 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
return list(links) return list(links)
@enforce_types @enforce_types
def archivable_links(links: Iterable[Link]) -> Iterable[Link]: def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
@ -165,15 +165,6 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
link = merge_links(unique_urls[link.url], link) link = merge_links(unique_urls[link.url], link)
unique_urls[link.url] = link unique_urls[link.url] = link
# unique_timestamps: OrderedDict[str, Link] = OrderedDict()
# for link in unique_urls.values():
# closest_non_duplicate_ts = lowest_uniq_timestamp(unique_timestamps, link.timestamp)
# if closest_non_duplicate_ts != link.timestamp:
# link = link.overwrite(timestamp=closest_non_duplicate_ts)
# Snapshot.objects.filter(url=link.url).update(timestamp=link.timestamp)
# unique_timestamps[link.timestamp] = link
# return unique_timestamps.values()
return unique_urls.values() return unique_urls.values()
@ -245,11 +236,7 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
if finished: if finished:
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): write_static_index(links, out_dir=out_dir)
write_json_main_index(links, out_dir=out_dir)
with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
write_html_main_index(links, out_dir=out_dir, finished=finished)
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.') stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
@ -260,7 +247,14 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
log_indexing_process_finished() log_indexing_process_finished()
@enforce_types
def write_static_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
write_json_main_index(links)
with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
write_html_main_index(links, out_dir=out_dir, finished=True)
@enforce_types
def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR): def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
setup_django(out_dir, check_db=True) setup_django(out_dir, check_db=True)
from core.models import Snapshot from core.models import Snapshot
@ -306,27 +300,47 @@ def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> T
return new_links return new_links
@enforce_types
def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
"""
Look for urls in the index, and merge them too
"""
unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in links:
index_link = snapshots.filter(url=link.url)
if index_link:
link = merge_links(index_link[0].as_link(), link)
unique_urls[link.url] = link
return unique_urls.values()
@enforce_types @enforce_types
def dedupe_links(existing_links: List[Link], def dedupe_links(snapshots: QuerySet,
new_links: List[Link]) -> Tuple[List[Link], List[Link]]: new_links: List[Link]) -> List[Link]:
"""
The validation of links happened at a different stage. This method will
focus on actual deduplication and timestamp fixing.
"""
# merge existing links in out_dir and new links # merge existing links in out_dir and new links
all_links = validate_links(existing_links + new_links) dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
all_link_urls = {link.url for link in existing_links}
new_links = [ new_links = [
link for link in new_links link for link in new_links
if link.url not in all_link_urls if not snapshots.filter(url=link.url).exists()
] ]
all_links_deduped = {link.url: link for link in all_links} dedup_links_dict = {link.url: link for link in dedup_links}
# Replace links in new_links with the dedup version
for i in range(len(new_links)): for i in range(len(new_links)):
if new_links[i].url in all_links_deduped.keys(): if new_links[i].url in dedup_links_dict.keys():
new_links[i] = all_links_deduped[new_links[i].url] new_links[i] = dedup_links_dict[new_links[i].url]
log_deduping_finished(len(new_links)) log_deduping_finished(len(new_links))
return all_links, new_links return new_links
### Link Details Index ### Link Details Index

View file

@ -40,9 +40,11 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
for link in links: for link in links:
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
try: try:
info['timestamp'] = Snapshot.objects.get(url=link.url).timestamp info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
pass while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
Snapshot.objects.update_or_create(url=link.url, defaults=info) Snapshot.objects.update_or_create(url=link.url, defaults=info)
@enforce_types @enforce_types

View file

@ -30,6 +30,7 @@ from .index import (
parse_links_from_source, parse_links_from_source,
dedupe_links, dedupe_links,
write_main_index, write_main_index,
write_static_index,
link_matches_filter, link_matches_filter,
get_indexed_folders, get_indexed_folders,
get_archived_folders, get_archived_folders,
@ -520,7 +521,7 @@ def add(urls: Union[str, List[str]],
check_data_folder(out_dir=out_dir) check_data_folder(out_dir=out_dir)
check_dependencies() check_dependencies()
new_links: List[Link] = [] new_links: List[Link] = []
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] all_links = load_main_index(out_dir=out_dir)
log_importing_started(urls=urls, depth=depth, index_only=index_only) log_importing_started(urls=urls, depth=depth, index_only=index_only)
if isinstance(urls, str): if isinstance(urls, str):
@ -541,8 +542,10 @@ def add(urls: Union[str, List[str]],
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
all_links, new_links = dedupe_links(all_links, imported_links) new_links = dedupe_links(all_links, imported_links)
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
write_main_index(links=new_links, out_dir=out_dir, finished=not new_links)
all_links = load_main_index(out_dir=out_dir)
if index_only: if index_only:
return all_links return all_links
@ -555,12 +558,9 @@ def add(urls: Union[str, List[str]],
elif new_links: elif new_links:
archive_links(new_links, overwrite=False, out_dir=out_dir) archive_links(new_links, overwrite=False, out_dir=out_dir)
else: else:
# nothing was updated, don't bother re-saving the index
return all_links return all_links
# Step 4: Re-write links index with updated titles, icons, and resources write_static_index([link.as_link() for link in all_links], out_dir=out_dir)
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
return all_links return all_links
@enforce_types @enforce_types