diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 000159e3..567e1bf3 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -9,6 +9,7 @@ from ..index.schema import Link from ..index import ( load_link_details, write_link_details, + write_main_index, ) from ..util import enforce_types from ..logging_util import ( @@ -128,24 +129,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s return link - @enforce_types -def archive_links(links: List[Link], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]: - if not links: +def archive_links(all_links: any, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]: + + if type(all_links) is list: + num_links: int = len(all_links) + get_link = lambda x: x + get_iter = lambda x: x + else: + num_links: int = all_links.count() + get_link = lambda x: x.as_link() + get_iter = lambda x: x.iterator() + + if num_links == 0: return [] - log_archiving_started(len(links)) + log_archiving_started(num_links) idx: int = 0 - link: Link = links[0] try: - for idx, link in enumerate(links): - archive_link(link, overwrite=overwrite, methods=methods, out_dir=link.link_dir) + for link in get_iter(all_links): + idx += 1 + to_archive = get_link(link) + archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=link.link_dir) except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link.timestamp) + log_archiving_paused(num_links, idx, link.timestamp) raise SystemExit(0) except BaseException: print() raise - log_archiving_finished(len(links)) - return links + log_archiving_finished(num_links) + return all_links diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 6e59609f..82c07007 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -11,6 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable from collections import OrderedDict from contextlib import contextmanager from urllib.parse import urlparse +from django.db.models import QuerySet from ..util import ( scheme, @@ -133,7 +134,6 @@ def validate_links(links: Iterable[Link]) -> List[Link]: return list(links) - @enforce_types def archivable_links(links: Iterable[Link]) -> Iterable[Link]: """remove chrome://, about:// or other schemed links that cant be archived""" @@ -165,15 +165,6 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]: link = merge_links(unique_urls[link.url], link) unique_urls[link.url] = link - # unique_timestamps: OrderedDict[str, Link] = OrderedDict() - # for link in unique_urls.values(): - # closest_non_duplicate_ts = lowest_uniq_timestamp(unique_timestamps, link.timestamp) - # if closest_non_duplicate_ts != link.timestamp: - # link = link.overwrite(timestamp=closest_non_duplicate_ts) - # Snapshot.objects.filter(url=link.url).update(timestamp=link.timestamp) - # unique_timestamps[link.timestamp] = link - - # return unique_timestamps.values() return unique_urls.values() @@ -245,11 +236,7 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool= os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes if finished: - with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): - write_json_main_index(links, out_dir=out_dir) - - with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)): - write_html_main_index(links, out_dir=out_dir, finished=finished) + write_static_index(links, out_dir=out_dir) except (KeyboardInterrupt, SystemExit): stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.') @@ -260,7 +247,14 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool= log_indexing_process_finished() +@enforce_types +def write_static_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: + with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)): + write_json_main_index(links) + with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)): + write_html_main_index(links, out_dir=out_dir, finished=True) +@enforce_types def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR): setup_django(out_dir, check_db=True) from core.models import Snapshot @@ -306,27 +300,47 @@ def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> T return new_links +@enforce_types +def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]: + """ + Look for urls in the index, and merge them too + """ + unique_urls: OrderedDict[str, Link] = OrderedDict() + + for link in links: + index_link = snapshots.filter(url=link.url) + if index_link: + link = merge_links(index_link[0].as_link(), link) + + unique_urls[link.url] = link + + return unique_urls.values() @enforce_types -def dedupe_links(existing_links: List[Link], - new_links: List[Link]) -> Tuple[List[Link], List[Link]]: - +def dedupe_links(snapshots: QuerySet, + new_links: List[Link]) -> List[Link]: + """ + The validation of links happened at a different stage. This method will + focus on actual deduplication and timestamp fixing. + """ + # merge existing links in out_dir and new links - all_links = validate_links(existing_links + new_links) - all_link_urls = {link.url for link in existing_links} + dedup_links = fix_duplicate_links_in_index(snapshots, new_links) new_links = [ link for link in new_links - if link.url not in all_link_urls + if not snapshots.filter(url=link.url).exists() ] - all_links_deduped = {link.url: link for link in all_links} + dedup_links_dict = {link.url: link for link in dedup_links} + + # Replace links in new_links with the dedup version for i in range(len(new_links)): - if new_links[i].url in all_links_deduped.keys(): - new_links[i] = all_links_deduped[new_links[i].url] + if new_links[i].url in dedup_links_dict.keys(): + new_links[i] = dedup_links_dict[new_links[i].url] log_deduping_finished(len(new_links)) - return all_links, new_links + return new_links ### Link Details Index diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 183aeef8..232de407 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -40,9 +40,11 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: for link in links: info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} try: - info['timestamp'] = Snapshot.objects.get(url=link.url).timestamp + info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp except Snapshot.DoesNotExist: - pass + while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): + info["timestamp"] = str(float(info["timestamp"]) + 1.0) + Snapshot.objects.update_or_create(url=link.url, defaults=info) @enforce_types diff --git a/archivebox/main.py b/archivebox/main.py index 07c340ad..6f34f91d 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -30,6 +30,7 @@ from .index import ( parse_links_from_source, dedupe_links, write_main_index, + write_static_index, link_matches_filter, get_indexed_folders, get_archived_folders, @@ -520,7 +521,7 @@ def add(urls: Union[str, List[str]], check_data_folder(out_dir=out_dir) check_dependencies() new_links: List[Link] = [] - all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] + all_links = load_main_index(out_dir=out_dir) log_importing_started(urls=urls, depth=depth, index_only=index_only) if isinstance(urls, str): @@ -541,8 +542,10 @@ def add(urls: Union[str, List[str]], new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) - all_links, new_links = dedupe_links(all_links, imported_links) - write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) + new_links = dedupe_links(all_links, imported_links) + + write_main_index(links=new_links, out_dir=out_dir, finished=not new_links) + all_links = load_main_index(out_dir=out_dir) if index_only: return all_links @@ -555,12 +558,9 @@ def add(urls: Union[str, List[str]], elif new_links: archive_links(new_links, overwrite=False, out_dir=out_dir) else: - # nothing was updated, don't bother re-saving the index return all_links - # Step 4: Re-write links index with updated titles, icons, and resources - all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] - write_main_index(links=list(all_links), out_dir=out_dir, finished=True) + write_static_index([link.as_link() for link in all_links], out_dir=out_dir) return all_links @enforce_types