mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-17 00:24:26 -04:00
refactor: update command is functional
This commit is contained in:
parent
de3c82730c
commit
d92083b928
4 changed files with 91 additions and 99 deletions
|
@ -88,7 +88,7 @@ def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[I
|
|||
details = {"history": {}}
|
||||
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
|
||||
else:
|
||||
details = list(load_snapshot_details(snapshot))
|
||||
details = load_snapshot_details(snapshot)
|
||||
|
||||
#log_link_archiving_started(link, out_dir, is_new)
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
|
|
|
@ -42,6 +42,7 @@ from .html import (
|
|||
write_html_snapshot_details,
|
||||
)
|
||||
from .json import (
|
||||
load_json_snapshot_details,
|
||||
parse_json_snapshot_details,
|
||||
write_json_snapshot_details,
|
||||
)
|
||||
|
@ -318,9 +319,9 @@ def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model
|
|||
"""check for an existing link archive in the given directory,
|
||||
and load+merge it into the given link dict
|
||||
"""
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
|
||||
existing_snapshot = parse_json_snapshot_details(out_dir)
|
||||
existing_snapshot = load_json_snapshot_details(out_dir)
|
||||
if existing_snapshot:
|
||||
return merge_snapshots(existing_snapshot, snapshot)
|
||||
|
||||
|
@ -379,56 +380,41 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
|||
return search_filter(snapshots, filter_patterns, filter_type)
|
||||
|
||||
|
||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in links
|
||||
}
|
||||
return {snapshot.snapshot_dir: snapshot for snapshot in snapshots}
|
||||
|
||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_archived, links)
|
||||
}
|
||||
return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_archived, snapshots)}
|
||||
|
||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_unarchived, links)
|
||||
}
|
||||
return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_unarchived, snapshots)}
|
||||
|
||||
def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""dirs that actually exist in the archive/ folder"""
|
||||
from core.models import Snapshot
|
||||
|
||||
all_folders = {}
|
||||
|
||||
for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
|
||||
if entry.is_dir():
|
||||
link = None
|
||||
snapshot = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
snapshot = parse_json_snapshot_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_folders[entry.name] = link
|
||||
all_folders[entry.name] = snapshot
|
||||
|
||||
return all_folders
|
||||
|
||||
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""dirs with a valid index matched to the main index and archived content"""
|
||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_valid, links)
|
||||
}
|
||||
return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_valid, snapshots)}
|
||||
|
||||
def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
|
||||
duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
|
||||
orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
|
||||
|
@ -437,7 +423,7 @@ def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
|||
return {**duplicate, **orphaned, **corrupted, **unrecognized}
|
||||
|
||||
|
||||
def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""dirs that conflict with other directories that have the same link URL or timestamp"""
|
||||
by_url = {}
|
||||
by_timestamp = {}
|
||||
|
@ -450,91 +436,92 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
|
|||
)
|
||||
|
||||
for path in chain(snapshots.iterator(), data_folders):
|
||||
link = None
|
||||
snapshot = None
|
||||
if type(path) is not str:
|
||||
path = path.as_link().link_dir
|
||||
path = path.snapshot_dir
|
||||
|
||||
try:
|
||||
link = parse_json_link_details(path)
|
||||
snapshot = parse_json_snapshot_details(path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if link:
|
||||
# link folder has same timestamp as different link folder
|
||||
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
|
||||
if by_timestamp[link.timestamp] > 1:
|
||||
duplicate_folders[path] = link
|
||||
if snapshot:
|
||||
# snapshot folder has same timestamp as different link folder
|
||||
by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1
|
||||
if by_timestamp[snapshot.timestamp] > 1:
|
||||
duplicate_folders[path] = snapshot
|
||||
|
||||
# link folder has same url as different link folder
|
||||
by_url[link.url] = by_url.get(link.url, 0) + 1
|
||||
if by_url[link.url] > 1:
|
||||
duplicate_folders[path] = link
|
||||
by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1
|
||||
if by_url[snapshot.url] > 1:
|
||||
duplicate_folders[path] = snapshot
|
||||
return duplicate_folders
|
||||
|
||||
def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""dirs that contain a valid index but aren't listed in the main index"""
|
||||
orphaned_folders = {}
|
||||
|
||||
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
|
||||
if entry.is_dir():
|
||||
link = None
|
||||
snapshot = None
|
||||
try:
|
||||
link = parse_json_link_details(str(entry))
|
||||
snapshot = parse_json_snapshot_details(str(entry))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if link and not snapshots.filter(timestamp=entry.name).exists():
|
||||
if snapshot and not snapshots.filter(timestamp=entry.name).exists():
|
||||
# folder is a valid link data dir with index details, but it's not in the main index
|
||||
orphaned_folders[str(entry)] = link
|
||||
orphaned_folders[str(entry)] = snapshot
|
||||
|
||||
return orphaned_folders
|
||||
|
||||
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
||||
corrupted = {}
|
||||
for snapshot in snapshots.iterator():
|
||||
link = snapshot.as_link()
|
||||
if is_corrupt(link):
|
||||
corrupted[link.link_dir] = link
|
||||
if is_corrupt(snapshot):
|
||||
corrupted[snapshot.snapshot_dir] = snapshot
|
||||
return corrupted
|
||||
|
||||
def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
|
||||
unrecognized_folders: Dict[str, Optional[Link]] = {}
|
||||
unrecognized_folders: Dict[str, Optional[Model]] = {}
|
||||
|
||||
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
|
||||
if entry.is_dir():
|
||||
index_exists = (entry / "index.json").exists()
|
||||
link = None
|
||||
snapshot = None
|
||||
try:
|
||||
link = parse_json_link_details(str(entry))
|
||||
snapshot = parse_json_snapshot_details(str(entry))
|
||||
except KeyError:
|
||||
# Try to fix index
|
||||
if index_exists:
|
||||
try:
|
||||
pass
|
||||
# TODO: Implement the `guess` bit for snapshots
|
||||
# try:
|
||||
# Last attempt to repair the detail index
|
||||
link_guessed = parse_json_link_details(str(entry), guess=True)
|
||||
write_json_link_details(link_guessed, out_dir=str(entry))
|
||||
link = parse_json_link_details(str(entry))
|
||||
except Exception:
|
||||
pass
|
||||
# link_guessed = parse_json_snapshot_details(str(entry), guess=True)
|
||||
# write_json_snapshot_details(link_guessed, out_dir=str(entry))
|
||||
# link = parse_json_link_details(str(entry))
|
||||
# except Exception:
|
||||
# pass
|
||||
|
||||
if index_exists and link is None:
|
||||
if index_exists and snapshot is None:
|
||||
# index exists but it's corrupted or unparseable
|
||||
unrecognized_folders[str(entry)] = link
|
||||
unrecognized_folders[str(entry)] = snapshot
|
||||
|
||||
elif not index_exists:
|
||||
# link details index doesn't exist and the folder isn't in the main index
|
||||
timestamp = entry.name
|
||||
if not snapshots.filter(timestamp=timestamp).exists():
|
||||
unrecognized_folders[str(entry)] = link
|
||||
unrecognized_folders[str(entry)] = snapshot
|
||||
|
||||
return unrecognized_folders
|
||||
|
||||
|
||||
def is_valid(link: Link) -> bool:
|
||||
dir_exists = Path(link.link_dir).exists()
|
||||
index_exists = (Path(link.link_dir) / "index.json").exists()
|
||||
def is_valid(snapshot: Model) -> bool:
|
||||
dir_exists = Path(snapshot.snapshot_dir).exists()
|
||||
index_exists = (Path(snapshot.snapshot_dir) / "index.json").exists()
|
||||
if not dir_exists:
|
||||
# unarchived links are not included in the valid list
|
||||
return False
|
||||
|
@ -542,29 +529,30 @@ def is_valid(link: Link) -> bool:
|
|||
return False
|
||||
if dir_exists and index_exists:
|
||||
try:
|
||||
parsed_link = parse_json_link_details(link.link_dir, guess=True)
|
||||
return link.url == parsed_link.url
|
||||
# TODO: review if the `guess` was necessary here
|
||||
parsed_snapshot = parse_json_snapshot_details(snapshot.snapshot_dir)
|
||||
return snapshot.url == parsed_snapshot.url
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def is_corrupt(link: Link) -> bool:
|
||||
if not Path(link.link_dir).exists():
|
||||
def is_corrupt(snapshot: Model) -> bool:
|
||||
if not Path(snapshot.snapshot_dir).exists():
|
||||
# unarchived links are not considered corrupt
|
||||
return False
|
||||
|
||||
if is_valid(link):
|
||||
if is_valid(snapshot):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_archived(link: Link) -> bool:
|
||||
return is_valid(link) and link.is_archived
|
||||
def is_archived(snapshot: Model) -> bool:
|
||||
return is_valid(snapshot) and snapshot.is_archived
|
||||
|
||||
def is_unarchived(link: Link) -> bool:
|
||||
if not Path(link.link_dir).exists():
|
||||
def is_unarchived(snapshot: Model) -> bool:
|
||||
if not Path(snapshot.snapshot_dir).exists():
|
||||
return True
|
||||
return not link.is_archived
|
||||
return not snapshot.is_archived
|
||||
|
||||
|
||||
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
|
||||
|
@ -574,22 +562,22 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
|
|||
if entry.is_dir(follow_symlinks=True):
|
||||
if (Path(entry.path) / 'index.json').exists():
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
snapshot = parse_json_snapshot_details(entry.path)
|
||||
except KeyError:
|
||||
link = None
|
||||
if not link:
|
||||
snapshot = None
|
||||
if not snapshot:
|
||||
continue
|
||||
|
||||
if not entry.path.endswith(f'/{link.timestamp}'):
|
||||
dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp
|
||||
dest = out_dir / ARCHIVE_DIR_NAME / snapshot.timestamp
|
||||
if dest.exists():
|
||||
cant_fix.append(entry.path)
|
||||
else:
|
||||
shutil.move(entry.path, dest)
|
||||
fixed.append(dest)
|
||||
timestamp = entry.path.rsplit('/', 1)[-1]
|
||||
assert link.link_dir == entry.path
|
||||
assert link.timestamp == timestamp
|
||||
write_json_link_details(link, out_dir=entry.path)
|
||||
assert snapshot.snapshot_dir == entry.path
|
||||
assert snapshot.timestamp == timestamp
|
||||
write_json_snapshot_details(snapshot, out_dir=entry.path)
|
||||
|
||||
return fixed, cant_fix
|
||||
|
|
|
@ -91,7 +91,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
|
|||
|
||||
|
||||
@enforce_types
|
||||
def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
|
||||
def load_json_snapshot_details(out_dir: Path) -> Optional[Model]:
|
||||
"""
|
||||
Loads the detail from the local json index
|
||||
"""
|
||||
|
@ -99,7 +99,10 @@ def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
|
|||
if existing_index.exists():
|
||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||
try:
|
||||
return pyjson.load(f)
|
||||
output = pyjson.load(f)
|
||||
if "history" not in output.keys():
|
||||
output["history"] = {}
|
||||
return output
|
||||
except pyjson.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
@ -109,7 +112,7 @@ def load_snapshot_details(out_Dir: Path) -> Optional[Model]:
|
|||
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[dict]:
|
||||
"""read through all the archive data folders and return the parsed links"""
|
||||
|
||||
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
|
||||
for entry in os.scandir(Path(out_dir)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
if (Path(entry.path) / 'index.json').exists():
|
||||
try:
|
||||
|
|
|
@ -9,7 +9,7 @@ from datetime import date
|
|||
|
||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||
from crontab import CronTab, CronSlices
|
||||
from django.db.models import QuerySet
|
||||
from django.db.models import QuerySet, Model
|
||||
|
||||
from .cli import (
|
||||
list_subcommands,
|
||||
|
@ -689,15 +689,16 @@ def update(resume: Optional[float]=None,
|
|||
extractors: str="",
|
||||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
from core.models import Snapshot
|
||||
|
||||
check_data_folder(out_dir=out_dir)
|
||||
check_dependencies()
|
||||
new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||
new_links: List[Snapshot] = [] # TODO: Remove input argument: only_new
|
||||
|
||||
extractors = extractors.split(",") if extractors else []
|
||||
|
||||
# Step 1: Filter for selected_links
|
||||
matching_snapshots = list_links(
|
||||
matching_snapshots = list_snapshots(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
|
@ -705,15 +706,15 @@ def update(resume: Optional[float]=None,
|
|||
)
|
||||
|
||||
matching_folders = list_folders(
|
||||
links=matching_snapshots,
|
||||
snapshots=matching_snapshots,
|
||||
status=status,
|
||||
out_dir=out_dir,
|
||||
)
|
||||
all_links = [link for link in matching_folders.values() if link]
|
||||
|
||||
if index_only:
|
||||
for link in all_links:
|
||||
write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||
for snapshot in all_snapshots:
|
||||
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=True)
|
||||
index_links(all_links, out_dir=out_dir)
|
||||
return all_links
|
||||
|
||||
|
@ -797,7 +798,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
|
|||
|
||||
|
||||
@enforce_types
|
||||
def list_links(snapshots: Optional[QuerySet]=None,
|
||||
def list_snapshots(snapshots: Optional[QuerySet]=None,
|
||||
filter_patterns: Optional[List[str]]=None,
|
||||
filter_type: str='exact',
|
||||
after: Optional[float]=None,
|
||||
|
@ -820,9 +821,9 @@ def list_links(snapshots: Optional[QuerySet]=None,
|
|||
return all_snapshots
|
||||
|
||||
@enforce_types
|
||||
def list_folders(links: List[Link],
|
||||
def list_folders(snapshots: List[Model],
|
||||
status: str,
|
||||
out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
|
||||
|
||||
check_data_folder(out_dir=out_dir)
|
||||
|
||||
|
@ -840,7 +841,7 @@ def list_folders(links: List[Link],
|
|||
}
|
||||
|
||||
try:
|
||||
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
|
||||
return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
|
||||
except KeyError:
|
||||
raise ValueError('Status not recognized.')
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue