mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-16 16:14:28 -04:00
fix: json index was missing base_url
field
This commit is contained in:
parent
11b08a063d
commit
1ce6130202
4 changed files with 15 additions and 20 deletions
|
@ -83,7 +83,7 @@ class Snapshot(models.Model):
|
||||||
updated = models.DateTimeField(null=True, blank=True, db_index=True)
|
updated = models.DateTimeField(null=True, blank=True, db_index=True)
|
||||||
tags = models.ManyToManyField(Tag)
|
tags = models.ManyToManyField(Tag)
|
||||||
|
|
||||||
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
keys = ('id', 'url', 'timestamp', 'title', 'tags', 'updated', 'base_url')
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
title = self.title or '-'
|
title = self.title or '-'
|
||||||
|
@ -109,11 +109,14 @@ class Snapshot(models.Model):
|
||||||
|
|
||||||
def as_json(self, *args) -> dict:
|
def as_json(self, *args) -> dict:
|
||||||
args = args or self.keys
|
args = args or self.keys
|
||||||
return {
|
output = {
|
||||||
key: getattr(self, key)
|
key: getattr(self, key)
|
||||||
if key != 'tags' else self.tags_str()
|
if key != 'tags' else self.tags_str()
|
||||||
for key in args
|
for key in args
|
||||||
}
|
}
|
||||||
|
if "id" in output.keys():
|
||||||
|
output["id"] = str(output["id"])
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
def as_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
|
def as_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
|
||||||
|
@ -269,16 +272,6 @@ class Snapshot(models.Model):
|
||||||
})
|
})
|
||||||
return canonical
|
return canonical
|
||||||
|
|
||||||
def _asdict(self):
|
|
||||||
return {
|
|
||||||
"id": str(self.id),
|
|
||||||
"url": self.url,
|
|
||||||
"timestamp": self.timestamp,
|
|
||||||
"title": self.title,
|
|
||||||
"added": self.added,
|
|
||||||
"updated": self.updated,
|
|
||||||
}
|
|
||||||
|
|
||||||
def save_tags(self, tags=()):
|
def save_tags(self, tags=()):
|
||||||
tags_id = []
|
tags_id = []
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
|
|
|
@ -87,7 +87,7 @@ def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) ->
|
||||||
|
|
||||||
out_dir = out_dir or snapshot.snapshot_dir
|
out_dir = out_dir or snapshot.snapshot_dir
|
||||||
path = Path(out_dir) / JSON_INDEX_FILENAME
|
path = Path(out_dir) / JSON_INDEX_FILENAME
|
||||||
atomic_write(str(path), snapshot._asdict())
|
atomic_write(str(path), snapshot.as_json())
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
|
@ -353,7 +353,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
orphaned_json_snapshots = {
|
orphaned_json_snapshots = {
|
||||||
snapshot.url: snapshot
|
snapshot.url: snapshot
|
||||||
for snapshot in parse_json_main_index(out_dir)
|
for snapshot in parse_json_main_index(out_dir)
|
||||||
if not all_links.filter(url=link.url).exists()
|
if not all_snapshots.filter(url=link.url).exists()
|
||||||
}
|
}
|
||||||
if orphaned_json_snapshots:
|
if orphaned_json_snapshots:
|
||||||
pending_snapshots.update(orphaned_json_snapshots)
|
pending_snapshots.update(orphaned_json_snapshots)
|
||||||
|
@ -381,7 +381,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
}
|
}
|
||||||
if invalid_folders:
|
if invalid_folders:
|
||||||
print(' {lightyellow}! Skipped adding {} invalid snapshot data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
print(' {lightyellow}! Skipped adding {} invalid snapshot data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
||||||
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
|
print(' X ' + '\n X '.join(f'{folder} {snapshot}' for folder, snapshot in invalid_folders.items()))
|
||||||
print()
|
print()
|
||||||
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
|
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
|
||||||
print(' archivebox status')
|
print(' archivebox status')
|
||||||
|
@ -394,7 +394,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
if existing_index:
|
if existing_index:
|
||||||
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
||||||
else:
|
else:
|
||||||
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
|
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} snapshots).{reset}'.format(len(all_snapshots), **ANSI))
|
||||||
print()
|
print()
|
||||||
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
|
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
|
||||||
print(' archivebox server # then visit http://127.0.0.1:8000')
|
print(' archivebox server # then visit http://127.0.0.1:8000')
|
||||||
|
@ -577,16 +577,16 @@ def add(urls: Union[str, List[str]],
|
||||||
for new_snapshot in new_snapshots:
|
for new_snapshot in new_snapshots:
|
||||||
# TODO: Check if we need to add domain to the Snapshot model
|
# TODO: Check if we need to add domain to the Snapshot model
|
||||||
downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
|
downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
|
||||||
new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url)
|
new_snapshots_depth += parse_snapshots_from_source(downloaded_file, root_url=new_snapshot.url)
|
||||||
|
|
||||||
imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
|
imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
|
||||||
new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)
|
new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)
|
||||||
|
|
||||||
write_main_index(snapshots=new_snapshots, out_dir=out_dir)
|
write_main_index(snapshots=new_snapshots, out_dir=out_dir)
|
||||||
all_links = load_main_index(out_dir=out_dir)
|
all_snapshots = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
if index_only:
|
if index_only:
|
||||||
return all_links
|
return all_snapshots
|
||||||
|
|
||||||
# Run the archive methods for each link
|
# Run the archive methods for each link
|
||||||
archive_kwargs = {
|
archive_kwargs = {
|
||||||
|
|
|
@ -35,6 +35,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac
|
||||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||||
with open(archived_item_path / "index.json", "r") as f:
|
with open(archived_item_path / "index.json", "r") as f:
|
||||||
output_json = json.load(f)
|
output_json = json.load(f)
|
||||||
|
|
||||||
assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
|
assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
|
||||||
|
|
||||||
|
|
||||||
|
@ -90,4 +91,5 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
|
||||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||||
|
|
||||||
assert (archived_item_path / "warc").exists()
|
assert (archived_item_path / "warc").exists()
|
||||||
assert not (archived_item_path / "singlefile.html").exists()
|
assert not (archived_item_path / "singlefile.html").exists()
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue