diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 183bbb14..f94cd68a 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -532,7 +532,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): @admin.register(Tag, site=archivebox_admin) class TagAdmin(ABIDModelAdmin): - list_display = ('abid', 'name', 'created', 'created_by', 'num_snapshots', 'snapshots') + list_display = ('created', 'created_by', 'abid', 'name', 'num_snapshots', 'snapshots') sort_fields = ('name', 'slug', 'abid', 'created_by', 'created') readonly_fields = ('slug', 'abid', 'created', 'modified', 'API', 'num_snapshots', 'snapshots') search_fields = ('abid', 'name', 'slug') diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index a262bba6..036ff73c 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -107,7 +107,7 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]: return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore] @enforce_types -def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: +def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. @@ -115,7 +115,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s try: snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot except Snapshot.DoesNotExist: - snapshot = write_link_to_sql_index(link) + snapshot = write_link_to_sql_index(link, created_by_id=created_by_id) active_methods = get_archive_methods_for_link(link) @@ -154,7 +154,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s log_archive_method_finished(result) write_search_index(link=link, texts=result.index_texts) ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, - output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) + output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status, created_by_id=snapshot.created_by_id) # bump the updated time on the main Snapshot here, this is critical @@ -213,7 +213,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s return link @enforce_types -def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]: +def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> List[Link]: if type(all_links) is QuerySet: num_links: int = all_links.count() @@ -232,7 +232,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa for link in all_links: idx += 1 to_archive = get_link(link) - archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir)) + archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir), created_by_id=created_by_id) except KeyboardInterrupt: log_archiving_paused(num_links, idx, link.timestamp) raise SystemExit(0) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 3e9ddc77..10c1525d 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -70,7 +70,7 @@ def write_link_to_sql_index(link: Link, created_by_id: int | None=None): 'cmd_version': entry.get('cmd_version') or 'unknown', 'pwd': entry['pwd'], 'status': entry['status'], - 'created_by_id': created_by_id, + 'created_by_id': snapshot.created_by_id, } ) else: @@ -85,7 +85,7 @@ def write_link_to_sql_index(link: Link, created_by_id: int | None=None): 'cmd_version': entry.cmd_version or 'unknown', 'pwd': entry.pwd, 'status': entry.status, - 'created_by_id': created_by_id, + 'created_by_id': snapshot.created_by_id, } ) diff --git a/archivebox/main.py b/archivebox/main.py index b2bc1ce4..b36fb3dd 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -566,7 +566,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: @enforce_types -def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): +def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR, created_by_id: int | None=None) -> List[Link]: """ Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. You can run this to archive single pages without needing to create a whole collection with archivebox init. @@ -580,7 +580,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): raise SystemExit(2) methods = extractors.split(",") if extractors else ignore_methods(['title']) - archive_link(oneshot_link[0], out_dir=out_dir, methods=methods) + archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, created_by_id=created_by_id) return oneshot_link @enforce_types @@ -659,13 +659,14 @@ def add(urls: Union[str, List[str]], if index_only: # mock archive all the links using the fake index_only extractor method in order to update their state if overwrite: - archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir) + archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id) else: - archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir) + archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id) else: # fully run the archive extractor methods for each link archive_kwargs = { "out_dir": out_dir, + "created_by_id": created_by_id, } if extractors: archive_kwargs["methods"] = extractors