From fea0b89dbe7a31163ada86c9c99b2902d9e5df9a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 27 Mar 2021 03:57:05 -0400 Subject: [PATCH] add tag cli option --- archivebox/cli/archivebox_add.py | 7 ++++++ archivebox/core/models.py | 3 +++ archivebox/main.py | 42 ++++++++++++++++++++++---------- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 7266a571..a96888b0 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -22,6 +22,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional add_help=True, formatter_class=SmartFormatter, ) + parser.add_argument( + '--tag', '-t', + type=str, + default='', + help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3", + ) parser.add_argument( '--update-all', #'-n', action='store_true', @@ -89,6 +95,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional add( urls=stdin_urls or urls, depth=command.depth, + tag=command.tag, update_all=command.update_all, index_only=command.index_only, overwrite=command.overwrite, diff --git a/archivebox/core/models.py b/archivebox/core/models.py index bdc58302..d3495826 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -33,8 +33,11 @@ class Tag(models.Model): Based on django-taggit model """ name = models.CharField(unique=True, blank=False, max_length=100) + + # slug is autoset on save from name, never set it manually slug = models.SlugField(unique=True, blank=True, max_length=100) + class Meta: verbose_name = "Tag" verbose_name_plural = "Tags" diff --git a/archivebox/main.py b/archivebox/main.py index 64870297..3b52a179 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -561,6 +561,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): @enforce_types def add(urls: Union[str, List[str]], + tag: str='', depth: int=0, update_all: bool=not ONLY_NEW, index_only: bool=False, @@ -570,6 +571,8 @@ def add(urls: Union[str, List[str]], out_dir: Path=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" + from core.models import Tag + assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' extractors = extractors.split(",") if extractors else [] @@ -602,31 +605,44 @@ def add(urls: Union[str, List[str]], new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) + new_links = dedupe_links(all_links, imported_links) write_main_index(links=new_links, out_dir=out_dir) all_links = load_main_index(out_dir=out_dir) + # add any tags to imported links + tags = [ + Tag.objects.get_or_create(name=name.strip()) + for name in tag.split(',') + if name.strip() + ] + if tags: + for link in imported_links: + link.as_snapshot().tags.add(*tags) + + if index_only: + # mock archive all the links using the fake index_only extractor method in order to update their state if overwrite: archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir) else: archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir) - return all_links + else: + # fully run the archive extractor methods for each link + archive_kwargs = { + "out_dir": out_dir, + } + if extractors: + archive_kwargs["methods"] = extractors - # Run the archive methods for each link - archive_kwargs = { - "out_dir": out_dir, - } - if extractors: - archive_kwargs["methods"] = extractors + if update_all: + archive_links(all_links, overwrite=overwrite, **archive_kwargs) + elif overwrite: + archive_links(imported_links, overwrite=True, **archive_kwargs) + elif new_links: + archive_links(new_links, overwrite=False, **archive_kwargs) - if update_all: - archive_links(all_links, overwrite=overwrite, **archive_kwargs) - elif overwrite: - archive_links(imported_links, overwrite=True, **archive_kwargs) - elif new_links: - archive_links(new_links, overwrite=False, **archive_kwargs) return all_links