mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-09 12:21:57 -04:00
restore missing archivebox_update work
This commit is contained in:
parent
52446b86ba
commit
f8e2f7c753
1 changed files with 65 additions and 131 deletions
|
@ -1,13 +1,13 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
__package__ = 'archivebox.cli'
|
__package__ = 'archivebox.cli'
|
||||||
__command__ = 'archivebox update'
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
from typing import List, Optional, IO
|
|
||||||
|
|
||||||
from archivebox.misc.util import docstring
|
import rich_click as click
|
||||||
|
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from archivebox.misc.util import enforce_types, docstring
|
||||||
from archivebox.index import (
|
from archivebox.index import (
|
||||||
LINK_FILTERS,
|
LINK_FILTERS,
|
||||||
get_indexed_folders,
|
get_indexed_folders,
|
||||||
|
@ -21,8 +21,66 @@ from archivebox.index import (
|
||||||
get_corrupted_folders,
|
get_corrupted_folders,
|
||||||
get_unrecognized_folders,
|
get_unrecognized_folders,
|
||||||
)
|
)
|
||||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
|
||||||
# from ..main import update
|
|
||||||
|
@enforce_types
|
||||||
|
def update(filter_patterns: Iterable[str]=(),
|
||||||
|
only_new: bool=False,
|
||||||
|
index_only: bool=False,
|
||||||
|
resume: float | None=None,
|
||||||
|
overwrite: bool=False,
|
||||||
|
before: float | None=None,
|
||||||
|
after: float | None=None,
|
||||||
|
status: str='indexed',
|
||||||
|
filter_type: str='exact',
|
||||||
|
extract: str="") -> None:
|
||||||
|
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||||
|
|
||||||
|
from archivebox.config.django import setup_django
|
||||||
|
setup_django()
|
||||||
|
|
||||||
|
from workers.orchestrator import Orchestrator
|
||||||
|
orchestrator = Orchestrator(exit_on_idle=False)
|
||||||
|
orchestrator.start()
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
|
||||||
|
@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
|
||||||
|
@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
|
||||||
|
@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
|
||||||
|
@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
|
||||||
|
@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
|
||||||
|
@click.option('--status', type=click.Choice([
|
||||||
|
'indexed', 'archived', 'unarchived',
|
||||||
|
'present', 'valid', 'invalid',
|
||||||
|
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
|
||||||
|
]), default='indexed', help=f'''
|
||||||
|
Update only links or data directories that have the given status:
|
||||||
|
indexed {get_indexed_folders.__doc__} (the default)
|
||||||
|
archived {get_archived_folders.__doc__}
|
||||||
|
unarchived {get_unarchived_folders.__doc__}
|
||||||
|
|
||||||
|
present {get_present_folders.__doc__}
|
||||||
|
valid {get_valid_folders.__doc__}
|
||||||
|
invalid {get_invalid_folders.__doc__}
|
||||||
|
|
||||||
|
duplicate {get_duplicate_folders.__doc__}
|
||||||
|
orphaned {get_orphaned_folders.__doc__}
|
||||||
|
corrupted {get_corrupted_folders.__doc__}
|
||||||
|
unrecognized {get_unrecognized_folders.__doc__}
|
||||||
|
''')
|
||||||
|
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||||
|
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
|
||||||
|
@click.argument('filter_patterns', nargs=-1)
|
||||||
|
@docstring(update.__doc__)
|
||||||
|
def main(**kwargs):
|
||||||
|
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||||
|
update(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -103,127 +161,3 @@ from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||||
# # Step 4: Re-write links index with updated titles, icons, and resources
|
# # Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
# all_links = load_main_index(out_dir=out_dir)
|
# all_links = load_main_index(out_dir=out_dir)
|
||||||
# return all_links
|
# return all_links
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update():
|
|
||||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
|
||||||
from archivebox.config.django import setup_django
|
|
||||||
setup_django()
|
|
||||||
|
|
||||||
from workers.orchestrator import Orchestrator
|
|
||||||
orchestrator = Orchestrator(exit_on_idle=False)
|
|
||||||
orchestrator.start()
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(update.__doc__)
|
|
||||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog=__command__,
|
|
||||||
description=update.__doc__,
|
|
||||||
add_help=True,
|
|
||||||
formatter_class=SmartFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--only-new', #'-n',
|
|
||||||
action='store_true',
|
|
||||||
help="Don't attempt to retry previously skipped/failed links when updating",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--index-only', #'-o',
|
|
||||||
action='store_true',
|
|
||||||
help="Update the main index without archiving any content",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--resume', #'-r',
|
|
||||||
type=float,
|
|
||||||
help='Resume the update process from a given timestamp',
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--overwrite', #'-x',
|
|
||||||
action='store_true',
|
|
||||||
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--before', #'-b',
|
|
||||||
type=float,
|
|
||||||
help="Update only links bookmarked before the given timestamp.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--after', #'-a',
|
|
||||||
type=float,
|
|
||||||
help="Update only links bookmarked after the given timestamp.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--status',
|
|
||||||
type=str,
|
|
||||||
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
|
||||||
default='indexed',
|
|
||||||
help=(
|
|
||||||
'Update only links or data directories that have the given status\n'
|
|
||||||
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
|
||||||
f' archived {get_archived_folders.__doc__}\n'
|
|
||||||
f' unarchived {get_unarchived_folders.__doc__}\n'
|
|
||||||
'\n'
|
|
||||||
f' present {get_present_folders.__doc__}\n'
|
|
||||||
f' valid {get_valid_folders.__doc__}\n'
|
|
||||||
f' invalid {get_invalid_folders.__doc__}\n'
|
|
||||||
'\n'
|
|
||||||
f' duplicate {get_duplicate_folders.__doc__}\n'
|
|
||||||
f' orphaned {get_orphaned_folders.__doc__}\n'
|
|
||||||
f' corrupted {get_corrupted_folders.__doc__}\n'
|
|
||||||
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--filter-type', '-t',
|
|
||||||
type=str,
|
|
||||||
choices=(*LINK_FILTERS.keys(), 'search'),
|
|
||||||
default='exact',
|
|
||||||
help='Type of pattern matching to use when filtering URLs',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'filter_patterns',
|
|
||||||
nargs='*',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='Update only URLs matching these filter patterns.'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--extract",
|
|
||||||
type=str,
|
|
||||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
|
||||||
This does not take precedence over the configuration",
|
|
||||||
default=""
|
|
||||||
)
|
|
||||||
command = parser.parse_args(args or ())
|
|
||||||
|
|
||||||
filter_patterns_str = None
|
|
||||||
if not command.filter_patterns:
|
|
||||||
filter_patterns_str = accept_stdin(stdin)
|
|
||||||
|
|
||||||
update()
|
|
||||||
|
|
||||||
# update(
|
|
||||||
# resume=command.resume,
|
|
||||||
# only_new=command.only_new,
|
|
||||||
# index_only=command.index_only,
|
|
||||||
# overwrite=command.overwrite,
|
|
||||||
# filter_patterns_str=filter_patterns_str,
|
|
||||||
# filter_patterns=command.filter_patterns,
|
|
||||||
# filter_type=command.filter_type,
|
|
||||||
# status=command.status,
|
|
||||||
# after=command.after,
|
|
||||||
# before=command.before,
|
|
||||||
# out_dir=Path(pwd) if pwd else DATA_DIR,
|
|
||||||
# extractors=command.extract,
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue