mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
fix depth flag and tweak logging
This commit is contained in:
parent
354a63ccd4
commit
d3bfa98a91
7 changed files with 156 additions and 127 deletions
|
@ -106,9 +106,19 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
|
|||
|
||||
if command.help or command.subcommand is None:
|
||||
command.subcommand = 'help'
|
||||
if command.version:
|
||||
elif command.version:
|
||||
command.subcommand = 'version'
|
||||
|
||||
if command.subcommand not in ('help', 'version', 'status'):
|
||||
from ..cli.logging import log_cli_command
|
||||
|
||||
log_cli_command(
|
||||
subcommand=command.subcommand,
|
||||
subcommand_args=command.subcommand_args,
|
||||
stdin=stdin,
|
||||
pwd=pwd or OUTPUT_DIR
|
||||
)
|
||||
|
||||
run_subcommand(
|
||||
subcommand=command.subcommand,
|
||||
subcommand_args=command.subcommand_args,
|
||||
|
|
|
@ -10,7 +10,7 @@ from typing import List, Optional, IO
|
|||
|
||||
from ..main import add, docstring
|
||||
from ..config import OUTPUT_DIR, ONLY_NEW
|
||||
from .logging import SmartFormatter, accept_stdin
|
||||
from .logging import SmartFormatter, accept_stdin, stderr
|
||||
|
||||
|
||||
@docstring(add.__doc__)
|
||||
|
@ -33,12 +33,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
help="Add the links to the main index without archiving them",
|
||||
)
|
||||
parser.add_argument(
|
||||
'import_path',
|
||||
nargs='?',
|
||||
'urls',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'URL or path to local file to start the archiving process from. e.g.:\n'
|
||||
'URLs or paths to archive e.g.:\n'
|
||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||
' https://example.com/some/rss/feed.xml\n'
|
||||
' https://example.com\n'
|
||||
|
@ -50,25 +50,21 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
"--depth",
|
||||
action="store",
|
||||
default=0,
|
||||
choices=[0,1],
|
||||
choices=[0, 1],
|
||||
type=int,
|
||||
help="Recursively archive all linked pages up to this many hops away"
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
import_string = accept_stdin(stdin)
|
||||
if import_string and command.import_path:
|
||||
urls = command.urls
|
||||
stdin_urls = accept_stdin(stdin)
|
||||
if (stdin_urls and urls) or (not stdin and not urls):
|
||||
stderr(
|
||||
'[X] You should pass an import path or a page url as an argument or in stdin but not both\n',
|
||||
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
elif import_string:
|
||||
import_path = import_string
|
||||
else:
|
||||
import_path = command.import_path
|
||||
|
||||
add(
|
||||
url=import_path,
|
||||
urls=stdin_urls or urls,
|
||||
depth=command.depth,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
|
|
|
@ -5,10 +5,12 @@ import os
|
|||
import sys
|
||||
import time
|
||||
import argparse
|
||||
import logging
|
||||
import signal
|
||||
from multiprocessing import Process
|
||||
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing import Process
|
||||
from typing import Optional, List, Dict, Union, IO
|
||||
|
||||
from ..index.schema import Link, ArchiveResult
|
||||
|
@ -23,11 +25,11 @@ from ..config import (
|
|||
SHOW_PROGRESS,
|
||||
TERM_WIDTH,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR_NAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
stderr,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeStats:
|
||||
"""mutable stats counter for logging archiving timing info to CLI output"""
|
||||
|
@ -98,9 +100,9 @@ class TimedProgress:
|
|||
|
||||
if SHOW_PROGRESS:
|
||||
# terminate if we havent already terminated
|
||||
if self.p is not None:
|
||||
self.p.terminate()
|
||||
self.p = None
|
||||
self.p.join()
|
||||
self.p.close()
|
||||
|
||||
# clear whole terminal line
|
||||
try:
|
||||
|
@ -145,28 +147,51 @@ def progress_bar(seconds: int, prefix: str='') -> None:
|
|||
seconds,
|
||||
))
|
||||
sys.stdout.flush()
|
||||
except KeyboardInterrupt:
|
||||
except (KeyboardInterrupt, BrokenPipeError):
|
||||
print()
|
||||
pass
|
||||
|
||||
|
||||
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
|
||||
from ..config import VERSION, ANSI
|
||||
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
|
||||
stdin_hint = ' < /dev/stdin' if not stdin.isatty() else ''
|
||||
print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format(
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
VERSION=VERSION,
|
||||
cmd=cmd,
|
||||
stdin_hint=stdin_hint,
|
||||
**ANSI,
|
||||
))
|
||||
print('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI))
|
||||
print()
|
||||
|
||||
### Parsing Stage
|
||||
|
||||
def log_parsing_started(source_file: str):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS.parse_start_ts = start_ts
|
||||
print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
source_file.rsplit('/', 1)[-1],
|
||||
|
||||
def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
|
||||
_LAST_RUN_STATS.parse_start_ts = datetime.now()
|
||||
print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
|
||||
_LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(urls) if isinstance(urls, list) else len(urls.split('\n')),
|
||||
depth,
|
||||
' (index only)' if index_only else '',
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_source_saved(source_file: str):
|
||||
print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
|
||||
|
||||
def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS.parse_end_ts = end_ts
|
||||
print(' > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links))
|
||||
def log_parsing_finished(num_parsed: int, parser_name: str):
|
||||
_LAST_RUN_STATS.parse_end_ts = datetime.now()
|
||||
print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
|
||||
|
||||
def log_deduping_finished(num_new_links: int):
|
||||
print(' > Found {} new URLs not already in index'.format(num_new_links))
|
||||
|
||||
|
||||
def log_crawl_started(new_links):
|
||||
print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
|
||||
|
||||
### Indexing Stage
|
||||
|
||||
|
@ -174,7 +199,7 @@ def log_indexing_process_started(num_links: int):
|
|||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS.index_start_ts = start_ts
|
||||
print()
|
||||
print('{green}[*] [{}] Writing {} links to main index...{reset}'.format(
|
||||
print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
|
@ -209,7 +234,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]=None):
|
|||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format(
|
||||
print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
|
|
|
@ -2,7 +2,7 @@ __package__ = 'archivebox.extractors'
|
|||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
|
@ -13,6 +13,9 @@ from ..index import (
|
|||
)
|
||||
from ..util import enforce_types
|
||||
from ..cli.logging import (
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
log_archiving_finished,
|
||||
log_link_archiving_started,
|
||||
log_link_archiving_finished,
|
||||
log_archive_method_started,
|
||||
|
@ -103,3 +106,25 @@ def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None)
|
|||
raise
|
||||
|
||||
return link
|
||||
|
||||
|
||||
@enforce_types
|
||||
def archive_links(links: List[Link], out_dir: Optional[str]=None) -> List[Link]:
|
||||
if not links:
|
||||
return []
|
||||
|
||||
log_archiving_started(len(links))
|
||||
idx: int = 0
|
||||
link: Link = links[0]
|
||||
try:
|
||||
for idx, link in enumerate(links):
|
||||
archive_link(link, out_dir=link.link_dir)
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(len(links), idx, link.timestamp)
|
||||
raise SystemExit(0)
|
||||
except BaseException:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(len(links))
|
||||
return links
|
||||
|
|
|
@ -33,8 +33,8 @@ from ..cli.logging import (
|
|||
log_indexing_process_finished,
|
||||
log_indexing_started,
|
||||
log_indexing_finished,
|
||||
log_parsing_started,
|
||||
log_parsing_finished,
|
||||
log_deduping_finished,
|
||||
)
|
||||
|
||||
from .schema import Link, ArchiveResult
|
||||
|
@ -268,20 +268,31 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
|
|||
|
||||
return None
|
||||
|
||||
|
||||
@enforce_types
|
||||
def import_new_links(existing_links: List[Link],
|
||||
import_path: str,
|
||||
out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
|
||||
def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]:
|
||||
|
||||
from ..parsers import parse_links
|
||||
|
||||
new_links: List[Link] = []
|
||||
|
||||
# parse and validate the import file
|
||||
log_parsing_started(import_path)
|
||||
raw_links, parser_name = parse_links(import_path)
|
||||
raw_links, parser_name = parse_links(source_path)
|
||||
new_links = validate_links(raw_links)
|
||||
|
||||
if parser_name:
|
||||
num_parsed = len(raw_links)
|
||||
log_parsing_finished(num_parsed, parser_name)
|
||||
|
||||
return new_links
|
||||
|
||||
|
||||
@enforce_types
|
||||
def dedupe_links(existing_links: List[Link],
|
||||
new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
|
||||
|
||||
from ..parsers import parse_links
|
||||
|
||||
# merge existing links in out_dir and new links
|
||||
all_links = validate_links(existing_links + new_links)
|
||||
all_link_urls = {link.url for link in existing_links}
|
||||
|
@ -290,11 +301,7 @@ def import_new_links(existing_links: List[Link],
|
|||
link for link in new_links
|
||||
if link.url not in all_link_urls
|
||||
]
|
||||
|
||||
if parser_name:
|
||||
num_parsed = len(raw_links)
|
||||
num_new_links = len(all_links) - len(existing_links)
|
||||
log_parsing_finished(num_parsed, num_new_links, parser_name)
|
||||
log_deduping_finished(len(new_links))
|
||||
|
||||
return all_links, new_links
|
||||
|
||||
|
|
|
@ -4,8 +4,7 @@ import os
|
|||
import sys
|
||||
import shutil
|
||||
|
||||
from typing import Dict, List, Optional, Iterable, IO
|
||||
|
||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||
from crontab import CronTab, CronSlices
|
||||
|
||||
from .cli import (
|
||||
|
@ -17,16 +16,17 @@ from .cli import (
|
|||
archive_cmds,
|
||||
)
|
||||
from .parsers import (
|
||||
save_stdin_to_sources,
|
||||
save_file_to_sources,
|
||||
save_text_as_source,
|
||||
save_file_as_source,
|
||||
)
|
||||
from .index.schema import Link
|
||||
from .util import enforce_types, docstring
|
||||
from .util import enforce_types, docstring # type: ignore
|
||||
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
||||
from .index import (
|
||||
links_after_timestamp,
|
||||
load_main_index,
|
||||
import_new_links,
|
||||
parse_links_from_source,
|
||||
dedupe_links,
|
||||
write_main_index,
|
||||
link_matches_filter,
|
||||
get_indexed_folders,
|
||||
|
@ -51,7 +51,7 @@ from .index.sql import (
|
|||
apply_migrations,
|
||||
)
|
||||
from .index.html import parse_html_main_index
|
||||
from .extractors import archive_link
|
||||
from .extractors import archive_links
|
||||
from .config import (
|
||||
stderr,
|
||||
ConfigDict,
|
||||
|
@ -91,9 +91,8 @@ from .config import (
|
|||
from .cli.logging import (
|
||||
TERM_WIDTH,
|
||||
TimedProgress,
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
log_archiving_finished,
|
||||
log_importing_started,
|
||||
log_crawl_started,
|
||||
log_removal_started,
|
||||
log_removal_finished,
|
||||
log_list_started,
|
||||
|
@ -496,57 +495,53 @@ def status(out_dir: str=OUTPUT_DIR) -> None:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def add(url: str,
|
||||
def add(urls: Union[str, List[str]],
|
||||
depth: int=0,
|
||||
update_all: bool=not ONLY_NEW,
|
||||
index_only: bool=False,
|
||||
out_dir: str=OUTPUT_DIR) -> List[Link]:
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
# Load list of links from the existing index
|
||||
check_data_folder(out_dir=out_dir)
|
||||
|
||||
base_path = save_stdin_to_sources(url, out_dir=out_dir)
|
||||
if depth == 1:
|
||||
depth_path = save_file_to_sources(url, out_dir=out_dir)
|
||||
check_dependencies()
|
||||
|
||||
# Step 1: Load list of links from the existing index
|
||||
# merge in and dedupe new links from import_path
|
||||
all_links: List[Link] = []
|
||||
new_links: List[Link] = []
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir)
|
||||
if depth == 1:
|
||||
all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir)
|
||||
new_links = new_links + new_links_depth
|
||||
|
||||
log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
||||
if isinstance(urls, str):
|
||||
# save verbatim stdin to sources
|
||||
write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
|
||||
elif isinstance(urls, list):
|
||||
# save verbatim args to sources
|
||||
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||
|
||||
new_links += parse_links_from_source(write_ahead_log)
|
||||
all_links, new_links = dedupe_links(all_links, new_links)
|
||||
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
|
||||
|
||||
|
||||
# Step 2: Write updated index with deduped old and new links back to disk
|
||||
write_main_index(links=all_links, out_dir=out_dir)
|
||||
# If we're going one level deeper, download each link and look for more links
|
||||
if new_links and depth == 1:
|
||||
log_crawl_started(new_links)
|
||||
for new_link in new_links:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
|
||||
new_links += parse_links_from_source(downloaded_file)
|
||||
all_links, new_links = dedupe_links(all_links, new_links)
|
||||
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
|
||||
|
||||
if index_only:
|
||||
return all_links
|
||||
|
||||
# Step 3: Run the archive methods for each link
|
||||
links = all_links if update_all else new_links
|
||||
log_archiving_started(len(links))
|
||||
idx: int = 0
|
||||
link: Link = None # type: ignore
|
||||
try:
|
||||
for idx, link in enumerate(links):
|
||||
archive_link(link, out_dir=link.link_dir)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
|
||||
raise SystemExit(0)
|
||||
|
||||
except:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(len(links))
|
||||
# Run the archive methods for each link
|
||||
to_archive = all_links if update_all else new_links
|
||||
archive_links(to_archive, out_dir=out_dir)
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
if to_archive:
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
|
||||
return all_links
|
||||
|
@ -671,23 +666,8 @@ def update(resume: Optional[float]=None,
|
|||
return all_links
|
||||
|
||||
# Step 3: Run the archive methods for each link
|
||||
links = new_links if only_new else all_links
|
||||
log_archiving_started(len(links), resume)
|
||||
idx: int = 0
|
||||
link: Link = None # type: ignore
|
||||
try:
|
||||
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
||||
archive_link(link, overwrite=overwrite, out_dir=link.link_dir)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
|
||||
raise SystemExit(0)
|
||||
|
||||
except:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(len(links))
|
||||
to_archive = new_links if only_new else all_links
|
||||
archive_links(to_archive, out_dir=out_dir)
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
|
|
|
@ -29,7 +29,7 @@ from ..util import (
|
|||
URL_REGEX,
|
||||
)
|
||||
from ..index.schema import Link
|
||||
from ..cli.logging import pretty_path, TimedProgress
|
||||
from ..cli.logging import pretty_path, TimedProgress, log_source_saved
|
||||
from .pocket_html import parse_pocket_html_export
|
||||
from .pinboard_rss import parse_pinboard_rss_export
|
||||
from .shaarli_rss import parse_shaarli_rss_export
|
||||
|
@ -83,36 +83,22 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
|
||||
check_data_folder(out_dir=out_dir)
|
||||
|
||||
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
|
||||
if not os.path.exists(sources_dir):
|
||||
os.makedirs(sources_dir)
|
||||
|
||||
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: str=OUTPUT_DIR) -> str:
|
||||
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
||||
|
||||
source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
|
||||
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(ts=ts))
|
||||
atomic_write(source_path, raw_text)
|
||||
log_source_saved(source_file=source_path)
|
||||
return source_path
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
|
||||
def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: str=OUTPUT_DIR) -> str:
|
||||
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
|
||||
check_data_folder(out_dir=out_dir)
|
||||
|
||||
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
|
||||
if not os.path.exists(sources_dir):
|
||||
os.makedirs(sources_dir)
|
||||
|
||||
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
||||
|
||||
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
|
||||
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts))
|
||||
|
||||
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
# Source is a URL that needs to be downloaded
|
||||
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
|
||||
print('{}[*] [{}] Downloading {}{}'.format(
|
||||
ANSI['green'],
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
|
@ -140,7 +126,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI
|
|||
|
||||
atomic_write(source_path, raw_source_text)
|
||||
|
||||
print(' > {}'.format(pretty_path(source_path)))
|
||||
log_source_saved(source_file=source_path)
|
||||
|
||||
return source_path
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue