mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
save all imports to sources dir
This commit is contained in:
parent
cc3d1e9cc9
commit
4c8e45b8d7
3 changed files with 36 additions and 33 deletions
|
@ -43,8 +43,8 @@ from .config import (
|
|||
)
|
||||
from .util import (
|
||||
enforce_types,
|
||||
save_remote_source,
|
||||
save_stdin_source,
|
||||
handle_stdin_import,
|
||||
handle_file_import,
|
||||
)
|
||||
from .logs import (
|
||||
log_archiving_started,
|
||||
|
@ -160,12 +160,12 @@ def main(args=None) -> None:
|
|||
print_help()
|
||||
raise SystemExit(1)
|
||||
|
||||
import_path = save_stdin_source(stdin_raw_text)
|
||||
import_path = handle_stdin_import(stdin_raw_text)
|
||||
|
||||
### Handle ingesting urls from a remote file/feed
|
||||
### Handle ingesting url from a remote file/feed
|
||||
# (e.g. if an RSS feed URL is used as the import path)
|
||||
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
import_path = save_remote_source(import_path)
|
||||
if import_path:
|
||||
import_path = handle_file_import(import_path)
|
||||
|
||||
### Run the main archive update process
|
||||
update_archive_data(import_path=import_path, resume=resume)
|
||||
|
|
|
@ -90,7 +90,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
|
|||
if is_new:
|
||||
os.makedirs(link_dir)
|
||||
|
||||
link = load_json_link_index(link, link_dir)
|
||||
link = load_json_link_index(link, link_dir=link_dir)
|
||||
log_link_archiving_started(link, link_dir, is_new)
|
||||
link = link.overwrite(updated=datetime.now())
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
|
@ -103,7 +103,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
|
|||
if should_run(link, link_dir):
|
||||
log_archive_method_started(method_name)
|
||||
|
||||
result = method_function(link, link_dir)
|
||||
result = method_function(link=link, link_dir=link_dir)
|
||||
|
||||
link.history[method_name].append(result)
|
||||
|
||||
|
|
|
@ -187,7 +187,7 @@ def check_url_parsing_invariants() -> None:
|
|||
### Random Helpers
|
||||
|
||||
@enforce_types
|
||||
def save_stdin_source(raw_text: str) -> str:
|
||||
def handle_stdin_import(raw_text: str) -> str:
|
||||
if not os.path.exists(SOURCES_DIR):
|
||||
os.makedirs(SOURCES_DIR)
|
||||
|
||||
|
@ -195,14 +195,12 @@ def save_stdin_source(raw_text: str) -> str:
|
|||
|
||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
|
||||
|
||||
with open(source_path, 'w', encoding='utf-8') as f:
|
||||
f.write(raw_text)
|
||||
|
||||
atomic_write(raw_text, source_path)
|
||||
return source_path
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
|
||||
def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
|
||||
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
|
||||
|
||||
if not os.path.exists(SOURCES_DIR):
|
||||
|
@ -210,30 +208,35 @@ def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
|
|||
|
||||
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
||||
|
||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
|
||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
|
||||
|
||||
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
|
||||
print('{}[*] [{}] Downloading {}{}'.format(
|
||||
ANSI['green'],
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
url,
|
||||
path,
|
||||
ANSI['reset'],
|
||||
))
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
downloaded_xml = download_url(url, timeout=timeout)
|
||||
raw_source_text = download_url(path, timeout=timeout)
|
||||
timer.end()
|
||||
except Exception as e:
|
||||
timer.end()
|
||||
print('{}[!] Failed to download {}{}\n'.format(
|
||||
ANSI['red'],
|
||||
url,
|
||||
path,
|
||||
ANSI['reset'],
|
||||
))
|
||||
print(' ', e)
|
||||
raise SystemExit(1)
|
||||
|
||||
with open(source_path, 'w', encoding='utf-8') as f:
|
||||
f.write(downloaded_xml)
|
||||
else:
|
||||
with open(path, 'r') as f:
|
||||
raw_source_text = f.read()
|
||||
|
||||
atomic_write(raw_source_text, source_path)
|
||||
|
||||
print(' > {}'.format(pretty_path(source_path)))
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue