mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 15:14:31 -04:00
save all imports to sources dir
This commit is contained in:
parent
cc3d1e9cc9
commit
4c8e45b8d7
3 changed files with 36 additions and 33 deletions
|
@ -43,8 +43,8 @@ from .config import (
|
||||||
)
|
)
|
||||||
from .util import (
|
from .util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
save_remote_source,
|
handle_stdin_import,
|
||||||
save_stdin_source,
|
handle_file_import,
|
||||||
)
|
)
|
||||||
from .logs import (
|
from .logs import (
|
||||||
log_archiving_started,
|
log_archiving_started,
|
||||||
|
@ -160,12 +160,12 @@ def main(args=None) -> None:
|
||||||
print_help()
|
print_help()
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
import_path = save_stdin_source(stdin_raw_text)
|
import_path = handle_stdin_import(stdin_raw_text)
|
||||||
|
|
||||||
### Handle ingesting urls from a remote file/feed
|
### Handle ingesting url from a remote file/feed
|
||||||
# (e.g. if an RSS feed URL is used as the import path)
|
# (e.g. if an RSS feed URL is used as the import path)
|
||||||
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
if import_path:
|
||||||
import_path = save_remote_source(import_path)
|
import_path = handle_file_import(import_path)
|
||||||
|
|
||||||
### Run the main archive update process
|
### Run the main archive update process
|
||||||
update_archive_data(import_path=import_path, resume=resume)
|
update_archive_data(import_path=import_path, resume=resume)
|
||||||
|
|
|
@ -90,7 +90,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
|
||||||
if is_new:
|
if is_new:
|
||||||
os.makedirs(link_dir)
|
os.makedirs(link_dir)
|
||||||
|
|
||||||
link = load_json_link_index(link, link_dir)
|
link = load_json_link_index(link, link_dir=link_dir)
|
||||||
log_link_archiving_started(link, link_dir, is_new)
|
log_link_archiving_started(link, link_dir, is_new)
|
||||||
link = link.overwrite(updated=datetime.now())
|
link = link.overwrite(updated=datetime.now())
|
||||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||||
|
@ -103,7 +103,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
|
||||||
if should_run(link, link_dir):
|
if should_run(link, link_dir):
|
||||||
log_archive_method_started(method_name)
|
log_archive_method_started(method_name)
|
||||||
|
|
||||||
result = method_function(link, link_dir)
|
result = method_function(link=link, link_dir=link_dir)
|
||||||
|
|
||||||
link.history[method_name].append(result)
|
link.history[method_name].append(result)
|
||||||
|
|
||||||
|
|
|
@ -187,7 +187,7 @@ def check_url_parsing_invariants() -> None:
|
||||||
### Random Helpers
|
### Random Helpers
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_stdin_source(raw_text: str) -> str:
|
def handle_stdin_import(raw_text: str) -> str:
|
||||||
if not os.path.exists(SOURCES_DIR):
|
if not os.path.exists(SOURCES_DIR):
|
||||||
os.makedirs(SOURCES_DIR)
|
os.makedirs(SOURCES_DIR)
|
||||||
|
|
||||||
|
@ -195,14 +195,12 @@ def save_stdin_source(raw_text: str) -> str:
|
||||||
|
|
||||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
|
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
|
||||||
|
|
||||||
with open(source_path, 'w', encoding='utf-8') as f:
|
atomic_write(raw_text, source_path)
|
||||||
f.write(raw_text)
|
|
||||||
|
|
||||||
return source_path
|
return source_path
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
|
def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
|
||||||
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
|
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
|
||||||
|
|
||||||
if not os.path.exists(SOURCES_DIR):
|
if not os.path.exists(SOURCES_DIR):
|
||||||
|
@ -210,30 +208,35 @@ def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
|
||||||
|
|
||||||
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
||||||
|
|
||||||
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
|
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
|
||||||
|
|
||||||
print('{}[*] [{}] Downloading {}{}'.format(
|
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||||
ANSI['green'],
|
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
print('{}[*] [{}] Downloading {}{}'.format(
|
||||||
url,
|
ANSI['green'],
|
||||||
ANSI['reset'],
|
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
))
|
path,
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
|
||||||
try:
|
|
||||||
downloaded_xml = download_url(url, timeout=timeout)
|
|
||||||
timer.end()
|
|
||||||
except Exception as e:
|
|
||||||
timer.end()
|
|
||||||
print('{}[!] Failed to download {}{}\n'.format(
|
|
||||||
ANSI['red'],
|
|
||||||
url,
|
|
||||||
ANSI['reset'],
|
ANSI['reset'],
|
||||||
))
|
))
|
||||||
print(' ', e)
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
raise SystemExit(1)
|
try:
|
||||||
|
raw_source_text = download_url(path, timeout=timeout)
|
||||||
|
timer.end()
|
||||||
|
except Exception as e:
|
||||||
|
timer.end()
|
||||||
|
print('{}[!] Failed to download {}{}\n'.format(
|
||||||
|
ANSI['red'],
|
||||||
|
path,
|
||||||
|
ANSI['reset'],
|
||||||
|
))
|
||||||
|
print(' ', e)
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
with open(source_path, 'w', encoding='utf-8') as f:
|
else:
|
||||||
f.write(downloaded_xml)
|
with open(path, 'r') as f:
|
||||||
|
raw_source_text = f.read()
|
||||||
|
|
||||||
|
atomic_write(raw_source_text, source_path)
|
||||||
|
|
||||||
print(' > {}'.format(pretty_path(source_path)))
|
print(' > {}'.format(pretty_path(source_path)))
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue