add pipenv, schedule cmd, logs dir, and lots more

This commit is contained in:
Nick Sweeting 2019-04-18 21:09:54 -04:00
parent 4f869f235f
commit 39a0ab3013
20 changed files with 820 additions and 188 deletions

22
Pipfile Normal file
View file

@ -0,0 +1,22 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
ipdb = "*"
flake8 = "*"
mypy = "*"
django-stubs = "*"
setuptools = "*"
[packages]
dataclasses = "*"
base32-crockford = "*"
django = "*"
youtube-dl = "*"
python-crontab = "*"
croniter = "*"
[requires]
python_version = ">=3.6"

314
Pipfile.lock generated Normal file
View file

@ -0,0 +1,314 @@
{
"_meta": {
"hash": {
"sha256": "7f25fb9c97e469fdb787e755c5756e2be4b0b649e3c5ad8feb17200b32d3bb36"
},
"pipfile-spec": 6,
"requires": {
"python_version": ">=3.6"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"base32-crockford": {
"hashes": [
"sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969",
"sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"
],
"index": "pypi",
"version": "==0.3.0"
},
"croniter": {
"hashes": [
"sha256:625949cbd38a0b2325295591940dfa5fa0dfca41d03150ae0284a924e0be10f0",
"sha256:66b6a9c6b2d1a85d4af51453b2328be775a173e688b69eb3a96a7ec752ba77a3"
],
"index": "pypi",
"version": "==0.3.29"
},
"dataclasses": {
"hashes": [
"sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f",
"sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84"
],
"index": "pypi",
"version": "==0.6"
},
"django": {
"hashes": [
"sha256:7c3543e4fb070d14e10926189a7fcf42ba919263b7473dceaefce34d54e8a119",
"sha256:a2814bffd1f007805b19194eb0b9a331933b82bd5da1c3ba3d7b7ba16e06dc4b"
],
"index": "pypi",
"version": "==2.2"
},
"python-crontab": {
"hashes": [
"sha256:91ce4b245ee5e5c117aa0b21b485bc43f2d80df854a36e922b707643f50d7923"
],
"index": "pypi",
"version": "==2.3.6"
},
"python-dateutil": {
"hashes": [
"sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
"sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
],
"version": "==2.8.0"
},
"pytz": {
"hashes": [
"sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
"sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"
],
"version": "==2019.1"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.12.0"
},
"sqlparse": {
"hashes": [
"sha256:40afe6b8d4b1117e7dff5504d7a8ce07d9a1b15aeeade8a2d10f130a834f8177",
"sha256:7c3dca29c022744e95b547e867cee89f4fce4373f3549ccd8797d8eb52cdb873"
],
"version": "==0.3.0"
},
"youtube-dl": {
"hashes": [
"sha256:0d25459093870bf560bccafe9015e59402d7de1b2c956593623ba4c2840153e5",
"sha256:ea0824ae9a166059ec754c267480198a074bd899c20b2ba497809bac099cde2e"
],
"index": "pypi",
"version": "==2019.4.17"
}
},
"develop": {
"appnope": {
"hashes": [
"sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
"sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
],
"markers": "sys_platform == 'darwin'",
"version": "==0.1.0"
},
"backcall": {
"hashes": [
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
],
"version": "==0.1.0"
},
"decorator": {
"hashes": [
"sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
"sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
],
"version": "==4.4.0"
},
"django-stubs": {
"hashes": [
"sha256:9c06a4b28fc8c18f6abee4f199f8ee29cb5cfcecf349e912ded31cb3526ea2b6",
"sha256:9ef230843a24b5d74f2ebd4c60f9bea09c21911bc119d0325e8bb47e2f495e70"
],
"index": "pypi",
"version": "==0.12.1"
},
"entrypoints": {
"hashes": [
"sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
"sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
],
"version": "==0.3"
},
"flake8": {
"hashes": [
"sha256:859996073f341f2670741b51ec1e67a01da142831aa1fdc6242dbf88dffbe661",
"sha256:a796a115208f5c03b18f332f7c11729812c8c3ded6c46319c59b53efd3819da8"
],
"index": "pypi",
"version": "==3.7.7"
},
"ipdb": {
"hashes": [
"sha256:dce2112557edfe759742ca2d0fee35c59c97b0cc7a05398b791079d78f1519ce"
],
"index": "pypi",
"version": "==0.12"
},
"ipython": {
"hashes": [
"sha256:b038baa489c38f6d853a3cfc4c635b0cda66f2864d136fe8f40c1a6e334e2a6b",
"sha256:f5102c1cd67e399ec8ea66bcebe6e3968ea25a8977e53f012963e5affeb1fe38"
],
"markers": "python_version >= '3.4'",
"version": "==7.4.0"
},
"ipython-genutils": {
"hashes": [
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
],
"version": "==0.2.0"
},
"jedi": {
"hashes": [
"sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b",
"sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c"
],
"version": "==0.13.3"
},
"mccabe": {
"hashes": [
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
],
"version": "==0.6.1"
},
"mypy": {
"hashes": [
"sha256:2afe51527b1f6cdc4a5f34fc90473109b22bf7f21086ba3e9451857cf11489e6",
"sha256:56a16df3e0abb145d8accd5dbb70eba6c4bd26e2f89042b491faa78c9635d1e2",
"sha256:5764f10d27b2e93c84f70af5778941b8f4aa1379b2430f85c827e0f5464e8714",
"sha256:5bbc86374f04a3aa817622f98e40375ccb28c4836f36b66706cf3c6ccce86eda",
"sha256:6a9343089f6377e71e20ca734cd8e7ac25d36478a9df580efabfe9059819bf82",
"sha256:6c9851bc4a23dc1d854d3f5dfd5f20a016f8da86bcdbb42687879bb5f86434b0",
"sha256:b8e85956af3fcf043d6f87c91cbe8705073fc67029ba6e22d3468bfee42c4823",
"sha256:b9a0af8fae490306bc112229000aa0c2ccc837b49d29a5c42e088c132a2334dd",
"sha256:bbf643528e2a55df2c1587008d6e3bda5c0445f1240dfa85129af22ae16d7a9a",
"sha256:c46ab3438bd21511db0f2c612d89d8344154c0c9494afc7fbc932de514cf8d15",
"sha256:f7a83d6bd805855ef83ec605eb01ab4fa42bcef254b13631e451cbb44914a9b0"
],
"index": "pypi",
"version": "==0.701"
},
"mypy-extensions": {
"hashes": [
"sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812",
"sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e"
],
"version": "==0.4.1"
},
"parso": {
"hashes": [
"sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33",
"sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376"
],
"version": "==0.4.0"
},
"pexpect": {
"hashes": [
"sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
"sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
],
"markers": "sys_platform != 'win32'",
"version": "==4.7.0"
},
"pickleshare": {
"hashes": [
"sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
"sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
],
"version": "==0.7.5"
},
"prompt-toolkit": {
"hashes": [
"sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
"sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
"sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
],
"version": "==2.0.9"
},
"ptyprocess": {
"hashes": [
"sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
"sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
],
"version": "==0.6.0"
},
"pycodestyle": {
"hashes": [
"sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56",
"sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"
],
"version": "==2.5.0"
},
"pyflakes": {
"hashes": [
"sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0",
"sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2"
],
"version": "==2.1.1"
},
"pygments": {
"hashes": [
"sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
"sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
],
"version": "==2.3.1"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.12.0"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
],
"version": "==4.3.2"
},
"typed-ast": {
"hashes": [
"sha256:04894d268ba6eab7e093d43107869ad49e7b5ef40d1a94243ea49b352061b200",
"sha256:16616ece19daddc586e499a3d2f560302c11f122b9c692bc216e821ae32aa0d0",
"sha256:252fdae740964b2d3cdfb3f84dcb4d6247a48a6abe2579e8029ab3be3cdc026c",
"sha256:2af80a373af123d0b9f44941a46df67ef0ff7a60f95872412a145f4500a7fc99",
"sha256:2c88d0a913229a06282b285f42a31e063c3bf9071ff65c5ea4c12acb6977c6a7",
"sha256:2ea99c029ebd4b5a308d915cc7fb95b8e1201d60b065450d5d26deb65d3f2bc1",
"sha256:3d2e3ab175fc097d2a51c7a0d3fda442f35ebcc93bb1d7bd9b95ad893e44c04d",
"sha256:4766dd695548a15ee766927bf883fb90c6ac8321be5a60c141f18628fb7f8da8",
"sha256:56b6978798502ef66625a2e0f80cf923da64e328da8bbe16c1ff928c70c873de",
"sha256:5cddb6f8bce14325b2863f9d5ac5c51e07b71b462361fd815d1d7706d3a9d682",
"sha256:644ee788222d81555af543b70a1098f2025db38eaa99226f3a75a6854924d4db",
"sha256:64cf762049fc4775efe6b27161467e76d0ba145862802a65eefc8879086fc6f8",
"sha256:68c362848d9fb71d3c3e5f43c09974a0ae319144634e7a47db62f0f2a54a7fa7",
"sha256:6c1f3c6f6635e611d58e467bf4371883568f0de9ccc4606f17048142dec14a1f",
"sha256:b213d4a02eec4ddf622f4d2fbc539f062af3788d1f332f028a2e19c42da53f15",
"sha256:bb27d4e7805a7de0e35bd0cb1411bc85f807968b2b0539597a49a23b00a622ae",
"sha256:c9d414512eaa417aadae7758bc118868cd2396b0e6138c1dd4fda96679c079d3",
"sha256:f0937165d1e25477b01081c4763d2d9cdc3b18af69cb259dd4f640c9b900fe5e",
"sha256:fb96a6e2c11059ecf84e6741a319f93f683e440e341d4489c9b161eca251cf2a",
"sha256:fc71d2d6ae56a091a8d94f33ec9d0f2001d1cb1db423d8b4355debfe9ce689b7"
],
"version": "==1.3.4"
},
"typing-extensions": {
"hashes": [
"sha256:07b2c978670896022a43c4b915df8958bec4a6b84add7f2c87b2b728bda3ba64",
"sha256:f3f0e67e1d42de47b5c67c32c9b26641642e9170fe7e292991793705cd5fef7c",
"sha256:fb2cd053238d33a8ec939190f30cfd736c00653a85a2919415cecf7dc3d9da71"
],
"version": "==3.7.2"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
],
"version": "==0.1.7"
}
}
}

View file

@ -1,30 +1,59 @@
__package__ = 'archivebox.cli' __package__ = 'archivebox.cli'
import os import os
from typing import Dict
from importlib import import_module from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__)) CLI_DIR = os.path.dirname(os.path.abspath(__file__))
required_attrs = ('__package__', '__command__', '__description__', 'main') # these common commands will appear sorted before any others for ease-of-use
display_first = ('help', 'version', 'init', 'list', 'update', 'add', 'remove')
# every imported command module must have these properties in order to be valid
required_attrs = ('__package__', '__command__', 'main')
order = ('help', 'version', 'init', 'list', 'update', 'add', 'remove') # basic checks to make sure imported files are valid subcommands
is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
is_valid_cli_module = lambda module, subcommand: (
all(hasattr(module, attr) for attr in required_attrs)
and module.__command__.split(' ')[-1] == subcommand
)
def list_subcommands() -> Dict[str, str]:
"""find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
def list_subcommands():
COMMANDS = [] COMMANDS = []
for filename in os.listdir(CLI_DIR): for filename in os.listdir(CLI_DIR):
if filename.startswith('archivebox_') and filename.endswith('.py'): if is_cli_module(filename):
subcommand = filename.replace('archivebox_', '').replace('.py', '') subcommand = filename.replace('archivebox_', '').replace('.py', '')
module = import_module('.archivebox_{}'.format(subcommand), __package__) module = import_module('.archivebox_{}'.format(subcommand), __package__)
assert is_valid_cli_module(module, subcommand)
COMMANDS.append((subcommand, module.__description__)) # type: ignore
globals()[subcommand] = module.main
module.main.__doc__ = module.__description__
assert all(hasattr(module, attr) for attr in required_attrs) display_order = lambda cmd: (
assert module.__command__.split(' ')[-1] == subcommand display_first.index(cmd[0])
COMMANDS.append((subcommand, module.__description__)) if cmd[0] in display_first else
100 + len(cmd[0])
)
return dict(sorted(COMMANDS, key=lambda cmd: order.index(cmd[0]) if cmd[0] in order else 10 + len(cmd[0]))) return dict(sorted(COMMANDS, key=display_order))
def run_subcommand(subcommand: str, args=None): def run_subcommand(subcommand: str, args=None) -> None:
"""run a given ArchiveBox subcommand with the given list of args"""
module = import_module('.archivebox_{}'.format(subcommand), __package__) module = import_module('.archivebox_{}'.format(subcommand), __package__)
return module.main(args) # type: ignore module.main(args) # type: ignore
SUBCOMMANDS = list_subcommands()
__all__ = (
'SUBCOMMANDS',
'list_subcommands',
'run_subcommand',
*SUBCOMMANDS.keys(),
)

View file

@ -82,5 +82,6 @@ def main(args=None, stdin=None):
only_new=command.only_new, only_new=command.only_new,
) )
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -4,7 +4,6 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox init' __command__ = 'archivebox init'
__description__ = 'Initialize a new ArchiveBox collection in the current directory' __description__ = 'Initialize a new ArchiveBox collection in the current directory'
import os
import sys import sys
import argparse import argparse

View file

@ -0,0 +1,194 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox schedule'
__description__ = 'Set ArchiveBox to run regularly at a specific time'
import os
import sys
import argparse
from datetime import datetime
from crontab import CronTab, CronSlices
from ..legacy.util import reject_stdin
from ..legacy.config import (
OUTPUT_DIR,
LOGS_DIR,
ARCHIVEBOX_BINARY,
USER,
ANSI,
stderr,
)
CRON_COMMENT = 'archivebox_schedule'
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help=("Don't warn about storage space."),
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--add', # '-a',
action='store_true',
help='Add a new scheduled ArchiveBox update job to cron',
)
parser.add_argument(
'--every', # '-e',
type=str,
default='daily',
help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',
)
group.add_argument(
'--clear', # '-c'
action='store_true',
help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
)
group.add_argument(
'--show', # '-s'
action='store_true',
help=("Print a list of currently active ArchiveBox cron jobs"),
)
group.add_argument(
'--foreground', '-f',
action='store_true',
help=("Launch ArchiveBox as a long-running foreground task "
"instead of using cron."),
)
group.add_argument(
'--run-all', # '-a',
action='store_true',
help='Run all the scheduled jobs once immediately, independent of their configured schedules',
)
parser.add_argument(
'import_path',
nargs='?',
type=str,
default=None,
help=("Check this path and import any new links on every run "
"(can be either local file or remote URL)"),
)
command = parser.parse_args(args)
reject_stdin(__command__)
os.makedirs(LOGS_DIR, exist_ok=True)
cron = CronTab(user=True)
cron = dedupe_jobs(cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if command.foreground or command.run_all:
if command.import_path or (not existing_jobs):
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
raise SystemExit(1)
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
if command.run_all:
try:
for job in existing_jobs:
sys.stdout.write(f' > {job.command}')
sys.stdout.flush()
job.run()
sys.stdout.write(f'\r{job.command}\n')
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
if command.foreground:
try:
for result in cron.run_scheduler():
print(result)
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
elif command.show:
if existing_jobs:
print('\n'.join(str(cmd) for cmd in existing_jobs))
else:
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
stderr(' To schedule a new job, run:')
stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
raise SystemExit(0)
elif command.clear:
print(cron.remove_all(comment=CRON_COMMENT))
cron.write()
raise SystemExit(0)
elif command.every:
quoted = lambda s: f'"{s}"' if s and ' ' in s else s
cmd = [
'cd',
quoted(OUTPUT_DIR),
'&&',
quoted(ARCHIVEBOX_BINARY),
*(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
'2>&1',
'>',
quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
]
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
set_every = getattr(new_job.every(), command.every)
set_every()
elif CronSlices.is_valid(command.every):
new_job.setall(command.every)
else:
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
stderr(' It must be one of minute/hour/day/week/month')
stderr(' or a quoted cron-format schedule like:')
stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
raise SystemExit(1)
cron = dedupe_jobs(cron)
cron.write()
total_runs = sum(j.frequency_per_year() for j in cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
print()
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
if total_runs > 60 and not command.quiet:
stderr()
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
stderr(f' Congrats on being an enthusiastic internet archiver! 👌')
stderr()
stderr(' Make sure you have enough storage space available to hold all the data.')
stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
raise SystemExit(0)
def dedupe_jobs(cron: CronTab) -> CronTab:
deduped = set()
for job in list(cron):
unique_tuple = (str(job.slices), job.command)
if unique_tuple not in deduped:
deduped.add(unique_tuple)
cron.remove(job)
for schedule, command in deduped:
job = cron.new(command=command, comment=CRON_COMMENT)
job.setall(schedule)
job.enable()
return cron
if __name__ == '__main__':
main()

View file

@ -7,7 +7,7 @@ __description__ = 'Run the ArchiveBox HTTP server'
import sys import sys
import argparse import argparse
from ..legacy.config import setup_django from ..legacy.config import setup_django, OUTPUT_DIR
from ..legacy.util import reject_stdin from ..legacy.util import reject_stdin
@ -29,7 +29,7 @@ def main(args=None):
command = parser.parse_args(args) command = parser.parse_args(args)
reject_stdin(__command__) reject_stdin(__command__)
setup_django() setup_django(OUTPUT_DIR)
from django.core.management import call_command from django.core.management import call_command
call_command("runserver", *command.runserver_args) call_command("runserver", *command.runserver_args)

View file

@ -7,7 +7,7 @@ __description__ = 'Enter an interactive ArchiveBox Django shell'
import sys import sys
import argparse import argparse
from ..legacy.config import setup_django from ..legacy.config import setup_django, OUTPUT_DIR
from ..legacy.util import reject_stdin from ..legacy.util import reject_stdin
@ -22,7 +22,7 @@ def main(args=None):
parser.parse_args(args) parser.parse_args(args)
reject_stdin(__command__) reject_stdin(__command__)
setup_django() setup_django(OUTPUT_DIR)
from django.core.management import call_command from django.core.management import call_command
call_command("shell_plus") call_command("shell_plus")

View file

@ -5,10 +5,8 @@ import os
SECRET_KEY = '---------------- not a valid secret key ! ----------------' SECRET_KEY = '---------------- not a valid secret key ! ----------------'
DEBUG = True DEBUG = True
OUTPUT_DIR = os.path.abspath(os.curdir) OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir))
DATABASE_DIR_NAME = 'database' DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3')
DATABASE_FILE_NAME = 'database.sqlite3'
DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
INSTALLED_APPS = [ INSTALLED_APPS = [
@ -38,7 +36,7 @@ ROOT_URLCONF = 'core.urls'
TEMPLATES = [ TEMPLATES = [
{ {
'BACKEND': 'django.template.backends.django.DjangoTemplates', 'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': ['templates'], 'DIRS': ['themes'],
'APP_DIRS': True, 'APP_DIRS': True,
'OPTIONS': { 'OPTIONS': {
'context_processors': [ 'context_processors': [

View file

@ -1,15 +0,0 @@
import os
import sys
PYTHON_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(PYTHON_DIR)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings")
import django
django.setup()
from django.conf import settings
DATABASE_FILE = settings.DATABASE_FILE

View file

@ -60,7 +60,6 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget')
YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl')
CHROME_BINARY = os.getenv('CHROME_BINARY', None) CHROME_BINARY = os.getenv('CHROME_BINARY', None)
# ****************************************************************************** # ******************************************************************************
### Terminal Configuration ### Terminal Configuration
@ -84,6 +83,7 @@ def stderr(*args):
sys.stderr.write(' '.join(str(a) for a in args) + '\n') sys.stderr.write(' '.join(str(a) for a in args) + '\n')
USER = getpass.getuser() or os.getlogin() USER = getpass.getuser() or os.getlogin()
ARCHIVEBOX_BINARY = sys.argv[0]
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..')) REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))
if OUTPUT_DIR: if OUTPUT_DIR:
@ -91,14 +91,15 @@ if OUTPUT_DIR:
else: else:
OUTPUT_DIR = os.path.abspath(os.curdir) OUTPUT_DIR = os.path.abspath(os.curdir)
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
ARCHIVE_DIR_NAME = 'archive' ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources' SOURCES_DIR_NAME = 'sources'
DATABASE_DIR_NAME = 'database' LOGS_DIR_NAME = 'logs'
DATABASE_FILE_NAME = 'database.sqlite3'
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME) ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME) SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
DATABASE_DIR = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME) LOGS_DIR = os.path.join(OUTPUT_DIR, LOGS_DIR_NAME)
DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox') PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox')
LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy') LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy')
@ -126,9 +127,10 @@ if USER == 'root':
raise SystemExit(1) raise SystemExit(1)
### Check Python environment ### Check Python environment
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) PYTHON_BINARY = sys.executable
if python_vers < 3.6: PYTHON_VERSION = '{}.{}'.format(sys.version_info.major, sys.version_info.minor)
stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) if float(PYTHON_VERSION) < 3.6:
stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], PYTHON_VERSION, ANSI['reset']))
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1) raise SystemExit(1)
@ -150,6 +152,7 @@ if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
def bin_version(binary: str) -> Optional[str]: def bin_version(binary: str) -> Optional[str]:
"""check the presence and return valid version line of a specified binary""" """check the presence and return valid version line of a specified binary"""
global HAS_INVALID_DEPENDENCIES global HAS_INVALID_DEPENDENCIES
binary = os.path.expanduser(binary) binary = os.path.expanduser(binary)
try: try:
@ -223,12 +226,17 @@ def find_chrome_data_dir() -> Optional[str]:
return None return None
def setup_django(): def setup_django(out_dir: str=OUTPUT_DIR, check_db=False):
import django import django
sys.path.append(PYTHON_DIR) sys.path.append(PYTHON_DIR)
os.environ.setdefault('OUTPUT_DIR', out_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup() django.setup()
if check_db:
assert os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)), (
f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {out_dir}')
# ****************************************************************************** # ******************************************************************************
# ************************ Environment & Dependencies ************************** # ************************ Environment & Dependencies **************************
# ****************************************************************************** # ******************************************************************************
@ -338,16 +346,16 @@ try:
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(SOURCES_DIR), 'is_valid': os.path.exists(SOURCES_DIR),
}, },
'LOGS_DIR': {
'path': os.path.abspath(LOGS_DIR),
'enabled': True,
'is_valid': os.path.exists(LOGS_DIR),
},
'ARCHIVE_DIR': { 'ARCHIVE_DIR': {
'path': os.path.abspath(ARCHIVE_DIR), 'path': os.path.abspath(ARCHIVE_DIR),
'enabled': True, 'enabled': True,
'is_valid': os.path.exists(ARCHIVE_DIR), 'is_valid': os.path.exists(ARCHIVE_DIR),
}, },
'DATABASE_DIR': {
'path': os.path.abspath(DATABASE_DIR),
'enabled': True,
'is_valid': os.path.exists(DATABASE_FILE),
},
'CHROME_USER_DATA_DIR': { 'CHROME_USER_DATA_DIR': {
'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR), 'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR),
'enabled': USE_CHROME and CHROME_USER_DATA_DIR, 'enabled': USE_CHROME and CHROME_USER_DATA_DIR,
@ -361,6 +369,12 @@ try:
} }
DEPENDENCIES = { DEPENDENCIES = {
'PYTHON_BINARY': {
'path': PYTHON_BINARY,
'version': PYTHON_VERSION,
'enabled': True,
'is_valid': bool(DJANGO_VERSION),
},
'DJANGO_BINARY': { 'DJANGO_BINARY': {
'path': DJANGO_BINARY, 'path': DJANGO_BINARY,
'version': DJANGO_VERSION, 'version': DJANGO_VERSION,

View file

@ -1,13 +1,17 @@
__package__ = 'archivebox.legacy'
import os import os
import json import json
from typing import List, Tuple, Optional, Iterable from typing import List, Tuple, Optional, Iterable
from collections import OrderedDict from collections import OrderedDict
from contextlib import contextmanager
from .schema import Link, ArchiveResult from .schema import Link, ArchiveResult
from .config import ( from .config import (
DATABASE_DIR, SQL_INDEX_FILENAME,
DATABASE_FILE_NAME, JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
OUTPUT_DIR, OUTPUT_DIR,
TIMEOUT, TIMEOUT,
URL_BLACKLIST_PTN, URL_BLACKLIST_PTN,
@ -35,14 +39,13 @@ from .util import (
from .parse import parse_links from .parse import parse_links
from .logs import ( from .logs import (
log_indexing_process_started, log_indexing_process_started,
log_indexing_process_finished,
log_indexing_started, log_indexing_started,
log_indexing_finished, log_indexing_finished,
log_parsing_started, log_parsing_started,
log_parsing_finished, log_parsing_finished,
) )
### Link filtering and checking ### Link filtering and checking
@enforce_types @enforce_types
@ -117,7 +120,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
if not links: if not links:
stderr('{red}[X] No links found in index.json{reset}'.format(**ANSI)) stderr('{red}[X] No links found in index.{reset}'.format(**ANSI))
stderr(' To add a link to your archive, run:') stderr(' To add a link to your archive, run:')
stderr(" archivebox add 'https://example.com'") stderr(" archivebox add 'https://example.com'")
stderr() stderr()
@ -204,49 +207,54 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
### Main Links Index ### Main Links Index
@contextmanager
@enforce_types
def timed_index_update(out_path: str):
log_indexing_started(out_path)
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
yield
finally:
timer.end()
assert os.path.exists(out_path), f'Failed to write index file: {out_path}'
log_indexing_finished(out_path)
@enforce_types @enforce_types
def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links""" """create index.html file for a given list of links"""
log_indexing_process_started() log_indexing_process_started(len(links))
log_indexing_started(DATABASE_DIR, DATABASE_FILE_NAME) with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
timer = TimedProgress(TIMEOUT * 2, prefix=' ') write_sql_main_index(links, out_dir=out_dir)
try:
write_sql_main_index(links)
finally:
timer.end()
log_indexing_finished(DATABASE_DIR, DATABASE_FILE_NAME)
log_indexing_started(out_dir, 'index.json') with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
write_json_main_index(links, out_dir=out_dir) write_json_main_index(links, out_dir=out_dir)
finally:
timer.end()
log_indexing_finished(out_dir, 'index.json')
log_indexing_started(out_dir, 'index.html') with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
write_html_main_index(links, out_dir=out_dir, finished=finished) write_html_main_index(links, out_dir=out_dir, finished=finished)
finally:
timer.end() log_indexing_process_finished()
log_indexing_finished(out_dir, 'index.html')
@enforce_types @enforce_types
def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]: def load_main_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in""" """parse and load existing index with any new links from import_path merged in"""
existing_links: List[Link] = [] all_links: List[Link] = []
if out_dir: all_links = list(parse_json_main_index(out_dir))
existing_links = list(parse_json_main_index(out_dir)) links_from_sql = list(parse_sql_main_index())
existing_sql_links = list(parse_sql_main_index()) assert set(l.url for l in all_links) == set(l['url'] for l in links_from_sql)
assert set(l.url for l in existing_links) == set(l['url'] for l in existing_sql_links)
return all_links
@enforce_types
def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
new_links: List[Link] = [] new_links: List[Link] = []
if import_path:
# parse and validate the import file # parse and validate the import file
log_parsing_started(import_path) log_parsing_started(import_path)
raw_links, parser_name = parse_links(import_path) raw_links, parser_name = parse_links(import_path)
@ -255,7 +263,7 @@ def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) ->
# merge existing links in out_dir and new links # merge existing links in out_dir and new links
all_links = list(validate_links(existing_links + new_links)) all_links = list(validate_links(existing_links + new_links))
if import_path and parser_name: if parser_name:
num_parsed = len(raw_links) num_parsed = len(raw_links)
num_new_links = len(all_links) - len(existing_links) num_new_links = len(all_links) - len(existing_links)
log_parsing_finished(num_parsed, num_new_links, parser_name) log_parsing_finished(num_parsed, num_new_links, parser_name)
@ -323,9 +331,3 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
return merge_links(existing_link, link) return merge_links(existing_link, link)
return link return link

View file

@ -6,7 +6,7 @@ from dataclasses import dataclass
from typing import Optional, List from typing import Optional, List
from .schema import Link, ArchiveResult from .schema import Link, ArchiveResult
from .config import ANSI, OUTPUT_DIR from .config import ANSI, OUTPUT_DIR, IS_TTY
@dataclass @dataclass
@ -42,7 +42,7 @@ def pretty_path(path: str) -> str:
def log_parsing_started(source_file: str): def log_parsing_started(source_file: str):
start_ts = datetime.now() start_ts = datetime.now()
_LAST_RUN_STATS.parse_start_ts = start_ts _LAST_RUN_STATS.parse_start_ts = start_ts
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'), start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1], source_file.rsplit('/', 1)[-1],
**ANSI, **ANSI,
@ -56,22 +56,26 @@ def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
### Indexing Stage ### Indexing Stage
def log_indexing_process_started(): def log_indexing_process_started(num_links: int):
start_ts = datetime.now() start_ts = datetime.now()
_LAST_RUN_STATS.index_start_ts = start_ts _LAST_RUN_STATS.index_start_ts = start_ts
print() print()
print('{green}[*] [{}] Saving main index files...{reset}'.format( print('{green}[*] [{}] Updating {} links in main index...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'), start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI, **ANSI,
)) ))
def log_indexing_started(out_dir: str, out_file: str): def log_indexing_process_finished():
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_indexing_finished(out_dir: str, out_file: str):
end_ts = datetime.now() end_ts = datetime.now()
_LAST_RUN_STATS.index_end_ts = end_ts _LAST_RUN_STATS.index_end_ts = end_ts
print('\r{}/{}'.format(out_dir, out_file))
def log_indexing_started(out_path: str):
if IS_TTY:
sys.stdout.write(f' > {out_path}')
def log_indexing_finished(out_path: str):
print(f'\r{out_path}')
### Archiving Stage ### Archiving Stage
@ -108,7 +112,7 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
print(' To view your archive, open:') print(' To view your archive, open:')
print(' {}/index.html'.format(OUTPUT_DIR)) print(' {}/index.html'.format(OUTPUT_DIR))
print(' Continue archiving where you left off by running:') print(' Continue archiving where you left off by running:')
print(' archivebox {}'.format(timestamp)) print(' archivebox update --resume={}'.format(timestamp))
def log_archiving_finished(num_links: int): def log_archiving_finished(num_links: int):
end_ts = datetime.now() end_ts = datetime.now()

View file

@ -9,6 +9,7 @@ from .util import enforce_types, TimedProgress
from .index import ( from .index import (
links_after_timestamp, links_after_timestamp,
load_main_index, load_main_index,
import_new_links,
write_main_index, write_main_index,
) )
from .archive_methods import archive_link from .archive_methods import archive_link
@ -19,8 +20,9 @@ from .config import (
OUTPUT_DIR, OUTPUT_DIR,
SOURCES_DIR, SOURCES_DIR,
ARCHIVE_DIR, ARCHIVE_DIR,
DATABASE_DIR, LOGS_DIR,
DATABASE_FILE, JSON_INDEX_FILENAME,
SQL_INDEX_FILENAME,
check_dependencies, check_dependencies,
check_data_folder, check_data_folder,
setup_django, setup_django,
@ -36,60 +38,85 @@ from .logs import (
) )
ALLOWED_IN_OUTPUT_DIR = {
'.DS_Store',
'.venv',
'venv',
'virtualenv',
'.virtualenv',
'sources',
'archive',
'logs',
'static',
}
@enforce_types @enforce_types
def init(): def init():
os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True)
harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv', 'sources', 'archive', 'database', 'logs', 'static'} is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files) existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
if is_empty: if is_empty:
stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI)) print('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
write_main_index([], out_dir=OUTPUT_DIR, finished=True) print('{green}----------------------------------------------------------------{reset}'.format(**ANSI))
else: else:
if existing_index: if existing_index:
stderr('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI)) print('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI))
stderr(f' {OUTPUT_DIR}') print('{green}----------------------------------------------------------------{reset}'.format(**ANSI))
stderr(f' > index.html') print(f' {OUTPUT_DIR}')
stderr(f' > index.json')
else: else:
stderr( stderr(
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}" ("{red}[X] This folder appears to have non-ArchiveBox files in it. You must run 'archivebox init' inside a completely empty directory.{reset}"
"\n\n" "\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n" " {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
" just cd into the folder and run the archivebox command to pick up where you left off.\n\n" " just cd into the folder and run 'archivebox update' to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)" " (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(OUTPUT_DIR, **ANSI) ).format(OUTPUT_DIR, **ANSI)
) )
raise SystemExit(1) raise SystemExit(1)
os.makedirs(SOURCES_DIR, exist_ok=True) os.makedirs(SOURCES_DIR, exist_ok=True)
stderr(f' > sources/') print(f' > {SOURCES_DIR}')
os.makedirs(ARCHIVE_DIR, exist_ok=True) os.makedirs(ARCHIVE_DIR, exist_ok=True)
stderr(f' > archive/') print(f' > {ARCHIVE_DIR}')
os.makedirs(DATABASE_DIR, exist_ok=True)
setup_django() os.makedirs(LOGS_DIR, exist_ok=True)
print(f' > {LOGS_DIR}')
print('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
setup_django(OUTPUT_DIR, check_db=False)
from django.core.management import call_command from django.core.management import call_command
from django.contrib.auth.models import User from django.conf import settings
stderr(f' > database/') assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
print(f' {settings.DATABASE_FILE}')
stderr('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
call_command("makemigrations", interactive=False) call_command("makemigrations", interactive=False)
call_command("migrate", interactive=False) call_command("migrate", interactive=False)
if not User.objects.filter(is_superuser=True).exists(): assert os.path.exists(settings.DATABASE_FILE)
stderr('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
call_command("createsuperuser", interactive=True)
stderr('\n{green}------------------------------------------------------------{reset}'.format(**ANSI)) # from django.contrib.auth.models import User
stderr('{green}[√] Done. ArchiveBox collection is set up in current folder.{reset}'.format(**ANSI)) # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
stderr(' To add new links, you can run:') # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
stderr(" archivebox add 'https://example.com'") # call_command("createsuperuser", interactive=True)
stderr()
stderr(' For more usage and examples, run:') if existing_index:
stderr(' archivebox help') all_links = load_main_index(out_dir=OUTPUT_DIR)
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
else:
write_main_index([], out_dir=OUTPUT_DIR)
print('\n{green}----------------------------------------------------------------{reset}'.format(**ANSI))
print('{green}[√] Done. ArchiveBox collection is set up in the current folder.{reset}'.format(**ANSI))
print(' To add new links, you can run:')
print(" archivebox add 'https://example.com'")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
@ -102,7 +129,11 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
# Step 1: Load list of links from the existing index # Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path # merge in and dedupe new links from import_path
all_links, new_links = load_main_index(out_dir=OUTPUT_DIR, import_path=import_path) all_links: List[Link] = []
new_links: List[Link] = []
all_links = load_main_index(out_dir=OUTPUT_DIR)
if import_path:
all_links, new_links = import_new_links(all_links, import_path)
# Step 2: Write updated index with deduped old and new links back to disk # Step 2: Write updated index with deduped old and new links back to disk
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR) write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
@ -127,7 +158,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
log_archiving_finished(len(links)) log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True) write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links return all_links
@ -152,7 +183,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact', def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]: after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links: for link in all_links:
if after is not None and float(link.timestamp) < after: if after is not None and float(link.timestamp) < after:
@ -198,7 +229,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
timer = TimedProgress(360, prefix=' ') timer = TimedProgress(360, prefix=' ')
try: try:
to_keep = [] to_keep = []
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links: for link in all_links:
should_remove = ( should_remove = (
(after is not None and float(link.timestamp) < after) (after is not None and float(link.timestamp) < after)

View file

@ -13,6 +13,7 @@ from ..config import (
GIT_SHA, GIT_SHA,
FOOTER_INFO, FOOTER_INFO,
ARCHIVE_DIR_NAME, ARCHIVE_DIR_NAME,
HTML_INDEX_FILENAME,
) )
from ..util import ( from ..util import (
enforce_types, enforce_types,
@ -44,7 +45,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static')) copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static'))
rendered_html = main_index_template(links, finished=finished) rendered_html = main_index_template(links, finished=finished)
atomic_write(rendered_html, join(out_dir, 'index.html')) atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
@enforce_types @enforce_types
@ -100,7 +101,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link) rendered_html = link_details_template(link)
atomic_write(rendered_html, join(out_dir, 'index.html')) atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
@enforce_types @enforce_types

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.legacy.storage' __package__ = 'archivebox.legacy.storage'
import os import os
import sys
import json import json
from datetime import datetime from datetime import datetime
@ -10,12 +11,33 @@ from ..schema import Link, ArchiveResult
from ..config import ( from ..config import (
VERSION, VERSION,
OUTPUT_DIR, OUTPUT_DIR,
FOOTER_INFO,
GIT_SHA,
DEPENDENCIES,
JSON_INDEX_FILENAME,
) )
from ..util import ( from ..util import (
enforce_types, enforce_types,
atomic_write, atomic_write,
) )
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.legacy.storage.json',
'copyright_info': FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'cmd': sys.argv,
'version': VERSION,
'git_sha': GIT_SHA,
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'source': 'https://github.com/pirate/ArchiveBox',
'issues': 'https://github.com/pirate/ArchiveBox/issues',
'dependencies': DEPENDENCIES,
},
}
### Main Links Index ### Main Links Index
@ -23,7 +45,7 @@ from ..util import (
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links""" """parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json') index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path): if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f: with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links'] links = json.load(f)['links']
@ -46,18 +68,13 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
if links and links[0].sources: if links and links[0].sources:
assert isinstance(links[0].sources[0], str) assert isinstance(links[0].sources[0], str)
path = os.path.join(out_dir, 'index.json') main_index_json = {
**MAIN_INDEX_HEADER,
index_json = {
'info': 'ArchiveBox Index',
'source': 'https://github.com/pirate/ArchiveBox',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'version': VERSION,
'num_links': len(links), 'num_links': len(links),
'updated': datetime.now(), 'updated': datetime.now(),
'links': links, 'links': links,
} }
atomic_write(index_json, path) atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
### Link Details Index ### Link Details Index
@ -67,7 +84,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link""" """write a json file with some info about the link"""
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir
path = os.path.join(out_dir, 'index.json') path = os.path.join(out_dir, JSON_INDEX_FILENAME)
atomic_write(link._asdict(extended=True), path) atomic_write(link._asdict(extended=True), path)
@ -75,7 +92,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
@enforce_types @enforce_types
def parse_json_link_details(out_dir: str) -> Optional[Link]: def parse_json_link_details(out_dir: str) -> Optional[Link]:
"""load the json link index from a given directory""" """load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json') existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(existing_index): if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f: with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f) link_json = json.load(f)

View file

@ -4,14 +4,14 @@ from typing import List, Iterator
from ..schema import Link from ..schema import Link
from ..util import enforce_types from ..util import enforce_types
from ..config import setup_django from ..config import setup_django, OUTPUT_DIR
### Main Links Index ### Main Links Index
@enforce_types @enforce_types
def parse_sql_main_index() -> Iterator[Link]: def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
setup_django() setup_django(out_dir, check_db=True)
from core.models import Page from core.models import Page
return ( return (
@ -20,8 +20,8 @@ def parse_sql_main_index() -> Iterator[Link]:
) )
@enforce_types @enforce_types
def write_sql_main_index(links: List[Link]) -> None: def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
setup_django() setup_django(out_dir, check_db=True)
from core.models import Page from core.models import Page
for link in links: for link in links:

View file

@ -27,6 +27,11 @@ os.environ.update(TEST_CONFIG)
from .legacy.main import init from .legacy.main import init
from .legacy.index import load_main_index from .legacy.index import load_main_index
from .legacy.config import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
)
from .cli import ( from .cli import (
archivebox_init, archivebox_init,
@ -55,12 +60,12 @@ and example14.badb
<or>htt://example15.badc</that> <or>htt://example15.badc</that>
''' '''
stdout = sys.stdout
stderr = sys.stderr
@contextmanager @contextmanager
def output_hidden(show_failing=True): def output_hidden(show_failing=True):
stdout = sys.stdout
stderr = sys.stderr
if not HIDE_CLI_OUTPUT: if not HIDE_CLI_OUTPUT:
yield yield
return return
@ -100,6 +105,11 @@ class TestInit(unittest.TestCase):
with output_hidden(): with output_hidden():
archivebox_init.main([]) archivebox_init.main([])
assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
def test_conflicting_init(self): def test_conflicting_init(self):
with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f: with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
f.write('test') f.write('test')
@ -108,9 +118,25 @@ class TestInit(unittest.TestCase):
with output_hidden(show_failing=False): with output_hidden(show_failing=False):
archivebox_init.main([]) archivebox_init.main([])
assert False, 'Init should have exited with an exception' assert False, 'Init should have exited with an exception'
except SystemExit:
pass
assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
try:
load_main_index(out_dir=OUTPUT_DIR)
assert False, 'load_main_index should raise an exception when no index is present'
except: except:
pass pass
def test_no_dirty_state(self):
with output_hidden():
init()
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
with output_hidden():
init()
class TestAdd(unittest.TestCase): class TestAdd(unittest.TestCase):
def setUp(self): def setUp(self):
@ -125,7 +151,7 @@ class TestAdd(unittest.TestCase):
with output_hidden(): with output_hidden():
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all']) archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 30 assert len(all_links) == 30
def test_add_arg_file(self): def test_add_arg_file(self):
@ -136,7 +162,7 @@ class TestAdd(unittest.TestCase):
with output_hidden(): with output_hidden():
archivebox_add.main([test_file]) archivebox_add.main([test_file])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12 assert len(all_links) == 12
os.remove(test_file) os.remove(test_file)
@ -144,7 +170,7 @@ class TestAdd(unittest.TestCase):
with output_hidden(): with output_hidden():
archivebox_add.main([], stdin=test_urls) archivebox_add.main([], stdin=test_urls)
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12 assert len(all_links) == 12
@ -155,29 +181,29 @@ class TestRemove(unittest.TestCase):
init() init()
archivebox_add.main([], stdin=test_urls) archivebox_add.main([], stdin=test_urls)
def tearDown(self): # def tearDown(self):
shutil.rmtree(OUTPUT_DIR, ignore_errors=True) # shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_remove_exact(self): def test_remove_exact(self):
with output_hidden(): with output_hidden():
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/']) archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 11 assert len(all_links) == 11
def test_remove_regex(self): def test_remove_regex(self):
with output_hidden(): with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', 'http(s)?:\/\/(.+\.)?(example\d\.com)']) archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', 'http(s)?:\/\/(.+\.)?(example\d\.com)'])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 4 assert len(all_links) == 4
def test_remove_domain(self): def test_remove_domain(self):
with output_hidden(): with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com']) archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR) all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 10 assert len(all_links) == 10
def test_remove_none(self): def test_remove_none(self):
@ -190,4 +216,7 @@ class TestRemove(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
if '--verbose' in sys.argv or '-v' in sys.argv:
HIDE_CLI_OUTPUT = False
unittest.main() unittest.main()

View file

@ -1,17 +0,0 @@
dataclasses
django
base32-crockford
setuptools
ipdb
mypy
django-stubs
flake8
#wpull
#pywb
#pyppeteer
#GitPython
#youtube-dl
#archivenow
#requests

View file

@ -31,7 +31,7 @@ setuptools.setup(
'Bug Tracker': 'https://github.com/pirate/ArchiveBox/issues', 'Bug Tracker': 'https://github.com/pirate/ArchiveBox/issues',
'Roadmap': 'https://github.com/pirate/ArchiveBox/wiki/Roadmap', 'Roadmap': 'https://github.com/pirate/ArchiveBox/wiki/Roadmap',
'Changelog': 'https://github.com/pirate/ArchiveBox/wiki/Changelog', 'Changelog': 'https://github.com/pirate/ArchiveBox/wiki/Changelog',
'Donations': 'https://github.com/pirate/ArchiveBox/wiki/Donations', 'Patreon': 'https://github.com/pirate/ArchiveBox/wiki/Donations',
}, },
packages=setuptools.find_packages(), packages=setuptools.find_packages(),
python_requires='>=3.6', python_requires='>=3.6',
@ -40,6 +40,15 @@ setuptools.setup(
"base32-crockford==0.3.0", "base32-crockford==0.3.0",
"django==2.2", "django==2.2",
"django-extensions==2.1.6", "django-extensions==2.1.6",
"youtube-dl",
# Some/all of these will likely be added in the future:
# wpull
# pywb
# pyppeteer
# archivenow
# requests
], ],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [