mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
cleanup ARCHIVE_DIR paths
This commit is contained in:
parent
46ea65d4f2
commit
c90f4bfd5b
3 changed files with 8 additions and 5 deletions
|
@ -27,6 +27,7 @@ from config import (
|
||||||
CHROME_USER_DATA_DIR,
|
CHROME_USER_DATA_DIR,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
ANSI,
|
ANSI,
|
||||||
|
ARCHIVE_DIR,
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
check_dependencies,
|
check_dependencies,
|
||||||
|
@ -50,7 +51,7 @@ def archive_links(archive_path, links, source=None, resume=None):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for idx, link in enumerate(to_archive):
|
for idx, link in enumerate(to_archive):
|
||||||
link_dir = os.path.join(archive_path, 'archive', link['timestamp'])
|
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||||
archive_link(link_dir, link)
|
archive_link(link_dir, link)
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
||||||
|
|
|
@ -36,6 +36,7 @@ FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted
|
||||||
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
|
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
|
||||||
|
|
||||||
OUTPUT_DIR = os.path.join(REPO_DIR, 'output')
|
OUTPUT_DIR = os.path.join(REPO_DIR, 'output')
|
||||||
|
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, 'archive')
|
||||||
SOURCES_DIR = os.path.join(OUTPUT_DIR, 'sources')
|
SOURCES_DIR = os.path.join(OUTPUT_DIR, 'sources')
|
||||||
|
|
||||||
PYTHON_PATH = os.path.join(REPO_DIR, 'archiver')
|
PYTHON_PATH = os.path.join(REPO_DIR, 'archiver')
|
||||||
|
|
|
@ -16,6 +16,7 @@ from config import (
|
||||||
REPO_DIR,
|
REPO_DIR,
|
||||||
SOURCES_DIR,
|
SOURCES_DIR,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
|
ARCHIVE_DIR,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
TERM_WIDTH,
|
TERM_WIDTH,
|
||||||
SHOW_PROGRESS,
|
SHOW_PROGRESS,
|
||||||
|
@ -262,7 +263,7 @@ def find_link(folder, links):
|
||||||
timestamp = folder.split('.')[0]
|
timestamp = folder.split('.')[0]
|
||||||
for link in links:
|
for link in links:
|
||||||
if link['timestamp'].startswith(timestamp):
|
if link['timestamp'].startswith(timestamp):
|
||||||
if link['domain'] in os.listdir(os.path.join(OUTPUT_DIR, 'archive', folder)):
|
if link['domain'] in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
|
||||||
return link # careful now, this isn't safe for most ppl
|
return link # careful now, this isn't safe for most ppl
|
||||||
if link['domain'] in parse_url(folder):
|
if link['domain'] in parse_url(folder):
|
||||||
return link
|
return link
|
||||||
|
@ -271,7 +272,7 @@ def find_link(folder, links):
|
||||||
|
|
||||||
def parse_url(folder):
|
def parse_url(folder):
|
||||||
"""for a given archive folder, figure out what url it's for"""
|
"""for a given archive folder, figure out what url it's for"""
|
||||||
link_json = os.path.join(OUTPUT_DIR, 'archive', folder, 'index.json')
|
link_json = os.path.join(ARCHIVE_DIR, folder, 'index.json')
|
||||||
if os.path.exists(link_json):
|
if os.path.exists(link_json):
|
||||||
with open(link_json, 'r') as f:
|
with open(link_json, 'r') as f:
|
||||||
try:
|
try:
|
||||||
|
@ -282,7 +283,7 @@ def parse_url(folder):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print('File contains invalid JSON: {}!'.format(link_json))
|
print('File contains invalid JSON: {}!'.format(link_json))
|
||||||
|
|
||||||
archive_org_txt = os.path.join(OUTPUT_DIR, 'archive', folder, 'archive.org.txt')
|
archive_org_txt = os.path.join(ARCHIVE_DIR, folder, 'archive.org.txt')
|
||||||
if os.path.exists(archive_org_txt):
|
if os.path.exists(archive_org_txt):
|
||||||
with open(archive_org_txt, 'r') as f:
|
with open(archive_org_txt, 'r') as f:
|
||||||
original_link = f.read().strip().split('/http', 1)[-1]
|
original_link = f.read().strip().split('/http', 1)[-1]
|
||||||
|
@ -417,7 +418,7 @@ def wget_output_path(link, look_in=None):
|
||||||
# instead of trying to emulate it here, we just look in the output folder
|
# instead of trying to emulate it here, we just look in the output folder
|
||||||
# to see what html file wget actually created as the output
|
# to see what html file wget actually created as the output
|
||||||
wget_folder = link['base_url'].rsplit('/', 1)[0].split('/')
|
wget_folder = link['base_url'].rsplit('/', 1)[0].split('/')
|
||||||
look_in = os.path.join(OUTPUT_DIR, 'archive', link['timestamp'], *wget_folder)
|
look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
|
||||||
|
|
||||||
if look_in and os.path.exists(look_in):
|
if look_in and os.path.exists(look_in):
|
||||||
html_files = [
|
html_files = [
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue