From e91cdfbc88249e10be6990ebc5fa3263d185aa70 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 24 Apr 2019 11:37:51 -0400 Subject: [PATCH] add rudimentary method to parse back html index into urls --- archivebox/legacy/storage/html.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/archivebox/legacy/storage/html.py b/archivebox/legacy/storage/html.py index 9eefb15a..545c06de 100644 --- a/archivebox/legacy/storage/html.py +++ b/archivebox/legacy/storage/html.py @@ -3,7 +3,7 @@ __package__ = 'archivebox.legacy.storage' import os from datetime import datetime -from typing import List, Optional +from typing import List, Optional, Iterator from ..schema import Link from ..config import ( @@ -39,6 +39,18 @@ TITLE_LOADING_MSG = 'Not yet archived...' ### Main Links Index +@enforce_types +def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]: + """parse an archive index html file and return the list of urls""" + + index_path = os.path.join(out_dir, HTML_INDEX_FILENAME) + if os.path.exists(index_path): + with open(index_path, 'r', encoding='utf-8') as f: + for line in f: + if 'class="link-url"' in line: + yield line.split('"')[1] + return () + @enforce_types def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: """write the html link index to a given path"""