From e91cdfbc88249e10be6990ebc5fa3263d185aa70 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Wed, 24 Apr 2019 11:37:51 -0400
Subject: [PATCH] add rudimentary method to parse back html index into urls

---
 archivebox/legacy/storage/html.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/archivebox/legacy/storage/html.py b/archivebox/legacy/storage/html.py
index 9eefb15a..545c06de 100644
--- a/archivebox/legacy/storage/html.py
+++ b/archivebox/legacy/storage/html.py
@@ -3,7 +3,7 @@ __package__ = 'archivebox.legacy.storage'
 import os
 
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Iterator
 
 from ..schema import Link
 from ..config import (
@@ -39,6 +39,18 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 
 ### Main Links Index
 
+@enforce_types
+def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
+    """parse an archive index html file and return the list of urls"""
+
+    index_path = os.path.join(out_dir, HTML_INDEX_FILENAME)
+    if os.path.exists(index_path):
+        with open(index_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                if 'class="link-url"' in line:
+                    yield line.split('"')[1]
+    return ()
+
 @enforce_types
 def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
     """write the html link index to a given path"""