diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index d4e09aa3..f3057271 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -202,4 +202,9 @@ def wget_output_path(link: Link) -> Optional[str]: if search_dir.is_dir(): return domain(link.url).replace(":", "+") + # fallback to just the domain dir without port + search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0] + if search_dir.is_dir(): + return domain(link.url).split(":", 1)[0] + return None diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 85972993..0a9b39c5 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -379,11 +379,15 @@ class Link: output_paths = ( domain(self.url), + 'output.html', 'output.pdf', 'screenshot.png', - 'output.html', + 'singlefile.html', + 'readability/content.html', + 'mercury/content.html', + 'htmltotext.txt', 'media', - 'singlefile.html' + 'git', ) return any(