mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
check for non html files from wget
This commit is contained in:
parent
c6f0b8e6b3
commit
9764a8ed9b
1 changed files with 11 additions and 0 deletions
|
@ -175,11 +175,22 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
if html_files:
|
if html_files:
|
||||||
return str(html_files[0].relative_to(link.link_dir))
|
return str(html_files[0].relative_to(link.link_dir))
|
||||||
|
|
||||||
|
# sometimes wget'd URLs have no ext and return non-html
|
||||||
|
# e.g. /some/example/rss/all -> some RSS XML content)
|
||||||
|
# /some/other/url.o4g -> some binary unrecognized ext)
|
||||||
|
# test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
||||||
|
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
||||||
|
for file_present in os.listdir(search_dir):
|
||||||
|
if file_present == last_part_of_url:
|
||||||
|
return os.path.join(path_from_link_dir, file_present)
|
||||||
|
|
||||||
# Move up one directory level
|
# Move up one directory level
|
||||||
search_dir = search_dir.parent
|
search_dir = search_dir.parent
|
||||||
|
|
||||||
if str(search_dir) == link.link_dir:
|
if str(search_dir) == link.link_dir:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
||||||
if not search_dir.is_dir():
|
if not search_dir.is_dir():
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue