mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
use globbing to find wget output path
This commit is contained in:
parent
e6fa16e13a
commit
846c966c4d
1 changed files with 12 additions and 8 deletions
|
@ -134,9 +134,7 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
|
|
||||||
See docs on wget --adjust-extension (-E)
|
See docs on wget --adjust-extension (-E)
|
||||||
"""
|
"""
|
||||||
if is_static_file(link.url):
|
|
||||||
return without_scheme(without_fragment(link.url))
|
|
||||||
|
|
||||||
# Wget downloads can save in a number of different ways depending on the url:
|
# Wget downloads can save in a number of different ways depending on the url:
|
||||||
# https://example.com
|
# https://example.com
|
||||||
# > example.com/index.html
|
# > example.com/index.html
|
||||||
|
@ -187,7 +185,7 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
||||||
for file_present in search_dir.iterdir():
|
for file_present in search_dir.iterdir():
|
||||||
if file_present == last_part_of_url:
|
if file_present == last_part_of_url:
|
||||||
return str(search_dir / file_present)
|
return str((search_dir / file_present).relative_to(link.link_dir))
|
||||||
|
|
||||||
# Move up one directory level
|
# Move up one directory level
|
||||||
search_dir = search_dir.parent
|
search_dir = search_dir.parent
|
||||||
|
@ -195,10 +193,16 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
if str(search_dir) == link.link_dir:
|
if str(search_dir) == link.link_dir:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# check for staticfiles
|
||||||
|
base_url = without_scheme(without_fragment(link.url))
|
||||||
|
domain_dir = Path(domain(link.url).replace(":", "+"))
|
||||||
|
files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
|
||||||
|
if files_within:
|
||||||
|
return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
|
||||||
|
|
||||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
# fallback to just the domain dir
|
||||||
if not search_dir.is_dir():
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
|
||||||
return str(search_dir.relative_to(link.link_dir))
|
if search_dir.is_dir():
|
||||||
|
return domain(link.url).replace(":", "+")
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue