mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
more fixes for wget_output_path
Some checks failed
CodeQL / Analyze (${{ matrix.language }}) (none, python) (push) Has been cancelled
Build Debian package / build (push) Has been cancelled
Build Docker image / buildx (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled
Run linters / lint (push) Has been cancelled
Build Pip package / build (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
Some checks failed
CodeQL / Analyze (${{ matrix.language }}) (none, python) (push) Has been cancelled
Build Debian package / build (push) Has been cancelled
Build Docker image / buildx (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled
Run linters / lint (push) Has been cancelled
Build Pip package / build (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
This commit is contained in:
parent
f2729c9dc7
commit
4c5a3fba8b
2 changed files with 5 additions and 8 deletions
|
@ -174,13 +174,12 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
|
||||||
|
|
||||||
# check for literally any file present that isnt an empty folder
|
# check for literally any file present that isnt an empty folder
|
||||||
domain_dir = Path(domain(link.url).replace(":", "+"))
|
domain_dir = Path(domain(link.url).replace(":", "+"))
|
||||||
files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
|
files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
|
||||||
if files_within:
|
if files_within:
|
||||||
return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
|
return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
|
||||||
|
|
||||||
# abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
|
# abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
|
||||||
# that it's better we just pretend it doesnt exist
|
# that it's better we just pretend it doesnt exist
|
||||||
|
|
||||||
# this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
|
# this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -243,26 +242,24 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
try:
|
try:
|
||||||
output_path = unsafe_wget_output_path(link)
|
output_path = unsafe_wget_output_path(link)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
# print(err)
|
|
||||||
pass # better to pretend it just failed to download than expose gnarly OSErrors to users
|
pass # better to pretend it just failed to download than expose gnarly OSErrors to users
|
||||||
|
|
||||||
|
|
||||||
# check for unprintable unicode characters
|
# check for unprintable unicode characters
|
||||||
# https://github.com/ArchiveBox/ArchiveBox/issues/1373
|
# https://github.com/ArchiveBox/ArchiveBox/issues/1373
|
||||||
if output_path:
|
if output_path:
|
||||||
safe_path = output_path.encode('utf-8', 'replace').decode()
|
safe_path = output_path.encode('utf-8', 'replace').decode()
|
||||||
|
|
||||||
if output_path != safe_path:
|
if output_path != safe_path:
|
||||||
# contains unprintable unicode characters that will break other parts of archivebox
|
# contains unprintable unicode characters that will break other parts of archivebox
|
||||||
# better to pretend it doesnt exist and fallback to parent dir than crash archivebox
|
# better to pretend it doesnt exist and fallback to parent dir than crash archivebox
|
||||||
output_path = None
|
output_path = None
|
||||||
|
|
||||||
|
|
||||||
# check for a path that is just too long to safely handle across different OS's
|
# check for a path that is just too long to safely handle across different OS's
|
||||||
# https://github.com/ArchiveBox/ArchiveBox/issues/549
|
# https://github.com/ArchiveBox/ArchiveBox/issues/549
|
||||||
if output_path and len(output_path) > 250:
|
if output_path and len(output_path) > 250:
|
||||||
output_path = None
|
output_path = None
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
return output_path
|
||||||
|
|
||||||
# fallback to just the domain dir
|
# fallback to just the domain dir
|
||||||
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
|
||||||
|
@ -274,5 +271,4 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
if search_dir.is_dir():
|
if search_dir.is_dir():
|
||||||
return domain(link.url).split(":", 1)[0]
|
return domain(link.url).split(":", 1)[0]
|
||||||
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -15,7 +15,8 @@ croniter==2.0.5
|
||||||
cryptography==42.0.7
|
cryptography==42.0.7
|
||||||
dateparser==1.2.0
|
dateparser==1.2.0
|
||||||
decorator==5.1.1
|
decorator==5.1.1
|
||||||
django==5.0.4
|
django==5.0.5
|
||||||
|
django-admin-data-views==0.3.1
|
||||||
django-auth-ldap==4.8.0
|
django-auth-ldap==4.8.0
|
||||||
django-extensions==3.2.3
|
django-extensions==3.2.3
|
||||||
django-ninja==1.1.0
|
django-ninja==1.1.0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue