mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
save the url as title for staticfiles or non html files
This commit is contained in:
parent
24e24934f7
commit
385daf9af8
1 changed files with 5 additions and 4 deletions
|
@ -62,9 +62,6 @@ class TitleParser(HTMLParser):
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
if is_static_file(link.url):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# if link already has valid title, skip it
|
# if link already has valid title, skip it
|
||||||
if not overwrite and link.title and not link.title.lower().startswith('http'):
|
if not overwrite and link.title and not link.title.lower().startswith('http'):
|
||||||
return False
|
return False
|
||||||
|
@ -113,7 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
||||||
timestamp=link.timestamp)\
|
timestamp=link.timestamp)\
|
||||||
.update(title=output)
|
.update(title=output)
|
||||||
else:
|
else:
|
||||||
raise ArchiveError('Unable to detect page title')
|
# if no content was returned, dont save a title (because it might be a temporary error)
|
||||||
|
if not html:
|
||||||
|
raise ArchiveError('Unable to detect page title')
|
||||||
|
# output = html[:128] # use first bit of content as the title
|
||||||
|
output = link.base_url # use the filename as the title (better UX)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue