mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-19 01:15:11 -04:00
htmldecode downloaded sources before parsing for links
This commit is contained in:
parent
fc0d064ddf
commit
e3ac4c2405
1 changed files with 4 additions and 6 deletions
|
@ -23,12 +23,14 @@ from ..config import (
|
||||||
)
|
)
|
||||||
from ..util import (
|
from ..util import (
|
||||||
basename,
|
basename,
|
||||||
|
htmldecode,
|
||||||
download_url,
|
download_url,
|
||||||
enforce_types,
|
enforce_types,
|
||||||
URL_REGEX,
|
URL_REGEX,
|
||||||
)
|
)
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..logging_util import TimedProgress, log_source_saved
|
from ..logging_util import TimedProgress, log_source_saved
|
||||||
|
|
||||||
from .pocket_html import parse_pocket_html_export
|
from .pocket_html import parse_pocket_html_export
|
||||||
from .pinboard_rss import parse_pinboard_rss_export
|
from .pinboard_rss import parse_pinboard_rss_export
|
||||||
from .shaarli_rss import parse_shaarli_rss_export
|
from .shaarli_rss import parse_shaarli_rss_export
|
||||||
|
@ -126,15 +128,11 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
||||||
|
|
||||||
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||||
# Source is a URL that needs to be downloaded
|
# Source is a URL that needs to be downloaded
|
||||||
print('{}[*] [{}] Downloading {}{}'.format(
|
print(f' > Downloading {path} contents')
|
||||||
ANSI['green'],
|
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
||||||
path,
|
|
||||||
ANSI['reset'],
|
|
||||||
))
|
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
raw_source_text = download_url(path, timeout=timeout)
|
raw_source_text = download_url(path, timeout=timeout)
|
||||||
|
raw_source_text = htmldecode(raw_source_text)
|
||||||
timer.end()
|
timer.end()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue