split up utils into separate files

This commit is contained in:
Nick Sweeting 2019-04-30 23:13:04 -04:00
parent daf5951897
commit 95007d9137
23 changed files with 820 additions and 759 deletions

View file

@ -5,16 +5,11 @@ import os
from typing import Optional, List, Dict, Tuple
from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, DEVNULL, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
DEVNULL,
is_static_file,
ArchiveError,
chmod_file,
)
from ..config import (
VERSION,
@ -24,6 +19,7 @@ from ..config import (
CURL_VERSION,
CHECK_SSL_VALIDITY
)
from ..cli.logging import TimedProgress

View file

@ -4,22 +4,19 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_DOM,
CHROME_VERSION,
)
from ..cli.logging import TimedProgress

View file

@ -5,14 +5,8 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
domain,
run,
PIPE,
chmod_file,
)
from ..system import chmod_file, run, PIPE
from ..util import enforce_types, domain
from ..config import (
TIMEOUT,
SAVE_FAVICON,
@ -20,6 +14,7 @@ from ..config import (
CURL_VERSION,
CHECK_SSL_VALIDITY,
)
from ..cli.logging import TimedProgress
@enforce_types

View file

@ -4,15 +4,11 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chmod_file,
domain,
extension,
without_query,
@ -26,6 +22,7 @@ from ..config import (
GIT_DOMAINS,
CHECK_SSL_VALIDITY
)
from ..cli.logging import TimedProgress

View file

@ -4,15 +4,11 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chmod_file,
)
from ..config import (
MEDIA_TIMEOUT,
@ -21,6 +17,7 @@ from ..config import (
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
)
from ..cli.logging import TimedProgress
@enforce_types

View file

@ -4,23 +4,19 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_PDF,
CHROME_VERSION,
)
from ..cli.logging import TimedProgress
@enforce_types

View file

@ -4,22 +4,19 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_SCREENSHOT,
CHROME_VERSION,
)
from ..cli.logging import TimedProgress

View file

@ -1,14 +1,14 @@
__package__ = 'archivebox.extractors'
import re
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..util import (
enforce_types,
TimedProgress,
is_static_file,
ArchiveError,
fetch_page_title,
download_url,
htmldecode,
)
from ..config import (
TIMEOUT,
@ -16,6 +16,14 @@ from ..config import (
CURL_BINARY,
CURL_VERSION,
)
from ..cli.logging import TimedProgress
HTML_TITLE_REGEX = re.compile(
r'<title.*?>' # start matching text after <title> tag
r'(.[^<>]+)', # get everything up to these symbols
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
)
@enforce_types
@ -44,7 +52,9 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
output = fetch_page_title(link.url, timeout=timeout, progress=False)
html = download_url(link.url, timeout=timeout)
match = re.search(HTML_TITLE_REGEX, html)
output = htmldecode(match.group(1).strip()) if match else None
if not output:
raise ArchiveError('Unable to detect page title')
except Exception as err:

View file

@ -1,18 +1,22 @@
__package__ = 'archivebox.extractors'
import os
import re
from typing import Optional
from datetime import datetime
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
wget_output_path,
ArchiveError,
is_static_file,
without_scheme,
without_fragment,
without_query,
path,
domain,
urldecode,
)
from ..config import (
TIMEOUT,
@ -26,7 +30,7 @@ from ..config import (
WGET_USER_AGENT,
COOKIES_FILE,
)
from ..cli.logging import TimedProgress
@enforce_types
@ -121,3 +125,72 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
status=status,
**timer.stats,
)
@enforce_types
def wget_output_path(link: Link) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension (-E)
"""
if is_static_file(link.url):
return without_scheme(without_fragment(link.url))
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
# > example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc
# > example.com/abc.html
# https://example.com/abc/
# > example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > example.com/abc?v=zzVa_tX1OiI.html
# https://example.com/abc/?v=zzVa_tX1OiI.html
# > example.com/abc/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc/test.html
# > example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI
# > example.com/abc/test?v=zzVa_tX1OiI.html
# https://example.com/abc/test/?v=zzVa_tX1OiI
# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
# Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget
# in order to avoid having to reverse-engineer how they calculate it,
# we just look in the output folder read the filename wget used from the filesystem
full_path = without_fragment(without_query(path(link.url))).strip('/')
search_dir = os.path.join(
link.link_dir,
domain(link.url),
urldecode(full_path),
)
for _ in range(4):
if os.path.exists(search_dir):
if os.path.isdir(search_dir):
html_files = [
f for f in os.listdir(search_dir)
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
]
if html_files:
path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
return os.path.join(path_from_link_dir, html_files[0])
# Move up one directory level
search_dir = search_dir.rsplit('/', 1)[0]
if search_dir == link.link_dir:
break
return None