mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-06-02 23:59:52 -04:00
split up utils into separate files
This commit is contained in:
parent
daf5951897
commit
95007d9137
23 changed files with 820 additions and 759 deletions
|
@ -5,16 +5,11 @@ import os
|
|||
from typing import Optional, List, Dict, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, DEVNULL, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
DEVNULL,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
VERSION,
|
||||
|
@ -24,6 +19,7 @@ from ..config import (
|
|||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4,22 +4,19 @@ import os
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chrome_args,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_DOM,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -5,14 +5,8 @@ import os
|
|||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
domain,
|
||||
run,
|
||||
PIPE,
|
||||
chmod_file,
|
||||
)
|
||||
from ..system import chmod_file, run, PIPE
|
||||
from ..util import enforce_types, domain
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_FAVICON,
|
||||
|
@ -20,6 +14,7 @@ from ..config import (
|
|||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -4,15 +4,11 @@ import os
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chmod_file,
|
||||
domain,
|
||||
extension,
|
||||
without_query,
|
||||
|
@ -26,6 +22,7 @@ from ..config import (
|
|||
GIT_DOMAINS,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4,15 +4,11 @@ import os
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
MEDIA_TIMEOUT,
|
||||
|
@ -21,6 +17,7 @@ from ..config import (
|
|||
YOUTUBEDL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -4,23 +4,19 @@ import os
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chrome_args,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_PDF,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -4,22 +4,19 @@ import os
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
chrome_args,
|
||||
chmod_file,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_SCREENSHOT,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
is_static_file,
|
||||
ArchiveError,
|
||||
fetch_page_title,
|
||||
download_url,
|
||||
htmldecode,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -16,6 +16,14 @@ from ..config import (
|
|||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
)
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
HTML_TITLE_REGEX = re.compile(
|
||||
r'<title.*?>' # start matching text after <title> tag
|
||||
r'(.[^<>]+)', # get everything up to these symbols
|
||||
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
@ -44,7 +52,9 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
|
|||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
output = fetch_page_title(link.url, timeout=timeout, progress=False)
|
||||
html = download_url(link.url, timeout=timeout)
|
||||
match = re.search(HTML_TITLE_REGEX, html)
|
||||
output = htmldecode(match.group(1).strip()) if match else None
|
||||
if not output:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
except Exception as err:
|
||||
|
|
|
@ -1,18 +1,22 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, PIPE
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
TimedProgress,
|
||||
run,
|
||||
PIPE,
|
||||
wget_output_path,
|
||||
ArchiveError,
|
||||
is_static_file,
|
||||
without_scheme,
|
||||
without_fragment,
|
||||
without_query,
|
||||
path,
|
||||
domain,
|
||||
urldecode,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -26,7 +30,7 @@ from ..config import (
|
|||
WGET_USER_AGENT,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
|
||||
from ..cli.logging import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
@ -121,3 +125,72 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
|
|||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def wget_output_path(link: Link) -> Optional[str]:
|
||||
"""calculate the path to the wgetted .html file, since wget may
|
||||
adjust some paths to be different than the base_url path.
|
||||
|
||||
See docs on wget --adjust-extension (-E)
|
||||
"""
|
||||
|
||||
if is_static_file(link.url):
|
||||
return without_scheme(without_fragment(link.url))
|
||||
|
||||
# Wget downloads can save in a number of different ways depending on the url:
|
||||
# https://example.com
|
||||
# > example.com/index.html
|
||||
# https://example.com?v=zzVa_tX1OiI
|
||||
# > example.com/index.html?v=zzVa_tX1OiI.html
|
||||
# https://www.example.com/?v=zzVa_tX1OiI
|
||||
# > example.com/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc
|
||||
# > example.com/abc.html
|
||||
# https://example.com/abc/
|
||||
# > example.com/abc/index.html
|
||||
# https://example.com/abc?v=zzVa_tX1OiI.html
|
||||
# > example.com/abc?v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
||||
# > example.com/abc/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc/test.html
|
||||
# > example.com/abc/test.html
|
||||
# https://example.com/abc/test?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test?v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# There's also lots of complexity around how the urlencoding and renaming
|
||||
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
|
||||
|
||||
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
||||
# and there's no way to get the computed output path from wget
|
||||
# in order to avoid having to reverse-engineer how they calculate it,
|
||||
# we just look in the output folder read the filename wget used from the filesystem
|
||||
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
||||
search_dir = os.path.join(
|
||||
link.link_dir,
|
||||
domain(link.url),
|
||||
urldecode(full_path),
|
||||
)
|
||||
|
||||
for _ in range(4):
|
||||
if os.path.exists(search_dir):
|
||||
if os.path.isdir(search_dir):
|
||||
html_files = [
|
||||
f for f in os.listdir(search_dir)
|
||||
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||
]
|
||||
if html_files:
|
||||
path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
|
||||
return os.path.join(path_from_link_dir, html_files[0])
|
||||
|
||||
# Move up one directory level
|
||||
search_dir = search_dir.rsplit('/', 1)[0]
|
||||
|
||||
if search_dir == link.link_dir:
|
||||
break
|
||||
|
||||
return None
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue