mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-21 02:15:10 -04:00
Merge remote-tracking branch 'upstream/master'
This commit is contained in:
commit
a3705e31c6
3 changed files with 6 additions and 3 deletions
|
@ -33,6 +33,7 @@ from config import (
|
||||||
WGET_USER_AGENT,
|
WGET_USER_AGENT,
|
||||||
CHECK_SSL_VALIDITY,
|
CHECK_SSL_VALIDITY,
|
||||||
COOKIES_FILE,
|
COOKIES_FILE,
|
||||||
|
WGET_AUTO_COMPRESSION
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
domain,
|
domain,
|
||||||
|
@ -224,10 +225,10 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT):
|
||||||
'--backup-converted',
|
'--backup-converted',
|
||||||
'--span-hosts',
|
'--span-hosts',
|
||||||
'--no-parent',
|
'--no-parent',
|
||||||
'--compression=auto',
|
|
||||||
'-e', 'robots=off',
|
'-e', 'robots=off',
|
||||||
'--restrict-file-names=unix',
|
'--restrict-file-names=unix',
|
||||||
'--timeout={}'.format(timeout),
|
'--timeout={}'.format(timeout),
|
||||||
|
*(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()),
|
||||||
*(() if FETCH_WARC else ('--timestamping',)),
|
*(() if FETCH_WARC else ('--timestamping',)),
|
||||||
*(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
|
*(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
|
||||||
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
||||||
|
|
|
@ -74,6 +74,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates')
|
||||||
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
|
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
|
||||||
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
|
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
|
||||||
USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
|
USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
|
||||||
|
WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode)
|
||||||
|
|
||||||
########################### Environment & Dependencies #########################
|
########################### Environment & Dependencies #########################
|
||||||
|
|
||||||
|
|
|
@ -154,7 +154,8 @@ def parse_rss_export(rss_file):
|
||||||
"""Parse RSS XML-format files into links"""
|
"""Parse RSS XML-format files into links"""
|
||||||
|
|
||||||
rss_file.seek(0)
|
rss_file.seek(0)
|
||||||
items = rss_file.read().split('</item>\n<item>')
|
items = rss_file.read().split('<item>')
|
||||||
|
items = items[1:] if items else []
|
||||||
for item in items:
|
for item in items:
|
||||||
# example item:
|
# example item:
|
||||||
# <item>
|
# <item>
|
||||||
|
@ -166,7 +167,7 @@ def parse_rss_export(rss_file):
|
||||||
# </item>
|
# </item>
|
||||||
|
|
||||||
trailing_removed = item.split('</item>', 1)[0]
|
trailing_removed = item.split('</item>', 1)[0]
|
||||||
leading_removed = trailing_removed.split('<item>', 1)[-1]
|
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
||||||
rows = leading_removed.split('\n')
|
rows = leading_removed.split('\n')
|
||||||
|
|
||||||
def get_row(key):
|
def get_row(key):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue