From 4e69d2c9e14bbbc4597731fdc349f5461a726b54 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly <benmuthalaly@gmail.com> Date: Wed, 21 Feb 2024 15:13:06 -0600 Subject: [PATCH 01/19] Add `EXTRA_*_ARGS` for wget, curl, and singlefile --- archivebox/config.py | 8 +++++++- archivebox/extractors/archive_org.py | 13 ++++++++++--- archivebox/extractors/favicon.py | 18 ++++++++++++++---- archivebox/extractors/headers.py | 14 ++++++++++---- archivebox/extractors/singlefile.py | 25 +++++++++---------------- archivebox/extractors/title.py | 13 ++++++++++--- archivebox/extractors/wget.py | 15 +++++++++++---- archivebox/util.py | 17 +++++++++++++++++ 8 files changed, 88 insertions(+), 35 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 1edd2eeb..ebb939a4 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -187,12 +187,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--no-parent', '-e', 'robots=off', ]}, + 'WGET_EXTRA_ARGS': {'type': list, 'default': None}, 'CURL_ARGS': {'type': list, 'default': ['--silent', '--location', '--compressed' ]}, + 'CURL_EXTRA_ARGS': {'type': list, 'default': None}, 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, - 'SINGLEFILE_ARGS': {'type': list, 'default' : None}, + 'SINGLEFILE_ARGS': {'type': list, 'default': None}, + 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None}, 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, }, @@ -530,6 +533,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []}, + 'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []}, 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']}, @@ -540,12 +544,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, + 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []}, 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, 'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []}, + 'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []}, 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index a0883113..93730f26 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -10,10 +10,12 @@ from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, + dedupe, ) from ..config import ( TIMEOUT, CURL_ARGS, + CURL_EXTRA_ARGS, CHECK_SSL_VALIDITY, SAVE_ARCHIVE_DOT_ORG, CURL_BINARY, @@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= output: ArchiveOutput = 'archive.org.txt' archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) - cmd = [ - CURL_BINARY, - *CURL_ARGS, + # earlier options take precedence + options = [ '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *CURL_EXTRA_ARGS, + *CURL_ARGS, + ] + cmd = [ + CURL_BINARY, + *dedupe(*options), submit_url, ] status = 'succeeded' diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 5baafc17..3b41f349 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -6,13 +6,18 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..system import chmod_file, run -from ..util import enforce_types, domain +from ..util import ( + enforce_types, + domain, + dedupe, +) from ..config import ( TIMEOUT, SAVE_FAVICON, FAVICON_PROVIDER, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_VERSION, CHECK_SSL_VALIDITY, CURL_USER_AGENT, @@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) out_dir = out_dir or link.link_dir output: ArchiveOutput = 'favicon.ico' - cmd = [ - CURL_BINARY, - *CURL_ARGS, + # earlier options take precedence + options = [ '--max-time', str(timeout), '--output', str(output), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *CURL_EXTRA_ARGS, + *CURL_ARGS, + ] + cmd = [ + CURL_BINARY, + *dedupe(*options), FAVICON_PROVIDER.format(domain(link.url)), ] status = 'failed' diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 91dcb8e3..3828de93 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -9,11 +9,13 @@ from ..system import atomic_write from ..util import ( enforce_types, get_headers, + dedupe, ) from ..config import ( TIMEOUT, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_USER_AGENT, CURL_VERSION, CHECK_SSL_VALIDITY, @@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') - - cmd = [ - CURL_BINARY, - *CURL_ARGS, + # earlier options take precedence + options = [ '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *CURL_EXTRA_ARGS, + *CURL_ARGS, + ] + cmd = [ + CURL_BINARY, + *dedupe(*options), link.url, ] try: diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index e50b3932..b2119119 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -11,6 +11,7 @@ from ..util import ( enforce_types, is_static_file, chrome_args, + dedupe, ) from ..config import ( TIMEOUT, @@ -18,6 +19,7 @@ from ..config import ( DEPENDENCIES, SINGLEFILE_VERSION, SINGLEFILE_ARGS, + SINGLEFILE_EXTRA_ARGS, CHROME_BINARY, ) from ..logging_util import TimedProgress @@ -46,11 +48,6 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - options = [ - *SINGLEFILE_ARGS, - '--browser-executable-path={}'.format(CHROME_BINARY), - browser_args, - ] # Deduplicate options (single-file doesn't like when you use the same option two times) # @@ -58,19 +55,15 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most # specificity, therefore the user sets it with a lot intent, therefore it should take precedence # kind of like the ergonomic principle of lexical scope in programming languages. - seen_option_names = [] - def test_seen(argument): - option_name = argument.split("=")[0] - if option_name in seen_option_names: - return False - else: - seen_option_names.append(option_name) - return True - deduped_options = list(filter(test_seen, options)) - + options = [ + '--browser-executable-path={}'.format(CHROME_BINARY), + browser_args, + *SINGLEFILE_EXTRA_ARGS, + *SINGLEFILE_ARGS, + ] cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], - *deduped_options, + *dedupe(*options), link.url, output, ] diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 6b0e37f6..b2b65af2 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -10,6 +10,7 @@ from ..util import ( enforce_types, download_url, htmldecode, + dedupe, ) from ..config import ( TIMEOUT, @@ -17,6 +18,7 @@ from ..config import ( SAVE_TITLE, CURL_BINARY, CURL_ARGS, + CURL_EXTRA_ARGS, CURL_VERSION, CURL_USER_AGENT, ) @@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - from core.models import Snapshot output: ArchiveOutput = None - cmd = [ - CURL_BINARY, - *CURL_ARGS, + # earlier options take precedence + options = [ '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *CURL_EXTRA_ARGS, + *CURL_ARGS, + ] + cmd = [ + CURL_BINARY, + *dedupe(*options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index f3057271..d50409b6 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -15,9 +15,11 @@ from ..util import ( path, domain, urldecode, + dedupe, ) from ..config import ( WGET_ARGS, + WGET_EXTRA_ARGS, TIMEOUT, SAVE_WGET, SAVE_WARC, @@ -55,10 +57,8 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html output: ArchiveOutput = None - cmd = [ - WGET_BINARY, - # '--server-response', # print headers for better error parsing - *WGET_ARGS, + # earlier options take precedence + options = [ '--timeout={}'.format(timeout), *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), @@ -68,6 +68,13 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), + # '--server-response', # print headers for better error parsing + *WGET_EXTRA_ARGS, + *WGET_ARGS, + ] + cmd = [ + WGET_BINARY, + *dedupe(*options), link.url, ] diff --git a/archivebox/util.py b/archivebox/util.py index 5321081c..6b31c86e 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -317,6 +317,23 @@ def ansi_to_html(text): return COLOR_REGEX.sub(single_sub, text) +@enforce_types +def dedupe(*options: List[str]) -> List[str]: + """ + Deduplicates the given options. Options that come earlier in the list clobber + later conflicting options. + """ + seen_option_names = [] + def test_seen(argument): + option_name = argument.split("=")[0] + if option_name in seen_option_names: + return False + else: + seen_option_names.append(option_name) + return True + return list(filter(test_seen, options)) + + class AttributeDict(dict): """Helper to allow accessing dict values via Example.key or Example['key']""" From ab8f395e0a4104dd01385be3d8fcea082a6987ee Mon Sep 17 00:00:00 2001 From: Ben Muthalaly <benmuthalaly@gmail.com> Date: Fri, 23 Feb 2024 15:40:31 -0600 Subject: [PATCH 02/19] Add `YOUTUBEDL_EXTRA_ARGS` --- archivebox/config.py | 1 + archivebox/extractors/media.py | 12 +++++++++--- archivebox/extractors/wget.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index ebb939a4..00e3b9f0 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -176,6 +176,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--add-metadata', '--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']), ]}, + 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None}, 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 7d73024f..862bb758 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -8,11 +8,13 @@ from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, + dedupe, ) from ..config import ( MEDIA_TIMEOUT, SAVE_MEDIA, YOUTUBEDL_ARGS, + YOUTUBEDL_EXTRA_ARGS, YOUTUBEDL_BINARY, YOUTUBEDL_VERSION, CHECK_SSL_VALIDITY @@ -39,11 +41,15 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME output: ArchiveOutput = 'media' output_path = out_dir / output output_path.mkdir(exist_ok=True) - cmd = [ - YOUTUBEDL_BINARY, - *YOUTUBEDL_ARGS, + options = [ *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} + *YOUTUBEDL_EXTRA_ARGS, + *YOUTUBEDL_ARGS, + ] + cmd = [ + YOUTUBEDL_BINARY, + *dedupe(*options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index d50409b6..5209cde9 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -69,7 +69,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), # '--server-response', # print headers for better error parsing - *WGET_EXTRA_ARGS, + *WGET_EXTRA_ARGS, *WGET_ARGS, ] cmd = [ From 4d9c5a7b4b0bc0f490b6d8928878853fad363d16 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly <benmuthalaly@gmail.com> Date: Fri, 23 Feb 2024 18:40:03 -0600 Subject: [PATCH 03/19] Add `CHROME_EXTRA_ARGS` Also fix `YOUTUBEDL_EXTRA_ARGS`. --- archivebox/config.py | 4 ++++ archivebox/util.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 00e3b9f0..f8e56036 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -152,6 +152,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CHROME_TIMEOUT': {'type': int, 'default': 0}, 'CHROME_HEADLESS': {'type': bool, 'default': True}, 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, + 'CHROME_EXTRA_ARGS': {'type': list, 'default': None}, + 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ '--restrict-filenames', '--trim-filenames', '128', @@ -568,6 +570,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None}, 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']}, 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []}, + 'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []}, 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()}, 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, @@ -589,6 +592,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, + 'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []}, 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, } diff --git a/archivebox/util.py b/archivebox/util.py index 6b31c86e..18ca08aa 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -227,7 +227,11 @@ def chrome_args(**options) -> List[str]: # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/ - from .config import CHROME_OPTIONS, CHROME_VERSION + from .config import ( + CHROME_OPTIONS, + CHROME_VERSION, + CHROME_EXTRA_ARGS, + ) options = {**CHROME_OPTIONS, **options} @@ -279,8 +283,10 @@ def chrome_args(**options) -> List[str]: if options['CHROME_USER_DATA_DIR']: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - - return cmd_args + + cmd_args += CHROME_EXTRA_ARGS + + return dedupe(*cmd_args) def chrome_cleanup(): """ From 22f9a289d399de5dda1de624ef92f93969f1473e Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Sun, 25 Feb 2024 12:34:51 -0800 Subject: [PATCH 04/19] Use feedparser for RSS parsing in generic_rss and pinboard_rss parsers The feedparser packages has 20 years of history and is very good at parsing RSS and Atom, so use that instead of ad-hoc regex and XML parsing. The medium_rss and shaarli_rss parsers weren't touched because they are probably unnecessary. (The special parse for pinboard is just needing because of how tags work.) Doesn't include tests because I haven't figured out how to run them in the docker development setup. Fixes #1171 --- archivebox/parsers/generic_rss.py | 44 ++++++++++++------------------ archivebox/parsers/pinboard_rss.py | 39 ++++++++++---------------- pyproject.toml | 1 + 3 files changed, 34 insertions(+), 50 deletions(-) diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py index 4bd04967..005da688 100644 --- a/archivebox/parsers/generic_rss.py +++ b/archivebox/parsers/generic_rss.py @@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers' from typing import IO, Iterable -from datetime import datetime +from time import mktime +from feedparser import parse as feedparser from ..index.schema import Link from ..util import ( htmldecode, - enforce_types, - str_between, + enforce_types ) @enforce_types @@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: """Parse RSS XML-format files into links""" rss_file.seek(0) - items = rss_file.read().split('<item>') - items = items[1:] if items else [] - for item in items: - # example item: - # <item> - # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title> - # <category>Unread</category> - # <link>https://blog.sessionstack.com/how-javascript-works-inside</link> - # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid> - # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate> - # </item> + feed = feedparser(rss_file.read()) + for item in feed.entries: + url = item.link + title = item.title + time = mktime(item.updated_parsed) - trailing_removed = item.split('</item>', 1)[0] - leading_removed = trailing_removed.split('<item>', 1)[-1].strip() - rows = leading_removed.split('\n') + try: + tags = ','.join(map(lambda tag: tag.term, item.tags)) + except AttributeError: + tags = '' - def get_row(key): - return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] - - url = str_between(get_row('link'), '<link>', '</link>') - ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>') - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") - title = str_between(get_row('title'), '<![CDATA[', ']]').strip() + if url is None: + # Yielding a Link with no URL will + # crash on a URL validation assertion + continue yield Link( url=htmldecode(url), - timestamp=str(time.timestamp()), + timestamp=str(time), title=htmldecode(title) or None, - tags=None, + tags=tags, sources=[rss_file.name], ) diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py index d12b219c..8c4dbb16 100644 --- a/archivebox/parsers/pinboard_rss.py +++ b/archivebox/parsers/pinboard_rss.py @@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers' from typing import IO, Iterable -from datetime import datetime, timezone - -from xml.etree import ElementTree +from time import mktime +from feedparser import parse as feedparser from ..index.schema import Link from ..util import ( htmldecode, - enforce_types, + enforce_types ) - @enforce_types def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: """Parse Pinboard RSS feed files into links""" rss_file.seek(0) - root = ElementTree.parse(rss_file).getroot() - items = root.findall("{http://purl.org/rss/1.0/}item") - for item in items: - find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore + feed = feedparser(rss_file.read()) + for item in feed.entries: + url = item.link + # title will start with "[priv] " if pin was marked private. useful? + title = item.title + time = mktime(item.updated_parsed) - url = find("{http://purl.org/rss/1.0/}link") - tags = find("{http://purl.org/dc/elements/1.1/}subject") - title = find("{http://purl.org/rss/1.0/}title") - ts_str = find("{http://purl.org/dc/elements/1.1/}date") + # all tags are in one entry.tags with spaces in it. annoying! + try: + tags = item.tags[0].term.replace(' ', ',') + except AttributeError: + tags = '' if url is None: # Yielding a Link with no URL will # crash on a URL validation assertion continue - # Pinboard includes a colon in its date stamp timezone offsets, which - # Python can't parse. Remove it: - if ts_str and ts_str[-3:-2] == ":": - ts_str = ts_str[:-3]+ts_str[-2:] - - if ts_str: - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - else: - time = datetime.now(timezone.utc) - yield Link( url=htmldecode(url), - timestamp=str(time.timestamp()), + timestamp=str(time), title=htmldecode(title) or None, tags=htmldecode(tags) or None, sources=[rss_file.name], diff --git a/pyproject.toml b/pyproject.toml index 0907858b..cb18a911 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "dateparser>=1.0.0", "django-extensions>=3.0.3", "django>=3.1.3,<3.2", + "feedparser>=6.0.11", "ipython>5.0.0", "mypy-extensions>=0.4.3", "python-crontab>=2.5.1", From 68326a60ee20e2a8831ae86e9867b352e0f74ca6 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly <benmuthalaly@gmail.com> Date: Tue, 27 Feb 2024 15:30:31 -0600 Subject: [PATCH 05/19] Add cookies file to http request in `download_url` --- archivebox/util.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 5321081c..2e1e4907 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -3,6 +3,7 @@ __package__ = 'archivebox' import re import requests import json as pyjson +import http.cookiejar from typing import List, Optional, Any from pathlib import Path @@ -164,13 +165,26 @@ def parse_date(date: Any) -> Optional[datetime]: @enforce_types def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" - from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT + from .config import ( + TIMEOUT, + CHECK_SSL_VALIDITY, + WGET_USER_AGENT, + COOKIES_FILE, + ) timeout = timeout or TIMEOUT + + cookie_jar = http.cookiejar.MozillaCookieJar() + if COOKIES_FILE is not None: + cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True) + else: + cookie_jar = None + response = requests.get( url, headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, timeout=timeout, + cookies=cookie_jar, ) content_type = response.headers.get('Content-Type', '') From 178e676e0f27704b1ead99c554f8a65426bc9ca8 Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Tue, 27 Feb 2024 14:48:19 -0800 Subject: [PATCH 06/19] Fix JSON parser by not always mangling the input Rather than by assuming the JSON file we are parsing has junk at the beginning (which maybe only used to happen?), try parsing it as-is first, and then fall back to trying again after skipping the first line Fixes #1347 --- archivebox/parsers/generic_json.py | 19 ++++++-- tests/mock_server/templates/example.json | 1 + tests/mock_server/templates/example.json.bad | 2 + tests/test_add.py | 50 ++++++++++++++++++++ 4 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 tests/mock_server/templates/example.json create mode 100644 tests/mock_server/templates/example.json.bad diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index daebb7c4..d8df70c3 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -18,9 +18,16 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: json_file.seek(0) - # sometimes the first line is a comment or filepath, so we get everything after the first { - json_file_json_str = '{' + json_file.read().split('{', 1)[-1] - links = json.loads(json_file_json_str) + try: + links = json.load(json_file) + except json.decoder.JSONDecodeError: + # sometimes the first line is a comment or other junk, so try without + json_file.seek(0) + first_line = json_file.readline() + #print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '') + links = json.load(json_file) + # we may fail again, which means we really don't know what to do + json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') for link in links: @@ -59,11 +66,15 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: elif link.get('name'): title = link['name'].strip() + tags = '' + if link.get('tags'): + tags = link.get('tags').replace(' ',',') + yield Link( url=htmldecode(url), timestamp=ts_str, title=htmldecode(title) or None, - tags=htmldecode(link.get('tags')) or '', + tags=htmldecode(tags), sources=[json_file.name], ) diff --git a/tests/mock_server/templates/example.json b/tests/mock_server/templates/example.json new file mode 100644 index 00000000..512febe5 --- /dev/null +++ b/tests/mock_server/templates/example.json @@ -0,0 +1 @@ +[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] diff --git a/tests/mock_server/templates/example.json.bad b/tests/mock_server/templates/example.json.bad new file mode 100644 index 00000000..88d77757 --- /dev/null +++ b/tests/mock_server/templates/example.json.bad @@ -0,0 +1,2 @@ +this line would cause problems but --parser=json will actually skip it +[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] diff --git a/tests/test_add.py b/tests/test_add.py index 331178fe..062de11e 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -91,3 +91,53 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process): assert (archived_item_path / "warc").exists() assert not (archived_item_path / "singlefile.html").exists() + +def test_json(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags From ccabda4c7d17f064feb413e9268b7d0c4f02029f Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Wed, 28 Feb 2024 17:38:49 -0800 Subject: [PATCH 07/19] Handle list of tags in JSON, and be more clever about comma vs. space --- archivebox/parsers/generic_json.py | 11 ++++++++--- tests/mock_server/templates/example.json | 7 ++++++- tests/test_add.py | 7 +++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index d8df70c3..9d12a4ef 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -66,9 +66,14 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: elif link.get('name'): title = link['name'].strip() - tags = '' - if link.get('tags'): - tags = link.get('tags').replace(' ',',') + # if we have a list, join it with commas + tags = link.get('tags') + if type(tags) == list: + tags = ','.join(tags) + elif type(tags) == str: + # if there's no comma, assume it was space-separated + if ',' not in tags: + tags = tags.replace(' ', ',') yield Link( url=htmldecode(url), diff --git a/tests/mock_server/templates/example.json b/tests/mock_server/templates/example.json index 512febe5..6ee15597 100644 --- a/tests/mock_server/templates/example.json +++ b/tests/mock_server/templates/example.json @@ -1 +1,6 @@ -[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}] +[ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}, +{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"}, +{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]}, +{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} +] diff --git a/tests/test_add.py b/tests/test_add.py index 062de11e..dd1307bb 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -110,12 +110,19 @@ def test_json(tmp_path, process, disable_extractors_dict): urls = list(map(lambda x: x[0], urls)) assert "http://127.0.0.1:8080/static/example.com.html" in urls + assert "http://127.0.0.1:8080/static/iana.org.html" in urls + assert "http://127.0.0.1:8080/static/shift_jis.html" in urls + assert "http://127.0.0.1:8080/static/title_og_with_html" in urls # if the following URL appears, we must have fallen back to another parser assert not "http://www.example.com/should-not-exist" in urls tags = list(map(lambda x: x[0], tags)) assert "Tag1" in tags assert "Tag2" in tags + assert "Tag3" in tags + assert "Tag4 with Space" in tags + assert "Tag5" in tags + assert "Tag6 with Space" in tags def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f: From fe11e1c2f47487b419497bac38aafbd433ed689a Mon Sep 17 00:00:00 2001 From: Nick Sweeting <git@sweeting.me> Date: Wed, 28 Feb 2024 18:19:44 -0800 Subject: [PATCH 08/19] check if COOKIE_FILE is file --- archivebox/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 2e1e4907..9b570ec9 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -174,7 +174,7 @@ def download_url(url: str, timeout: int=None) -> str: timeout = timeout or TIMEOUT cookie_jar = http.cookiejar.MozillaCookieJar() - if COOKIES_FILE is not None: + if COOKIES_FILE and Path(COOKIES_FILE).is_file(): cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True) else: cookie_jar = None From 89ab18c772b482a92ee8c3c9b4a7e93b80593d93 Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Thu, 29 Feb 2024 18:15:06 -0800 Subject: [PATCH 09/19] Add generic_jsonl parser Resolves #1369 --- archivebox/parsers/__init__.py | 2 + archivebox/parsers/generic_json.py | 110 +++++++++--------- archivebox/parsers/generic_jsonl.py | 34 ++++++ .../templates/example-single.jsonl | 1 + tests/mock_server/templates/example.jsonl | 4 + tests/test_add.py | 70 +++++++++++ 6 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 archivebox/parsers/generic_jsonl.py create mode 100644 tests/mock_server/templates/example-single.jsonl create mode 100644 tests/mock_server/templates/example.jsonl diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index c6f2f382..0cd39d8a 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -44,6 +44,7 @@ from . import medium_rss from . import netscape_html from . import generic_rss from . import generic_json +from . import generic_jsonl from . import generic_html from . import generic_txt from . import url_list @@ -63,6 +64,7 @@ PARSERS = { netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER), generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER), generic_json.KEY: (generic_json.NAME, generic_json.PARSER), + generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER), generic_html.KEY: (generic_html.NAME, generic_html.PARSER), # Catchall fallback parser diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index 9d12a4ef..8b64f55e 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -11,6 +11,60 @@ from ..util import ( enforce_types, ) +# This gets used by generic_jsonl, too +def jsonObjectToLink(link: str, source: str): + json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') + + # example line + # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] + # Parse URL + url = link.get('href') or link.get('url') or link.get('URL') + if not url: + raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') + + # Parse the timestamp + ts_str = str(datetime.now(timezone.utc).timestamp()) + if link.get('timestamp'): + # chrome/ff histories use a very precise timestamp + ts_str = str(link['timestamp'] / 10000000) + elif link.get('time'): + ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) + elif link.get('created_at'): + ts_str = str(json_date(link['created_at']).timestamp()) + elif link.get('created'): + ts_str = str(json_date(link['created']).timestamp()) + elif link.get('date'): + ts_str = str(json_date(link['date']).timestamp()) + elif link.get('bookmarked'): + ts_str = str(json_date(link['bookmarked']).timestamp()) + elif link.get('saved'): + ts_str = str(json_date(link['saved']).timestamp()) + + # Parse the title + title = None + if link.get('title'): + title = link['title'].strip() + elif link.get('description'): + title = link['description'].replace(' — Readability', '').strip() + elif link.get('name'): + title = link['name'].strip() + + # if we have a list, join it with commas + tags = link.get('tags') + if type(tags) == list: + tags = ','.join(tags) + elif type(tags) == str: + # if there's no comma, assume it was space-separated + if ',' not in tags: + tags = tags.replace(' ', ',') + + return Link( + url=htmldecode(url), + timestamp=ts_str, + title=htmldecode(title) or None, + tags=htmldecode(tags), + sources=[source], + ) @enforce_types def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: @@ -20,6 +74,8 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: try: links = json.load(json_file) + if type(links) != list: + raise Exception('JSON parser expects list of objects, maybe this is JSONL?') except json.decoder.JSONDecodeError: # sometimes the first line is a comment or other junk, so try without json_file.seek(0) @@ -28,61 +84,9 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: links = json.load(json_file) # we may fail again, which means we really don't know what to do - json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') - for link in links: - # example line - # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if link: - # Parse URL - url = link.get('href') or link.get('url') or link.get('URL') - if not url: - raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') - - # Parse the timestamp - ts_str = str(datetime.now(timezone.utc).timestamp()) - if link.get('timestamp'): - # chrome/ff histories use a very precise timestamp - ts_str = str(link['timestamp'] / 10000000) - elif link.get('time'): - ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) - elif link.get('created_at'): - ts_str = str(json_date(link['created_at']).timestamp()) - elif link.get('created'): - ts_str = str(json_date(link['created']).timestamp()) - elif link.get('date'): - ts_str = str(json_date(link['date']).timestamp()) - elif link.get('bookmarked'): - ts_str = str(json_date(link['bookmarked']).timestamp()) - elif link.get('saved'): - ts_str = str(json_date(link['saved']).timestamp()) - - # Parse the title - title = None - if link.get('title'): - title = link['title'].strip() - elif link.get('description'): - title = link['description'].replace(' — Readability', '').strip() - elif link.get('name'): - title = link['name'].strip() - - # if we have a list, join it with commas - tags = link.get('tags') - if type(tags) == list: - tags = ','.join(tags) - elif type(tags) == str: - # if there's no comma, assume it was space-separated - if ',' not in tags: - tags = tags.replace(' ', ',') - - yield Link( - url=htmldecode(url), - timestamp=ts_str, - title=htmldecode(title) or None, - tags=htmldecode(tags), - sources=[json_file.name], - ) - + yield jsonObjectToLink(link,json_file.name) KEY = 'json' NAME = 'Generic JSON' diff --git a/archivebox/parsers/generic_jsonl.py b/archivebox/parsers/generic_jsonl.py new file mode 100644 index 00000000..8ee94b28 --- /dev/null +++ b/archivebox/parsers/generic_jsonl.py @@ -0,0 +1,34 @@ +__package__ = 'archivebox.parsers' + +import json + +from typing import IO, Iterable +from datetime import datetime, timezone + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + +from .generic_json import jsonObjectToLink + +def parse_line(line: str): + if line.strip() != "": + return json.loads(line) + +@enforce_types +def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: + """Parse JSONL format bookmarks export files""" + + json_file.seek(0) + + links = [ parse_line(line) for line in json_file ] + + for link in links: + if link: + yield jsonObjectToLink(link,json_file.name) + +KEY = 'jsonl' +NAME = 'Generic JSONL' +PARSER = parse_generic_jsonl_export diff --git a/tests/mock_server/templates/example-single.jsonl b/tests/mock_server/templates/example-single.jsonl new file mode 100644 index 00000000..492c906d --- /dev/null +++ b/tests/mock_server/templates/example-single.jsonl @@ -0,0 +1 @@ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} diff --git a/tests/mock_server/templates/example.jsonl b/tests/mock_server/templates/example.jsonl new file mode 100644 index 00000000..de0b3b5c --- /dev/null +++ b/tests/mock_server/templates/example.jsonl @@ -0,0 +1,4 @@ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} +{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"} +{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]} +{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} diff --git a/tests/test_add.py b/tests/test_add.py index dd1307bb..baeac4e9 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -148,3 +148,73 @@ def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): tags = list(map(lambda x: x[0], tags)) assert "Tag1" in tags assert "Tag2" in tags + +def test_jsonl(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=jsonl"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + assert "http://127.0.0.1:8080/static/iana.org.html" in urls + assert "http://127.0.0.1:8080/static/shift_jis.html" in urls + assert "http://127.0.0.1:8080/static/title_og_with_html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + assert "Tag3" in tags + assert "Tag4 with Space" in tags + assert "Tag5" in tags + assert "Tag6 with Space" in tags + +def test_jsonl_single(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=jsonl"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +# make sure that JSON parser rejects a single line of JSONL which is valid +# JSON but not our expected format +def test_json_single(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + assert 'expects list of objects' in arg_process.stderr.decode("utf-8") From 1f828d94410eded4e23ee8778a2d6151a4c89c8c Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Fri, 1 Mar 2024 11:22:28 -0800 Subject: [PATCH 10/19] Add tests for generic_rss and pinboard_rss parsers --- tests/mock_server/templates/example.atom | 24 +++++++++ tests/mock_server/templates/example.rss | 32 +++++++++++ tests/test_add.py | 68 ++++++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 tests/mock_server/templates/example.atom create mode 100644 tests/mock_server/templates/example.rss diff --git a/tests/mock_server/templates/example.atom b/tests/mock_server/templates/example.atom new file mode 100644 index 00000000..9d71abb1 --- /dev/null +++ b/tests/mock_server/templates/example.atom @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="utf-8"?> +<feed + xml:lang="en" + xmlns="http://www.w3.org/2005/Atom" +> + <id>http://www.example.com/</id> + <title>Example of an Atom feed</title> + <link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" /> + <link rel="alternate" type="text/html" href="http://www.example.com/" /> + <author> + <name>Jim Winstead</name> + </author> + <updated>2024-02-26T03:18:26Z</updated> + <entry> + <title>Example</title> + <link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" /> + <id>tag:example.com,2024-02-25:3319</id> + <updated>2024-02-26T03:18:26Z</updated> + <published>2024-02-25T19:18:25-08:00</published> + <category term="Tag1" scheme="http://example.com/archive" /> + <category term="Tag2" scheme="http://example.com/archive" /> + <content type="html">This is some <b>content</b></content> + </entry> +</feed> diff --git a/tests/mock_server/templates/example.rss b/tests/mock_server/templates/example.rss new file mode 100644 index 00000000..d47a5a38 --- /dev/null +++ b/tests/mock_server/templates/example.rss @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="utf-8"?> +<rss version="2.0" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:admin="http://webns.net/mvcb/" + xmlns:content="http://purl.org/rss/1.0/modules/content/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> +<channel> + <title>Sample Feed</title> + <link>http://example.org/</link> + <description>For documentation only</description> + <dc:language>en-us</dc:language> + <dc:creator>Nobody (nobody@example.org)</dc:creator> + <dc:rights>Public domain</dc:rights> + <dc:date>2024-02-26T17:28:12-08:00</dc:date> + <admin:generatorAgent rdf:resource="http://www.example.org/"/> + <admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/> + + <item> + <title>First!</title> + <link>http://127.0.0.1:8080/static/example.com.html</link> + <guid isPermaLink="false">just-an@example.org</guid> + <description> + This has a description. + </description> + <dc:subject>Tag1 Tag2</dc:subject> + <dc:date>2024-02-26T17:28:12-08:00</dc:date> + <content:encoded><![CDATA[ + This has a <b>description</b>.]]> + </content:encoded> + </item> +</channel> +</rss> diff --git a/tests/test_add.py b/tests/test_add.py index 331178fe..9675f361 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -91,3 +91,71 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process): assert (archived_item_path / "warc").exists() assert not (archived_item_path / "singlefile.html").exists() + +def test_generic_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://purl.org/dc/elements/1.1/" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1 Tag2" in tags + +def test_pinboard_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=pinboard_rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_atom(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.w3.org/2005/Atom" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags From 9f462a87a8f021b5497dd75208b044dbe1c4ce40 Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Sun, 25 Feb 2024 12:34:51 -0800 Subject: [PATCH 11/19] Use feedparser for RSS parsing in generic_rss and pinboard_rss parsers The feedparser packages has 20 years of history and is very good at parsing RSS and Atom, so use that instead of ad-hoc regex and XML parsing. The medium_rss and shaarli_rss parsers weren't touched because they are probably unnecessary. (The special parse for pinboard is just needing because of how tags work.) Doesn't include tests because I haven't figured out how to run them in the docker development setup. Fixes #1171 --- archivebox/parsers/generic_rss.py | 44 ++++++++++++------------------ archivebox/parsers/pinboard_rss.py | 39 ++++++++++---------------- pyproject.toml | 1 + 3 files changed, 34 insertions(+), 50 deletions(-) diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py index 4bd04967..005da688 100644 --- a/archivebox/parsers/generic_rss.py +++ b/archivebox/parsers/generic_rss.py @@ -2,13 +2,13 @@ __package__ = 'archivebox.parsers' from typing import IO, Iterable -from datetime import datetime +from time import mktime +from feedparser import parse as feedparser from ..index.schema import Link from ..util import ( htmldecode, - enforce_types, - str_between, + enforce_types ) @enforce_types @@ -16,35 +16,27 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: """Parse RSS XML-format files into links""" rss_file.seek(0) - items = rss_file.read().split('<item>') - items = items[1:] if items else [] - for item in items: - # example item: - # <item> - # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title> - # <category>Unread</category> - # <link>https://blog.sessionstack.com/how-javascript-works-inside</link> - # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid> - # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate> - # </item> + feed = feedparser(rss_file.read()) + for item in feed.entries: + url = item.link + title = item.title + time = mktime(item.updated_parsed) - trailing_removed = item.split('</item>', 1)[0] - leading_removed = trailing_removed.split('<item>', 1)[-1].strip() - rows = leading_removed.split('\n') + try: + tags = ','.join(map(lambda tag: tag.term, item.tags)) + except AttributeError: + tags = '' - def get_row(key): - return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] - - url = str_between(get_row('link'), '<link>', '</link>') - ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>') - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") - title = str_between(get_row('title'), '<![CDATA[', ']]').strip() + if url is None: + # Yielding a Link with no URL will + # crash on a URL validation assertion + continue yield Link( url=htmldecode(url), - timestamp=str(time.timestamp()), + timestamp=str(time), title=htmldecode(title) or None, - tags=None, + tags=tags, sources=[rss_file.name], ) diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py index d12b219c..8c4dbb16 100644 --- a/archivebox/parsers/pinboard_rss.py +++ b/archivebox/parsers/pinboard_rss.py @@ -2,50 +2,41 @@ __package__ = 'archivebox.parsers' from typing import IO, Iterable -from datetime import datetime, timezone - -from xml.etree import ElementTree +from time import mktime +from feedparser import parse as feedparser from ..index.schema import Link from ..util import ( htmldecode, - enforce_types, + enforce_types ) - @enforce_types def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: """Parse Pinboard RSS feed files into links""" rss_file.seek(0) - root = ElementTree.parse(rss_file).getroot() - items = root.findall("{http://purl.org/rss/1.0/}item") - for item in items: - find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore + feed = feedparser(rss_file.read()) + for item in feed.entries: + url = item.link + # title will start with "[priv] " if pin was marked private. useful? + title = item.title + time = mktime(item.updated_parsed) - url = find("{http://purl.org/rss/1.0/}link") - tags = find("{http://purl.org/dc/elements/1.1/}subject") - title = find("{http://purl.org/rss/1.0/}title") - ts_str = find("{http://purl.org/dc/elements/1.1/}date") + # all tags are in one entry.tags with spaces in it. annoying! + try: + tags = item.tags[0].term.replace(' ', ',') + except AttributeError: + tags = '' if url is None: # Yielding a Link with no URL will # crash on a URL validation assertion continue - # Pinboard includes a colon in its date stamp timezone offsets, which - # Python can't parse. Remove it: - if ts_str and ts_str[-3:-2] == ":": - ts_str = ts_str[:-3]+ts_str[-2:] - - if ts_str: - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - else: - time = datetime.now(timezone.utc) - yield Link( url=htmldecode(url), - timestamp=str(time.timestamp()), + timestamp=str(time), title=htmldecode(title) or None, tags=htmldecode(tags) or None, sources=[rss_file.name], diff --git a/pyproject.toml b/pyproject.toml index 0907858b..cb18a911 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "dateparser>=1.0.0", "django-extensions>=3.0.3", "django>=3.1.3,<3.2", + "feedparser>=6.0.11", "ipython>5.0.0", "mypy-extensions>=0.4.3", "python-crontab>=2.5.1", From e7119adb0b1ff4b950bd61f88a69f8cf9f8ed145 Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Fri, 1 Mar 2024 11:27:59 -0800 Subject: [PATCH 12/19] Add tests for generic_rss and pinboard_rss parsers --- tests/mock_server/templates/example.atom | 24 ++++++++++++ tests/mock_server/templates/example.rss | 32 ++++++++++++++++ tests/test_add.py | 49 ++++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 tests/mock_server/templates/example.atom create mode 100644 tests/mock_server/templates/example.rss diff --git a/tests/mock_server/templates/example.atom b/tests/mock_server/templates/example.atom new file mode 100644 index 00000000..9d71abb1 --- /dev/null +++ b/tests/mock_server/templates/example.atom @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="utf-8"?> +<feed + xml:lang="en" + xmlns="http://www.w3.org/2005/Atom" +> + <id>http://www.example.com/</id> + <title>Example of an Atom feed</title> + <link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" /> + <link rel="alternate" type="text/html" href="http://www.example.com/" /> + <author> + <name>Jim Winstead</name> + </author> + <updated>2024-02-26T03:18:26Z</updated> + <entry> + <title>Example</title> + <link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" /> + <id>tag:example.com,2024-02-25:3319</id> + <updated>2024-02-26T03:18:26Z</updated> + <published>2024-02-25T19:18:25-08:00</published> + <category term="Tag1" scheme="http://example.com/archive" /> + <category term="Tag2" scheme="http://example.com/archive" /> + <content type="html">This is some <b>content</b></content> + </entry> +</feed> diff --git a/tests/mock_server/templates/example.rss b/tests/mock_server/templates/example.rss new file mode 100644 index 00000000..d47a5a38 --- /dev/null +++ b/tests/mock_server/templates/example.rss @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="utf-8"?> +<rss version="2.0" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:admin="http://webns.net/mvcb/" + xmlns:content="http://purl.org/rss/1.0/modules/content/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> +<channel> + <title>Sample Feed</title> + <link>http://example.org/</link> + <description>For documentation only</description> + <dc:language>en-us</dc:language> + <dc:creator>Nobody (nobody@example.org)</dc:creator> + <dc:rights>Public domain</dc:rights> + <dc:date>2024-02-26T17:28:12-08:00</dc:date> + <admin:generatorAgent rdf:resource="http://www.example.org/"/> + <admin:errorReportsTo rdf:resource="mailto:nobody@example.org"/> + + <item> + <title>First!</title> + <link>http://127.0.0.1:8080/static/example.com.html</link> + <guid isPermaLink="false">just-an@example.org</guid> + <description> + This has a description. + </description> + <dc:subject>Tag1 Tag2</dc:subject> + <dc:date>2024-02-26T17:28:12-08:00</dc:date> + <content:encoded><![CDATA[ + This has a <b>description</b>.]]> + </content:encoded> + </item> +</channel> +</rss> diff --git a/tests/test_add.py b/tests/test_add.py index dd1307bb..37f666c9 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -148,3 +148,52 @@ def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict): tags = list(map(lambda x: x[0], tags)) assert "Tag1" in tags assert "Tag2" in tags + +def test_generic_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + assert not "http://purl.org/dc/elements/1.1/" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1 Tag2" in tags + +def test_pinboard_rss(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=pinboard_rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +def test_atom(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=rss"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.w3.org/2005/Atom" in urls From 741ff5f1a864ab6833ed98da5ff7a3be79cbee52 Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Fri, 1 Mar 2024 12:43:53 -0800 Subject: [PATCH 13/19] Make it a little easier to run specific tests Changes ./bin/test.sh to pass command line options to pytest, and default to only running tests in the tests/ directory instead of everywhere excluding a few directories which is more error-prone. Also keeps the mock_server used in testing quiet so access log entries don't appear on stdout. --- bin/test.sh | 2 +- pyproject.toml | 2 ++ tests/mock_server/server.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/test.sh b/bin/test.sh index f9ea3575..515806bb 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -14,4 +14,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" source "$DIR/.venv/bin/activate" -pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist +pytest -s --basetemp=tests/out "$@" diff --git a/pyproject.toml b/pyproject.toml index 0907858b..f5f7dc4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,8 @@ lint = "./bin/lint.sh" test = "./bin/test.sh" # all = {composite = ["lint mypackage/", "test -v tests/"]} +[tool.pytest.ini_options] +testpaths = [ "tests" ] [project.scripts] archivebox = "archivebox.cli:main" diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index 4283574f..39abd80c 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -50,4 +50,4 @@ def redirect_to_static(filename): def start(): - run(host='localhost', port=8080) \ No newline at end of file + run(host='localhost', port=8080, quiet=True) From d74ddd42ae104004e656929036c55f972a9d63d4 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly <benmuthalaly@gmail.com> Date: Fri, 1 Mar 2024 14:50:32 -0600 Subject: [PATCH 14/19] Flip dedupe precedence order --- archivebox/extractors/archive_org.py | 6 +++--- archivebox/extractors/favicon.py | 6 +++--- archivebox/extractors/headers.py | 6 +++--- archivebox/extractors/media.py | 5 +++-- archivebox/extractors/singlefile.py | 14 ++++---------- archivebox/extractors/title.py | 6 +++--- archivebox/extractors/wget.py | 6 +++--- archivebox/util.py | 24 +++++++++++------------- 8 files changed, 33 insertions(+), 40 deletions(-) diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 93730f26..0d45534a 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -46,14 +46,14 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= output: ArchiveOutput = 'archive.org.txt' archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 3b41f349..fffa3d16 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -39,14 +39,14 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) out_dir = out_dir or link.link_dir output: ArchiveOutput = 'favicon.ico' - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--max-time', str(timeout), '--output', str(output), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 3828de93..9be14331 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -42,14 +42,14 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 862bb758..a6d4e81f 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -41,11 +41,12 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME output: ArchiveOutput = 'media' output_path = out_dir / output output_path.mkdir(exist_ok=True) + # later options take precedence options = [ + *YOUTUBEDL_ARGS, + *YOUTUBEDL_EXTRA_ARGS, *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} - *YOUTUBEDL_EXTRA_ARGS, - *YOUTUBEDL_ARGS, ] cmd = [ YOUTUBEDL_BINARY, diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index b2119119..5021a6cc 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -48,18 +48,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - - # Deduplicate options (single-file doesn't like when you use the same option two times) - # - # NOTE: Options names that come first clobber conflicting names that come later - # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most - # specificity, therefore the user sets it with a lot intent, therefore it should take precedence - # kind of like the ergonomic principle of lexical scope in programming languages. + # later options take precedence options = [ - '--browser-executable-path={}'.format(CHROME_BINARY), - browser_args, - *SINGLEFILE_EXTRA_ARGS, *SINGLEFILE_ARGS, + *SINGLEFILE_EXTRA_ARGS, + browser_args, + '--browser-executable-path={}'.format(CHROME_BINARY), ] cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index b2b65af2..4f34ca81 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -104,13 +104,13 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - from core.models import Snapshot output: ArchiveOutput = None - # earlier options take precedence + # later options take precedence options = [ + *CURL_ARGS, + *CURL_EXTRA_ARGS, '--max-time', str(timeout), *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *([] if CHECK_SSL_VALIDITY else ['--insecure']), - *CURL_EXTRA_ARGS, - *CURL_ARGS, ] cmd = [ CURL_BINARY, diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 5209cde9..885e31f5 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -57,8 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html output: ArchiveOutput = None - # earlier options take precedence + # later options take precedence options = [ + *WGET_ARGS, + *WGET_EXTRA_ARGS, '--timeout={}'.format(timeout), *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), @@ -69,8 +71,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> *([] if SAVE_WARC else ['--timestamping']), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), # '--server-response', # print headers for better error parsing - *WGET_EXTRA_ARGS, - *WGET_ARGS, ] cmd = [ WGET_BINARY, diff --git a/archivebox/util.py b/archivebox/util.py index 18ca08aa..10ceebd4 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -240,6 +240,8 @@ def chrome_args(**options) -> List[str]: cmd_args = [options['CHROME_BINARY']] + cmd_args += CHROME_EXTRA_ARGS + if options['CHROME_HEADLESS']: chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1]) if chrome_major_version >= 111: @@ -284,7 +286,6 @@ def chrome_args(**options) -> List[str]: if options['CHROME_USER_DATA_DIR']: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - cmd_args += CHROME_EXTRA_ARGS return dedupe(*cmd_args) @@ -324,20 +325,17 @@ def ansi_to_html(text): @enforce_types -def dedupe(*options: List[str]) -> List[str]: +def dedupe(*options: str) -> List[str]: """ - Deduplicates the given options. Options that come earlier in the list clobber - later conflicting options. + Deduplicates the given options. Options that come later clobber earlier + conflicting options. """ - seen_option_names = [] - def test_seen(argument): - option_name = argument.split("=")[0] - if option_name in seen_option_names: - return False - else: - seen_option_names.append(option_name) - return True - return list(filter(test_seen, options)) + deduped = {} + + for option in options: + deduped[option.split('=')[0]] = option + + return list(deduped.values()) class AttributeDict(dict): From 4686da91e6b11661c0e57397fe86886416d965d5 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly <benmuthalaly@gmail.com> Date: Tue, 5 Mar 2024 01:48:35 -0600 Subject: [PATCH 15/19] Fix cookies being set incorrectly --- archivebox/util.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/archivebox/util.py b/archivebox/util.py index 2e1e4907..461141c3 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -166,25 +166,25 @@ def parse_date(date: Any) -> Optional[datetime]: def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" from .config import ( - TIMEOUT, - CHECK_SSL_VALIDITY, - WGET_USER_AGENT, - COOKIES_FILE, + TIMEOUT, + CHECK_SSL_VALIDITY, + WGET_USER_AGENT, + COOKIES_FILE, ) timeout = timeout or TIMEOUT + session = requests.Session() - cookie_jar = http.cookiejar.MozillaCookieJar() - if COOKIES_FILE is not None: - cookie_jar.load(COOKIES_FILE, ignore_discard=True, ignore_expires=True) - else: - cookie_jar = None + if COOKIES_FILE and Path(COOKIES_FILE).is_file(): + cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE) + cookie_jar.load(ignore_discard=True, ignore_expires=True) + for cookie in cookie_jar: + session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) - response = requests.get( + response = session.get( url, headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, timeout=timeout, - cookies=cookie_jar, ) content_type = response.headers.get('Content-Type', '') From d8cf09c21e2d6e3ece8a7e5c93d537596c3687d0 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly <benmuthalaly@gmail.com> Date: Tue, 5 Mar 2024 21:13:45 -0600 Subject: [PATCH 16/19] Remove unnecessary variable length args for dedupe --- archivebox/extractors/archive_org.py | 2 +- archivebox/extractors/favicon.py | 2 +- archivebox/extractors/headers.py | 2 +- archivebox/extractors/media.py | 2 +- archivebox/extractors/singlefile.py | 2 +- archivebox/extractors/title.py | 2 +- archivebox/extractors/wget.py | 2 +- archivebox/util.py | 4 ++-- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 0d45534a..245315f1 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -57,7 +57,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= ] cmd = [ CURL_BINARY, - *dedupe(*options), + *dedupe(options), submit_url, ] status = 'succeeded' diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index fffa3d16..f793f8df 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -50,7 +50,7 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ] cmd = [ CURL_BINARY, - *dedupe(*options), + *dedupe(options), FAVICON_PROVIDER.format(domain(link.url)), ] status = 'failed' diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 9be14331..975787ad 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -53,7 +53,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ] cmd = [ CURL_BINARY, - *dedupe(*options), + *dedupe(options), link.url, ] try: diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index a6d4e81f..ad4c9c4b 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -50,7 +50,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME ] cmd = [ YOUTUBEDL_BINARY, - *dedupe(*options), + *dedupe(options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 5021a6cc..553c9f8d 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -57,7 +57,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO ] cmd = [ DEPENDENCIES['SINGLEFILE_BINARY']['path'], - *dedupe(*options), + *dedupe(options), link.url, output, ] diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 4f34ca81..5decc52c 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -114,7 +114,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - ] cmd = [ CURL_BINARY, - *dedupe(*options), + *dedupe(options), link.url, ] status = 'succeeded' diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 885e31f5..07471e29 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -74,7 +74,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ] cmd = [ WGET_BINARY, - *dedupe(*options), + *dedupe(options), link.url, ] diff --git a/archivebox/util.py b/archivebox/util.py index 10ceebd4..e1707049 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -287,7 +287,7 @@ def chrome_args(**options) -> List[str]: cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - return dedupe(*cmd_args) + return dedupe(cmd_args) def chrome_cleanup(): """ @@ -325,7 +325,7 @@ def ansi_to_html(text): @enforce_types -def dedupe(*options: str) -> List[str]: +def dedupe(options: List[str]) -> List[str]: """ Deduplicates the given options. Options that come later clobber earlier conflicting options. From f4deb97f59abffae4faa5f93a5108c9f28cb09f3 Mon Sep 17 00:00:00 2001 From: Ben Muthalaly <benmuthalaly@gmail.com> Date: Tue, 5 Mar 2024 21:15:38 -0600 Subject: [PATCH 17/19] Add `ARGS` and `EXTRA_ARGS` for Mercury extractor --- archivebox/config.py | 4 ++++ archivebox/extractors/mercury.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index f8e56036..64b07931 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -199,6 +199,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, 'SINGLEFILE_ARGS': {'type': list, 'default': None}, 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None}, + 'MERCURY_ARGS': {'type': list, 'default': ['--format=text']}, + 'MERCURY_EXTRA_ARGS': {'type': list, 'default': None}, 'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'}, }, @@ -561,6 +563,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750 + 'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []}, + 'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []}, 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index e7d20362..a0f38434 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -11,13 +11,15 @@ from ..system import run, atomic_write from ..util import ( enforce_types, is_static_file, - + dedupe, ) from ..config import ( TIMEOUT, SAVE_MERCURY, DEPENDENCIES, MERCURY_VERSION, + MERCURY_ARGS, + MERCURY_EXTRA_ARGS, ) from ..logging_util import TimedProgress @@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) timer = TimedProgress(timeout, prefix=' ') try: output_folder.mkdir(exist_ok=True) - - # Get plain text version of article + # later options take precedence + options = [ + *MERCURY_ARGS, + *MERCURY_EXTRA_ARGS, + ] + # By default, get plain text version of article cmd = [ DEPENDENCIES['MERCURY_BINARY']['path'], link.url, - "--format=text" + *dedupe(options) ] result = run(cmd, cwd=out_dir, timeout=timeout) try: From 3512dc7e606e67b126100dc8bb2d56874c9025c5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting <git@sweeting.me> Date: Thu, 14 Mar 2024 00:58:45 -0700 Subject: [PATCH 18/19] Disable searching for existing chrome user profiles by default --- archivebox/config.py | 53 ++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 1edd2eeb..fad2db53 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -500,7 +500,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME}, 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, - 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None + 'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None}, 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories @@ -910,27 +910,36 @@ def find_chrome_binary() -> Optional[str]: def find_chrome_data_dir() -> Optional[str]: """find any installed chrome user data directories in the default locations""" - # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev - # make sure data dir finding precedence order always matches binary finding order - default_profile_paths = ( - '~/.config/chromium', - '~/Library/Application Support/Chromium', - '~/AppData/Local/Chromium/User Data', - '~/.config/chrome', - '~/.config/google-chrome', - '~/Library/Application Support/Google/Chrome', - '~/AppData/Local/Google/Chrome/User Data', - '~/.config/google-chrome-stable', - '~/.config/google-chrome-beta', - '~/Library/Application Support/Google/Chrome Canary', - '~/AppData/Local/Google/Chrome SxS/User Data', - '~/.config/google-chrome-unstable', - '~/.config/google-chrome-dev', - ) - for path in default_profile_paths: - full_path = Path(path).resolve() - if full_path.exists(): - return full_path + # deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior. + + # Going forward we want to discourage people from using their main chrome profile for archiving. + # Session tokens, personal data, and cookies are often returned in server responses, + # when they get archived, they are essentially burned as anyone who can view the archive + # can use that data to masquerade as the logged-in user that did the archiving. + # For this reason users should always create dedicated burner profiles for archiving and not use + # their daily driver main accounts. + + # # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev + # # make sure data dir finding precedence order always matches binary finding order + # default_profile_paths = ( + # '~/.config/chromium', + # '~/Library/Application Support/Chromium', + # '~/AppData/Local/Chromium/User Data', + # '~/.config/chrome', + # '~/.config/google-chrome', + # '~/Library/Application Support/Google/Chrome', + # '~/AppData/Local/Google/Chrome/User Data', + # '~/.config/google-chrome-stable', + # '~/.config/google-chrome-beta', + # '~/Library/Application Support/Google/Chrome Canary', + # '~/AppData/Local/Google/Chrome SxS/User Data', + # '~/.config/google-chrome-unstable', + # '~/.config/google-chrome-dev', + # ) + # for path in default_profile_paths: + # full_path = Path(path).resolve() + # if full_path.exists(): + # return full_path return None def wget_supports_compression(config): From 5478d13d5254a2443a3a32645a6bb3118bfa7b8a Mon Sep 17 00:00:00 2001 From: jim winstead <jimw@trainedmonkey.com> Date: Thu, 29 Feb 2024 18:15:06 -0800 Subject: [PATCH 19/19] Add generic_jsonl parser Resolves #1369 --- archivebox/parsers/__init__.py | 2 + archivebox/parsers/generic_json.py | 110 +++++++++--------- archivebox/parsers/generic_jsonl.py | 34 ++++++ .../templates/example-single.jsonl | 1 + tests/mock_server/templates/example.jsonl | 4 + tests/test_add.py | 70 +++++++++++ 6 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 archivebox/parsers/generic_jsonl.py create mode 100644 tests/mock_server/templates/example-single.jsonl create mode 100644 tests/mock_server/templates/example.jsonl diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index c6f2f382..0cd39d8a 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -44,6 +44,7 @@ from . import medium_rss from . import netscape_html from . import generic_rss from . import generic_json +from . import generic_jsonl from . import generic_html from . import generic_txt from . import url_list @@ -63,6 +64,7 @@ PARSERS = { netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER), generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER), generic_json.KEY: (generic_json.NAME, generic_json.PARSER), + generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER), generic_html.KEY: (generic_html.NAME, generic_html.PARSER), # Catchall fallback parser diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index 9d12a4ef..8b64f55e 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -11,6 +11,60 @@ from ..util import ( enforce_types, ) +# This gets used by generic_jsonl, too +def jsonObjectToLink(link: str, source: str): + json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') + + # example line + # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] + # Parse URL + url = link.get('href') or link.get('url') or link.get('URL') + if not url: + raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') + + # Parse the timestamp + ts_str = str(datetime.now(timezone.utc).timestamp()) + if link.get('timestamp'): + # chrome/ff histories use a very precise timestamp + ts_str = str(link['timestamp'] / 10000000) + elif link.get('time'): + ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) + elif link.get('created_at'): + ts_str = str(json_date(link['created_at']).timestamp()) + elif link.get('created'): + ts_str = str(json_date(link['created']).timestamp()) + elif link.get('date'): + ts_str = str(json_date(link['date']).timestamp()) + elif link.get('bookmarked'): + ts_str = str(json_date(link['bookmarked']).timestamp()) + elif link.get('saved'): + ts_str = str(json_date(link['saved']).timestamp()) + + # Parse the title + title = None + if link.get('title'): + title = link['title'].strip() + elif link.get('description'): + title = link['description'].replace(' — Readability', '').strip() + elif link.get('name'): + title = link['name'].strip() + + # if we have a list, join it with commas + tags = link.get('tags') + if type(tags) == list: + tags = ','.join(tags) + elif type(tags) == str: + # if there's no comma, assume it was space-separated + if ',' not in tags: + tags = tags.replace(' ', ',') + + return Link( + url=htmldecode(url), + timestamp=ts_str, + title=htmldecode(title) or None, + tags=htmldecode(tags), + sources=[source], + ) @enforce_types def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: @@ -20,6 +74,8 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: try: links = json.load(json_file) + if type(links) != list: + raise Exception('JSON parser expects list of objects, maybe this is JSONL?') except json.decoder.JSONDecodeError: # sometimes the first line is a comment or other junk, so try without json_file.seek(0) @@ -28,61 +84,9 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: links = json.load(json_file) # we may fail again, which means we really don't know what to do - json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') - for link in links: - # example line - # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if link: - # Parse URL - url = link.get('href') or link.get('url') or link.get('URL') - if not url: - raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') - - # Parse the timestamp - ts_str = str(datetime.now(timezone.utc).timestamp()) - if link.get('timestamp'): - # chrome/ff histories use a very precise timestamp - ts_str = str(link['timestamp'] / 10000000) - elif link.get('time'): - ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) - elif link.get('created_at'): - ts_str = str(json_date(link['created_at']).timestamp()) - elif link.get('created'): - ts_str = str(json_date(link['created']).timestamp()) - elif link.get('date'): - ts_str = str(json_date(link['date']).timestamp()) - elif link.get('bookmarked'): - ts_str = str(json_date(link['bookmarked']).timestamp()) - elif link.get('saved'): - ts_str = str(json_date(link['saved']).timestamp()) - - # Parse the title - title = None - if link.get('title'): - title = link['title'].strip() - elif link.get('description'): - title = link['description'].replace(' — Readability', '').strip() - elif link.get('name'): - title = link['name'].strip() - - # if we have a list, join it with commas - tags = link.get('tags') - if type(tags) == list: - tags = ','.join(tags) - elif type(tags) == str: - # if there's no comma, assume it was space-separated - if ',' not in tags: - tags = tags.replace(' ', ',') - - yield Link( - url=htmldecode(url), - timestamp=ts_str, - title=htmldecode(title) or None, - tags=htmldecode(tags), - sources=[json_file.name], - ) - + yield jsonObjectToLink(link,json_file.name) KEY = 'json' NAME = 'Generic JSON' diff --git a/archivebox/parsers/generic_jsonl.py b/archivebox/parsers/generic_jsonl.py new file mode 100644 index 00000000..8ee94b28 --- /dev/null +++ b/archivebox/parsers/generic_jsonl.py @@ -0,0 +1,34 @@ +__package__ = 'archivebox.parsers' + +import json + +from typing import IO, Iterable +from datetime import datetime, timezone + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, +) + +from .generic_json import jsonObjectToLink + +def parse_line(line: str): + if line.strip() != "": + return json.loads(line) + +@enforce_types +def parse_generic_jsonl_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: + """Parse JSONL format bookmarks export files""" + + json_file.seek(0) + + links = [ parse_line(line) for line in json_file ] + + for link in links: + if link: + yield jsonObjectToLink(link,json_file.name) + +KEY = 'jsonl' +NAME = 'Generic JSONL' +PARSER = parse_generic_jsonl_export diff --git a/tests/mock_server/templates/example-single.jsonl b/tests/mock_server/templates/example-single.jsonl new file mode 100644 index 00000000..492c906d --- /dev/null +++ b/tests/mock_server/templates/example-single.jsonl @@ -0,0 +1 @@ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} diff --git a/tests/mock_server/templates/example.jsonl b/tests/mock_server/templates/example.jsonl new file mode 100644 index 00000000..de0b3b5c --- /dev/null +++ b/tests/mock_server/templates/example.jsonl @@ -0,0 +1,4 @@ +{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"} +{"href":"http://127.0.0.1:8080/static/iana.org.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:43Z","shared":"no","toread":"no","tags":"Tag3,Tag4 with Space"} +{"href":"http://127.0.0.1:8080/static/shift_jis.html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:44Z","shared":"no","toread":"no","tags":["Tag5","Tag6 with Space"]} +{"href":"http://127.0.0.1:8080/static/title_og_with_html","description":"Example 2","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:45Z","shared":"no","toread":"no"} diff --git a/tests/test_add.py b/tests/test_add.py index 972db2e8..c899b320 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -216,3 +216,73 @@ def test_atom(tmp_path, process, disable_extractors_dict): tags = list(map(lambda x: x[0], tags)) assert "Tag1" in tags assert "Tag2" in tags + +def test_jsonl(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=jsonl"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + assert "http://127.0.0.1:8080/static/iana.org.html" in urls + assert "http://127.0.0.1:8080/static/shift_jis.html" in urls + assert "http://127.0.0.1:8080/static/title_og_with_html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + assert "Tag3" in tags + assert "Tag4 with Space" in tags + assert "Tag5" in tags + assert "Tag6 with Space" in tags + +def test_jsonl_single(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=jsonl"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url from core_snapshot").fetchall() + tags = c.execute("SELECT name from core_tag").fetchall() + conn.commit() + conn.close() + + urls = list(map(lambda x: x[0], urls)) + assert "http://127.0.0.1:8080/static/example.com.html" in urls + # if the following URL appears, we must have fallen back to another parser + assert not "http://www.example.com/should-not-exist" in urls + + tags = list(map(lambda x: x[0], tags)) + assert "Tag1" in tags + assert "Tag2" in tags + +# make sure that JSON parser rejects a single line of JSONL which is valid +# JSON but not our expected format +def test_json_single(tmp_path, process, disable_extractors_dict): + with open('../../mock_server/templates/example-single.jsonl', 'r', encoding='utf-8') as f: + arg_process = subprocess.run( + ["archivebox", "add", "--index-only", "--parser=json"], + stdin=f, + capture_output=True, + env=disable_extractors_dict, + ) + + assert 'expects list of objects' in arg_process.stderr.decode("utf-8")