diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt index 14e510a1..ee6a2fc5 100644 --- a/archivebox.egg-info/SOURCES.txt +++ b/archivebox.egg-info/SOURCES.txt @@ -70,6 +70,7 @@ archivebox/index/json.py archivebox/index/schema.py archivebox/index/sql.py archivebox/parsers/__init__.py +archivebox/parsers/generic_html.py archivebox/parsers/generic_json.py archivebox/parsers/generic_rss.py archivebox/parsers/generic_txt.py diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index b7c8ebff..784c879c 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -301,14 +301,14 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]: @enforce_types -def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]: +def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]: from ..parsers import parse_links new_links: List[Link] = [] # parse and validate the import file - raw_links, parser_name = parse_links(source_path) + raw_links, parser_name = parse_links(source_path, root_url=root_url) new_links = validate_links(raw_links) if parser_name: diff --git a/archivebox/main.py b/archivebox/main.py index 2e17594b..b65c6e64 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -548,7 +548,7 @@ def add(urls: Union[str, List[str]], # save verbatim args to sources write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) - new_links += parse_links_from_source(write_ahead_log) + new_links += parse_links_from_source(write_ahead_log, root_url=None) # If we're going one level deeper, download each link and look for more links new_links_depth = [] @@ -556,9 +556,9 @@ def add(urls: Union[str, List[str]], log_crawl_started(new_links) for new_link in new_links: downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) - new_links_depth += parse_links_from_source(downloaded_file) + new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) - imported_links = new_links + new_links_depth + imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) all_links, new_links = dedupe_links(all_links, imported_links) write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index bde71c27..930e1ade 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -11,7 +11,7 @@ import re import os from io import StringIO -from typing import IO, Tuple, List +from typing import IO, Tuple, List, Optional from datetime import datetime from ..system import atomic_write @@ -38,26 +38,29 @@ from .medium_rss import parse_medium_rss_export from .netscape_html import parse_netscape_html_export from .generic_rss import parse_generic_rss_export from .generic_json import parse_generic_json_export +from .generic_html import parse_generic_html_export from .generic_txt import parse_generic_txt_export PARSERS = ( - # Specialized parsers - ('Pocket HTML', parse_pocket_html_export), - ('Pinboard RSS', parse_pinboard_rss_export), - ('Shaarli RSS', parse_shaarli_rss_export), - ('Medium RSS', parse_medium_rss_export), - - # General parsers - ('Netscape HTML', parse_netscape_html_export), - ('Generic RSS', parse_generic_rss_export), - ('Generic JSON', parse_generic_json_export), + # Specialized parsers + ('Pocket HTML', parse_pocket_html_export), + ('Pinboard RSS', parse_pinboard_rss_export), + ('Shaarli RSS', parse_shaarli_rss_export), + ('Medium RSS', parse_medium_rss_export), + + # General parsers + ('Netscape HTML', parse_netscape_html_export), + ('Generic RSS', parse_generic_rss_export), + ('Generic JSON', parse_generic_json_export), + ('Generic HTML', parse_generic_html_export), + + # Fallback parser + ('Plain Text', parse_generic_txt_export), +) - # Fallback parser - ('Plain Text', parse_generic_txt_export), - ) @enforce_types -def parse_links_memory(urls: List[str]): +def parse_links_memory(urls: List[str], root_url: Optional[str]=None): """ parse a list of URLS without touching the filesystem """ @@ -68,17 +71,16 @@ def parse_links_memory(urls: List[str]): file = StringIO() file.writelines(urls) file.name = "io_string" - output = _parse(file, timer) - - if output is not None: - return output - + links, parser = run_parser_functions(file, timer, root_url=root_url) timer.end() - return [], 'Failed to parse' + + if parser is None: + return [], 'Failed to parse' + return links, parser @enforce_types -def parse_links(source_file: str) -> Tuple[List[Link], str]: +def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]: """parse a list of URLs with their metadata from an RSS feed, bookmarks export, or text file """ @@ -87,28 +89,39 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]: timer = TimedProgress(TIMEOUT * 4) with open(source_file, 'r', encoding='utf-8') as file: - output = _parse(file, timer) - - if output is not None: - return output + links, parser = run_parser_functions(file, timer, root_url=root_url) timer.end() - return [], 'Failed to parse' + if parser is None: + return [], 'Failed to parse' + return links, parser + + +def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]: + most_links: List[Link] = [] + best_parser_name = None -def _parse(to_parse: IO[str], timer) -> Tuple[List[Link], str]: for parser_name, parser_func in PARSERS: try: - links = list(parser_func(to_parse)) - if links: - timer.end() - return links, parser_name - except Exception as err: # noqa - pass + parsed_links = list(parser_func(to_parse, root_url=root_url)) + if not parsed_links: + raise Exception('no links found') + + # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed') + if len(parsed_links) > len(most_links): + most_links = parsed_links + best_parser_name = parser_name + + except Exception as err: # noqa # Parsers are tried one by one down the list, and the first one # that succeeds is used. To see why a certain parser was not used # due to error or format incompatibility, uncomment this line: + # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) # raise + pass + timer.end() + return most_links, best_parser_name @enforce_types diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py new file mode 100644 index 00000000..4c632f04 --- /dev/null +++ b/archivebox/parsers/generic_html.py @@ -0,0 +1,53 @@ +__package__ = 'archivebox.parsers' + + +import re + +from typing import IO, Iterable, Optional +from datetime import datetime + +from ..index.schema import Link +from ..util import ( + htmldecode, + enforce_types, + URL_REGEX, +) +from html.parser import HTMLParser +from urllib.parse import urljoin + + +class HrefParser(HTMLParser): + def __init__(self): + super().__init__() + self.urls = [] + + def handle_starttag(self, tag, attrs): + if tag == "a": + for attr, value in attrs: + if attr == "href": + self.urls.append(value) + + +@enforce_types +def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]: + """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" + + html_file.seek(0) + for line in html_file: + parser = HrefParser() + # example line + #