mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
Add parser for Pocket API
Pass a url like `pocket://Username` to import that username's archived Pocket library. Tokens need to be stored in ArchveBox.conf with the following keys: ``` POCKET_CONSUMER_KEY = key-from-custom-pocket-app POCKET_ACCESS_TOKENS = {"YourUsername": "pocket-token-for-app"} ``` `POCKET_ACCESS_TOKENS` MUST be on a single line, or the JSON will be misinterpreted by the parser as a new key/value pair.
This commit is contained in:
parent
04291c4d47
commit
ac7ad9e942
4 changed files with 122 additions and 1 deletions
|
@ -159,6 +159,9 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||||
'CHROME_BINARY': {'type': str, 'default': None},
|
'CHROME_BINARY': {'type': str, 'default': None},
|
||||||
|
|
||||||
|
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||||
|
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -386,7 +389,7 @@ def load_config_val(key: str,
|
||||||
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
|
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
|
||||||
return int(val)
|
return int(val)
|
||||||
|
|
||||||
elif type is list:
|
elif type is list or type is dict:
|
||||||
return json.loads(val)
|
return json.loads(val)
|
||||||
|
|
||||||
raise Exception('Config values can only be str, bool, int or json')
|
raise Exception('Config values can only be str, bool, int or json')
|
||||||
|
|
|
@ -32,6 +32,7 @@ from ..index.schema import Link
|
||||||
from ..logging_util import TimedProgress, log_source_saved
|
from ..logging_util import TimedProgress, log_source_saved
|
||||||
|
|
||||||
from .pocket_html import parse_pocket_html_export
|
from .pocket_html import parse_pocket_html_export
|
||||||
|
from .pocket_api import parse_pocket_api_export
|
||||||
from .pinboard_rss import parse_pinboard_rss_export
|
from .pinboard_rss import parse_pinboard_rss_export
|
||||||
from .wallabag_atom import parse_wallabag_atom_export
|
from .wallabag_atom import parse_wallabag_atom_export
|
||||||
from .shaarli_rss import parse_shaarli_rss_export
|
from .shaarli_rss import parse_shaarli_rss_export
|
||||||
|
@ -44,6 +45,7 @@ from .generic_txt import parse_generic_txt_export
|
||||||
|
|
||||||
PARSERS = (
|
PARSERS = (
|
||||||
# Specialized parsers
|
# Specialized parsers
|
||||||
|
('Pocket API', parse_pocket_api_export),
|
||||||
('Wallabag ATOM', parse_wallabag_atom_export),
|
('Wallabag ATOM', parse_wallabag_atom_export),
|
||||||
('Pocket HTML', parse_pocket_html_export),
|
('Pocket HTML', parse_pocket_html_export),
|
||||||
('Pinboard RSS', parse_pinboard_rss_export),
|
('Pinboard RSS', parse_pinboard_rss_export),
|
||||||
|
|
115
archivebox/parsers/pocket_api.py
Normal file
115
archivebox/parsers/pocket_api.py
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
__package__ = 'archivebox.parsers'
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from typing import IO, Iterable, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
from configparser import ConfigParser
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from pocket import Pocket
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from ..index.schema import Link
|
||||||
|
from ..util import (
|
||||||
|
enforce_types,
|
||||||
|
)
|
||||||
|
from ..config import (
|
||||||
|
SOURCES_DIR
|
||||||
|
)
|
||||||
|
|
||||||
|
_COUNT_PER_PAGE = 500
|
||||||
|
_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
|
||||||
|
|
||||||
|
# search for broken protocols that sometimes come from the Pocket API
|
||||||
|
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
|
||||||
|
|
||||||
|
def get_pocket_articles(api: Pocket, since=None, page=0):
|
||||||
|
body, headers = api.get(
|
||||||
|
state='archive',
|
||||||
|
sort='oldest',
|
||||||
|
since=since,
|
||||||
|
count=_COUNT_PER_PAGE,
|
||||||
|
offset=page * _COUNT_PER_PAGE,
|
||||||
|
)
|
||||||
|
|
||||||
|
articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
|
||||||
|
returned_count = len(articles)
|
||||||
|
|
||||||
|
yield from articles
|
||||||
|
|
||||||
|
if returned_count == _COUNT_PER_PAGE:
|
||||||
|
yield from get_pocket_articles(api, since=since, page=page + 1)
|
||||||
|
else:
|
||||||
|
api.last_since = body['since']
|
||||||
|
|
||||||
|
|
||||||
|
def link_from_article(article: dict, sources: list):
|
||||||
|
url: str = article['resolved_url'] or article['given_url']
|
||||||
|
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
|
||||||
|
if broken_protocol:
|
||||||
|
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
|
||||||
|
title = article['resolved_title'] or article['given_title'] or url
|
||||||
|
|
||||||
|
return Link(
|
||||||
|
url=url,
|
||||||
|
timestamp=article['time_read'],
|
||||||
|
title=title,
|
||||||
|
tags=article.get('tags'),
|
||||||
|
sources=sources
|
||||||
|
)
|
||||||
|
|
||||||
|
def write_since(username: str, since: str):
|
||||||
|
from ..system import atomic_write
|
||||||
|
|
||||||
|
if not _API_DB_PATH.exists():
|
||||||
|
atomic_write(_API_DB_PATH, '')
|
||||||
|
|
||||||
|
since_file = ConfigParser()
|
||||||
|
since_file.optionxform = str
|
||||||
|
since_file.read(_API_DB_PATH)
|
||||||
|
|
||||||
|
since_file[username] = {
|
||||||
|
'since': since
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(_API_DB_PATH, 'w+') as new:
|
||||||
|
since_file.write(new)
|
||||||
|
|
||||||
|
def read_since(username: str) -> Optional[str]:
|
||||||
|
from ..system import atomic_write
|
||||||
|
|
||||||
|
if not _API_DB_PATH.exists():
|
||||||
|
atomic_write(_API_DB_PATH, '')
|
||||||
|
|
||||||
|
config_file = ConfigParser()
|
||||||
|
config_file.optionxform = str
|
||||||
|
config_file.read(_API_DB_PATH)
|
||||||
|
|
||||||
|
return config_file.get(username, 'since', fallback=None)
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def should_parse_as_pocket_api(text: str) -> bool:
|
||||||
|
return text.startswith('pocket://')
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
|
"""Parse bookmarks from the Pocket API"""
|
||||||
|
|
||||||
|
input_buffer.seek(0)
|
||||||
|
pattern = re.compile("^pocket:\/\/(\w+)")
|
||||||
|
for line in input_buffer:
|
||||||
|
if should_parse_as_pocket_api(line):
|
||||||
|
from ..config import (
|
||||||
|
POCKET_CONSUMER_KEY,
|
||||||
|
POCKET_ACCESS_TOKENS,
|
||||||
|
)
|
||||||
|
username = pattern.search(line).group(1)
|
||||||
|
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
|
||||||
|
api.last_since = None
|
||||||
|
|
||||||
|
for article in get_pocket_articles(api, since=read_since(username)):
|
||||||
|
yield link_from_article(article, sources=[line])
|
||||||
|
|
||||||
|
write_since(username, api.last_since)
|
1
setup.py
1
setup.py
|
@ -59,6 +59,7 @@ setuptools.setup(
|
||||||
"python-crontab==2.5.1",
|
"python-crontab==2.5.1",
|
||||||
"croniter==0.3.34",
|
"croniter==0.3.34",
|
||||||
"w3lib==1.22.0",
|
"w3lib==1.22.0",
|
||||||
|
"pocket==0.3.6",
|
||||||
# Some/all of these will likely be added in the future:
|
# Some/all of these will likely be added in the future:
|
||||||
# wpull
|
# wpull
|
||||||
# pywb
|
# pywb
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue