Add parser for Pocket API

Pass a url like `pocket://Username` to import that username's archived Pocket
library. Tokens need to be stored in ArchveBox.conf with the following keys:

```
POCKET_CONSUMER_KEY = key-from-custom-pocket-app
POCKET_ACCESS_TOKENS = {"YourUsername": "pocket-token-for-app"}
```

`POCKET_ACCESS_TOKENS` MUST be on a single line, or the JSON will be
misinterpreted by the parser as a new key/value pair.
This commit is contained in:
mAAdhaTTah 2020-11-07 14:17:21 -05:00
parent 04291c4d47
commit ac7ad9e942
No known key found for this signature in database
GPG key ID: 4C82AA17660179FF
4 changed files with 122 additions and 1 deletions
archivebox/parsers

View file

@ -0,0 +1,115 @@
__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable, Optional
from datetime import datetime
from configparser import ConfigParser
from pathlib import Path
from pocket import Pocket
import requests
from ..index.schema import Link
from ..util import (
enforce_types,
)
from ..config import (
SOURCES_DIR
)
_COUNT_PER_PAGE = 500
_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
# search for broken protocols that sometimes come from the Pocket API
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
def get_pocket_articles(api: Pocket, since=None, page=0):
body, headers = api.get(
state='archive',
sort='oldest',
since=since,
count=_COUNT_PER_PAGE,
offset=page * _COUNT_PER_PAGE,
)
articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
returned_count = len(articles)
yield from articles
if returned_count == _COUNT_PER_PAGE:
yield from get_pocket_articles(api, since=since, page=page + 1)
else:
api.last_since = body['since']
def link_from_article(article: dict, sources: list):
url: str = article['resolved_url'] or article['given_url']
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
if broken_protocol:
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
title = article['resolved_title'] or article['given_title'] or url
return Link(
url=url,
timestamp=article['time_read'],
title=title,
tags=article.get('tags'),
sources=sources
)
def write_since(username: str, since: str):
from ..system import atomic_write
if not _API_DB_PATH.exists():
atomic_write(_API_DB_PATH, '')
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(_API_DB_PATH)
since_file[username] = {
'since': since
}
with open(_API_DB_PATH, 'w+') as new:
since_file.write(new)
def read_since(username: str) -> Optional[str]:
from ..system import atomic_write
if not _API_DB_PATH.exists():
atomic_write(_API_DB_PATH, '')
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(_API_DB_PATH)
return config_file.get(username, 'since', fallback=None)
@enforce_types
def should_parse_as_pocket_api(text: str) -> bool:
return text.startswith('pocket://')
@enforce_types
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Pocket API"""
input_buffer.seek(0)
pattern = re.compile("^pocket:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_pocket_api(line):
from ..config import (
POCKET_CONSUMER_KEY,
POCKET_ACCESS_TOKENS,
)
username = pattern.search(line).group(1)
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)):
yield link_from_article(article, sources=[line])
write_since(username, api.last_since)