mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-09 12:21:57 -04:00
move utils and vendored libs into subfolders
This commit is contained in:
parent
8440858751
commit
a0a79cead8
9 changed files with 413 additions and 52 deletions
|
@ -4,34 +4,35 @@ __package__ = 'archivebox.parsers'
|
|||
import re
|
||||
|
||||
from typing import IO, Iterable, Optional
|
||||
from datetime import datetime
|
||||
from configparser import ConfigParser
|
||||
|
||||
from pathlib import Path
|
||||
from pocket import Pocket
|
||||
import requests
|
||||
from ..vendor.pocket import Pocket
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
)
|
||||
from ..util import enforce_types
|
||||
from ..system import atomic_write
|
||||
from ..config import (
|
||||
SOURCES_DIR
|
||||
SOURCES_DIR,
|
||||
POCKET_CONSUMER_KEY,
|
||||
POCKET_ACCESS_TOKENS,
|
||||
)
|
||||
|
||||
_COUNT_PER_PAGE = 500
|
||||
_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
|
||||
|
||||
COUNT_PER_PAGE = 500
|
||||
API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
|
||||
|
||||
# search for broken protocols that sometimes come from the Pocket API
|
||||
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
|
||||
|
||||
|
||||
def get_pocket_articles(api: Pocket, since=None, page=0):
|
||||
body, headers = api.get(
|
||||
state='archive',
|
||||
sort='oldest',
|
||||
since=since,
|
||||
count=_COUNT_PER_PAGE,
|
||||
offset=page * _COUNT_PER_PAGE,
|
||||
count=COUNT_PER_PAGE,
|
||||
offset=page * COUNT_PER_PAGE,
|
||||
)
|
||||
|
||||
articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
|
||||
|
@ -39,7 +40,7 @@ def get_pocket_articles(api: Pocket, since=None, page=0):
|
|||
|
||||
yield from articles
|
||||
|
||||
if returned_count == _COUNT_PER_PAGE:
|
||||
if returned_count == COUNT_PER_PAGE:
|
||||
yield from get_pocket_articles(api, since=since, page=page + 1)
|
||||
else:
|
||||
api.last_since = body['since']
|
||||
|
@ -60,56 +61,53 @@ def link_from_article(article: dict, sources: list):
|
|||
sources=sources
|
||||
)
|
||||
|
||||
def write_since(username: str, since: str):
|
||||
from ..system import atomic_write
|
||||
|
||||
if not _API_DB_PATH.exists():
|
||||
atomic_write(_API_DB_PATH, '')
|
||||
def write_since(username: str, since: str):
|
||||
if not API_DB_PATH.exists():
|
||||
atomic_write(API_DB_PATH, '')
|
||||
|
||||
since_file = ConfigParser()
|
||||
since_file.optionxform = str
|
||||
since_file.read(_API_DB_PATH)
|
||||
since_file.read(API_DB_PATH)
|
||||
|
||||
since_file[username] = {
|
||||
'since': since
|
||||
}
|
||||
|
||||
with open(_API_DB_PATH, 'w+') as new:
|
||||
with open(API_DB_PATH, 'w+') as new:
|
||||
since_file.write(new)
|
||||
|
||||
def read_since(username: str) -> Optional[str]:
|
||||
from ..system import atomic_write
|
||||
|
||||
if not _API_DB_PATH.exists():
|
||||
atomic_write(_API_DB_PATH, '')
|
||||
def read_since(username: str) -> Optional[str]:
|
||||
if not API_DB_PATH.exists():
|
||||
atomic_write(API_DB_PATH, '')
|
||||
|
||||
config_file = ConfigParser()
|
||||
config_file.optionxform = str
|
||||
config_file.read(_API_DB_PATH)
|
||||
config_file.read(API_DB_PATH)
|
||||
|
||||
return config_file.get(username, 'since', fallback=None)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_parse_as_pocket_api(text: str) -> bool:
|
||||
return text.startswith('pocket://')
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse bookmarks from the Pocket API"""
|
||||
|
||||
input_buffer.seek(0)
|
||||
pattern = re.compile("^pocket:\/\/(\w+)")
|
||||
pattern = re.compile(r"^pocket:\/\/(\w+)")
|
||||
for line in input_buffer:
|
||||
if should_parse_as_pocket_api(line):
|
||||
from ..config import (
|
||||
POCKET_CONSUMER_KEY,
|
||||
POCKET_ACCESS_TOKENS,
|
||||
)
|
||||
username = pattern.search(line).group(1)
|
||||
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
|
||||
api.last_since = None
|
||||
|
||||
for article in get_pocket_articles(api, since=read_since(username)):
|
||||
yield link_from_article(article, sources=[line])
|
||||
|
||||
write_since(username, api.last_since)
|
||||
if should_parse_as_pocket_api(line):
|
||||
|
||||
username = pattern.search(line).group(1)
|
||||
api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
|
||||
api.last_since = None
|
||||
|
||||
for article in get_pocket_articles(api, since=read_since(username)):
|
||||
yield link_from_article(article, sources=[line])
|
||||
|
||||
write_since(username, api.last_since)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue