diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index e078bdaf..4c46569f 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -14,7 +14,6 @@ from django import forms from core.models import Snapshot, Tag from core.forms import AddLinkForm, TagField -from core.utils import get_icons from core.mixins import SearchResultsAdminMixin from index.html import snapshot_icons diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 8f48929b..051cf50b 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -3,7 +3,7 @@ __package__ = 'archivebox.core' from django import forms from ..util import URL_REGEX -from .utils_taggit import edit_string_for_tags, parse_tags +from ..vendor.taggit_utils import edit_string_for_tags, parse_tags CHOICES = ( ('0', 'depth = 0 (archive just these URLs)'), diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py index 5327eebb..bf3a292b 100644 --- a/archivebox/parsers/pocket_api.py +++ b/archivebox/parsers/pocket_api.py @@ -4,34 +4,35 @@ __package__ = 'archivebox.parsers' import re from typing import IO, Iterable, Optional -from datetime import datetime from configparser import ConfigParser from pathlib import Path -from pocket import Pocket -import requests +from ..vendor.pocket import Pocket from ..index.schema import Link -from ..util import ( - enforce_types, -) +from ..util import enforce_types +from ..system import atomic_write from ..config import ( - SOURCES_DIR + SOURCES_DIR, + POCKET_CONSUMER_KEY, + POCKET_ACCESS_TOKENS, ) -_COUNT_PER_PAGE = 500 -_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db' + +COUNT_PER_PAGE = 500 +API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db' # search for broken protocols that sometimes come from the Pocket API _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))') + def get_pocket_articles(api: Pocket, since=None, page=0): body, headers = api.get( state='archive', sort='oldest', since=since, - count=_COUNT_PER_PAGE, - offset=page * _COUNT_PER_PAGE, + count=COUNT_PER_PAGE, + offset=page * COUNT_PER_PAGE, ) articles = body['list'].values() if isinstance(body['list'], dict) else body['list'] @@ -39,7 +40,7 @@ def get_pocket_articles(api: Pocket, since=None, page=0): yield from articles - if returned_count == _COUNT_PER_PAGE: + if returned_count == COUNT_PER_PAGE: yield from get_pocket_articles(api, since=since, page=page + 1) else: api.last_since = body['since'] @@ -60,56 +61,53 @@ def link_from_article(article: dict, sources: list): sources=sources ) -def write_since(username: str, since: str): - from ..system import atomic_write - if not _API_DB_PATH.exists(): - atomic_write(_API_DB_PATH, '') +def write_since(username: str, since: str): + if not API_DB_PATH.exists(): + atomic_write(API_DB_PATH, '') since_file = ConfigParser() since_file.optionxform = str - since_file.read(_API_DB_PATH) + since_file.read(API_DB_PATH) since_file[username] = { 'since': since } - with open(_API_DB_PATH, 'w+') as new: + with open(API_DB_PATH, 'w+') as new: since_file.write(new) -def read_since(username: str) -> Optional[str]: - from ..system import atomic_write - if not _API_DB_PATH.exists(): - atomic_write(_API_DB_PATH, '') +def read_since(username: str) -> Optional[str]: + if not API_DB_PATH.exists(): + atomic_write(API_DB_PATH, '') config_file = ConfigParser() config_file.optionxform = str - config_file.read(_API_DB_PATH) + config_file.read(API_DB_PATH) return config_file.get(username, 'since', fallback=None) + @enforce_types def should_parse_as_pocket_api(text: str) -> bool: return text.startswith('pocket://') + @enforce_types def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]: """Parse bookmarks from the Pocket API""" input_buffer.seek(0) - pattern = re.compile("^pocket:\/\/(\w+)") + pattern = re.compile(r"^pocket:\/\/(\w+)") for line in input_buffer: - if should_parse_as_pocket_api(line): - from ..config import ( - POCKET_CONSUMER_KEY, - POCKET_ACCESS_TOKENS, - ) - username = pattern.search(line).group(1) - api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username]) - api.last_since = None - - for article in get_pocket_articles(api, since=read_since(username)): - yield link_from_article(article, sources=[line]) - - write_since(username, api.last_since) + if should_parse_as_pocket_api(line): + + username = pattern.search(line).group(1) + api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username]) + api.last_since = None + + for article in get_pocket_articles(api, since=read_since(username)): + yield link_from_article(article, sources=[line]) + + write_since(username, api.last_since) diff --git a/archivebox/util.py b/archivebox/util.py index 733fe8f5..5530ab45 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -1,11 +1,11 @@ __package__ = 'archivebox' import re -from pathlib import Path +import requests import json as pyjson - from typing import List, Optional, Any +from pathlib import Path from inspect import signature from functools import wraps from hashlib import sha256 @@ -13,10 +13,9 @@ from urllib.parse import urlparse, quote, unquote from html import escape, unescape from datetime import datetime from dateparser import parse as dateparser - -import requests from requests.exceptions import RequestException, ReadTimeout -from .base32_crockford import encode as base32_encode # type: ignore + +from .vendor.base32_crockford import encode as base32_encode # type: ignore from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding try: diff --git a/archivebox/vendor/__init__.py b/archivebox/vendor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/base32_crockford.py b/archivebox/vendor/base32_crockford.py similarity index 100% rename from archivebox/base32_crockford.py rename to archivebox/vendor/base32_crockford.py diff --git a/archivebox/vendor/pocket.py b/archivebox/vendor/pocket.py new file mode 100644 index 00000000..bd49aa29 --- /dev/null +++ b/archivebox/vendor/pocket.py @@ -0,0 +1,368 @@ +# https://github.com/tapanpandita/pocket/blob/master/pocket.py + +import requests +import json +from functools import wraps + + +class PocketException(Exception): + ''' + Base class for all pocket exceptions + http://getpocket.com/developer/docs/errors + + ''' + pass + + +class InvalidQueryException(PocketException): + pass + + +class AuthException(PocketException): + pass + + +class RateLimitException(PocketException): + ''' + http://getpocket.com/developer/docs/rate-limits + + ''' + pass + + +class ServerMaintenanceException(PocketException): + pass + +EXCEPTIONS = { + 400: InvalidQueryException, + 401: AuthException, + 403: RateLimitException, + 503: ServerMaintenanceException, +} + + +def method_wrapper(fn): + + @wraps(fn) + def wrapped(self, *args, **kwargs): + arg_names = list(fn.__code__.co_varnames) + arg_names.remove('self') + kwargs.update(dict(zip(arg_names, args))) + + url = self.api_endpoints[fn.__name__] + payload = dict([ + (k, v) for k, v in kwargs.items() + if v is not None + ]) + payload.update(self.get_payload()) + + return self.make_request(url, payload) + + return wrapped + + +def bulk_wrapper(fn): + + @wraps(fn) + def wrapped(self, *args, **kwargs): + arg_names = list(fn.__code__.co_varnames) + arg_names.remove('self') + kwargs.update(dict(zip(arg_names, args))) + + wait = kwargs.get('wait', True) + query = dict( + [(k, v) for k, v in kwargs.items() if v is not None] + ) + # TODO: Fix this hack + query['action'] = 'add' if fn.__name__ == 'bulk_add' else fn.__name__ + + if wait: + self.add_bulk_query(query) + return self + else: + url = self.api_endpoints['send'] + payload = { + 'actions': [query], + } + payload.update(self.get_payload()) + return self.make_request( + url, + json.dumps(payload), + headers={'content-type': 'application/json'}, + ) + + return wrapped + + +class Pocket(object): + ''' + This class implements a basic python wrapper around the pocket api. For a + detailed documentation of the methods and what they do please refer the + official pocket api documentation at + http://getpocket.com/developer/docs/overview + + ''' + api_endpoints = dict( + (method, 'https://getpocket.com/v3/%s' % method) + for method in "add,send,get".split(",") + ) + + statuses = { + 200: 'Request was successful', + 400: 'Invalid request, please make sure you follow the ' + 'documentation for proper syntax', + 401: 'Problem authenticating the user', + 403: 'User was authenticated, but access denied due to lack of ' + 'permission or rate limiting', + 503: 'Pocket\'s sync server is down for scheduled maintenance.', + } + + def __init__(self, consumer_key, access_token): + self.consumer_key = consumer_key + self.access_token = access_token + self._bulk_query = [] + + self._payload = { + 'consumer_key': self.consumer_key, + 'access_token': self.access_token, + } + + def get_payload(self): + return self._payload + + def add_bulk_query(self, query): + self._bulk_query.append(query) + + @staticmethod + def _post_request(url, payload, headers): + r = requests.post(url, data=payload, headers=headers) + return r + + @classmethod + def _make_request(cls, url, payload, headers=None): + r = cls._post_request(url, payload, headers) + + if r.status_code > 399: + error_msg = cls.statuses.get(r.status_code) + extra_info = r.headers.get('X-Error') + raise EXCEPTIONS.get(r.status_code, PocketException)( + '%s. %s' % (error_msg, extra_info) + ) + + return r.json() or r.text, r.headers + + @classmethod + def make_request(cls, url, payload, headers=None): + return cls._make_request(url, payload, headers) + + @method_wrapper + def add(self, url, title=None, tags=None, tweet_id=None): + ''' + This method allows you to add a page to a user's list. + In order to use the /v3/add endpoint, your consumer key must have the + "Add" permission. + http://getpocket.com/developer/docs/v3/add + + ''' + + @method_wrapper + def get( + self, state=None, favorite=None, tag=None, contentType=None, + sort=None, detailType=None, search=None, domain=None, since=None, + count=None, offset=None + ): + ''' + This method allows you to retrieve a user's list. It supports + retrieving items changed since a specific time to allow for syncing. + http://getpocket.com/developer/docs/v3/retrieve + + ''' + + @method_wrapper + def send(self, actions): + ''' + This method allows you to make changes to a user's list. It supports + adding new pages, marking pages as read, changing titles, or updating + tags. Multiple changes to items can be made in one request. + http://getpocket.com/developer/docs/v3/modify + + ''' + + @bulk_wrapper + def bulk_add( + self, item_id, ref_id=None, tags=None, time=None, title=None, + url=None, wait=True + ): + ''' + Add a new item to the user's list + http://getpocket.com/developer/docs/v3/modify#action_add + + ''' + + @bulk_wrapper + def archive(self, item_id, time=None, wait=True): + ''' + Move an item to the user's archive + http://getpocket.com/developer/docs/v3/modify#action_archive + + ''' + + @bulk_wrapper + def readd(self, item_id, time=None, wait=True): + ''' + Re-add (unarchive) an item to the user's list + http://getpocket.com/developer/docs/v3/modify#action_readd + + ''' + + @bulk_wrapper + def favorite(self, item_id, time=None, wait=True): + ''' + Mark an item as a favorite + http://getpocket.com/developer/docs/v3/modify#action_favorite + + ''' + + @bulk_wrapper + def unfavorite(self, item_id, time=None, wait=True): + ''' + Remove an item from the user's favorites + http://getpocket.com/developer/docs/v3/modify#action_unfavorite + + ''' + + @bulk_wrapper + def delete(self, item_id, time=None, wait=True): + ''' + Permanently remove an item from the user's account + http://getpocket.com/developer/docs/v3/modify#action_delete + + ''' + + @bulk_wrapper + def tags_add(self, item_id, tags, time=None, wait=True): + ''' + Add one or more tags to an item + http://getpocket.com/developer/docs/v3/modify#action_tags_add + + ''' + + @bulk_wrapper + def tags_remove(self, item_id, tags, time=None, wait=True): + ''' + Remove one or more tags from an item + http://getpocket.com/developer/docs/v3/modify#action_tags_remove + + ''' + + @bulk_wrapper + def tags_replace(self, item_id, tags, time=None, wait=True): + ''' + Replace all of the tags for an item with one or more provided tags + http://getpocket.com/developer/docs/v3/modify#action_tags_replace + + ''' + + @bulk_wrapper + def tags_clear(self, item_id, time=None, wait=True): + ''' + Remove all tags from an item. + http://getpocket.com/developer/docs/v3/modify#action_tags_clear + + ''' + + @bulk_wrapper + def tag_rename(self, item_id, old_tag, new_tag, time=None, wait=True): + ''' + Rename a tag. This affects all items with this tag. + http://getpocket.com/developer/docs/v3/modify#action_tag_rename + + ''' + + def commit(self): + ''' + This method executes the bulk query, flushes stored queries and + returns the response + + ''' + url = self.api_endpoints['send'] + payload = { + 'actions': self._bulk_query, + } + payload.update(self._payload) + self._bulk_query = [] + + return self._make_request( + url, + json.dumps(payload), + headers={'content-type': 'application/json'}, + ) + + @classmethod + def get_request_token( + cls, consumer_key, redirect_uri='http://example.com/', state=None + ): + ''' + Returns the request token that can be used to fetch the access token + + ''' + headers = { + 'X-Accept': 'application/json', + } + url = 'https://getpocket.com/v3/oauth/request' + payload = { + 'consumer_key': consumer_key, + 'redirect_uri': redirect_uri, + } + + if state: + payload['state'] = state + + return cls._make_request(url, payload, headers)[0]['code'] + + @classmethod + def get_credentials(cls, consumer_key, code): + ''' + Fetches access token from using the request token and consumer key + + ''' + headers = { + 'X-Accept': 'application/json', + } + url = 'https://getpocket.com/v3/oauth/authorize' + payload = { + 'consumer_key': consumer_key, + 'code': code, + } + + return cls._make_request(url, payload, headers)[0] + + @classmethod + def get_access_token(cls, consumer_key, code): + return cls.get_credentials(consumer_key, code)['access_token'] + + @classmethod + def get_auth_url(cls, code, redirect_uri='http://example.com'): + auth_url = ('https://getpocket.com/auth/authorize' + '?request_token=%s&redirect_uri=%s' % (code, redirect_uri)) + return auth_url + + @classmethod + def auth( + cls, consumer_key, redirect_uri='http://example.com/', state=None, + ): + ''' + This is a test method for verifying if oauth worked + http://getpocket.com/developer/docs/authentication + + ''' + code = cls.get_request_token(consumer_key, redirect_uri, state) + + auth_url = 'https://getpocket.com/auth/authorize?request_token='\ + '%s&redirect_uri=%s' % (code, redirect_uri) + raw_input( + 'Please open %s in your browser to authorize the app and ' + 'press enter:' % auth_url + ) + + return cls.get_access_token(consumer_key, code) diff --git a/archivebox/core/utils_taggit.py b/archivebox/vendor/taggit_utils.py similarity index 100% rename from archivebox/core/utils_taggit.py rename to archivebox/vendor/taggit_utils.py diff --git a/setup.py b/setup.py index f81edf21..d01b3f65 100755 --- a/setup.py +++ b/setup.py @@ -48,6 +48,11 @@ setuptools.setup( "wheel", ], install_requires=[ + # only add things here that have corresponding apt python3-packages available + # anything added here also needs to be added to our package dependencies in + # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc. + # if there is no apt python3-package equivalent, then vendor it instead in + # ./archivebox/vendor/ "requests==2.24.0", "atomicwrites==1.4.0", "mypy-extensions==0.4.3", @@ -59,12 +64,6 @@ setuptools.setup( "python-crontab==2.5.1", "croniter==0.3.34", "w3lib==1.22.0", - "pocket==0.3.6", - # Some/all of these will likely be added in the future: - # wpull - # pywb - # pyppeteer - # archivenow ], extras_require={ 'dev': [ @@ -81,8 +80,6 @@ setuptools.setup( "bottle", "stdeb", ], - # 'redis': ['redis', 'django-redis'], - # 'pywb': ['pywb', 'redis'], }, packages=[PKG_NAME], include_package_data=True, # see MANIFEST.in