Merge pull request #20 from ilvar/issue-18-parse-browser-bookmarks

#18 - Parse browser bookmarks
This commit is contained in:
Nick Sweeting 2017-06-08 17:06:32 -05:00 committed by GitHub
commit 350803a4c8
2 changed files with 29 additions and 1 deletions

View file

@ -33,6 +33,9 @@ apt update; apt install google-chrome-beta python3 wget
# Check: # Check:
google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed." google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed."
# BeautifulSoup4
sudo pip3 install beautifulsoup4
``` ```
**2. Run the archive script:** **2. Run the archive script:**

View file

@ -9,7 +9,9 @@ import os
import sys import sys
import json import json
from datetime import datetime from bs4 import BeautifulSoup
from datetime import datetime, timezone
import time import time
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE, DEVNULL
@ -108,6 +110,27 @@ def parse_pinboard_export(html):
info['type'] = get_link_type(info) info['type'] = get_link_type(info)
yield info yield info
def parse_bookmarks_export(html):
soup = BeautifulSoup(html, "html5lib")
for link in soup.find_all('a'):
url = link.get('href')
secs = link.get('add_date')
dt = datetime.fromtimestamp(int(secs))
info = {
'url': url,
'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
'time': dt,
'timestamp': secs,
'tags': link.get('tags'),
'title': link.string.strip(),
}
info['type'] = get_link_type(info)
yield info
### ACHIVING FUNCTIONS ### ACHIVING FUNCTIONS
@ -278,6 +301,8 @@ def create_archive(export_file, service, resume=None):
links = parse_pocket_export(f) links = parse_pocket_export(f)
elif service == "pinboard": elif service == "pinboard":
links = parse_pinboard_export(f) links = parse_pinboard_export(f)
elif service == "bookmarks":
links = parse_bookmarks_export(f)
links = list(reversed(sorted(links, key=lambda l: l['timestamp']))) # most recent first links = list(reversed(sorted(links, key=lambda l: l['timestamp']))) # most recent first
if resume: if resume:
links = [link for link in links if link['timestamp'] >= resume] links = [link for link in links if link['timestamp'] >= resume]