mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
Removed BeautifulSoup from dependencies
This commit is contained in:
parent
350803a4c8
commit
666760fe06
2 changed files with 19 additions and 23 deletions
|
@ -33,9 +33,6 @@ apt update; apt install google-chrome-beta python3 wget
|
||||||
|
|
||||||
# Check:
|
# Check:
|
||||||
google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed."
|
google-chrome --version && which wget && which python3 && echo "[√] All dependencies installed."
|
||||||
|
|
||||||
# BeautifulSoup4
|
|
||||||
sudo pip3 install beautifulsoup4
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**2. Run the archive script:**
|
**2. Run the archive script:**
|
||||||
|
|
39
archive.py
39
archive.py
|
@ -9,9 +9,7 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from datetime import datetime
|
||||||
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from subprocess import run, PIPE, DEVNULL
|
from subprocess import run, PIPE, DEVNULL
|
||||||
|
@ -111,25 +109,26 @@ def parse_pinboard_export(html):
|
||||||
yield info
|
yield info
|
||||||
|
|
||||||
def parse_bookmarks_export(html):
|
def parse_bookmarks_export(html):
|
||||||
soup = BeautifulSoup(html, "html5lib")
|
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
|
||||||
for link in soup.find_all('a'):
|
for line in html:
|
||||||
|
match = pattern.search(line)
|
||||||
|
if match:
|
||||||
|
url = match.group(1)
|
||||||
|
secs = match.group(2)
|
||||||
|
dt = datetime.fromtimestamp(int(secs))
|
||||||
|
|
||||||
url = link.get('href')
|
info = {
|
||||||
secs = link.get('add_date')
|
'url': url,
|
||||||
dt = datetime.fromtimestamp(int(secs))
|
'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
|
||||||
|
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
|
||||||
|
'time': dt,
|
||||||
|
'timestamp': secs,
|
||||||
|
'tags': "",
|
||||||
|
'title': match.group(3)
|
||||||
|
}
|
||||||
|
|
||||||
info = {
|
info['type'] = get_link_type(info)
|
||||||
'url': url,
|
yield info
|
||||||
'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
|
|
||||||
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
|
|
||||||
'time': dt,
|
|
||||||
'timestamp': secs,
|
|
||||||
'tags': link.get('tags'),
|
|
||||||
'title': link.string.strip(),
|
|
||||||
}
|
|
||||||
|
|
||||||
info['type'] = get_link_type(info)
|
|
||||||
yield info
|
|
||||||
|
|
||||||
|
|
||||||
### ACHIVING FUNCTIONS
|
### ACHIVING FUNCTIONS
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue