Merge branch 'master' into dev

This commit is contained in:
Nick Sweeting 2019-03-30 15:36:54 -04:00
commit 5d0185b6dd
6 changed files with 41 additions and 8 deletions

View file

@ -1,5 +1,5 @@
--- ---
name: Bug report name: 🐞 Bug report
about: Create a report to help us improve about: Create a report to help us improve
title: '' title: ''
labels: '' labels: ''

View file

@ -0,0 +1,15 @@
---
name: 📑 Documentation change
about: Submit a suggestion for the Wiki documentation
title: ''
labels: ''
assignees: ''
---
## Wiki Page URL
## Suggested Edit
...

View file

@ -1,5 +1,5 @@
--- ---
name: Feature request name: 💡 Feature request
about: Suggest an idea for this project about: Suggest an idea for this project
title: '' title: ''
labels: '' labels: ''

View file

@ -1,3 +1,5 @@
**IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes, I will close them with great prejudice. The PEP8 checks I don't follow are intentional. PRs for minor bugfixes, typos, etc are fine.**
# Summary # Summary
e.g. This PR fixes ABC or adds the ability to do XYZ... e.g. This PR fixes ABC or adds the ability to do XYZ...

View file

@ -1,12 +1,14 @@
import os import os
import re
import sys import sys
import shutil import shutil
from typing import Optional from typing import Optional, Pattern
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE, DEVNULL
OUTPUT_DIR: str OUTPUT_DIR: str
URL_BLACKLIST: Optional[Pattern[str]]
# ****************************************************************************** # ******************************************************************************
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
@ -24,6 +26,7 @@ TIMEOUT = int(os.getenv('TIMEOUT', '60'))
MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600'))
OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' ) OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' )
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
URL_BLACKLIST = os.getenv('URL_BLACKLIST', None)
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true' FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true' FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
@ -58,6 +61,11 @@ CHROME_BINARY = os.getenv('CHROME_BINARY', None)
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
try:
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR'))
except Exception:
OUTPUT_DIR = None
# ****************************************************************************** # ******************************************************************************
### Terminal Configuration ### Terminal Configuration
@ -95,6 +103,9 @@ TEMPLATES_DIR = os.path.join(PYTHON_DIR, 'templates')
if COOKIES_FILE: if COOKIES_FILE:
COOKIES_FILE = os.path.abspath(COOKIES_FILE) COOKIES_FILE = os.path.abspath(COOKIES_FILE)
URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE)
########################### Environment & Dependencies #########################
VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip() VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[1] GIT_SHA = VERSION.split('+')[1]

View file

@ -8,6 +8,9 @@ from .util import (
merge_links, merge_links,
) )
from config import (
URL_BLACKLIST,
)
def validate_links(links: Iterable[Link]) -> Iterable[Link]: def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
@ -22,11 +25,11 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
def archivable_links(links: Iterable[Link]) -> Iterable[Link]: def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
return ( for link in links:
link scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
for link in links not_blacklisted = (not URL_BLACKLIST.match(link.url)) if URL_BLACKLIST else True
if scheme(link.url) in ('http', 'https', 'ftp') if scheme_is_valid and not_blacklisted:
) yield link
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
@ -87,3 +90,5 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
new_timestamp = '{}.{}'.format(timestamp, nonce) new_timestamp = '{}.{}'.format(timestamp, nonce)
return new_timestamp return new_timestamp