From e8808b0a1fbd37b70c98d5220db1cf58b866757e Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Fri, 11 Jan 2019 07:02:49 -0500
Subject: [PATCH] add WARC downloading

---
 archivebox/archive_methods.py | 52 +++++++++++++++++++++++++++++++++++
 archivebox/config.py          |  1 +
 2 files changed, 53 insertions(+)

diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py
index 75a260cd..32fb0bfe 100644
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -17,6 +17,7 @@ from config import (
     FETCH_PDF,
     FETCH_SCREENSHOT,
     FETCH_DOM,
+    FETCH_WARC,
     FETCH_GIT,
     FETCH_MEDIA,
     RESOLUTION,
@@ -106,6 +107,9 @@ def archive_link(link_dir, link, overwrite=True):
     if FETCH_DOM:
         link = fetch_dom(link_dir, link, overwrite=overwrite)
 
+    if FETCH_WARC:
+        link = fetch_warc(link_dir, link, overwrite=overwrite)
+
     if SUBMIT_ARCHIVE_DOT_ORG:
         link = archive_dot_org(link_dir, link, overwrite=overwrite)
 
@@ -495,6 +499,7 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
         'output': output,
     }
 
+
 @attach_result_to_link('git')
 def fetch_git(link_dir, link, timeout=TIMEOUT):
     """download full site using git"""
@@ -530,6 +535,53 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
     }
 
 
+@attach_result_to_link('warc')
+def fetch_warc(link_dir, link, timeout=TIMEOUT):
+    """download full site using wget's warc saving feature"""
+
+    output = os.path.join(link_dir, 'warc')
+    if os.path.exists(output) and os.listdir(output):
+        return {'output': 'warc', 'status': 'skipped'}
+
+    os.makedirs(output, exist_ok=True)
+    CMD = [
+        'wget',
+        '--warc-file={}'.format(int(datetime.now().timestamp())),
+        *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
+        *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate',))),
+        link['url'],
+    ]
+
+    end = progress(timeout, prefix='      ')
+    try:
+        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output, timeout=timeout + 1)  # warc/at-00000.warc.gz
+        end()
+
+        # Check for common failure cases
+        if result.returncode > 0:
+            print('        got wget response code {}:'.format(result.returncode))
+            if result.returncode != 8:
+                print('\n'.join('          ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip()))
+            if b'403: Forbidden' in result.stderr:
+                raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
+            if b'404: Not Found' in result.stderr:
+                raise Exception('404 Not Found')
+            if b'ERROR 500: Internal Server Error' in result.stderr:
+                raise Exception('500 Internal Server Error')
+            if result.returncode == 4:
+                raise Exception('Failed warc download')
+    except Exception as e:
+        end()
+        print('        Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
+        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    return {
+        'cmd': CMD,
+        'output': output,
+    }
+
+
 def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR):
     args = [binary, '--headless']  # '--disable-gpu'
     if not CHROME_SANDBOX:
diff --git a/archivebox/config.py b/archivebox/config.py
index ec634039..38b8b32d 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -21,6 +21,7 @@ FETCH_VIDEO =            os.getenv('FETCH_VIDEO',            'False'
 FETCH_PDF =              os.getenv('FETCH_PDF',              'True'             ).lower() == 'true'
 FETCH_SCREENSHOT =       os.getenv('FETCH_SCREENSHOT',       'True'             ).lower() == 'true'
 FETCH_DOM =              os.getenv('FETCH_DOM',              'True'             ).lower() == 'true'
+FETCH_WARC =             os.getenv('FETCH_WARC',             'True'             ).lower() == 'true'
 FETCH_GIT =              os.getenv('FETCH_GIT',              'True'             ).lower() == 'true'
 FETCH_MEDIA =            os.getenv('FETCH_MEDIA',            'False'            ).lower() == 'true'
 FETCH_FAVICON =          os.getenv('FETCH_FAVICON',          'True'             ).lower() == 'true'