From 318b9ae1db223867bdc45d95d3fb9212bf8161d9 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Mon, 30 Oct 2017 02:50:37 -0500
Subject: [PATCH] finished manual link merging logic to fix folder conflicts

---
 links.py |  27 +-----------
 util.py  | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 113 insertions(+), 39 deletions(-)

diff --git a/links.py b/links.py
index 22242d17..4365469c 100644
--- a/links.py
+++ b/links.py
@@ -35,8 +35,9 @@ Link {
 from util import (
     domain,
     base_url,
-    get_str_between,
+    str_between,
     get_link_type,
+    merge_links,
 )
 
 
@@ -89,30 +90,6 @@ def sorted_links(links):
     sort_func = lambda link: (link['timestamp'], link['url'])
     return sorted(links, key=sort_func, reverse=True)
 
-
-
-def merge_links(a, b):
-    """deterministially merge two links, favoring longer field values over shorter,
-    and "cleaner" values over worse ones.
-    """
-    longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
-    earlier = lambda key: a[key] if a[key] < b[key] else b[key]
-    
-    url = longer('url')
-    longest_title = longer('title')
-    cleanest_title = a['title'] if '://' not in a['title'] else b['title']
-    link = {
-        'timestamp': earlier('timestamp'),
-        'url': url,
-        'domain': domain(url),
-        'base_url': base_url(url),
-        'tags': longer('tags'),
-        'title': longest_title if '://' not in longest_title else cleanest_title,
-        'sources': list(set(a.get('sources', []) + b.get('sources', []))),
-    }
-    link['type'] = get_link_type(link)
-    return link
-
 def links_after_timestamp(links, timestamp=None):
     if not timestamp:
         yield from links
diff --git a/util.py b/util.py
index 98be2978..74aec6d8 100644
--- a/util.py
+++ b/util.py
@@ -10,6 +10,7 @@ from subprocess import run, PIPE, DEVNULL
 from multiprocessing import Process
 
 from config import (
+    IS_TTY,
     ARCHIVE_PERMISSIONS,
     ARCHIVE_DIR,
     TIMEOUT,
@@ -220,6 +221,27 @@ def get_link_type(link):
         return 'vimeo'
     return None
 
+def merge_links(a, b):
+    """deterministially merge two links, favoring longer field values over shorter,
+    and "cleaner" values over worse ones.
+    """
+    longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
+    earlier = lambda key: a[key] if a[key] < b[key] else b[key]
+    
+    url = longer('url')
+    longest_title = longer('title')
+    cleanest_title = a['title'] if '://' not in a['title'] else b['title']
+    link = {
+        'timestamp': earlier('timestamp'),
+        'url': url,
+        'domain': domain(url),
+        'base_url': base_url(url),
+        'tags': longer('tags'),
+        'title': longest_title if '://' not in longest_title else cleanest_title,
+        'sources': list(set(a.get('sources', []) + b.get('sources', []))),
+    }
+    link['type'] = get_link_type(link)
+    return link
 
 def find_link(folder, links):
     """for a given archive folder, find the corresponding link object in links"""
@@ -244,8 +266,13 @@ def parse_url(folder):
     link_json = os.path.join('./html/archive/' + folder, 'index.json')
     if os.path.exists(link_json):
         with open(link_json, 'r') as f:
-            link = json.load(f)
-            return link['base_url']
+            try:
+                link_json = f.read().strip()
+                if link_json:
+                    link = json.loads(link_json)
+                    return link['base_url']
+            except ValueError:
+                print('File contains invalid JSON: {}!'.format(link_json))
 
     archive_org_txt = os.path.join('./html/archive/' + folder, 'archive.org.txt')
     if os.path.exists(archive_org_txt):
@@ -256,15 +283,72 @@ def parse_url(folder):
 
     return ''
 
+def manually_merge_folders(source, target):
+    """prompt for user input to resolve a conflict between two archive folders"""
 
-def merge_folders(folder, link):
+    if not IS_TTY:
+        return
+
+    fname = lambda path: path.split('/')[-1]
+
+    print('    {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target)))
+    print('      - [enter]: do nothing (keep both)')
+    print('      - a:       keep everything from {}'.format(source))
+    print('      - b:       keep everything from {}'.format(target))
+    print('      - q:       quit and resolve the conflict manually')
+    try:
+        answer = input('> ').strip().lower()
+    except KeyboardInterrupt:
+        answer = 'q'
+
+    assert answer in ('', 'a', 'b', 'q'), 'Invalid choice.'
+
+    if answer == 'q':
+        print('\nJust run Bookmark Archiver again to pick up where you left off.')
+        raise SystemExit(0)
+    elif answer == '':
+        return
+
+    files_in_source = set(os.listdir(source))
+    files_in_target = set(os.listdir(target))
+    for file in files_in_source.intersection(files_in_target):
+        if file in files_in_target:
+            to_delete = target if answer == 'a' else source
+            run(['rm', '-Rf', os.path.join(to_delete, file)])
+        run(['mv', os.path.join(source, file), os.path.join(target, file)])
+
+    if not set(os.listdir(source)):
+        run(['rm', '-Rf', source])
+
+def merge_folders(path, folder, link):
     """given a folder, merge it to the canonical 'correct' path for the given link object"""
-    base_url = parse_url(folder)
+    source, target = os.path.join(path, folder), os.path.join(path, link['timestamp'])
+
+    base_url = parse_url(source)
     if not (base_url in link['base_url']
             or link['base_url'] in base_url):
-        print(base_url, link['base_url'])
-        assert False
-    print('{} > {}'.format(folder, link['timestamp']))
+        raise ValueError('The link does not match the url for this folder.')
+
+    if not os.path.exists(target):
+        # target doesn't exist so nothing needs merging, simply move A to B
+        if run(['mv', source, target]).returncode:
+            print('Failed to move {} to {}!'.format(source, target))
+            return False
+    else:
+        # target folder exists, check for conflicting files and attempt manual merge
+        files_in_source = set(os.listdir(source))
+        files_in_target = set(os.listdir(target))
+
+        if not files_in_source.intersection(files_in_target):
+            # no conflicts, move everything from A to B
+            for file in files_in_source:
+                run(['mv', os.path.join(source, file), os.path.join(target, file)])
+
+    files_in_source = set(os.listdir(source))
+    if files_in_source:
+        manually_merge_folders(source, target)
+    else:
+        run(['rm', '-R', source])
 
 
 def cleanup_archive(path, links):
@@ -277,15 +361,28 @@ def cleanup_archive(path, links):
     # check each folder for a "domain.com" folder or 
 
     unmatched = []
+    bad_folders = []
+
+    if not os.path.exists(path):
+        return
 
     for folder in os.listdir(path):
-        link = find_link(folder, links)
-        if link is None:
-            unmatched.append(folder)
-            continue
-        
-        if folder != link['timestamp']:
-            merge_folders(folder, link)
+        if not os.listdir(os.path.join(path, folder)):
+            # delete empty folders
+            run(['rm', '-R', os.path.join(path, folder)])
+        else:
+            link = find_link(folder, links)
+            if link is None:
+                unmatched.append(folder)
+                continue
+            
+            if folder != link['timestamp']:
+                bad_folders.append((folder, link))
+    
+    if bad_folders:
+        print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders)))
+        for folder, link in bad_folders:
+            merge_folders(path, folder, link)
 
     if unmatched:
         print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))