From bd6d9c165bd0954ea101388f0ea18a81ef7ca186 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 27 Mar 2021 01:01:29 -0400 Subject: [PATCH] enforce utf8 on literally all file operations because windows sucks --- archivebox/cli/tests.py | 12 ++++++------ archivebox/config.py | 8 ++++---- archivebox/extractors/archive_org.py | 2 +- archivebox/extractors/readability.py | 2 +- archivebox/search/utils.py | 2 +- archivebox/system.py | 3 ++- tests/test_add.py | 6 +++--- tests/test_extractors.py | 8 ++++---- tests/test_init.py | 14 +++++++------- 9 files changed, 29 insertions(+), 28 deletions(-) diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py index 5e2e2d96..04c54df8 100644 --- a/archivebox/cli/tests.py +++ b/archivebox/cli/tests.py @@ -75,8 +75,8 @@ def output_hidden(show_failing=True): yield return - sys.stdout = open('stdout.txt', 'w+') - sys.stderr = open('stderr.txt', 'w+') + sys.stdout = open('stdout.txt', 'w+', encoding='utf-8') + sys.stderr = open('stderr.txt', 'w+', encoding='utf-8') try: yield sys.stdout.close() @@ -89,9 +89,9 @@ def output_hidden(show_failing=True): sys.stdout = stdout sys.stderr = stderr if show_failing: - with open('stdout.txt', 'r') as f: + with open('stdout.txt', 'r', encoding='utf-8') as f: print(f.read()) - with open('stderr.txt', 'r') as f: + with open('stderr.txt', 'r', encoding='utf-8') as f: print(f.read()) raise finally: @@ -116,7 +116,7 @@ class TestInit(unittest.TestCase): assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0 def test_conflicting_init(self): - with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+') as f: + with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f: f.write('test') try: @@ -161,7 +161,7 @@ class TestAdd(unittest.TestCase): def test_add_arg_file(self): test_file = Path(OUTPUT_DIR) / 'test.txt' - with open(test_file, 'w+') as f: + with open(test_file, 'w+', encoding='utf') as f: f.write(test_urls) with output_hidden(): diff --git a/archivebox/config.py b/archivebox/config.py index e9b30061..803e4d19 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -462,7 +462,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: config_file.optionxform = str config_file.read(config_path) - with open(config_path, 'r') as old: + with open(config_path, 'r', encoding='utf-8') as old: atomic_write(f'{config_path}.bak', old.read()) find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0] @@ -490,7 +490,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: else: config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key} - with open(config_path, 'w+') as new: + with open(config_path, 'w+', encoding='utf-8') as new: config_file.write(new) try: @@ -502,7 +502,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: } except: # something went horribly wrong, rever to the previous version - with open(f'{config_path}.bak', 'r') as old: + with open(f'{config_path}.bak', 'r', encoding='utf-8') as old: atomic_write(config_path, old.read()) if Path(f'{config_path}.bak').exists(): @@ -1099,7 +1099,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, from django.conf import settings # log startup message to the error log - with open(settings.ERROR_LOG, "a+") as f: + with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S') f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index 1f382190..a0883113 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -31,7 +31,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr out_dir = out_dir or Path(link.link_dir) if not overwrite and (out_dir / 'archive.org.txt').exists(): - # if open(path, 'r').read().strip() != 'None': + # if open(path, 'r', encoding='utf-8').read().strip() != 'None': return False return SAVE_ARCHIVE_DOT_ORG diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index b2e88712..d7c1e303 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -35,7 +35,7 @@ def get_html(link: Link, path: Path) -> str: document = None for source in sources: try: - with open(abs_path / source, "r") as f: + with open(abs_path / source, "r", encoding="utf-8") as f: document = f.read() break except (FileNotFoundError, TypeError): diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index e6d15455..82d1880e 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -16,7 +16,7 @@ def get_file_result_content(res, extra_path, use_pwd=False): if extra_path: fpath = f'{fpath}/{extra_path}' - with open(fpath, 'r') as file: + with open(fpath, 'r', encoding='utf-8') as file: data = file.read() if data: return [data] diff --git a/archivebox/system.py b/archivebox/system.py index 2191c70a..af0dbbb8 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -37,10 +37,11 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over """Safe atomic write to filesystem by writing to temp file + atomic rename""" mode = 'wb+' if isinstance(contents, bytes) else 'w' + encoding = None if isinstance(contents, bytes) else 'utf-8' # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}') try: - with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f: + with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f: if isinstance(contents, dict): dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) elif isinstance(contents, (bytes, str)): diff --git a/tests/test_add.py b/tests/test_add.py index bb15e51b..331178fe 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -33,7 +33,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac ) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - with open(archived_item_path / "index.json", "r") as f: + with open(archived_item_path / "index.json", "r", encoding='utf-8') as f: output_json = json.load(f) assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html" @@ -79,7 +79,7 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - with open(archived_item_path / "index.json", "r") as f: + with open(archived_item_path / "index.json", "r", encoding="utf-8") as f: output_json = json.load(f) assert output_json["history"] != {} @@ -90,4 +90,4 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process): archived_item_path = list(tmp_path.glob('archive/**/*'))[0] assert (archived_item_path / "warc").exists() - assert not (archived_item_path / "singlefile.html").exists() \ No newline at end of file + assert not (archived_item_path / "singlefile.html").exists() diff --git a/tests/test_extractors.py b/tests/test_extractors.py index b467f0e1..86b50d51 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -86,7 +86,7 @@ def test_headers_retrieved(tmp_path, process, disable_extractors_dict): output_file = archived_item_path / "headers.json" assert output_file.exists() headers_file = archived_item_path / 'headers.json' - with open(headers_file) as f: + with open(headers_file, 'r', encoding='utf-8') as f: headers = pyjson.load(f) assert headers['Content-Language'] == 'en' assert headers['Content-Script-Type'] == 'text/javascript' @@ -98,7 +98,7 @@ def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict): capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob("archive/**/*"))[0] output_file = archived_item_path / "headers.json" - with open(output_file) as f: + with open(output_file, 'r', encoding='utf-8') as f: headers = pyjson.load(f) assert headers['Content-Language'] == 'en' assert headers['Content-Script-Type'] == 'text/javascript' @@ -110,6 +110,6 @@ def test_headers_400_plus(tmp_path, process, disable_extractors_dict): capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob("archive/**/*"))[0] output_file = archived_item_path / "headers.json" - with open(output_file) as f: + with open(output_file, 'r', encoding='utf-8') as f: headers = pyjson.load(f) - assert headers["Status-Code"] == "200" \ No newline at end of file + assert headers["Status-Code"] == "200" diff --git a/tests/test_init.py b/tests/test_init.py index 50897612..728aedfb 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -28,11 +28,11 @@ def test_add_link(tmp_path, process, disable_extractors_dict): assert "index.json" in [x.name for x in archived_item_path.iterdir()] - with open(archived_item_path / "index.json", "r") as f: + with open(archived_item_path / "index.json", "r", encoding="utf-8") as f: output_json = json.load(f) assert "Example Domain" == output_json['history']['title'][0]['output'] - with open(archived_item_path / "index.html", "r") as f: + with open(archived_item_path / "index.html", "r", encoding="utf-8") as f: output_html = f.read() assert "Example Domain" in output_html @@ -47,7 +47,7 @@ def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict): assert "index.json" in [x.name for x in archived_item_path.iterdir()] - with open(archived_item_path / "index.json", "r") as f: + with open(archived_item_path / "index.json", "r", encoding="utf-8") as f: output_json = json.load(f) assert "Example Domain" == output_json['history']['title'][0]['output'] @@ -75,11 +75,11 @@ def test_collision_urls_different_timestamps(tmp_path, process, disable_extracto first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders])) json_index = str(first_archive / "index.json") - with open(json_index, "r") as f: + with open(json_index, "r", encoding="utf-8") as f: link_details = json.loads(f.read()) link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html" - with open(json_index, "w") as f: + with open(json_index, "w", encoding="utf-8") as f: json.dump(link_details, f) init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict) @@ -98,12 +98,12 @@ def test_collision_timestamps_different_urls(tmp_path, process, disable_extracto archive_folders.remove(first_archive.name) json_index = str(first_archive / "index.json") - with open(json_index, "r") as f: + with open(json_index, "r", encoding="utf-8") as f: link_details = json.loads(f.read()) link_details["timestamp"] = archive_folders[0] - with open(json_index, "w") as f: + with open(json_index, "w", encoding="utf-8") as f: json.dump(link_details, f) init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)