enforce utf8 on literally all file operations because windows sucks

2025-06-01 15:28:24 -04:00 · 2021-03-27 01:01:29 -04:00 · 2021-03-27 01:01:29 -04:00 · bd6d9c165b
commit bd6d9c165b
parent 185d2f9f9b
9 changed files with 29 additions and 28 deletions
--- a/tests/test_add.py
+++ b/tests/test_add.py
@ -33,7 +33,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac
    )
    
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding='utf-8') as f:
        output_json = json.load(f)
    assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"

@ -79,7 +79,7 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di

    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]

-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
        output_json = json.load(f)
    assert output_json["history"] != {}

@ -90,4 +90,4 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]

    assert (archived_item_path / "warc").exists()
-    assert not (archived_item_path / "singlefile.html").exists()
+    assert not (archived_item_path / "singlefile.html").exists()
--- a/tests/test_extractors.py
+++ b/tests/test_extractors.py
@ -86,7 +86,7 @@ def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
    output_file = archived_item_path / "headers.json"
    assert output_file.exists()
    headers_file = archived_item_path / 'headers.json'
-    with open(headers_file) as f:
+    with open(headers_file, 'r', encoding='utf-8') as f:
        headers = pyjson.load(f)
    assert headers['Content-Language'] == 'en'
    assert headers['Content-Script-Type'] == 'text/javascript'
@ -98,7 +98,7 @@ def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
    output_file = archived_item_path / "headers.json" 
-    with open(output_file) as f:
+    with open(output_file, 'r', encoding='utf-8') as f:
        headers = pyjson.load(f)
    assert headers['Content-Language'] == 'en'
    assert headers['Content-Script-Type'] == 'text/javascript'
@ -110,6 +110,6 @@ def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
    output_file = archived_item_path / "headers.json" 
-    with open(output_file) as f:
+    with open(output_file, 'r', encoding='utf-8') as f:
        headers = pyjson.load(f)
-    assert headers["Status-Code"] == "200"
+    assert headers["Status-Code"] == "200"
--- a/tests/test_init.py
+++ b/tests/test_init.py
@ -28,11 +28,11 @@ def test_add_link(tmp_path, process, disable_extractors_dict):

    assert "index.json" in [x.name for x in archived_item_path.iterdir()]

-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
        output_json = json.load(f)
    assert "Example Domain" == output_json['history']['title'][0]['output']

-    with open(archived_item_path / "index.html", "r") as f:
+    with open(archived_item_path / "index.html", "r", encoding="utf-8") as f:
        output_html = f.read()
    assert "Example Domain" in output_html

@ -47,7 +47,7 @@ def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):

    assert "index.json" in [x.name for x in archived_item_path.iterdir()]

-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
        output_json = json.load(f)
    assert "Example Domain" == output_json['history']['title'][0]['output']

@ -75,11 +75,11 @@ def test_collision_urls_different_timestamps(tmp_path, process, disable_extracto
    
    first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
    json_index = str(first_archive / "index.json")
-    with open(json_index, "r") as f:
+    with open(json_index, "r", encoding="utf-8") as f:
        link_details = json.loads(f.read())

    link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
-    with open(json_index, "w") as f:
+    with open(json_index, "w", encoding="utf-8") as f:
        json.dump(link_details, f)

    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
@ -98,12 +98,12 @@ def test_collision_timestamps_different_urls(tmp_path, process, disable_extracto
    archive_folders.remove(first_archive.name)
    json_index = str(first_archive / "index.json")

-    with open(json_index, "r") as f:
+    with open(json_index, "r", encoding="utf-8") as f:
        link_details = json.loads(f.read())

    link_details["timestamp"] = archive_folders[0]

-    with open(json_index, "w") as f:
+    with open(json_index, "w", encoding="utf-8") as f:
        json.dump(link_details, f)

    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)