| 
 | 1 | +import pathlib  | 
 | 2 | +import re  | 
 | 3 | +from typing import Any  | 
 | 4 | + | 
 | 5 | +import pytest  | 
 | 6 | + | 
 | 7 | +from zimscraperlib.zim import Archive, Creator  | 
 | 8 | +from zimscraperlib.zim.dedup import Deduplicator  | 
 | 9 | + | 
 | 10 | + | 
 | 11 | +def test_deduplicator(  | 
 | 12 | +    tmp_path: pathlib.Path,  | 
 | 13 | +    png_image: pathlib.Path,  | 
 | 14 | +    html_file: pathlib.Path,  | 
 | 15 | +    html_str: str,  | 
 | 16 | +    html_str_cn: str,  | 
 | 17 | +):  | 
 | 18 | +    main_path = "welcome"  | 
 | 19 | + | 
 | 20 | +    png_data = png_image.read_bytes()  | 
 | 21 | + | 
 | 22 | +    def add_items(creator_or_deduplicator: Any):  | 
 | 23 | +        creator_or_deduplicator.add_item_for(  | 
 | 24 | +            "welcome1", "wel1", content=html_str, is_front=True  | 
 | 25 | +        )  | 
 | 26 | +        creator_or_deduplicator.add_item_for(  | 
 | 27 | +            "welcome2", "wel2", content=html_str, is_front=True  | 
 | 28 | +        )  | 
 | 29 | +        creator_or_deduplicator.add_item_for(  | 
 | 30 | +            "dedup/welcome3", "wel3", content=html_str, is_front=True  | 
 | 31 | +        )  | 
 | 32 | +        creator_or_deduplicator.add_item_for(  | 
 | 33 | +            "dedup/welcome4", "wel4", content=html_str, is_front=True  | 
 | 34 | +        )  | 
 | 35 | +        creator_or_deduplicator.add_item_for(  | 
 | 36 | +            "prefix/dedup/welcome5", "wel5", content=html_str, is_front=True  | 
 | 37 | +        )  | 
 | 38 | +        creator_or_deduplicator.add_item_for("image1", None, fpath=png_image)  | 
 | 39 | +        creator_or_deduplicator.add_item_for("image2", None, content=png_data)  | 
 | 40 | +        creator_or_deduplicator.add_item_for("dedup/image3", None, fpath=png_image)  | 
 | 41 | +        creator_or_deduplicator.add_item_for("dedup/image4", None, content=png_data)  | 
 | 42 | +        creator_or_deduplicator.add_item_for("dedup/html", None, fpath=html_file)  | 
 | 43 | +        creator_or_deduplicator.add_item_for("dedup/html_cn", None, content=html_str_cn)  | 
 | 44 | +        creator_or_deduplicator.add_item_for(  | 
 | 45 | +            "prefix/dedup/image5", None, content=png_data  | 
 | 46 | +        )  | 
 | 47 | + | 
 | 48 | +    fpath_without_dedup = tmp_path / "zim_without_dedup.zim"  | 
 | 49 | +    with Creator(fpath_without_dedup, main_path).config_dev_metadata() as creator:  | 
 | 50 | +        add_items(creator)  | 
 | 51 | + | 
 | 52 | +    assert fpath_without_dedup.exists()  | 
 | 53 | + | 
 | 54 | +    fpath_with_dedup = tmp_path / "zim_with_dedup.zim"  | 
 | 55 | +    with Creator(fpath_with_dedup, main_path).config_dev_metadata() as creator:  | 
 | 56 | +        deduplicator = Deduplicator(creator)  | 
 | 57 | +        deduplicator.filters.append(re.compile("^foo/.*$"))  | 
 | 58 | +        deduplicator.filters.append(re.compile("^dedup/.*$"))  | 
 | 59 | +        deduplicator.filters.append(re.compile("^bar/.*$"))  | 
 | 60 | +        add_items(deduplicator)  | 
 | 61 | + | 
 | 62 | +        # added_items contains only original items, not the duplicates  | 
 | 63 | +        assert set(deduplicator.added_items.values()) == {  | 
 | 64 | +            "dedup/welcome3",  | 
 | 65 | +            "dedup/image3",  | 
 | 66 | +            "dedup/html_cn",  | 
 | 67 | +        }  | 
 | 68 | + | 
 | 69 | +    assert fpath_with_dedup.exists()  | 
 | 70 | + | 
 | 71 | +    # check that deduplication has a consequence on ZIM size  | 
 | 72 | +    assert (  | 
 | 73 | +        fpath_without_dedup.lstat().st_size - fpath_with_dedup.lstat().st_size  | 
 | 74 | +    ) > 3000  # 3291 as of libzim 9.3  | 
 | 75 | + | 
 | 76 | +    for zim_path in [fpath_with_dedup, fpath_without_dedup]:  | 
 | 77 | +        reader = Archive(zim_path)  | 
 | 78 | + | 
 | 79 | +        assert reader.all_entry_count == 24  | 
 | 80 | + | 
 | 81 | +        for html_path in [  | 
 | 82 | +            "welcome1",  | 
 | 83 | +            "welcome2",  | 
 | 84 | +            "dedup/welcome3",  | 
 | 85 | +            "dedup/welcome4",  | 
 | 86 | +            "prefix/dedup/welcome5",  | 
 | 87 | +            "dedup/html",  | 
 | 88 | +        ]:  | 
 | 89 | +            assert bytes(reader.get_item(html_path).content).decode() == html_str  | 
 | 90 | +        assert bytes(reader.get_item("dedup/html_cn").content).decode() == html_str_cn  | 
 | 91 | + | 
 | 92 | +        for img_path in [  | 
 | 93 | +            "image1",  | 
 | 94 | +            "image2",  | 
 | 95 | +            "dedup/image3",  | 
 | 96 | +            "dedup/image4",  | 
 | 97 | +            "prefix/dedup/image5",  | 
 | 98 | +        ]:  | 
 | 99 | +            assert bytes(reader.get_item(img_path).content) == png_data  | 
 | 100 | + | 
 | 101 | + | 
 | 102 | +def test_missing_content(tmp_path: pathlib.Path):  | 
 | 103 | +    with Creator(tmp_path / "test.zin", "foo").config_dev_metadata() as creator:  | 
 | 104 | +        deduplicator = Deduplicator(creator)  | 
 | 105 | +        deduplicator.filters.append(re.compile(".*"))  | 
 | 106 | +        with pytest.raises(Exception, match="Either content or fpath are mandatory"):  | 
 | 107 | +            deduplicator.add_item_for("welcome", None)  | 
0 commit comments