diff -Nru libdeflate-1.18/.cirrus.yml libdeflate-1.19/.cirrus.yml --- libdeflate-1.18/.cirrus.yml 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/.cirrus.yml 2023-09-16 23:48:03.000000000 +0000 @@ -2,7 +2,7 @@ freebsd_instance: matrix: - image_family: freebsd-12-3 - - image_family: freebsd-13-0 + - image_family: freebsd-13-2 install_script: pkg install -y cmake script: - cmake -B build -DLIBDEFLATE_BUILD_TESTS=1 diff -Nru libdeflate-1.18/.github/workflows/ci.yml libdeflate-1.19/.github/workflows/ci.yml --- libdeflate-1.18/.github/workflows/ci.yml 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/.github/workflows/ci.yml 2023-09-16 23:48:03.000000000 +0000 @@ -93,26 +93,30 @@ run: cc -O2 -Wall -Werror -municode lib/*{,/*}.c programs/{gzip,prog_util,tgetopt}.c -o libdeflate-gzip.exe windows-visualstudio-build-and-test: - name: Build and test (Windows, Visual Studio ${{matrix.toolset}}, ${{matrix.platform.vs}}) + name: Build and test (Windows, ${{matrix.gen}}, ${{matrix.toolset}}, ${{matrix.vs}}) strategy: matrix: - platform: [ {vs: x64, vcpkg: x64-windows}, - {vs: Win32, vcpkg: x86-windows} ] - toolset: [v143, ClangCL] - runs-on: windows-latest + include: + - {os: windows-2022, gen: "Visual Studio 17 2022", toolset: v143, vs: x64, vcpkg: x64-windows} + - {os: windows-2022, gen: "Visual Studio 17 2022", toolset: ClangCL, vs: x64, vcpkg: x64-windows} + - {os: windows-2022, gen: "Visual Studio 17 2022", toolset: v143, vs: Win32, vcpkg: x86-windows} + - {os: windows-2022, gen: "Visual Studio 17 2022", toolset: ClangCL, vs: Win32, vcpkg: x86-windows} + - {os: windows-2019, gen: "Visual Studio 16 2019", toolset: v142, vs: x64, vcpkg: x64-windows} + - {os: windows-2019, gen: "Visual Studio 16 2019", toolset: v142, vs: Win32, vcpkg: x86-windows} + runs-on: ${{matrix.os}} steps: - uses: actions/checkout@v3 - uses: microsoft/setup-msbuild@v1.1 - - run: vcpkg install zlib:${{matrix.platform.vcpkg}} + - run: vcpkg install zlib:${{matrix.vcpkg}} - run: > - echo C:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\bin + echo C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\bin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append # Note: as per the CMake documentation, DESTDIR is unsupported on Windows. - run: > - cmake -B build -G "Visual Studio 17 2022" -T ${{matrix.toolset}} - -A ${{matrix.platform.vs}} -DLIBDEFLATE_BUILD_TESTS=1 - -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS /IC:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\include" - -DZLIB_LIBRARY=C:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\lib\zlib.lib + cmake -B build -G "${{matrix.gen}}" -T ${{matrix.toolset}} + -A ${{matrix.vs}} -DLIBDEFLATE_BUILD_TESTS=1 + -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS /IC:\vcpkg\packages\zlib_${{matrix.vcpkg}}\include" + -DZLIB_LIBRARY=C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\lib\zlib.lib -DCMAKE_INSTALL_PREFIX=build\install - run: cmake --build build --verbose --config Debug - run: cmake --install build --verbose --config Debug diff -Nru libdeflate-1.18/CMakeLists.txt libdeflate-1.19/CMakeLists.txt --- libdeflate-1.18/CMakeLists.txt 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/CMakeLists.txt 2023-09-16 23:48:03.000000000 +0000 @@ -38,7 +38,7 @@ option(LIBDEFLATE_FREESTANDING "Build a freestanding library, i.e. a library that doesn't link to any libc functions like malloc(), free(), and memcpy(). Library users will - need to call libdeflate_set_memory_allocator()." OFF) + need to provide a custom memory allocator." OFF) option(LIBDEFLATE_BUILD_GZIP "Build the libdeflate-gzip program" ON) option(LIBDEFLATE_BUILD_TESTS "Build the test programs" OFF) option(LIBDEFLATE_USE_SHARED_LIB diff -Nru libdeflate-1.18/NEWS.md libdeflate-1.19/NEWS.md --- libdeflate-1.18/NEWS.md 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/NEWS.md 2023-09-16 23:48:03.000000000 +0000 @@ -1,5 +1,26 @@ # libdeflate release notes +## Version 1.19 + +* Added new functions `libdeflate_alloc_compressor_ex()` and + `libdeflate_alloc_decompressor_ex()`. These functions allow specifying a + custom memory allocator on a per-compressor basis. + +* libdeflate now always generates Huffman codes with at least 2 codewords. This + fixes a compatibility issue where Windows Explorer's ZIP unpacker could not + decompress DEFLATE streams created by libdeflate. libdeflate's behavior was + allowed by the DEFLATE RFC, but not all software was okay with it. In rare + cases, compression ratios can be slightly reduced by this change. + +* Disabled the use of some compiler intrinsics on MSVC versions where they don't + work correctly. + +* libdeflate can now compress up to the exact size of the output buffer. + +* Slightly improved compression performance at levels 1-9. + +* Improved the compression ratio of very short inputs. + ## Version 1.18 * Fixed a bug where the build type didn't default to "Release" when using diff -Nru libdeflate-1.18/README.md libdeflate-1.19/README.md --- libdeflate-1.18/README.md 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/README.md 2023-09-16 23:48:03.000000000 +0000 @@ -118,6 +118,7 @@ * Julia: [LibDeflate.jl](https://github.com/jakobnissen/LibDeflate.jl) * Nim: [libdeflate-nim](https://github.com/gemesa/libdeflate-nim) * Perl: [Gzip::Libdeflate](https://github.com/benkasminbullock/gzip-libdeflate) +* PHP: [ext-libdeflate](https://github.com/pmmp/ext-libdeflate) * Python: [deflate](https://github.com/dcwatson/deflate) * Ruby: [libdeflate-ruby](https://github.com/kaorimatz/libdeflate-ruby) * Rust: [libdeflater](https://github.com/adamkewley/libdeflater) @@ -126,6 +127,11 @@ the authors of libdeflate. Please direct all questions, bugs, and improvements for these bindings to their authors. +Also, unfortunately many of these bindings bundle or pin an old version of +libdeflate. To avoid known issues in old versions and to improve performance, +before using any of these bindings please ensure that the bundled or pinned +version of libdeflate has been upgraded to the latest release. + # DEFLATE vs. zlib vs. gzip The DEFLATE format ([rfc1951](https://www.ietf.org/rfc/rfc1951.txt)), the zlib diff -Nru libdeflate-1.18/debian/changelog libdeflate-1.19/debian/changelog --- libdeflate-1.18/debian/changelog 2023-09-27 18:26:03.000000000 +0000 +++ libdeflate-1.19/debian/changelog 2024-01-17 20:25:15.000000000 +0000 @@ -1,3 +1,10 @@ +libdeflate (1.19-0ubuntu1~16.04.sav0) xenial; urgency=medium + + * New upstream release + * d/libdeflate0.symbols: Add new libdeflate_alloc_{,de}compressor_ex + + -- Rob Savoury Wed, 17 Jan 2024 12:25:15 -0800 + libdeflate (1.18-1~16.04.sav0) xenial; urgency=medium * Backport to Xenial diff -Nru libdeflate-1.18/debian/libdeflate0.symbols libdeflate-1.19/debian/libdeflate0.symbols --- libdeflate-1.18/debian/libdeflate0.symbols 2020-05-13 10:09:28.000000000 +0000 +++ libdeflate-1.19/debian/libdeflate0.symbols 2024-01-17 20:18:51.000000000 +0000 @@ -2,7 +2,9 @@ * Build-Depends-Package: libdeflate-dev libdeflate_adler32@Base 1.0 libdeflate_alloc_compressor@Base 1.0 + libdeflate_alloc_compressor_ex@Base 1.19 libdeflate_alloc_decompressor@Base 1.0 + libdeflate_alloc_decompressor_ex@Base 1.19 libdeflate_crc32@Base 1.0 libdeflate_deflate_compress@Base 1.0 libdeflate_deflate_compress_bound@Base 1.0 diff -Nru libdeflate-1.18/lib/deflate_compress.c libdeflate-1.19/lib/deflate_compress.c --- libdeflate-1.18/lib/deflate_compress.c 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/lib/deflate_compress.c 2023-09-16 23:48:03.000000000 +0000 @@ -285,46 +285,26 @@ }; /* - * A condensed table which maps offset => offset slot as follows: - * - * offset <= 256: deflate_offset_slot[offset] - * offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)] - * - * This table was generated by scripts/gen_offset_slot_map.py. + * Table: 'offset - 1 => offset_slot' for offset <= 256. + * This was generated by scripts/gen_offset_slot_map.py. */ -static const u8 deflate_offset_slot[512] = { - 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, - 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, - 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, +static const u8 deflate_offset_slot[256] = { + 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, - 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, - 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, - 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, - 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, - 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }; /* The order in which precode codeword lengths are stored */ @@ -477,6 +457,9 @@ void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in, size_t in_nbytes, struct deflate_output_bitstream *os); + /* The free() function for this struct, chosen at allocation time */ + free_func_t free_func; + /* The compression level with which this compressor was created */ unsigned compression_level; @@ -603,7 +586,8 @@ /* The current cost model being used */ struct deflate_costs costs; - struct deflate_costs costs_producing_best_true_cost; + /* Saved cost model */ + struct deflate_costs costs_saved; /* * A table that maps match offset to offset slot. This @@ -654,6 +638,23 @@ */ unsigned min_bits_to_use_nonfinal_path; + /* + * The maximum block length, in uncompressed bytes, at + * which to find and consider the optimal match/literal + * list for the static Huffman codes. This strategy + * improves the compression ratio produced by static + * Huffman blocks and can discover more cases in which + * static blocks are worthwhile. This helps mostly with + * small blocks, hence why this parameter is a max_len. + * + * Above this block length, static Huffman blocks are + * only used opportunistically. I.e. a static Huffman + * block is only used if a static block using the same + * match/literal list as the optimized dynamic block + * happens to be cheaper than the dynamic block itself. + */ + unsigned max_len_to_optimize_static_block; + } n; /* (n)ear-optimal */ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ @@ -701,24 +702,12 @@ */ u8 *next; - /* - * Pointer to near the end of the output buffer. 'next' will never - * exceed this. There are OUTPUT_END_PADDING bytes reserved after this - * to allow branchlessly writing a whole word at this location. - */ + /* Pointer to the end of the output buffer */ u8 *end; -}; -/* - * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be - * present following os->end, in order to not overrun the buffer when generating - * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t) - * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However, - * to make the compression algorithm produce the same result on all CPU - * architectures (which is sometimes desirable), we have to unconditionally use - * the maximum for any CPU, which is sizeof(bitbuf_t) == 8. - */ -#define OUTPUT_END_PADDING 8 + /* true if the output buffer ran out of space */ + bool overflow; +}; /* * Add some bits to the bitbuffer variable of the output bitstream. The caller @@ -732,21 +721,29 @@ ASSERT(bitcount <= BITBUF_NBITS); \ } while (0) -/* Flush bits from the bitbuffer variable to the output buffer. */ +/* + * Flush bits from the bitbuffer variable to the output buffer. After this, the + * bitbuffer will contain at most 7 bits (a partial byte). + * + * Since deflate_flush_block() verified ahead of time that there is enough space + * remaining before actually writing the block, it's guaranteed that out_next + * won't exceed os->end. However, there might not be enough space remaining to + * flush a whole word, even though that's fastest. Therefore, flush a whole + * word if there is space for it, otherwise flush a byte at a time. + */ #define FLUSH_BITS() \ do { \ - if (UNALIGNED_ACCESS_IS_FAST) { \ + if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \ /* Flush a whole word (branchlessly). */ \ put_unaligned_leword(bitbuf, out_next); \ bitbuf >>= bitcount & ~7; \ - out_next += MIN(out_end - out_next, bitcount >> 3); \ + out_next += bitcount >> 3; \ bitcount &= 7; \ } else { \ /* Flush a byte at a time. */ \ while (bitcount >= 8) { \ - *out_next = bitbuf; \ - if (out_next != out_end) \ - out_next++; \ + ASSERT(out_next < os->end); \ + *out_next++ = bitbuf; \ bitcount -= 8; \ bitbuf >>= 8; \ } \ @@ -1335,7 +1332,6 @@ * eventually return the codewords. */ num_used_syms = sort_symbols(num_syms, freqs, lens, A); - /* * 'num_used_syms' is the number of symbols with nonzero frequency. * This may be less than @num_syms. 'num_used_syms' is also the number @@ -1344,30 +1340,34 @@ */ /* - * Handle special cases where only 0 or 1 symbols were used (had nonzero - * frequency). + * A complete Huffman code must contain at least 2 codewords. Yet, it's + * possible that fewer than 2 symbols were used. When this happens, + * it's usually for the offset code (0-1 symbols used). But it's also + * theoretically possible for the litlen and pre codes (1 symbol used). + * + * The DEFLATE RFC explicitly allows the offset code to contain just 1 + * codeword, or even be completely empty. But it's silent about the + * other codes. It also doesn't say whether, in the 1-codeword case, + * the codeword (which it says must be 1 bit) is '0' or '1'. + * + * In any case, some DEFLATE decompressors reject these cases. zlib + * generally allows them, but it does reject precodes that have just 1 + * codeword. More problematically, zlib v1.2.1 and earlier rejected + * empty offset codes, and this behavior can also be seen in Windows + * Explorer's ZIP unpacker (supposedly even still in Windows 11). + * + * Other DEFLATE compressors, including zlib, always send at least 2 + * codewords in order to make a complete Huffman code. Therefore, this + * is a case where practice does not entirely match the specification. + * We follow practice by generating 2 codewords of length 1: codeword + * '0' for symbol 0, and codeword '1' for another symbol -- the used + * symbol if it exists and is not symbol 0, otherwise symbol 1. This + * does worsen the compression ratio by having to send an unnecessary + * offset codeword length. But this only affects rare cases such as + * blocks containing all literals, and it only makes a tiny difference. */ - - if (unlikely(num_used_syms == 0)) { - /* - * Code is empty. sort_symbols() already set all lengths to 0, - * so there is nothing more to do. - */ - return; - } - - if (unlikely(num_used_syms == 1)) { - /* - * Only one symbol was used, so we only need one codeword. But - * two codewords are needed to form the smallest complete - * Huffman code, which uses codewords 0 and 1. Therefore, we - * choose another symbol to which to assign a codeword. We use - * 0 (if the used symbol is not 0) or 1 (if the used symbol is - * 0). In either case, the lesser-valued symbol must be - * assigned codeword 0 so that the resulting code is canonical. - */ - - unsigned sym = A[0] & SYMBOL_MASK; + if (unlikely(num_used_syms < 2)) { + unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0; unsigned nonzero_idx = sym ? sym : 1; codewords[0] = 0; @@ -1451,20 +1451,30 @@ /* Return the offset slot for the given match offset, using the small map. */ static forceinline unsigned -deflate_get_offset_slot(unsigned offset) +deflate_get_offset_slot(u32 offset) { -#if 1 - if (offset <= 256) - return deflate_offset_slot[offset]; - else - return deflate_offset_slot[256 + ((offset - 1) >> 7)]; -#else /* Branchless version */ - u32 i1 = offset; - u32 i2 = 256 + ((offset - 1) >> 7); - u32 is_small = (s32)(offset - 257) >> 31; + /* + * 1 <= offset <= 32768 here. For 1 <= offset <= 256, + * deflate_offset_slot[offset - 1] gives the slot. + * + * For 257 <= offset <= 32768, we take advantage of the fact that 257 is + * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 == + * 128 times larger than each slot [2..16) (since the number of extra + * bits increases by 1 every 2 slots). Thus, the slot is: + * + * deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2) + * == deflate_offset_slot[((offset - 1) >> 7)] + 14 + * + * Define 'n = (offset <= 256) ? 0 : 7'. Then any offset is handled by: + * + * deflate_offset_slot[(offset - 1) >> n] + (n << 1) + * + * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with + * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'. + */ + unsigned n = (256 - offset) >> 29; - return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)]; -#endif + return deflate_offset_slot[(offset - 1) >> n] + (n << 1); } static unsigned @@ -1711,20 +1721,26 @@ bitbuf_t bitbuf = os->bitbuf; unsigned bitcount = os->bitcount; u8 *out_next = os->next; - u8 * const out_end = os->end; - /* The cost for each block type, in bits */ - u32 dynamic_cost = 0; - u32 static_cost = 0; - u32 uncompressed_cost = 0; + u8 * const out_fast_end = + os->end - MIN(WORDBYTES - 1, os->end - out_next); + /* + * The cost for each block type, in bits. Start with the cost of the + * block header which is 3 bits. + */ + u32 dynamic_cost = 3; + u32 static_cost = 3; + u32 uncompressed_cost = 3; u32 best_cost; struct deflate_codes *codes; unsigned sym; - ASSERT(block_length >= MIN_BLOCK_LENGTH || is_final_block); + ASSERT(block_length >= MIN_BLOCK_LENGTH || + (is_final_block && block_length > 0)); ASSERT(block_length <= MAX_BLOCK_LENGTH); ASSERT(bitcount <= 7); ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); - ASSERT(out_next <= out_end); + ASSERT(out_next <= os->end); + ASSERT(!os->overflow); /* Precompute the precode items and build the precode. */ deflate_precompute_huffman_header(c); @@ -1782,9 +1798,71 @@ UINT16_MAX) - 1)) + (8 * block_length); - /* Choose and output the cheapest type of block. */ - best_cost = MIN(static_cost, uncompressed_cost); - if (dynamic_cost < best_cost) { + /* + * Choose and output the cheapest type of block. If there is a tie, + * prefer uncompressed, then static, then dynamic. + */ + + best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost)); + + /* If the block isn't going to fit, then stop early. */ + if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) { + os->overflow = true; + return; + } + /* + * Else, now we know that the block fits, so no further bounds checks on + * the output buffer are required until the next block. + */ + + if (best_cost == uncompressed_cost) { + /* + * Uncompressed block(s). DEFLATE limits the length of + * uncompressed blocks to UINT16_MAX bytes, so if the length of + * the "block" we're flushing is over UINT16_MAX, we actually + * output multiple blocks. + */ + do { + u8 bfinal = 0; + size_t len = UINT16_MAX; + + if (in_end - in_next <= UINT16_MAX) { + bfinal = is_final_block; + len = in_end - in_next; + } + /* It was already checked that there is enough space. */ + ASSERT(os->end - out_next >= + DIV_ROUND_UP(bitcount + 3, 8) + 4 + len); + /* + * Output BFINAL (1 bit) and BTYPE (2 bits), then align + * to a byte boundary. + */ + STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); + *out_next++ = (bfinal << bitcount) | bitbuf; + if (bitcount > 5) + *out_next++ = 0; + bitbuf = 0; + bitcount = 0; + /* Output LEN and NLEN, then the data itself. */ + put_unaligned_le16(len, out_next); + out_next += 2; + put_unaligned_le16(~len, out_next); + out_next += 2; + memcpy(out_next, in_next, len); + out_next += len; + in_next += len; + } while (in_next != in_end); + /* Done outputting uncompressed block(s) */ + goto out; + } + + if (best_cost == static_cost) { + /* Static Huffman block */ + codes = &c->static_codes; + ADD_BITS(is_final_block, 1); + ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); + FLUSH_BITS(); + } else { const unsigned num_explicit_lens = c->o.precode.num_explicit_lens; const unsigned num_precode_items = c->o.precode.num_items; unsigned precode_sym, precode_item; @@ -1792,7 +1870,6 @@ /* Dynamic Huffman block */ - best_cost = dynamic_cost; codes = &c->codes; STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); ADD_BITS(is_final_block, 1); @@ -1844,54 +1921,6 @@ deflate_extra_precode_bits[precode_sym]); FLUSH_BITS(); } while (++i < num_precode_items); - } else if (static_cost < uncompressed_cost) { - /* Static Huffman block */ - codes = &c->static_codes; - ADD_BITS(is_final_block, 1); - ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); - FLUSH_BITS(); - } else { - /* - * Uncompressed block(s). DEFLATE limits the length of - * uncompressed blocks to UINT16_MAX bytes, so if the length of - * the "block" we're flushing is over UINT16_MAX, we actually - * output multiple blocks. - */ - do { - u8 bfinal = 0; - size_t len = UINT16_MAX; - - if (in_end - in_next <= UINT16_MAX) { - bfinal = is_final_block; - len = in_end - in_next; - } - if (out_end - out_next < - (bitcount + 3 + 7) / 8 + 4 + len) { - /* Not enough output space remaining. */ - out_next = out_end; - goto out; - } - /* - * Output BFINAL (1 bit) and BTYPE (2 bits), then align - * to a byte boundary. - */ - STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); - *out_next++ = (bfinal << bitcount) | bitbuf; - if (bitcount > 5) - *out_next++ = 0; - bitbuf = 0; - bitcount = 0; - /* Output LEN and NLEN, then the data itself. */ - put_unaligned_le16(len, out_next); - out_next += 2; - put_unaligned_le16(~len, out_next); - out_next += 2; - memcpy(out_next, in_next, len); - out_next += len; - in_next += len; - } while (in_next != in_end); - /* Done outputting uncompressed block(s) */ - goto out; } /* Output the literals and matches for a dynamic or static block. */ @@ -1995,13 +2024,12 @@ out: ASSERT(bitcount <= 7); /* - * Assert that the block cost was computed correctly, as + * Assert that the block cost was computed correctly. This is relied on + * above for the bounds check on the output buffer. Also, * libdeflate_deflate_compress_bound() relies on this via the assumption - * that uncompressed blocks will always be used when cheaper. + * that uncompressed blocks will always be used when cheapest. */ - ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == - 3 + best_cost || out_next == out_end); - + ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost); os->bitbuf = bitbuf; os->bitcount = bitcount; os->next = out_next; @@ -2305,6 +2333,13 @@ size_t i; /* + * For very short inputs, the static Huffman code has a good chance of + * being best, in which case there is no reason to avoid short matches. + */ + if (data_len < 512) + return DEFLATE_MIN_MATCH_LEN; + + /* * For an initial approximation, scan the first 4 KiB of data. The * caller may use recalculate_min_match_len() to update min_len later. */ @@ -2483,7 +2518,7 @@ deflate_finish_block(c, os, in_block_begin, in_next - in_block_begin, c->p.f.sequences, in_next == in_end); - } while (in_next != in_end); + } while (in_next != in_end && !os->overflow); } /* @@ -2562,7 +2597,7 @@ deflate_finish_block(c, os, in_block_begin, in_next - in_block_begin, c->p.g.sequences, in_next == in_end); - } while (in_next != in_end); + } while (in_next != in_end && !os->overflow); } static forceinline void @@ -2768,7 +2803,7 @@ deflate_finish_block(c, os, in_block_begin, in_next - in_block_begin, c->p.g.sequences, in_next == in_end); - } while (in_next != in_end); + } while (in_next != in_end && !os->overflow); } /* @@ -3389,6 +3424,7 @@ u32 best_true_cost = UINT32_MAX; u32 true_cost; u32 only_lits_cost; + u32 static_cost = UINT32_MAX; struct deflate_sequence seq_; struct deflate_sequence *seq = NULL; u32 i; @@ -3410,6 +3446,24 @@ ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; + /* + * Sometimes a static Huffman block ends up being cheapest, particularly + * if the block is small. So, if the block is sufficiently small, find + * the optimal static block solution and remember its cost. + */ + if (block_length <= c->p.n.max_len_to_optimize_static_block) { + /* Save c->p.n.costs temporarily. */ + c->p.n.costs_saved = c->p.n.costs; + + deflate_set_costs_from_codes(c, &c->static_codes.lens); + deflate_find_min_cost_path(c, block_length, cache_ptr); + static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST; + static_cost += 7; /* for the end-of-block symbol */ + + /* Restore c->p.n.costs. */ + c->p.n.costs = c->p.n.costs_saved; + } + /* Initialize c->p.n.costs with default costs. */ deflate_set_initial_costs(c, block_begin, block_length, is_first_block); @@ -3437,7 +3491,9 @@ break; best_true_cost = true_cost; - c->p.n.costs_producing_best_true_cost = c->p.n.costs; + + /* Save the cost model that gave 'best_true_cost'. */ + c->p.n.costs_saved = c->p.n.costs; /* Update the cost model from the Huffman codes. */ deflate_set_costs_from_codes(c, &c->codes.lens); @@ -3445,20 +3501,26 @@ } while (--num_passes_remaining); *used_only_literals = false; - if (only_lits_cost < best_true_cost) { - /* Using only literals ended up being best! */ - deflate_choose_all_literals(c, block_begin, block_length); - deflate_set_costs_from_codes(c, &c->codes.lens); - seq_.litrunlen_and_length = block_length; - seq = &seq_; - *used_only_literals = true; + if (MIN(only_lits_cost, static_cost) < best_true_cost) { + if (only_lits_cost < static_cost) { + /* Using only literals ended up being best! */ + deflate_choose_all_literals(c, block_begin, block_length); + deflate_set_costs_from_codes(c, &c->codes.lens); + seq_.litrunlen_and_length = block_length; + seq = &seq_; + *used_only_literals = true; + } else { + /* Static block ended up being best! */ + deflate_set_costs_from_codes(c, &c->static_codes.lens); + deflate_find_min_cost_path(c, block_length, cache_ptr); + } } else if (true_cost >= best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) { /* * The best solution was actually from a non-final optimization * pass, so recover and use the min-cost path from that pass. */ - c->p.n.costs = c->p.n.costs_producing_best_true_cost; + c->p.n.costs = c->p.n.costs_saved; deflate_find_min_cost_path(c, block_length, cache_ptr); deflate_set_costs_from_codes(c, &c->codes.lens); } @@ -3782,7 +3844,7 @@ deflate_near_optimal_init_stats(c); in_block_begin = in_next; } - } while (in_next != in_end); + } while (in_next != in_end && !os->overflow); } /* Initialize c->p.n.offset_slot_full. */ @@ -3807,13 +3869,21 @@ #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ LIBDEFLATEAPI struct libdeflate_compressor * -libdeflate_alloc_compressor(int compression_level) +libdeflate_alloc_compressor_ex(int compression_level, + const struct libdeflate_options *options) { struct libdeflate_compressor *c; size_t size = offsetof(struct libdeflate_compressor, p); check_buildtime_parameters(); + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + if (compression_level < 0 || compression_level > 12) return NULL; @@ -3829,9 +3899,14 @@ size += sizeof(c->p.f); } - c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size); + c = libdeflate_aligned_malloc(options->malloc_func ? + options->malloc_func : + libdeflate_default_malloc_func, + MATCHFINDER_MEM_ALIGNMENT, size); if (!c) return NULL; + c->free_func = options->free_func ? + options->free_func : libdeflate_default_free_func; c->compression_level = compression_level; @@ -3902,6 +3977,7 @@ c->p.n.max_optim_passes = 2; c->p.n.min_improvement_to_continue = 32; c->p.n.min_bits_to_use_nonfinal_path = 32; + c->p.n.max_len_to_optimize_static_block = 0; deflate_init_offset_slot_full(c); break; case 11: @@ -3911,6 +3987,7 @@ c->p.n.max_optim_passes = 4; c->p.n.min_improvement_to_continue = 16; c->p.n.min_bits_to_use_nonfinal_path = 16; + c->p.n.max_len_to_optimize_static_block = 1000; deflate_init_offset_slot_full(c); break; case 12: @@ -3921,6 +3998,7 @@ c->p.n.max_optim_passes = 10; c->p.n.min_improvement_to_continue = 1; c->p.n.min_bits_to_use_nonfinal_path = 1; + c->p.n.max_len_to_optimize_static_block = 10000; deflate_init_offset_slot_full(c); break; #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ @@ -3931,6 +4009,16 @@ return c; } + +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor(int compression_level) +{ + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_compressor_ex(compression_level, &defaults); +} + LIBDEFLATEAPI size_t libdeflate_deflate_compress(struct libdeflate_compressor *c, const void *in, size_t in_nbytes, @@ -3946,38 +4034,40 @@ return deflate_compress_none(in, in_nbytes, out, out_nbytes_avail); - /* - * Initialize the output bitstream structure. - * - * The end is set to OUTPUT_END_PADDING below the true end, so that - * FLUSH_BITS() can be more efficient. - */ - if (unlikely(out_nbytes_avail <= OUTPUT_END_PADDING)) - return 0; + /* Initialize the output bitstream structure. */ os.bitbuf = 0; os.bitcount = 0; os.next = out; - os.end = os.next + out_nbytes_avail - OUTPUT_END_PADDING; + os.end = os.next + out_nbytes_avail; + os.overflow = false; + + /* Call the actual compression function. */ (*c->impl)(c, in, in_nbytes, &os); + + /* Return 0 if the output buffer is too small. */ + if (os.overflow) + return 0; + /* - * If 'os.next' reached 'os.end', then either there was not enough space - * in the output buffer, or the compressed size would have been within - * OUTPUT_END_PADDING of the true end. For performance reasons we don't - * distinguish between these cases; we just make sure to return some - * extra space from libdeflate_deflate_compress_bound(). + * Write the final byte if needed. This can't overflow the output + * buffer because deflate_flush_block() would have set the overflow flag + * if there wasn't enough space remaining for the full final block. */ - if (os.next >= os.end) - return 0; ASSERT(os.bitcount <= 7); - if (os.bitcount) + if (os.bitcount) { + ASSERT(os.next < os.end); *os.next++ = os.bitbuf; + } + + /* Return the compressed size in bytes. */ return os.next - (u8 *)out; } LIBDEFLATEAPI void libdeflate_free_compressor(struct libdeflate_compressor *c) { - libdeflate_aligned_free(c); + if (c) + libdeflate_aligned_free(c->free_func, c); } unsigned int @@ -3990,7 +4080,6 @@ libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, size_t in_nbytes) { - size_t bound = 0; size_t max_blocks; /* @@ -4007,10 +4096,12 @@ */ /* - * The minimum length that is passed to deflate_flush_block() is - * MIN_BLOCK_LENGTH bytes, except for the final block if needed. + * Calculate the maximum number of uncompressed blocks that the + * compressor can use for 'in_nbytes' of data. * - * If deflate_flush_block() decides to use an uncompressed block, it + * The minimum length that is passed to deflate_flush_block() is + * MIN_BLOCK_LENGTH bytes, except for the final block if needed. If + * deflate_flush_block() decides to use an uncompressed block, it * actually will (in general) output a series of uncompressed blocks in * order to stay within the UINT16_MAX limit of DEFLATE. But this can * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', @@ -4029,20 +4120,8 @@ * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the * alignment bits at the very start of the block can be disregarded; * they would otherwise increase the overhead to 6 bytes per block.) + * Therefore, the maximum number of overhead bytes is '5 * max_blocks'. + * To get the final bound, add the number of uncompressed bytes. */ - bound += 5 * max_blocks; - - /* Account for the data itself, stored uncompressed. */ - bound += in_nbytes; - - /* - * Add 1 + OUTPUT_END_PADDING because for performance reasons, the - * compressor doesn't distinguish between cases where there wasn't - * enough space and cases where the compressed size would have been - * 'out_nbytes_avail - OUTPUT_END_PADDING' or greater. Adding - * 1 + OUTPUT_END_PADDING to the bound ensures the needed wiggle room. - */ - bound += 1 + OUTPUT_END_PADDING; - - return bound; + return (5 * max_blocks) + in_nbytes; } diff -Nru libdeflate-1.18/lib/deflate_decompress.c libdeflate-1.19/lib/deflate_decompress.c --- libdeflate-1.18/lib/deflate_decompress.c 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/lib/deflate_decompress.c 2023-09-16 23:48:03.000000000 +0000 @@ -671,6 +671,9 @@ bool static_codes_loaded; unsigned litlen_tablebits; + + /* The free() function for this struct, chosen at allocation time */ + free_func_t free_func; }; /* @@ -802,38 +805,48 @@ u32 entry; unsigned i; + /* + * The DEFLATE RFC explicitly allows the offset code to be + * incomplete in two cases: a code containing just 1 codeword, + * if that codeword has length 1; and a code containing no + * codewords. Note: the list of offset codeword lengths is + * always nonempty, but lengths of 0 don't count as codewords. + * + * The RFC doesn't say whether the same cases are allowed for + * the litlen and pre codes. It's actually impossible for no + * symbols to be used from these codes; however, it's + * technically possible for only one symbol to be used. zlib + * allows 1 codeword for the litlen code, but not the precode. + * The RFC also doesn't say whether, when there is 1 codeword, + * that codeword is '0' or '1'. zlib uses '0'. + * + * We accept what zlib accepts, plus a bit more. First, we + * don't treat the precode more strictly than the litlen and + * offset codes. There's no convincing reason to add a special + * case for the precode here. + * + * Second, we just map each allowed incompete code to a complete + * code with only real symbols. To do this, we choose a symbol, + * either the used symbol (for codes with 1 codeword) or an + * arbitrary symbol (for empty codes), and give it both + * codewords '0' and '1'. zlib instead uses a special ERROR + * symbol in the part of the codespace the code doesn't use. + * However, having an ERROR symbol reduces the performance of + * the Huffman decoder, for no real benefit. Our approach also + * avoids having to decide whether '0' or '1' is correct. + * + * Like zlib, we still reject all incomplete codes that contain + * more than 1 codeword or a codeword length greater than 1. + */ if (codespace_used == 0) { - /* - * An empty code is allowed. This can happen for the - * offset code in DEFLATE, since a dynamic Huffman block - * need not contain any matches. - */ - - /* sym=0, len=1 (arbitrary) */ - entry = make_decode_table_entry(decode_results, 0, 1); + sym = 0; /* arbitrary */ } else { - /* - * Allow codes with a single used symbol, with codeword - * length 1. The DEFLATE RFC is unclear regarding this - * case. What zlib's decompressor does is permit this - * for the litlen and offset codes and assume the - * codeword is '0' rather than '1'. We do the same - * except we allow this for precodes too, since there's - * no convincing reason to treat the codes differently. - * We also assign both codewords '0' and '1' to the - * symbol to avoid having to handle '1' specially. - */ if (codespace_used != (1U << (max_codeword_len - 1)) || len_counts[1] != 1) return false; - entry = make_decode_table_entry(decode_results, - *sorted_syms, 1); + sym = sorted_syms[0]; } - /* - * Note: the decode table still must be fully initialized, in - * case the stream is malformed and contains bits from the part - * of the codespace the incomplete code doesn't use. - */ + entry = make_decode_table_entry(decode_results, sym, 1); for (i = 0; i < (1U << table_bits); i++) decode_table[i] = entry; return true; @@ -1140,8 +1153,21 @@ } LIBDEFLATEAPI struct libdeflate_decompressor * -libdeflate_alloc_decompressor(void) +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options) { + struct libdeflate_decompressor *d; + + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + + d = (options->malloc_func ? options->malloc_func : + libdeflate_default_malloc_func)(sizeof(*d)); + if (d == NULL) + return NULL; /* * Note that only certain parts of the decompressor actually must be * initialized here: @@ -1155,18 +1181,28 @@ * valgrind, since build_decode_table() is guaranteed to initialize * all entries eventually anyway.) * + * - 'free_func' must be set. + * * But for simplicity, we currently just zero the whole decompressor. */ - struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d)); - - if (d == NULL) - return NULL; memset(d, 0, sizeof(*d)); + d->free_func = options->free_func ? + options->free_func : libdeflate_default_free_func; return d; } +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor(void) +{ + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_decompressor_ex(&defaults); +} + LIBDEFLATEAPI void libdeflate_free_decompressor(struct libdeflate_decompressor *d) { - libdeflate_free(d); + if (d) + d->free_func(d); } diff -Nru libdeflate-1.18/lib/lib_common.h libdeflate-1.19/lib/lib_common.h --- libdeflate-1.18/lib/lib_common.h 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/lib/lib_common.h 2023-09-16 23:48:03.000000000 +0000 @@ -39,11 +39,15 @@ #include "../common_defs.h" -void *libdeflate_malloc(size_t size); -void libdeflate_free(void *ptr); +typedef void *(*malloc_func_t)(size_t); +typedef void (*free_func_t)(void *); -void *libdeflate_aligned_malloc(size_t alignment, size_t size); -void libdeflate_aligned_free(void *ptr); +extern malloc_func_t libdeflate_default_malloc_func; +extern free_func_t libdeflate_default_free_func; + +void *libdeflate_aligned_malloc(malloc_func_t malloc_func, + size_t alignment, size_t size); +void libdeflate_aligned_free(free_func_t free_func, void *ptr); #ifdef FREESTANDING /* diff -Nru libdeflate-1.18/lib/utils.c libdeflate-1.19/lib/utils.c --- libdeflate-1.18/lib/utils.c 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/lib/utils.c 2023-09-16 23:48:03.000000000 +0000 @@ -34,27 +34,18 @@ # include #endif -static void *(*libdeflate_malloc_func)(size_t) = malloc; -static void (*libdeflate_free_func)(void *) = free; +malloc_func_t libdeflate_default_malloc_func = malloc; +free_func_t libdeflate_default_free_func = free; void * -libdeflate_malloc(size_t size) +libdeflate_aligned_malloc(malloc_func_t malloc_func, + size_t alignment, size_t size) { - return (*libdeflate_malloc_func)(size); -} - -void -libdeflate_free(void *ptr) -{ - (*libdeflate_free_func)(ptr); -} + void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size); -void * -libdeflate_aligned_malloc(size_t alignment, size_t size) -{ - void *ptr = libdeflate_malloc(sizeof(void *) + alignment - 1 + size); if (ptr) { void *orig_ptr = ptr; + ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); ((void **)ptr)[-1] = orig_ptr; } @@ -62,18 +53,17 @@ } void -libdeflate_aligned_free(void *ptr) +libdeflate_aligned_free(free_func_t free_func, void *ptr) { - if (ptr) - libdeflate_free(((void **)ptr)[-1]); + (*free_func)(((void **)ptr)[-1]); } LIBDEFLATEAPI void -libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), - void (*free_func)(void *)) +libdeflate_set_memory_allocator(malloc_func_t malloc_func, + free_func_t free_func) { - libdeflate_malloc_func = malloc_func; - libdeflate_free_func = free_func; + libdeflate_default_malloc_func = malloc_func; + libdeflate_default_free_func = free_func; } /* diff -Nru libdeflate-1.18/lib/x86/cpu_features.c libdeflate-1.19/lib/x86/cpu_features.c --- libdeflate-1.18/lib/x86/cpu_features.c 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/lib/x86/cpu_features.c 2023-09-16 23:48:03.000000000 +0000 @@ -30,15 +30,17 @@ #if HAVE_DYNAMIC_X86_CPU_FEATURES -/* With old GCC versions we have to manually save and restore the x86_32 PIC - * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */ +/* + * With old GCC versions we have to manually save and restore the x86_32 PIC + * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 + */ #if defined(ARCH_X86_32) && defined(__PIC__) # define EBX_CONSTRAINT "=&r" #else # define EBX_CONSTRAINT "=b" #endif -/* Execute the CPUID instruction. */ +/* Execute the CPUID instruction. */ static inline void cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) { @@ -51,40 +53,37 @@ *c = result[2]; *d = result[3]; #else - __asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" - "cpuid \n" - ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" - : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) - : "a" (leaf), "c" (subleaf)); + __asm__ volatile(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" + "cpuid \n" + ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" + : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) + : "a" (leaf), "c" (subleaf)); #endif } -/* Read an extended control register. */ +/* Read an extended control register. */ static inline u64 read_xcr(u32 index) { #ifdef _MSC_VER return _xgetbv(index); #else - u32 edx, eax; + u32 d, a; - /* Execute the "xgetbv" instruction. Old versions of binutils do not - * recognize this instruction, so list the raw bytes instead. */ - __asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index)); + /* + * Execute the "xgetbv" instruction. Old versions of binutils do not + * recognize this instruction, so list the raw bytes instead. + * + * This must be 'volatile' to prevent this code from being moved out + * from under the check for OSXSAVE. + */ + __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : + "=d" (d), "=a" (a) : "c" (index)); - return ((u64)edx << 32) | eax; + return ((u64)d << 32) | a; #endif } -#undef BIT -#define BIT(nr) (1UL << (nr)) - -#define XCR0_BIT_SSE BIT(1) -#define XCR0_BIT_AVX BIT(2) - -#define IS_SET(reg, nr) ((reg) & BIT(nr)) -#define IS_ALL_SET(reg, mask) (((reg) & (mask)) == (mask)) - static const struct cpu_feature x86_cpu_feature_table[] = { {X86_CPU_FEATURE_SSE2, "sse2"}, {X86_CPU_FEATURE_PCLMUL, "pclmul"}, @@ -98,47 +97,34 @@ /* Initialize libdeflate_x86_cpu_features. */ void libdeflate_init_x86_cpu_features(void) { + u32 max_leaf, a, b, c, d; + u64 xcr0 = 0; u32 features = 0; - u32 dummy1, dummy2, dummy3, dummy4; - u32 max_function; - u32 features_1, features_2, features_3, features_4; - bool os_avx_support = false; - - /* Get maximum supported function */ - cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4); - if (max_function < 1) - goto out; - /* Standard feature flags */ - cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1); + /* EAX=0: Highest Function Parameter and Manufacturer ID */ + cpuid(0, 0, &max_leaf, &b, &c, &d); + if (max_leaf < 1) + goto out; - if (IS_SET(features_1, 26)) + /* EAX=1: Processor Info and Feature Bits */ + cpuid(1, 0, &a, &b, &c, &d); + if (d & (1 << 26)) features |= X86_CPU_FEATURE_SSE2; - - if (IS_SET(features_2, 1)) + if (c & (1 << 1)) features |= X86_CPU_FEATURE_PCLMUL; - - if (IS_SET(features_2, 27)) { /* OSXSAVE set? */ - u64 xcr0 = read_xcr(0); - - os_avx_support = IS_ALL_SET(xcr0, - XCR0_BIT_SSE | - XCR0_BIT_AVX); - } - - if (os_avx_support && IS_SET(features_2, 28)) + if (c & (1 << 27)) + xcr0 = read_xcr(0); + if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6)) features |= X86_CPU_FEATURE_AVX; - if (max_function < 7) + if (max_leaf < 7) goto out; - /* Extended feature flags */ - cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4); - - if (os_avx_support && IS_SET(features_3, 5)) + /* EAX=7, ECX=0: Extended Features */ + cpuid(7, 0, &a, &b, &c, &d); + if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6)) features |= X86_CPU_FEATURE_AVX2; - - if (IS_SET(features_3, 8)) + if (b & (1 << 8)) features |= X86_CPU_FEATURE_BMI2; out: diff -Nru libdeflate-1.18/lib/x86/cpu_features.h libdeflate-1.19/lib/x86/cpu_features.h --- libdeflate-1.18/lib/x86/cpu_features.h 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/lib/x86/cpu_features.h 2023-09-16 23:48:03.000000000 +0000 @@ -145,6 +145,16 @@ #else # define HAVE_BMI2_INTRIN 0 #endif +/* + * MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*() + * intrinsics. It seems to be fixed in VS2022. + */ +#if defined(_MSC_VER) && _MSC_VER < 1930 /* older than VS2022 (toolset v143) */ +# undef HAVE_BMI2_NATIVE +# undef HAVE_BMI2_INTRIN +# define HAVE_BMI2_NATIVE 0 +# define HAVE_BMI2_INTRIN 0 +#endif #endif /* ARCH_X86_32 || ARCH_X86_64 */ diff -Nru libdeflate-1.18/libdeflate.h libdeflate-1.19/libdeflate.h --- libdeflate-1.18/libdeflate.h 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/libdeflate.h 2023-09-16 23:48:03.000000000 +0000 @@ -13,8 +13,8 @@ #endif #define LIBDEFLATE_VERSION_MAJOR 1 -#define LIBDEFLATE_VERSION_MINOR 18 -#define LIBDEFLATE_VERSION_STRING "1.18" +#define LIBDEFLATE_VERSION_MINOR 19 +#define LIBDEFLATE_VERSION_STRING "1.19" /* * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause @@ -35,6 +35,7 @@ /* ========================================================================== */ struct libdeflate_compressor; +struct libdeflate_options; /* * libdeflate_alloc_compressor() allocates a new compressor that supports @@ -58,11 +59,18 @@ libdeflate_alloc_compressor(int compression_level); /* + * Like libdeflate_alloc_compressor(), but adds the 'options' argument. + */ +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor_ex(int compression_level, + const struct libdeflate_options *options); + +/* * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of * data. It attempts to compress 'in_nbytes' bytes of data located at 'in' and * write the result to 'out', which has space for 'out_nbytes_avail' bytes. The * return value is the compressed size in bytes, or 0 if the data could not be - * compressed to 'out_nbytes_avail' bytes or fewer (but see note below). + * compressed to 'out_nbytes_avail' bytes or fewer. * * If compression is successful, then the output data is guaranteed to be a * valid DEFLATE stream that decompresses to the input data. No other @@ -72,22 +80,6 @@ * writing tests that compare compressed data to a golden output, as this can * break when libdeflate is updated. (This property isn't specific to * libdeflate; the same is true for zlib and other compression libraries too.) - * - * Note: due to a performance optimization, libdeflate_deflate_compress() - * currently needs a small amount of slack space at the end of the output - * buffer. As a result, it can't actually report compressed sizes very close to - * 'out_nbytes_avail'. This doesn't matter in real-world use cases, and - * libdeflate_deflate_compress_bound() already includes the slack space. - * However, it does mean that testing code that redundantly compresses data - * using an exact-sized output buffer won't work as might be expected: - * - * out_nbytes = libdeflate_deflate_compress(c, in, in_nbytes, out, - * libdeflate_deflate_compress_bound(in_nbytes)); - * // The following assertion will fail. - * assert(libdeflate_deflate_compress(c, in, in_nbytes, out, out_nbytes) != 0); - * - * To avoid this, either don't write tests like the above, or make sure to - * include at least 9 bytes of slack space in 'out_nbytes_avail'. */ LIBDEFLATEAPI size_t libdeflate_deflate_compress(struct libdeflate_compressor *compressor, @@ -171,6 +163,7 @@ /* ========================================================================== */ struct libdeflate_decompressor; +struct libdeflate_options; /* * libdeflate_alloc_decompressor() allocates a new decompressor that can be used @@ -188,6 +181,12 @@ libdeflate_alloc_decompressor(void); /* + * Like libdeflate_alloc_decompressor(), but adds the 'options' argument. + */ +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options); + +/* * Result of a call to libdeflate_deflate_decompress(), * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). */ @@ -351,16 +350,60 @@ /* * Install a custom memory allocator which libdeflate will use for all memory - * allocations. 'malloc_func' is a function that must behave like malloc(), and - * 'free_func' is a function that must behave like free(). + * allocations by default. 'malloc_func' is a function that must behave like + * malloc(), and 'free_func' is a function that must behave like free(). + * + * The per-(de)compressor custom memory allocator that can be specified in + * 'struct libdeflate_options' takes priority over this. * - * There must not be any libdeflate_compressor or libdeflate_decompressor - * structures in existence when calling this function. + * This doesn't affect the free() function that will be used to free + * (de)compressors that were already in existence when this is called. */ LIBDEFLATEAPI void libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), void (*free_func)(void *)); +/* + * Advanced options. This is the options structure that + * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex() + * require. Most users won't need this and should just use the non-"_ex" + * functions instead. If you do need this, it should be initialized like this: + * + * struct libdeflate_options options; + * + * memset(&options, 0, sizeof(options)); + * options.sizeof_options = sizeof(options); + * // Then set the fields that you need to override the defaults for. + */ +struct libdeflate_options { + + /* + * This field must be set to the struct size. This field exists for + * extensibility, so that fields can be appended to this struct in + * future versions of libdeflate while still supporting old binaries. + */ + size_t sizeof_options; + + /* + * An optional custom memory allocator to use for this (de)compressor. + * 'malloc_func' must be a function that behaves like malloc(), and + * 'free_func' must be a function that behaves like free(). + * + * This is useful in cases where a process might have multiple users of + * libdeflate who want to use different memory allocators. For example, + * a library might want to use libdeflate with a custom memory allocator + * without interfering with user code that might use libdeflate too. + * + * This takes priority over the "global" memory allocator (which by + * default is malloc() and free(), but can be changed by + * libdeflate_set_memory_allocator()). Moreover, libdeflate will never + * call the "global" memory allocator if a per-(de)compressor custom + * allocator is always given. + */ + void *(*malloc_func)(size_t); + void (*free_func)(void *); +}; + #ifdef __cplusplus } #endif diff -Nru libdeflate-1.18/programs/test_custom_malloc.c libdeflate-1.19/programs/test_custom_malloc.c --- libdeflate-1.18/programs/test_custom_malloc.c 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/programs/test_custom_malloc.c 2023-09-16 23:48:03.000000000 +0000 @@ -1,7 +1,7 @@ /* * test_custom_malloc.c * - * Test libdeflate_set_memory_allocator(). + * Test the support for custom memory allocators. * Also test injecting allocation failures. */ @@ -28,24 +28,34 @@ free(ptr); } -int -tmain(int argc, tchar *argv[]) +static void reset_state(void) +{ + libdeflate_set_memory_allocator(malloc, free); + malloc_count = 0; + free_count = 0; +} + +/* Test that the custom allocator is actually used when requested. */ +static void do_custom_memalloc_test(bool global) { + static const struct libdeflate_options options = { + .sizeof_options = sizeof(options), + .malloc_func = do_malloc, + .free_func = do_free, + }; int level; struct libdeflate_compressor *c; struct libdeflate_decompressor *d; - begin_program(argv); - - /* Test that the custom allocator is actually used when requested. */ - - libdeflate_set_memory_allocator(do_malloc, do_free); - ASSERT(malloc_count == 0); - ASSERT(free_count == 0); + if (global) + libdeflate_set_memory_allocator(do_malloc, do_free); for (level = 0; level <= 12; level++) { malloc_count = free_count = 0; - c = libdeflate_alloc_compressor(level); + if (global) + c = libdeflate_alloc_compressor(level); + else + c = libdeflate_alloc_compressor_ex(level, &options); ASSERT(c != NULL); ASSERT(malloc_count == 1); ASSERT(free_count == 0); @@ -55,7 +65,10 @@ } malloc_count = free_count = 0; - d = libdeflate_alloc_decompressor(); + if (global) + d = libdeflate_alloc_decompressor(); + else + d = libdeflate_alloc_decompressor_ex(&options); ASSERT(d != NULL); ASSERT(malloc_count == 1); ASSERT(free_count == 0); @@ -63,7 +76,52 @@ ASSERT(malloc_count == 1); ASSERT(free_count == 1); - /* As long as we're here, also test injecting allocation failures. */ + reset_state(); +} + +#define offsetofend(type, field) \ + (offsetof(type, field) + sizeof(((type *)NULL)->field)) + +/* Test some edge cases involving libdeflate_options. */ +static void do_options_test(void) +{ + struct libdeflate_options options = { 0 }; + struct libdeflate_compressor *c; + struct libdeflate_decompressor *d; + /* Size in libdeflate v1.19 */ + size_t min_size = offsetofend(struct libdeflate_options, free_func); + + /* sizeof_options must be at least the minimum size. */ + for (; options.sizeof_options < min_size; + options.sizeof_options++) { + c = libdeflate_alloc_compressor_ex(6, &options); + ASSERT(c == NULL); + d = libdeflate_alloc_decompressor_ex(&options); + ASSERT(d == NULL); + } + + /* NULL malloc_func and free_func means "use the global allocator". */ + options.sizeof_options = min_size; + malloc_count = free_count = 0; + libdeflate_set_memory_allocator(do_malloc, do_free); + c = libdeflate_alloc_compressor_ex(6, &options); + libdeflate_free_compressor(c); + ASSERT(malloc_count == 1); + ASSERT(free_count == 1); + d = libdeflate_alloc_decompressor_ex(&options); + libdeflate_free_decompressor(d); + ASSERT(malloc_count == 2); + ASSERT(free_count == 2); + + reset_state(); +} + +/* Test injecting memory allocation failures. */ +static void do_fault_injection_test(void) +{ + int level; + struct libdeflate_compressor *c; + struct libdeflate_decompressor *d; libdeflate_set_memory_allocator(do_fail_malloc, do_free); @@ -81,5 +139,17 @@ ASSERT(malloc_count == 1); ASSERT(free_count == 0); + reset_state(); +} + +int +tmain(int argc, tchar *argv[]) +{ + begin_program(argv); + + do_custom_memalloc_test(true); + do_custom_memalloc_test(false); + do_options_test(); + do_fault_injection_test(); return 0; } diff -Nru libdeflate-1.18/scripts/gen_offset_slot_map.py libdeflate-1.19/scripts/gen_offset_slot_map.py --- libdeflate-1.18/scripts/gen_offset_slot_map.py 2023-03-24 02:41:29.000000000 +0000 +++ libdeflate-1.19/scripts/gen_offset_slot_map.py 2023-09-16 23:48:03.000000000 +0000 @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# This script generates the deflate_offset_slot[] array, which is a condensed -# map from offsets to offset slots. +# This script generates the deflate_offset_slot[] array, which maps +# 'offset - 1 => offset_slot' for offset <= 256. DEFLATE_OFFSET_SLOT_BASE = [ 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 , @@ -10,26 +10,14 @@ 4097 , 6145 , 8193 , 12289 , 16385 , 24577 , ] -DEFLATE_EXTRA_OFFSET_BITS = [ - 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 , - 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , - 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 , - 11 , 11 , 12 , 12 , 13 , 13 , -] - -offset_slot_map = [0] * 512 - -for offset_slot, offset_base in enumerate(DEFLATE_OFFSET_SLOT_BASE): - num_extra_bits = DEFLATE_EXTRA_OFFSET_BITS[offset_slot] - offset_end = offset_base + (1 << num_extra_bits) - if offset_base <= 256: - for offset in range(offset_base, offset_end): - offset_slot_map[offset] = offset_slot - else: - for offset in range(offset_base, offset_end, 128): - offset_slot_map[256 + ((offset - 1) >> 7)] = offset_slot +offset_slot_map = [0] * 256 +offset_slot = -1 +for offset in range(1, len(offset_slot_map) + 1): + if offset >= DEFLATE_OFFSET_SLOT_BASE[offset_slot + 1]: + offset_slot += 1 + offset_slot_map[offset - 1] = offset_slot -print('static const u8 deflate_offset_slot_map[512] = {') +print(f'static const u8 deflate_offset_slot[{len(offset_slot_map)}] = {{') for i in range(0, len(offset_slot_map), 16): print('\t', end='') for j, v in enumerate(offset_slot_map[i:i+16]):