diff -Nru libdeflate-1.18/.cirrus.yml libdeflate-1.19/.cirrus.yml
--- libdeflate-1.18/.cirrus.yml	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/.cirrus.yml	2023-09-16 23:48:03.000000000 +0000
@@ -2,7 +2,7 @@
   freebsd_instance:
     matrix:
       - image_family: freebsd-12-3
-      - image_family: freebsd-13-0
+      - image_family: freebsd-13-2
   install_script: pkg install -y cmake
   script:
     - cmake -B build -DLIBDEFLATE_BUILD_TESTS=1
diff -Nru libdeflate-1.18/.github/workflows/ci.yml libdeflate-1.19/.github/workflows/ci.yml
--- libdeflate-1.18/.github/workflows/ci.yml	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/.github/workflows/ci.yml	2023-09-16 23:48:03.000000000 +0000
@@ -93,26 +93,30 @@
       run: cc -O2 -Wall -Werror -municode lib/*{,/*}.c programs/{gzip,prog_util,tgetopt}.c -o libdeflate-gzip.exe
 
   windows-visualstudio-build-and-test:
-    name: Build and test (Windows, Visual Studio ${{matrix.toolset}}, ${{matrix.platform.vs}})
+    name: Build and test (Windows, ${{matrix.gen}}, ${{matrix.toolset}}, ${{matrix.vs}})
     strategy:
       matrix:
-        platform: [ {vs: x64, vcpkg: x64-windows},
-                    {vs: Win32, vcpkg: x86-windows} ]
-        toolset: [v143, ClangCL]
-    runs-on: windows-latest
+        include:
+        - {os: windows-2022, gen: "Visual Studio 17 2022", toolset: v143,    vs: x64,   vcpkg: x64-windows}
+        - {os: windows-2022, gen: "Visual Studio 17 2022", toolset: ClangCL, vs: x64,   vcpkg: x64-windows}
+        - {os: windows-2022, gen: "Visual Studio 17 2022", toolset: v143,    vs: Win32, vcpkg: x86-windows}
+        - {os: windows-2022, gen: "Visual Studio 17 2022", toolset: ClangCL, vs: Win32, vcpkg: x86-windows}
+        - {os: windows-2019, gen: "Visual Studio 16 2019", toolset: v142,    vs: x64,   vcpkg: x64-windows}
+        - {os: windows-2019, gen: "Visual Studio 16 2019", toolset: v142,    vs: Win32, vcpkg: x86-windows}
+    runs-on: ${{matrix.os}}
     steps:
     - uses: actions/checkout@v3
     - uses: microsoft/setup-msbuild@v1.1
-    - run: vcpkg install zlib:${{matrix.platform.vcpkg}}
+    - run: vcpkg install zlib:${{matrix.vcpkg}}
     - run: >
-        echo C:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\bin
+        echo C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\bin
         | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
     # Note: as per the CMake documentation, DESTDIR is unsupported on Windows.
     - run: >
-        cmake -B build -G "Visual Studio 17 2022" -T ${{matrix.toolset}}
-        -A ${{matrix.platform.vs}} -DLIBDEFLATE_BUILD_TESTS=1
-        -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS /IC:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\include"
-        -DZLIB_LIBRARY=C:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\lib\zlib.lib
+        cmake -B build -G "${{matrix.gen}}" -T ${{matrix.toolset}}
+        -A ${{matrix.vs}} -DLIBDEFLATE_BUILD_TESTS=1
+        -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS /IC:\vcpkg\packages\zlib_${{matrix.vcpkg}}\include"
+        -DZLIB_LIBRARY=C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\lib\zlib.lib
         -DCMAKE_INSTALL_PREFIX=build\install
     - run: cmake --build build --verbose --config Debug
     - run: cmake --install build --verbose --config Debug
diff -Nru libdeflate-1.18/CMakeLists.txt libdeflate-1.19/CMakeLists.txt
--- libdeflate-1.18/CMakeLists.txt	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/CMakeLists.txt	2023-09-16 23:48:03.000000000 +0000
@@ -38,7 +38,7 @@
 option(LIBDEFLATE_FREESTANDING
        "Build a freestanding library, i.e. a library that doesn't link to any
        libc functions like malloc(), free(), and memcpy().  Library users will
-       need to call libdeflate_set_memory_allocator()." OFF)
+       need to provide a custom memory allocator." OFF)
 option(LIBDEFLATE_BUILD_GZIP "Build the libdeflate-gzip program" ON)
 option(LIBDEFLATE_BUILD_TESTS "Build the test programs" OFF)
 option(LIBDEFLATE_USE_SHARED_LIB
diff -Nru libdeflate-1.18/NEWS.md libdeflate-1.19/NEWS.md
--- libdeflate-1.18/NEWS.md	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/NEWS.md	2023-09-16 23:48:03.000000000 +0000
@@ -1,5 +1,26 @@
 # libdeflate release notes
 
+## Version 1.19
+
+* Added new functions `libdeflate_alloc_compressor_ex()` and
+  `libdeflate_alloc_decompressor_ex()`.  These functions allow specifying a
+  custom memory allocator on a per-compressor basis.
+
+* libdeflate now always generates Huffman codes with at least 2 codewords.  This
+  fixes a compatibility issue where Windows Explorer's ZIP unpacker could not
+  decompress DEFLATE streams created by libdeflate.  libdeflate's behavior was
+  allowed by the DEFLATE RFC, but not all software was okay with it.  In rare
+  cases, compression ratios can be slightly reduced by this change.
+
+* Disabled the use of some compiler intrinsics on MSVC versions where they don't
+  work correctly.
+
+* libdeflate can now compress up to the exact size of the output buffer.
+
+* Slightly improved compression performance at levels 1-9.
+
+* Improved the compression ratio of very short inputs.
+
 ## Version 1.18
 
 * Fixed a bug where the build type didn't default to "Release" when using
diff -Nru libdeflate-1.18/README.md libdeflate-1.19/README.md
--- libdeflate-1.18/README.md	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/README.md	2023-09-16 23:48:03.000000000 +0000
@@ -118,6 +118,7 @@
 * Julia: [LibDeflate.jl](https://github.com/jakobnissen/LibDeflate.jl)
 * Nim: [libdeflate-nim](https://github.com/gemesa/libdeflate-nim)
 * Perl: [Gzip::Libdeflate](https://github.com/benkasminbullock/gzip-libdeflate)
+* PHP: [ext-libdeflate](https://github.com/pmmp/ext-libdeflate)
 * Python: [deflate](https://github.com/dcwatson/deflate)
 * Ruby: [libdeflate-ruby](https://github.com/kaorimatz/libdeflate-ruby)
 * Rust: [libdeflater](https://github.com/adamkewley/libdeflater)
@@ -126,6 +127,11 @@
 the authors of libdeflate.  Please direct all questions, bugs, and improvements
 for these bindings to their authors.
 
+Also, unfortunately many of these bindings bundle or pin an old version of
+libdeflate.  To avoid known issues in old versions and to improve performance,
+before using any of these bindings please ensure that the bundled or pinned
+version of libdeflate has been upgraded to the latest release.
+
 # DEFLATE vs. zlib vs. gzip
 
 The DEFLATE format ([rfc1951](https://www.ietf.org/rfc/rfc1951.txt)), the zlib
diff -Nru libdeflate-1.18/debian/changelog libdeflate-1.19/debian/changelog
--- libdeflate-1.18/debian/changelog	2023-09-27 18:26:03.000000000 +0000
+++ libdeflate-1.19/debian/changelog	2024-01-17 20:25:15.000000000 +0000
@@ -1,3 +1,10 @@
+libdeflate (1.19-0ubuntu1~16.04.sav0) xenial; urgency=medium
+
+  * New upstream release
+  * d/libdeflate0.symbols: Add new libdeflate_alloc_{,de}compressor_ex
+
+ -- Rob Savoury <savoury@savos.tech>  Wed, 17 Jan 2024 12:25:15 -0800
+
 libdeflate (1.18-1~16.04.sav0) xenial; urgency=medium
 
   * Backport to Xenial
diff -Nru libdeflate-1.18/debian/libdeflate0.symbols libdeflate-1.19/debian/libdeflate0.symbols
--- libdeflate-1.18/debian/libdeflate0.symbols	2020-05-13 10:09:28.000000000 +0000
+++ libdeflate-1.19/debian/libdeflate0.symbols	2024-01-17 20:18:51.000000000 +0000
@@ -2,7 +2,9 @@
 * Build-Depends-Package: libdeflate-dev
  libdeflate_adler32@Base 1.0
  libdeflate_alloc_compressor@Base 1.0
+ libdeflate_alloc_compressor_ex@Base 1.19
  libdeflate_alloc_decompressor@Base 1.0
+ libdeflate_alloc_decompressor_ex@Base 1.19
  libdeflate_crc32@Base 1.0
  libdeflate_deflate_compress@Base 1.0
  libdeflate_deflate_compress_bound@Base 1.0
diff -Nru libdeflate-1.18/lib/deflate_compress.c libdeflate-1.19/lib/deflate_compress.c
--- libdeflate-1.18/lib/deflate_compress.c	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/lib/deflate_compress.c	2023-09-16 23:48:03.000000000 +0000
@@ -285,46 +285,26 @@
 };
 
 /*
- * A condensed table which maps offset => offset slot as follows:
- *
- *	offset <= 256: deflate_offset_slot[offset]
- *	offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)]
- *
- * This table was generated by scripts/gen_offset_slot_map.py.
+ * Table: 'offset - 1 => offset_slot' for offset <= 256.
+ * This was generated by scripts/gen_offset_slot_map.py.
  */
-static const u8 deflate_offset_slot[512] = {
-	0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7,
-	7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
-	9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-	10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-	11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+static const u8 deflate_offset_slot[256] = {
+	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
 	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-	12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
 	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
-	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
-	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
-	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
-	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
-	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 };
 
 /* The order in which precode codeword lengths are stored */
@@ -477,6 +457,9 @@
 	void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
 		     size_t in_nbytes, struct deflate_output_bitstream *os);
 
+	/* The free() function for this struct, chosen at allocation time */
+	free_func_t free_func;
+
 	/* The compression level with which this compressor was created */
 	unsigned compression_level;
 
@@ -603,7 +586,8 @@
 			/* The current cost model being used */
 			struct deflate_costs costs;
 
-			struct deflate_costs costs_producing_best_true_cost;
+			/* Saved cost model */
+			struct deflate_costs costs_saved;
 
 			/*
 			 * A table that maps match offset to offset slot.  This
@@ -654,6 +638,23 @@
 			 */
 			unsigned min_bits_to_use_nonfinal_path;
 
+			/*
+			 * The maximum block length, in uncompressed bytes, at
+			 * which to find and consider the optimal match/literal
+			 * list for the static Huffman codes.  This strategy
+			 * improves the compression ratio produced by static
+			 * Huffman blocks and can discover more cases in which
+			 * static blocks are worthwhile.  This helps mostly with
+			 * small blocks, hence why this parameter is a max_len.
+			 *
+			 * Above this block length, static Huffman blocks are
+			 * only used opportunistically.  I.e. a static Huffman
+			 * block is only used if a static block using the same
+			 * match/literal list as the optimized dynamic block
+			 * happens to be cheaper than the dynamic block itself.
+			 */
+			unsigned max_len_to_optimize_static_block;
+
 		} n; /* (n)ear-optimal */
 	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
@@ -701,24 +702,12 @@
 	 */
 	u8 *next;
 
-	/*
-	 * Pointer to near the end of the output buffer.  'next' will never
-	 * exceed this.  There are OUTPUT_END_PADDING bytes reserved after this
-	 * to allow branchlessly writing a whole word at this location.
-	 */
+	/* Pointer to the end of the output buffer */
 	u8 *end;
-};
 
-/*
- * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
- * present following os->end, in order to not overrun the buffer when generating
- * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
- * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
- * to make the compression algorithm produce the same result on all CPU
- * architectures (which is sometimes desirable), we have to unconditionally use
- * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
- */
-#define OUTPUT_END_PADDING	8
+	/* true if the output buffer ran out of space */
+	bool overflow;
+};
 
 /*
  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
@@ -732,21 +721,29 @@
 	ASSERT(bitcount <= BITBUF_NBITS);	\
 } while (0)
 
-/* Flush bits from the bitbuffer variable to the output buffer. */
+/*
+ * Flush bits from the bitbuffer variable to the output buffer.  After this, the
+ * bitbuffer will contain at most 7 bits (a partial byte).
+ *
+ * Since deflate_flush_block() verified ahead of time that there is enough space
+ * remaining before actually writing the block, it's guaranteed that out_next
+ * won't exceed os->end.  However, there might not be enough space remaining to
+ * flush a whole word, even though that's fastest.  Therefore, flush a whole
+ * word if there is space for it, otherwise flush a byte at a time.
+ */
 #define FLUSH_BITS()							\
 do {									\
-	if (UNALIGNED_ACCESS_IS_FAST) {					\
+	if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \
 		/* Flush a whole word (branchlessly). */		\
 		put_unaligned_leword(bitbuf, out_next);			\
 		bitbuf >>= bitcount & ~7;				\
-		out_next += MIN(out_end - out_next, bitcount >> 3);	\
+		out_next += bitcount >> 3;				\
 		bitcount &= 7;						\
 	} else {							\
 		/* Flush a byte at a time. */				\
 		while (bitcount >= 8) {					\
-			*out_next = bitbuf;				\
-			if (out_next != out_end)			\
-				out_next++;				\
+			ASSERT(out_next < os->end);			\
+			*out_next++ = bitbuf;				\
 			bitcount -= 8;					\
 			bitbuf >>= 8;					\
 		}							\
@@ -1335,7 +1332,6 @@
 	 * eventually return the codewords.
 	 */
 	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
-
 	/*
 	 * 'num_used_syms' is the number of symbols with nonzero frequency.
 	 * This may be less than @num_syms.  'num_used_syms' is also the number
@@ -1344,30 +1340,34 @@
 	 */
 
 	/*
-	 * Handle special cases where only 0 or 1 symbols were used (had nonzero
-	 * frequency).
+	 * A complete Huffman code must contain at least 2 codewords.  Yet, it's
+	 * possible that fewer than 2 symbols were used.  When this happens,
+	 * it's usually for the offset code (0-1 symbols used).  But it's also
+	 * theoretically possible for the litlen and pre codes (1 symbol used).
+	 *
+	 * The DEFLATE RFC explicitly allows the offset code to contain just 1
+	 * codeword, or even be completely empty.  But it's silent about the
+	 * other codes.  It also doesn't say whether, in the 1-codeword case,
+	 * the codeword (which it says must be 1 bit) is '0' or '1'.
+	 *
+	 * In any case, some DEFLATE decompressors reject these cases.  zlib
+	 * generally allows them, but it does reject precodes that have just 1
+	 * codeword.  More problematically, zlib v1.2.1 and earlier rejected
+	 * empty offset codes, and this behavior can also be seen in Windows
+	 * Explorer's ZIP unpacker (supposedly even still in Windows 11).
+	 *
+	 * Other DEFLATE compressors, including zlib, always send at least 2
+	 * codewords in order to make a complete Huffman code.  Therefore, this
+	 * is a case where practice does not entirely match the specification.
+	 * We follow practice by generating 2 codewords of length 1: codeword
+	 * '0' for symbol 0, and codeword '1' for another symbol -- the used
+	 * symbol if it exists and is not symbol 0, otherwise symbol 1.  This
+	 * does worsen the compression ratio by having to send an unnecessary
+	 * offset codeword length.  But this only affects rare cases such as
+	 * blocks containing all literals, and it only makes a tiny difference.
 	 */
-
-	if (unlikely(num_used_syms == 0)) {
-		/*
-		 * Code is empty.  sort_symbols() already set all lengths to 0,
-		 * so there is nothing more to do.
-		 */
-		return;
-	}
-
-	if (unlikely(num_used_syms == 1)) {
-		/*
-		 * Only one symbol was used, so we only need one codeword.  But
-		 * two codewords are needed to form the smallest complete
-		 * Huffman code, which uses codewords 0 and 1.  Therefore, we
-		 * choose another symbol to which to assign a codeword.  We use
-		 * 0 (if the used symbol is not 0) or 1 (if the used symbol is
-		 * 0).  In either case, the lesser-valued symbol must be
-		 * assigned codeword 0 so that the resulting code is canonical.
-		 */
-
-		unsigned sym = A[0] & SYMBOL_MASK;
+	if (unlikely(num_used_syms < 2)) {
+		unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0;
 		unsigned nonzero_idx = sym ? sym : 1;
 
 		codewords[0] = 0;
@@ -1451,20 +1451,30 @@
 
 /* Return the offset slot for the given match offset, using the small map. */
 static forceinline unsigned
-deflate_get_offset_slot(unsigned offset)
+deflate_get_offset_slot(u32 offset)
 {
-#if 1
-	if (offset <= 256)
-		return deflate_offset_slot[offset];
-	else
-		return deflate_offset_slot[256 + ((offset - 1) >> 7)];
-#else /* Branchless version */
-	u32 i1 = offset;
-	u32 i2 = 256 + ((offset - 1) >> 7);
-	u32 is_small = (s32)(offset - 257) >> 31;
+	/*
+	 * 1 <= offset <= 32768 here.  For 1 <= offset <= 256,
+	 * deflate_offset_slot[offset - 1] gives the slot.
+	 *
+	 * For 257 <= offset <= 32768, we take advantage of the fact that 257 is
+	 * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 ==
+	 * 128 times larger than each slot [2..16) (since the number of extra
+	 * bits increases by 1 every 2 slots).  Thus, the slot is:
+	 *
+	 *	deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2)
+	 *   == deflate_offset_slot[((offset - 1) >> 7)] + 14
+	 *
+	 * Define 'n = (offset <= 256) ? 0 : 7'.  Then any offset is handled by:
+	 *
+	 *      deflate_offset_slot[(offset - 1) >> n] + (n << 1)
+	 *
+	 * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with
+	 * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'.
+	 */
+	unsigned n = (256 - offset) >> 29;
 
-	return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)];
-#endif
+	return deflate_offset_slot[(offset - 1) >> n] + (n << 1);
 }
 
 static unsigned
@@ -1711,20 +1721,26 @@
 	bitbuf_t bitbuf = os->bitbuf;
 	unsigned bitcount = os->bitcount;
 	u8 *out_next = os->next;
-	u8 * const out_end = os->end;
-	/* The cost for each block type, in bits */
-	u32 dynamic_cost = 0;
-	u32 static_cost = 0;
-	u32 uncompressed_cost = 0;
+	u8 * const out_fast_end =
+		os->end - MIN(WORDBYTES - 1, os->end - out_next);
+	/*
+	 * The cost for each block type, in bits.  Start with the cost of the
+	 * block header which is 3 bits.
+	 */
+	u32 dynamic_cost = 3;
+	u32 static_cost = 3;
+	u32 uncompressed_cost = 3;
 	u32 best_cost;
 	struct deflate_codes *codes;
 	unsigned sym;
 
-	ASSERT(block_length >= MIN_BLOCK_LENGTH || is_final_block);
+	ASSERT(block_length >= MIN_BLOCK_LENGTH ||
+	       (is_final_block && block_length > 0));
 	ASSERT(block_length <= MAX_BLOCK_LENGTH);
 	ASSERT(bitcount <= 7);
 	ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
-	ASSERT(out_next <= out_end);
+	ASSERT(out_next <= os->end);
+	ASSERT(!os->overflow);
 
 	/* Precompute the precode items and build the precode. */
 	deflate_precompute_huffman_header(c);
@@ -1782,9 +1798,71 @@
 						 UINT16_MAX) - 1)) +
 			     (8 * block_length);
 
-	/* Choose and output the cheapest type of block. */
-	best_cost = MIN(static_cost, uncompressed_cost);
-	if (dynamic_cost < best_cost) {
+	/*
+	 * Choose and output the cheapest type of block.  If there is a tie,
+	 * prefer uncompressed, then static, then dynamic.
+	 */
+
+	best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost));
+
+	/* If the block isn't going to fit, then stop early. */
+	if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) {
+		os->overflow = true;
+		return;
+	}
+	/*
+	 * Else, now we know that the block fits, so no further bounds checks on
+	 * the output buffer are required until the next block.
+	 */
+
+	if (best_cost == uncompressed_cost) {
+		/*
+		 * Uncompressed block(s).  DEFLATE limits the length of
+		 * uncompressed blocks to UINT16_MAX bytes, so if the length of
+		 * the "block" we're flushing is over UINT16_MAX, we actually
+		 * output multiple blocks.
+		 */
+		do {
+			u8 bfinal = 0;
+			size_t len = UINT16_MAX;
+
+			if (in_end - in_next <= UINT16_MAX) {
+				bfinal = is_final_block;
+				len = in_end - in_next;
+			}
+			/* It was already checked that there is enough space. */
+			ASSERT(os->end - out_next >=
+			       DIV_ROUND_UP(bitcount + 3, 8) + 4 + len);
+			/*
+			 * Output BFINAL (1 bit) and BTYPE (2 bits), then align
+			 * to a byte boundary.
+			 */
+			STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
+			*out_next++ = (bfinal << bitcount) | bitbuf;
+			if (bitcount > 5)
+				*out_next++ = 0;
+			bitbuf = 0;
+			bitcount = 0;
+			/* Output LEN and NLEN, then the data itself. */
+			put_unaligned_le16(len, out_next);
+			out_next += 2;
+			put_unaligned_le16(~len, out_next);
+			out_next += 2;
+			memcpy(out_next, in_next, len);
+			out_next += len;
+			in_next += len;
+		} while (in_next != in_end);
+		/* Done outputting uncompressed block(s) */
+		goto out;
+	}
+
+	if (best_cost == static_cost) {
+		/* Static Huffman block */
+		codes = &c->static_codes;
+		ADD_BITS(is_final_block, 1);
+		ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
+		FLUSH_BITS();
+	} else {
 		const unsigned num_explicit_lens = c->o.precode.num_explicit_lens;
 		const unsigned num_precode_items = c->o.precode.num_items;
 		unsigned precode_sym, precode_item;
@@ -1792,7 +1870,6 @@
 
 		/* Dynamic Huffman block */
 
-		best_cost = dynamic_cost;
 		codes = &c->codes;
 		STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3));
 		ADD_BITS(is_final_block, 1);
@@ -1844,54 +1921,6 @@
 				 deflate_extra_precode_bits[precode_sym]);
 			FLUSH_BITS();
 		} while (++i < num_precode_items);
-	} else if (static_cost < uncompressed_cost) {
-		/* Static Huffman block */
-		codes = &c->static_codes;
-		ADD_BITS(is_final_block, 1);
-		ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
-		FLUSH_BITS();
-	} else {
-		/*
-		 * Uncompressed block(s).  DEFLATE limits the length of
-		 * uncompressed blocks to UINT16_MAX bytes, so if the length of
-		 * the "block" we're flushing is over UINT16_MAX, we actually
-		 * output multiple blocks.
-		 */
-		do {
-			u8 bfinal = 0;
-			size_t len = UINT16_MAX;
-
-			if (in_end - in_next <= UINT16_MAX) {
-				bfinal = is_final_block;
-				len = in_end - in_next;
-			}
-			if (out_end - out_next <
-			    (bitcount + 3 + 7) / 8 + 4 + len) {
-				/* Not enough output space remaining. */
-				out_next = out_end;
-				goto out;
-			}
-			/*
-			 * Output BFINAL (1 bit) and BTYPE (2 bits), then align
-			 * to a byte boundary.
-			 */
-			STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
-			*out_next++ = (bfinal << bitcount) | bitbuf;
-			if (bitcount > 5)
-				*out_next++ = 0;
-			bitbuf = 0;
-			bitcount = 0;
-			/* Output LEN and NLEN, then the data itself. */
-			put_unaligned_le16(len, out_next);
-			out_next += 2;
-			put_unaligned_le16(~len, out_next);
-			out_next += 2;
-			memcpy(out_next, in_next, len);
-			out_next += len;
-			in_next += len;
-		} while (in_next != in_end);
-		/* Done outputting uncompressed block(s) */
-		goto out;
 	}
 
 	/* Output the literals and matches for a dynamic or static block. */
@@ -1995,13 +2024,12 @@
 out:
 	ASSERT(bitcount <= 7);
 	/*
-	 * Assert that the block cost was computed correctly, as
+	 * Assert that the block cost was computed correctly.  This is relied on
+	 * above for the bounds check on the output buffer.  Also,
 	 * libdeflate_deflate_compress_bound() relies on this via the assumption
-	 * that uncompressed blocks will always be used when cheaper.
+	 * that uncompressed blocks will always be used when cheapest.
 	 */
-	ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount ==
-	       3 + best_cost || out_next == out_end);
-
+	ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost);
 	os->bitbuf = bitbuf;
 	os->bitcount = bitcount;
 	os->next = out_next;
@@ -2305,6 +2333,13 @@
 	size_t i;
 
 	/*
+	 * For very short inputs, the static Huffman code has a good chance of
+	 * being best, in which case there is no reason to avoid short matches.
+	 */
+	if (data_len < 512)
+		return DEFLATE_MIN_MATCH_LEN;
+
+	/*
 	 * For an initial approximation, scan the first 4 KiB of data.  The
 	 * caller may use recalculate_min_match_len() to update min_len later.
 	 */
@@ -2483,7 +2518,7 @@
 		deflate_finish_block(c, os, in_block_begin,
 				     in_next - in_block_begin,
 				     c->p.f.sequences, in_next == in_end);
-	} while (in_next != in_end);
+	} while (in_next != in_end && !os->overflow);
 }
 
 /*
@@ -2562,7 +2597,7 @@
 		deflate_finish_block(c, os, in_block_begin,
 				     in_next - in_block_begin,
 				     c->p.g.sequences, in_next == in_end);
-	} while (in_next != in_end);
+	} while (in_next != in_end && !os->overflow);
 }
 
 static forceinline void
@@ -2768,7 +2803,7 @@
 		deflate_finish_block(c, os, in_block_begin,
 				     in_next - in_block_begin,
 				     c->p.g.sequences, in_next == in_end);
-	} while (in_next != in_end);
+	} while (in_next != in_end && !os->overflow);
 }
 
 /*
@@ -3389,6 +3424,7 @@
 	u32 best_true_cost = UINT32_MAX;
 	u32 true_cost;
 	u32 only_lits_cost;
+	u32 static_cost = UINT32_MAX;
 	struct deflate_sequence seq_;
 	struct deflate_sequence *seq = NULL;
 	u32 i;
@@ -3410,6 +3446,24 @@
 		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
 
+	/*
+	 * Sometimes a static Huffman block ends up being cheapest, particularly
+	 * if the block is small.  So, if the block is sufficiently small, find
+	 * the optimal static block solution and remember its cost.
+	 */
+	if (block_length <= c->p.n.max_len_to_optimize_static_block) {
+		/* Save c->p.n.costs temporarily. */
+		c->p.n.costs_saved = c->p.n.costs;
+
+		deflate_set_costs_from_codes(c, &c->static_codes.lens);
+		deflate_find_min_cost_path(c, block_length, cache_ptr);
+		static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
+		static_cost += 7; /* for the end-of-block symbol */
+
+		/* Restore c->p.n.costs. */
+		c->p.n.costs = c->p.n.costs_saved;
+	}
+
 	/* Initialize c->p.n.costs with default costs. */
 	deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
 
@@ -3437,7 +3491,9 @@
 			break;
 
 		best_true_cost = true_cost;
-		c->p.n.costs_producing_best_true_cost = c->p.n.costs;
+
+		/* Save the cost model that gave 'best_true_cost'. */
+		c->p.n.costs_saved = c->p.n.costs;
 
 		/* Update the cost model from the Huffman codes. */
 		deflate_set_costs_from_codes(c, &c->codes.lens);
@@ -3445,20 +3501,26 @@
 	} while (--num_passes_remaining);
 
 	*used_only_literals = false;
-	if (only_lits_cost < best_true_cost) {
-		/* Using only literals ended up being best! */
-		deflate_choose_all_literals(c, block_begin, block_length);
-		deflate_set_costs_from_codes(c, &c->codes.lens);
-		seq_.litrunlen_and_length = block_length;
-		seq = &seq_;
-		*used_only_literals = true;
+	if (MIN(only_lits_cost, static_cost) < best_true_cost) {
+		if (only_lits_cost < static_cost) {
+			/* Using only literals ended up being best! */
+			deflate_choose_all_literals(c, block_begin, block_length);
+			deflate_set_costs_from_codes(c, &c->codes.lens);
+			seq_.litrunlen_and_length = block_length;
+			seq = &seq_;
+			*used_only_literals = true;
+		} else {
+			/* Static block ended up being best! */
+			deflate_set_costs_from_codes(c, &c->static_codes.lens);
+			deflate_find_min_cost_path(c, block_length, cache_ptr);
+		}
 	} else if (true_cost >=
 		   best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
 		/*
 		 * The best solution was actually from a non-final optimization
 		 * pass, so recover and use the min-cost path from that pass.
 		 */
-		c->p.n.costs = c->p.n.costs_producing_best_true_cost;
+		c->p.n.costs = c->p.n.costs_saved;
 		deflate_find_min_cost_path(c, block_length, cache_ptr);
 		deflate_set_costs_from_codes(c, &c->codes.lens);
 	}
@@ -3782,7 +3844,7 @@
 			deflate_near_optimal_init_stats(c);
 			in_block_begin = in_next;
 		}
-	} while (in_next != in_end);
+	} while (in_next != in_end && !os->overflow);
 }
 
 /* Initialize c->p.n.offset_slot_full. */
@@ -3807,13 +3869,21 @@
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
 LIBDEFLATEAPI struct libdeflate_compressor *
-libdeflate_alloc_compressor(int compression_level)
+libdeflate_alloc_compressor_ex(int compression_level,
+			       const struct libdeflate_options *options)
 {
 	struct libdeflate_compressor *c;
 	size_t size = offsetof(struct libdeflate_compressor, p);
 
 	check_buildtime_parameters();
 
+	/*
+	 * Note: if more fields are added to libdeflate_options, this code will
+	 * need to be updated to support both the old and new structs.
+	 */
+	if (options->sizeof_options != sizeof(*options))
+		return NULL;
+
 	if (compression_level < 0 || compression_level > 12)
 		return NULL;
 
@@ -3829,9 +3899,14 @@
 			size += sizeof(c->p.f);
 	}
 
-	c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
+	c = libdeflate_aligned_malloc(options->malloc_func ?
+				      options->malloc_func :
+				      libdeflate_default_malloc_func,
+				      MATCHFINDER_MEM_ALIGNMENT, size);
 	if (!c)
 		return NULL;
+	c->free_func = options->free_func ?
+		       options->free_func : libdeflate_default_free_func;
 
 	c->compression_level = compression_level;
 
@@ -3902,6 +3977,7 @@
 		c->p.n.max_optim_passes = 2;
 		c->p.n.min_improvement_to_continue = 32;
 		c->p.n.min_bits_to_use_nonfinal_path = 32;
+		c->p.n.max_len_to_optimize_static_block = 0;
 		deflate_init_offset_slot_full(c);
 		break;
 	case 11:
@@ -3911,6 +3987,7 @@
 		c->p.n.max_optim_passes = 4;
 		c->p.n.min_improvement_to_continue = 16;
 		c->p.n.min_bits_to_use_nonfinal_path = 16;
+		c->p.n.max_len_to_optimize_static_block = 1000;
 		deflate_init_offset_slot_full(c);
 		break;
 	case 12:
@@ -3921,6 +3998,7 @@
 		c->p.n.max_optim_passes = 10;
 		c->p.n.min_improvement_to_continue = 1;
 		c->p.n.min_bits_to_use_nonfinal_path = 1;
+		c->p.n.max_len_to_optimize_static_block = 10000;
 		deflate_init_offset_slot_full(c);
 		break;
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
@@ -3931,6 +4009,16 @@
 	return c;
 }
 
+
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor(int compression_level)
+{
+	static const struct libdeflate_options defaults = {
+		.sizeof_options = sizeof(defaults),
+	};
+	return libdeflate_alloc_compressor_ex(compression_level, &defaults);
+}
+
 LIBDEFLATEAPI size_t
 libdeflate_deflate_compress(struct libdeflate_compressor *c,
 			    const void *in, size_t in_nbytes,
@@ -3946,38 +4034,40 @@
 		return deflate_compress_none(in, in_nbytes,
 					     out, out_nbytes_avail);
 
-	/*
-	 * Initialize the output bitstream structure.
-	 *
-	 * The end is set to OUTPUT_END_PADDING below the true end, so that
-	 * FLUSH_BITS() can be more efficient.
-	 */
-	if (unlikely(out_nbytes_avail <= OUTPUT_END_PADDING))
-		return 0;
+	/* Initialize the output bitstream structure. */
 	os.bitbuf = 0;
 	os.bitcount = 0;
 	os.next = out;
-	os.end = os.next + out_nbytes_avail - OUTPUT_END_PADDING;
+	os.end = os.next + out_nbytes_avail;
+	os.overflow = false;
+
+	/* Call the actual compression function. */
 	(*c->impl)(c, in, in_nbytes, &os);
+
+	/* Return 0 if the output buffer is too small. */
+	if (os.overflow)
+		return 0;
+
 	/*
-	 * If 'os.next' reached 'os.end', then either there was not enough space
-	 * in the output buffer, or the compressed size would have been within
-	 * OUTPUT_END_PADDING of the true end.  For performance reasons we don't
-	 * distinguish between these cases; we just make sure to return some
-	 * extra space from libdeflate_deflate_compress_bound().
+	 * Write the final byte if needed.  This can't overflow the output
+	 * buffer because deflate_flush_block() would have set the overflow flag
+	 * if there wasn't enough space remaining for the full final block.
 	 */
-	if (os.next >= os.end)
-		return 0;
 	ASSERT(os.bitcount <= 7);
-	if (os.bitcount)
+	if (os.bitcount) {
+		ASSERT(os.next < os.end);
 		*os.next++ = os.bitbuf;
+	}
+
+	/* Return the compressed size in bytes. */
 	return os.next - (u8 *)out;
 }
 
 LIBDEFLATEAPI void
 libdeflate_free_compressor(struct libdeflate_compressor *c)
 {
-	libdeflate_aligned_free(c);
+	if (c)
+		libdeflate_aligned_free(c->free_func, c);
 }
 
 unsigned int
@@ -3990,7 +4080,6 @@
 libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
 				  size_t in_nbytes)
 {
-	size_t bound = 0;
 	size_t max_blocks;
 
 	/*
@@ -4007,10 +4096,12 @@
 	 */
 
 	/*
-	 * The minimum length that is passed to deflate_flush_block() is
-	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.
+	 * Calculate the maximum number of uncompressed blocks that the
+	 * compressor can use for 'in_nbytes' of data.
 	 *
-	 * If deflate_flush_block() decides to use an uncompressed block, it
+	 * The minimum length that is passed to deflate_flush_block() is
+	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.  If
+	 * deflate_flush_block() decides to use an uncompressed block, it
 	 * actually will (in general) output a series of uncompressed blocks in
 	 * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
 	 * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
@@ -4029,20 +4120,8 @@
 	 * BTYPE, LEN, and NLEN fields.  (For the reason explained earlier, the
 	 * alignment bits at the very start of the block can be disregarded;
 	 * they would otherwise increase the overhead to 6 bytes per block.)
+	 * Therefore, the maximum number of overhead bytes is '5 * max_blocks'.
+	 * To get the final bound, add the number of uncompressed bytes.
 	 */
-	bound += 5 * max_blocks;
-
-	/* Account for the data itself, stored uncompressed. */
-	bound += in_nbytes;
-
-	/*
-	 * Add 1 + OUTPUT_END_PADDING because for performance reasons, the
-	 * compressor doesn't distinguish between cases where there wasn't
-	 * enough space and cases where the compressed size would have been
-	 * 'out_nbytes_avail - OUTPUT_END_PADDING' or greater.  Adding
-	 * 1 + OUTPUT_END_PADDING to the bound ensures the needed wiggle room.
-	 */
-	bound += 1 + OUTPUT_END_PADDING;
-
-	return bound;
+	return (5 * max_blocks) + in_nbytes;
 }
diff -Nru libdeflate-1.18/lib/deflate_decompress.c libdeflate-1.19/lib/deflate_decompress.c
--- libdeflate-1.18/lib/deflate_decompress.c	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/lib/deflate_decompress.c	2023-09-16 23:48:03.000000000 +0000
@@ -671,6 +671,9 @@
 
 	bool static_codes_loaded;
 	unsigned litlen_tablebits;
+
+	/* The free() function for this struct, chosen at allocation time */
+	free_func_t free_func;
 };
 
 /*
@@ -802,38 +805,48 @@
 		u32 entry;
 		unsigned i;
 
+		/*
+		 * The DEFLATE RFC explicitly allows the offset code to be
+		 * incomplete in two cases: a code containing just 1 codeword,
+		 * if that codeword has length 1; and a code containing no
+		 * codewords.  Note: the list of offset codeword lengths is
+		 * always nonempty, but lengths of 0 don't count as codewords.
+		 *
+		 * The RFC doesn't say whether the same cases are allowed for
+		 * the litlen and pre codes.  It's actually impossible for no
+		 * symbols to be used from these codes; however, it's
+		 * technically possible for only one symbol to be used.  zlib
+		 * allows 1 codeword for the litlen code, but not the precode.
+		 * The RFC also doesn't say whether, when there is 1 codeword,
+		 * that codeword is '0' or '1'.  zlib uses '0'.
+		 *
+		 * We accept what zlib accepts, plus a bit more.  First, we
+		 * don't treat the precode more strictly than the litlen and
+		 * offset codes.  There's no convincing reason to add a special
+		 * case for the precode here.
+		 *
+		 * Second, we just map each allowed incompete code to a complete
+		 * code with only real symbols.  To do this, we choose a symbol,
+		 * either the used symbol (for codes with 1 codeword) or an
+		 * arbitrary symbol (for empty codes), and give it both
+		 * codewords '0' and '1'.  zlib instead uses a special ERROR
+		 * symbol in the part of the codespace the code doesn't use.
+		 * However, having an ERROR symbol reduces the performance of
+		 * the Huffman decoder, for no real benefit.  Our approach also
+		 * avoids having to decide whether '0' or '1' is correct.
+		 *
+		 * Like zlib, we still reject all incomplete codes that contain
+		 * more than 1 codeword or a codeword length greater than 1.
+		 */
 		if (codespace_used == 0) {
-			/*
-			 * An empty code is allowed.  This can happen for the
-			 * offset code in DEFLATE, since a dynamic Huffman block
-			 * need not contain any matches.
-			 */
-
-			/* sym=0, len=1 (arbitrary) */
-			entry = make_decode_table_entry(decode_results, 0, 1);
+			sym = 0; /* arbitrary */
 		} else {
-			/*
-			 * Allow codes with a single used symbol, with codeword
-			 * length 1.  The DEFLATE RFC is unclear regarding this
-			 * case.  What zlib's decompressor does is permit this
-			 * for the litlen and offset codes and assume the
-			 * codeword is '0' rather than '1'.  We do the same
-			 * except we allow this for precodes too, since there's
-			 * no convincing reason to treat the codes differently.
-			 * We also assign both codewords '0' and '1' to the
-			 * symbol to avoid having to handle '1' specially.
-			 */
 			if (codespace_used != (1U << (max_codeword_len - 1)) ||
 			    len_counts[1] != 1)
 				return false;
-			entry = make_decode_table_entry(decode_results,
-							*sorted_syms, 1);
+			sym = sorted_syms[0];
 		}
-		/*
-		 * Note: the decode table still must be fully initialized, in
-		 * case the stream is malformed and contains bits from the part
-		 * of the codespace the incomplete code doesn't use.
-		 */
+		entry = make_decode_table_entry(decode_results, sym, 1);
 		for (i = 0; i < (1U << table_bits); i++)
 			decode_table[i] = entry;
 		return true;
@@ -1140,8 +1153,21 @@
 }
 
 LIBDEFLATEAPI struct libdeflate_decompressor *
-libdeflate_alloc_decompressor(void)
+libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options)
 {
+	struct libdeflate_decompressor *d;
+
+	/*
+	 * Note: if more fields are added to libdeflate_options, this code will
+	 * need to be updated to support both the old and new structs.
+	 */
+	if (options->sizeof_options != sizeof(*options))
+		return NULL;
+
+	d = (options->malloc_func ? options->malloc_func :
+	     libdeflate_default_malloc_func)(sizeof(*d));
+	if (d == NULL)
+		return NULL;
 	/*
 	 * Note that only certain parts of the decompressor actually must be
 	 * initialized here:
@@ -1155,18 +1181,28 @@
 	 *   valgrind, since build_decode_table() is guaranteed to initialize
 	 *   all entries eventually anyway.)
 	 *
+	 * - 'free_func' must be set.
+	 *
 	 * But for simplicity, we currently just zero the whole decompressor.
 	 */
-	struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d));
-
-	if (d == NULL)
-		return NULL;
 	memset(d, 0, sizeof(*d));
+	d->free_func = options->free_func ?
+		       options->free_func : libdeflate_default_free_func;
 	return d;
 }
 
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor(void)
+{
+	static const struct libdeflate_options defaults = {
+		.sizeof_options = sizeof(defaults),
+	};
+	return libdeflate_alloc_decompressor_ex(&defaults);
+}
+
 LIBDEFLATEAPI void
 libdeflate_free_decompressor(struct libdeflate_decompressor *d)
 {
-	libdeflate_free(d);
+	if (d)
+		d->free_func(d);
 }
diff -Nru libdeflate-1.18/lib/lib_common.h libdeflate-1.19/lib/lib_common.h
--- libdeflate-1.18/lib/lib_common.h	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/lib/lib_common.h	2023-09-16 23:48:03.000000000 +0000
@@ -39,11 +39,15 @@
 
 #include "../common_defs.h"
 
-void *libdeflate_malloc(size_t size);
-void libdeflate_free(void *ptr);
+typedef void *(*malloc_func_t)(size_t);
+typedef void (*free_func_t)(void *);
 
-void *libdeflate_aligned_malloc(size_t alignment, size_t size);
-void libdeflate_aligned_free(void *ptr);
+extern malloc_func_t libdeflate_default_malloc_func;
+extern free_func_t libdeflate_default_free_func;
+
+void *libdeflate_aligned_malloc(malloc_func_t malloc_func,
+				size_t alignment, size_t size);
+void libdeflate_aligned_free(free_func_t free_func, void *ptr);
 
 #ifdef FREESTANDING
 /*
diff -Nru libdeflate-1.18/lib/utils.c libdeflate-1.19/lib/utils.c
--- libdeflate-1.18/lib/utils.c	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/lib/utils.c	2023-09-16 23:48:03.000000000 +0000
@@ -34,27 +34,18 @@
 #  include <stdlib.h>
 #endif
 
-static void *(*libdeflate_malloc_func)(size_t) = malloc;
-static void (*libdeflate_free_func)(void *) = free;
+malloc_func_t libdeflate_default_malloc_func = malloc;
+free_func_t libdeflate_default_free_func = free;
 
 void *
-libdeflate_malloc(size_t size)
+libdeflate_aligned_malloc(malloc_func_t malloc_func,
+			  size_t alignment, size_t size)
 {
-	return (*libdeflate_malloc_func)(size);
-}
-
-void
-libdeflate_free(void *ptr)
-{
-	(*libdeflate_free_func)(ptr);
-}
+	void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size);
 
-void *
-libdeflate_aligned_malloc(size_t alignment, size_t size)
-{
-	void *ptr = libdeflate_malloc(sizeof(void *) + alignment - 1 + size);
 	if (ptr) {
 		void *orig_ptr = ptr;
+
 		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
 		((void **)ptr)[-1] = orig_ptr;
 	}
@@ -62,18 +53,17 @@
 }
 
 void
-libdeflate_aligned_free(void *ptr)
+libdeflate_aligned_free(free_func_t free_func, void *ptr)
 {
-	if (ptr)
-		libdeflate_free(((void **)ptr)[-1]);
+	(*free_func)(((void **)ptr)[-1]);
 }
 
 LIBDEFLATEAPI void
-libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
-				void (*free_func)(void *))
+libdeflate_set_memory_allocator(malloc_func_t malloc_func,
+				free_func_t free_func)
 {
-	libdeflate_malloc_func = malloc_func;
-	libdeflate_free_func = free_func;
+	libdeflate_default_malloc_func = malloc_func;
+	libdeflate_default_free_func = free_func;
 }
 
 /*
diff -Nru libdeflate-1.18/lib/x86/cpu_features.c libdeflate-1.19/lib/x86/cpu_features.c
--- libdeflate-1.18/lib/x86/cpu_features.c	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/lib/x86/cpu_features.c	2023-09-16 23:48:03.000000000 +0000
@@ -30,15 +30,17 @@
 
 #if HAVE_DYNAMIC_X86_CPU_FEATURES
 
-/* With old GCC versions we have to manually save and restore the x86_32 PIC
- * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602  */
+/*
+ * With old GCC versions we have to manually save and restore the x86_32 PIC
+ * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602
+ */
 #if defined(ARCH_X86_32) && defined(__PIC__)
 #  define EBX_CONSTRAINT "=&r"
 #else
 #  define EBX_CONSTRAINT "=b"
 #endif
 
-/* Execute the CPUID instruction.  */
+/* Execute the CPUID instruction. */
 static inline void
 cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
 {
@@ -51,40 +53,37 @@
 	*c = result[2];
 	*d = result[3];
 #else
-	__asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
-		"cpuid                                  \n"
-		".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
-		: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
-		: "a" (leaf), "c" (subleaf));
+	__asm__ volatile(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
+			 "cpuid                                  \n"
+			 ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
+			 : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+			 : "a" (leaf), "c" (subleaf));
 #endif
 }
 
-/* Read an extended control register.  */
+/* Read an extended control register. */
 static inline u64
 read_xcr(u32 index)
 {
 #ifdef _MSC_VER
 	return _xgetbv(index);
 #else
-	u32 edx, eax;
+	u32 d, a;
 
-	/* Execute the "xgetbv" instruction.  Old versions of binutils do not
-	 * recognize this instruction, so list the raw bytes instead.  */
-	__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
+	/*
+	 * Execute the "xgetbv" instruction.  Old versions of binutils do not
+	 * recognize this instruction, so list the raw bytes instead.
+	 *
+	 * This must be 'volatile' to prevent this code from being moved out
+	 * from under the check for OSXSAVE.
+	 */
+	__asm__ volatile(".byte 0x0f, 0x01, 0xd0" :
+			 "=d" (d), "=a" (a) : "c" (index));
 
-	return ((u64)edx << 32) | eax;
+	return ((u64)d << 32) | a;
 #endif
 }
 
-#undef BIT
-#define BIT(nr)			(1UL << (nr))
-
-#define XCR0_BIT_SSE		BIT(1)
-#define XCR0_BIT_AVX		BIT(2)
-
-#define IS_SET(reg, nr)		((reg) & BIT(nr))
-#define IS_ALL_SET(reg, mask)	(((reg) & (mask)) == (mask))
-
 static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_SSE2,		"sse2"},
 	{X86_CPU_FEATURE_PCLMUL,	"pclmul"},
@@ -98,47 +97,34 @@
 /* Initialize libdeflate_x86_cpu_features. */
 void libdeflate_init_x86_cpu_features(void)
 {
+	u32 max_leaf, a, b, c, d;
+	u64 xcr0 = 0;
 	u32 features = 0;
-	u32 dummy1, dummy2, dummy3, dummy4;
-	u32 max_function;
-	u32 features_1, features_2, features_3, features_4;
-	bool os_avx_support = false;
-
-	/* Get maximum supported function  */
-	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
-	if (max_function < 1)
-		goto out;
 
-	/* Standard feature flags  */
-	cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
+	/* EAX=0: Highest Function Parameter and Manufacturer ID */
+	cpuid(0, 0, &max_leaf, &b, &c, &d);
+	if (max_leaf < 1)
+		goto out;
 
-	if (IS_SET(features_1, 26))
+	/* EAX=1: Processor Info and Feature Bits */
+	cpuid(1, 0, &a, &b, &c, &d);
+	if (d & (1 << 26))
 		features |= X86_CPU_FEATURE_SSE2;
-
-	if (IS_SET(features_2, 1))
+	if (c & (1 << 1))
 		features |= X86_CPU_FEATURE_PCLMUL;
-
-	if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
-		u64 xcr0 = read_xcr(0);
-
-		os_avx_support = IS_ALL_SET(xcr0,
-					    XCR0_BIT_SSE |
-					    XCR0_BIT_AVX);
-	}
-
-	if (os_avx_support && IS_SET(features_2, 28))
+	if (c & (1 << 27))
+		xcr0 = read_xcr(0);
+	if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
 		features |= X86_CPU_FEATURE_AVX;
 
-	if (max_function < 7)
+	if (max_leaf < 7)
 		goto out;
 
-	/* Extended feature flags  */
-	cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
-
-	if (os_avx_support && IS_SET(features_3, 5))
+	/* EAX=7, ECX=0: Extended Features */
+	cpuid(7, 0, &a, &b, &c, &d);
+	if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
 		features |= X86_CPU_FEATURE_AVX2;
-
-	if (IS_SET(features_3, 8))
+	if (b & (1 << 8))
 		features |= X86_CPU_FEATURE_BMI2;
 
 out:
diff -Nru libdeflate-1.18/lib/x86/cpu_features.h libdeflate-1.19/lib/x86/cpu_features.h
--- libdeflate-1.18/lib/x86/cpu_features.h	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/lib/x86/cpu_features.h	2023-09-16 23:48:03.000000000 +0000
@@ -145,6 +145,16 @@
 #else
 #  define HAVE_BMI2_INTRIN	0
 #endif
+/*
+ * MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*()
+ * intrinsics.  It seems to be fixed in VS2022.
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1930 /* older than VS2022 (toolset v143) */
+#  undef HAVE_BMI2_NATIVE
+#  undef HAVE_BMI2_INTRIN
+#  define HAVE_BMI2_NATIVE	0
+#  define HAVE_BMI2_INTRIN	0
+#endif
 
 #endif /* ARCH_X86_32 || ARCH_X86_64 */
 
diff -Nru libdeflate-1.18/libdeflate.h libdeflate-1.19/libdeflate.h
--- libdeflate-1.18/libdeflate.h	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/libdeflate.h	2023-09-16 23:48:03.000000000 +0000
@@ -13,8 +13,8 @@
 #endif
 
 #define LIBDEFLATE_VERSION_MAJOR	1
-#define LIBDEFLATE_VERSION_MINOR	18
-#define LIBDEFLATE_VERSION_STRING	"1.18"
+#define LIBDEFLATE_VERSION_MINOR	19
+#define LIBDEFLATE_VERSION_STRING	"1.19"
 
 /*
  * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause
@@ -35,6 +35,7 @@
 /* ========================================================================== */
 
 struct libdeflate_compressor;
+struct libdeflate_options;
 
 /*
  * libdeflate_alloc_compressor() allocates a new compressor that supports
@@ -58,11 +59,18 @@
 libdeflate_alloc_compressor(int compression_level);
 
 /*
+ * Like libdeflate_alloc_compressor(), but adds the 'options' argument.
+ */
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor_ex(int compression_level,
+			       const struct libdeflate_options *options);
+
+/*
  * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
  * data.  It attempts to compress 'in_nbytes' bytes of data located at 'in' and
  * write the result to 'out', which has space for 'out_nbytes_avail' bytes.  The
  * return value is the compressed size in bytes, or 0 if the data could not be
- * compressed to 'out_nbytes_avail' bytes or fewer (but see note below).
+ * compressed to 'out_nbytes_avail' bytes or fewer.
  *
  * If compression is successful, then the output data is guaranteed to be a
  * valid DEFLATE stream that decompresses to the input data.  No other
@@ -72,22 +80,6 @@
  * writing tests that compare compressed data to a golden output, as this can
  * break when libdeflate is updated.  (This property isn't specific to
  * libdeflate; the same is true for zlib and other compression libraries too.)
- *
- * Note: due to a performance optimization, libdeflate_deflate_compress()
- * currently needs a small amount of slack space at the end of the output
- * buffer.  As a result, it can't actually report compressed sizes very close to
- * 'out_nbytes_avail'.  This doesn't matter in real-world use cases, and
- * libdeflate_deflate_compress_bound() already includes the slack space.
- * However, it does mean that testing code that redundantly compresses data
- * using an exact-sized output buffer won't work as might be expected:
- *
- *	out_nbytes = libdeflate_deflate_compress(c, in, in_nbytes, out,
- *						 libdeflate_deflate_compress_bound(in_nbytes));
- *	// The following assertion will fail.
- *	assert(libdeflate_deflate_compress(c, in, in_nbytes, out, out_nbytes) != 0);
- *
- * To avoid this, either don't write tests like the above, or make sure to
- * include at least 9 bytes of slack space in 'out_nbytes_avail'.
  */
 LIBDEFLATEAPI size_t
 libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
@@ -171,6 +163,7 @@
 /* ========================================================================== */
 
 struct libdeflate_decompressor;
+struct libdeflate_options;
 
 /*
  * libdeflate_alloc_decompressor() allocates a new decompressor that can be used
@@ -188,6 +181,12 @@
 libdeflate_alloc_decompressor(void);
 
 /*
+ * Like libdeflate_alloc_decompressor(), but adds the 'options' argument.
+ */
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options);
+
+/*
  * Result of a call to libdeflate_deflate_decompress(),
  * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
  */
@@ -351,16 +350,60 @@
 
 /*
  * Install a custom memory allocator which libdeflate will use for all memory
- * allocations.  'malloc_func' is a function that must behave like malloc(), and
- * 'free_func' is a function that must behave like free().
+ * allocations by default.  'malloc_func' is a function that must behave like
+ * malloc(), and 'free_func' is a function that must behave like free().
+ *
+ * The per-(de)compressor custom memory allocator that can be specified in
+ * 'struct libdeflate_options' takes priority over this.
  *
- * There must not be any libdeflate_compressor or libdeflate_decompressor
- * structures in existence when calling this function.
+ * This doesn't affect the free() function that will be used to free
+ * (de)compressors that were already in existence when this is called.
  */
 LIBDEFLATEAPI void
 libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
 				void (*free_func)(void *));
 
+/*
+ * Advanced options.  This is the options structure that
+ * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex()
+ * require.  Most users won't need this and should just use the non-"_ex"
+ * functions instead.  If you do need this, it should be initialized like this:
+ *
+ *	struct libdeflate_options options;
+ *
+ *	memset(&options, 0, sizeof(options));
+ *	options.sizeof_options = sizeof(options);
+ *	// Then set the fields that you need to override the defaults for.
+ */
+struct libdeflate_options {
+
+	/*
+	 * This field must be set to the struct size.  This field exists for
+	 * extensibility, so that fields can be appended to this struct in
+	 * future versions of libdeflate while still supporting old binaries.
+	 */
+	size_t sizeof_options;
+
+	/*
+	 * An optional custom memory allocator to use for this (de)compressor.
+	 * 'malloc_func' must be a function that behaves like malloc(), and
+	 * 'free_func' must be a function that behaves like free().
+	 *
+	 * This is useful in cases where a process might have multiple users of
+	 * libdeflate who want to use different memory allocators.  For example,
+	 * a library might want to use libdeflate with a custom memory allocator
+	 * without interfering with user code that might use libdeflate too.
+	 *
+	 * This takes priority over the "global" memory allocator (which by
+	 * default is malloc() and free(), but can be changed by
+	 * libdeflate_set_memory_allocator()).  Moreover, libdeflate will never
+	 * call the "global" memory allocator if a per-(de)compressor custom
+	 * allocator is always given.
+	 */
+	void *(*malloc_func)(size_t);
+	void (*free_func)(void *);
+};
+
 #ifdef __cplusplus
 }
 #endif
diff -Nru libdeflate-1.18/programs/test_custom_malloc.c libdeflate-1.19/programs/test_custom_malloc.c
--- libdeflate-1.18/programs/test_custom_malloc.c	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/programs/test_custom_malloc.c	2023-09-16 23:48:03.000000000 +0000
@@ -1,7 +1,7 @@
 /*
  * test_custom_malloc.c
  *
- * Test libdeflate_set_memory_allocator().
+ * Test the support for custom memory allocators.
  * Also test injecting allocation failures.
  */
 
@@ -28,24 +28,34 @@
 	free(ptr);
 }
 
-int
-tmain(int argc, tchar *argv[])
+static void reset_state(void)
+{
+	libdeflate_set_memory_allocator(malloc, free);
+	malloc_count = 0;
+	free_count = 0;
+}
+
+/* Test that the custom allocator is actually used when requested. */
+static void do_custom_memalloc_test(bool global)
 {
+	static const struct libdeflate_options options = {
+		.sizeof_options = sizeof(options),
+		.malloc_func = do_malloc,
+		.free_func = do_free,
+	};
 	int level;
 	struct libdeflate_compressor *c;
 	struct libdeflate_decompressor *d;
 
-	begin_program(argv);
-
-	/* Test that the custom allocator is actually used when requested. */
-
-	libdeflate_set_memory_allocator(do_malloc, do_free);
-	ASSERT(malloc_count == 0);
-	ASSERT(free_count == 0);
+	if (global)
+		libdeflate_set_memory_allocator(do_malloc, do_free);
 
 	for (level = 0; level <= 12; level++) {
 		malloc_count = free_count = 0;
-		c = libdeflate_alloc_compressor(level);
+		if (global)
+			c = libdeflate_alloc_compressor(level);
+		else
+			c = libdeflate_alloc_compressor_ex(level, &options);
 		ASSERT(c != NULL);
 		ASSERT(malloc_count == 1);
 		ASSERT(free_count == 0);
@@ -55,7 +65,10 @@
 	}
 
 	malloc_count = free_count = 0;
-	d = libdeflate_alloc_decompressor();
+	if (global)
+		d = libdeflate_alloc_decompressor();
+	else
+		d = libdeflate_alloc_decompressor_ex(&options);
 	ASSERT(d != NULL);
 	ASSERT(malloc_count == 1);
 	ASSERT(free_count == 0);
@@ -63,7 +76,52 @@
 	ASSERT(malloc_count == 1);
 	ASSERT(free_count == 1);
 
-	/* As long as we're here, also test injecting allocation failures. */
+	reset_state();
+}
+
+#define offsetofend(type, field) \
+	(offsetof(type, field) + sizeof(((type *)NULL)->field))
+
+/* Test some edge cases involving libdeflate_options. */
+static void do_options_test(void)
+{
+	struct libdeflate_options options = { 0 };
+	struct libdeflate_compressor *c;
+	struct libdeflate_decompressor *d;
+	/* Size in libdeflate v1.19 */
+	size_t min_size = offsetofend(struct libdeflate_options, free_func);
+
+	/* sizeof_options must be at least the minimum size. */
+	for (; options.sizeof_options < min_size;
+	     options.sizeof_options++) {
+		c = libdeflate_alloc_compressor_ex(6, &options);
+		ASSERT(c == NULL);
+		d = libdeflate_alloc_decompressor_ex(&options);
+		ASSERT(d == NULL);
+	}
+
+	/* NULL malloc_func and free_func means "use the global allocator". */
+	options.sizeof_options = min_size;
+	malloc_count = free_count = 0;
+	libdeflate_set_memory_allocator(do_malloc, do_free);
+	c = libdeflate_alloc_compressor_ex(6, &options);
+	libdeflate_free_compressor(c);
+	ASSERT(malloc_count == 1);
+	ASSERT(free_count == 1);
+	d = libdeflate_alloc_decompressor_ex(&options);
+	libdeflate_free_decompressor(d);
+	ASSERT(malloc_count == 2);
+	ASSERT(free_count == 2);
+
+	reset_state();
+}
+
+/* Test injecting memory allocation failures. */
+static void do_fault_injection_test(void)
+{
+	int level;
+	struct libdeflate_compressor *c;
+	struct libdeflate_decompressor *d;
 
 	libdeflate_set_memory_allocator(do_fail_malloc, do_free);
 
@@ -81,5 +139,17 @@
 	ASSERT(malloc_count == 1);
 	ASSERT(free_count == 0);
 
+	reset_state();
+}
+
+int
+tmain(int argc, tchar *argv[])
+{
+	begin_program(argv);
+
+	do_custom_memalloc_test(true);
+	do_custom_memalloc_test(false);
+	do_options_test();
+	do_fault_injection_test();
 	return 0;
 }
diff -Nru libdeflate-1.18/scripts/gen_offset_slot_map.py libdeflate-1.19/scripts/gen_offset_slot_map.py
--- libdeflate-1.18/scripts/gen_offset_slot_map.py	2023-03-24 02:41:29.000000000 +0000
+++ libdeflate-1.19/scripts/gen_offset_slot_map.py	2023-09-16 23:48:03.000000000 +0000
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 #
-# This script generates the deflate_offset_slot[] array, which is a condensed
-# map from offsets to offset slots.
+# This script generates the deflate_offset_slot[] array, which maps
+# 'offset - 1 => offset_slot' for offset <= 256.
 
 DEFLATE_OFFSET_SLOT_BASE = [
 	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
@@ -10,26 +10,14 @@
 	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
 ]
 
-DEFLATE_EXTRA_OFFSET_BITS = [
-	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
-	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
-	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
-	11   , 11   , 12   , 12    , 13    , 13    ,
-]
-
-offset_slot_map = [0] * 512
-
-for offset_slot, offset_base in enumerate(DEFLATE_OFFSET_SLOT_BASE):
-    num_extra_bits = DEFLATE_EXTRA_OFFSET_BITS[offset_slot]
-    offset_end = offset_base + (1 << num_extra_bits)
-    if offset_base <= 256:
-        for offset in range(offset_base, offset_end):
-            offset_slot_map[offset] = offset_slot
-    else:
-        for offset in range(offset_base, offset_end, 128):
-            offset_slot_map[256 + ((offset - 1) >> 7)] = offset_slot
+offset_slot_map = [0] * 256
+offset_slot = -1
+for offset in range(1, len(offset_slot_map) + 1):
+    if offset >= DEFLATE_OFFSET_SLOT_BASE[offset_slot + 1]:
+        offset_slot += 1
+    offset_slot_map[offset - 1] = offset_slot
 
-print('static const u8 deflate_offset_slot_map[512] = {')
+print(f'static const u8 deflate_offset_slot[{len(offset_slot_map)}] = {{')
 for i in range(0, len(offset_slot_map), 16):
     print('\t', end='')
     for j, v in enumerate(offset_slot_map[i:i+16]):