Sync decompression code from wimlib

2016-07-09 17:31:28 -05:00 · 2016-07-09 17:31:28 -05:00 · 5c337bc502
parent 3ddd227ee8
commit 5c337bc502
15 changed files with 1848 additions and 968 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -7,15 +7,21 @@ plugindir = $(libdir)/ntfs-3g
 plugin_LTLIBRARIES = ntfs-plugin-80000017.la

 ntfs_plugin_80000017_la_SOURCES =	\
+	src/aligned_malloc.c		\
+	src/common_defs.h		\
 	src/decompress_common.c		\
 	src/decompress_common.h		\
+	src/lzx_common.c		\
+	src/lzx_common.h		\
+	src/lzx_constants.h		\
 	src/lzx_decompress.c		\
 	src/plugin.c			\
 	src/system_compression.c	\
 	src/system_compression.h	\
+	src/xpress_constants.h		\
 	src/xpress_decompress.c

 ntfs_plugin_80000017_la_LDFLAGS  = -module -shared -avoid-version
 ntfs_plugin_80000017_la_CPPFLAGS = -D_FILE_OFFSET_BITS=64
-ntfs_plugin_80000017_la_CFLAGS   = $(LIBNTFS_3G_CFLAGS)
+ntfs_plugin_80000017_la_CFLAGS   = $(LIBNTFS_3G_CFLAGS) -std=gnu99
 ntfs_plugin_80000017_la_LIBADD   = $(LIBNTFS_3G_LIBS)
--- a/README.md
+++ b/README.md
@ -34,7 +34,22 @@ directory (`$libdir`).  An example full path to the installed plugin is
 platforms.  `make install` will create the plugin directory if it does not
 already exist.

-# License
+# Implementation note
+
+The XPRESS and LZX decompression formats used in system-compressed files are
+identical to the formats used in Windows Imaging (WIM) archives.  Therefore, for
+the system compression plugin I borrowed the XPRESS and LZX decompressors I had
+already written for the wimlib project (https://wimlib.net/).  I made some
+slight modifications for integration purposes.  The code in wimlib is currently
+licensed LGPLv3+, but I have relicensed the version in this plugin to GPLv2+ for
+consistency with NTFS-3G's license.  (Public domain portions remain public
+domain.)
+
+# Notices
+
+The NTFS-3G system compression plugin was written by Eric Biggers, with
+contributions from Jean-Pierre André.  You can contact the author at
+ebiggers3@gmail.com.

 This software may be redistributed and/or modified under the terms of the GNU
 General Public License as published by the Free Software Foundation, either
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ntfs-3g-system-compression], [0.1], [ebiggers3@gmail.com])
+AC_INIT([ntfs-3g-system-compression], [0.2], [ebiggers3@gmail.com])

 AC_CONFIG_SRCDIR([src/plugin.c])
 AC_CONFIG_MACRO_DIR([m4])
--- a/src/aligned_malloc.c
+++ b/src/aligned_malloc.c
@ -0,0 +1,34 @@
+/*
+ * aligned_malloc.c - aligned memory allocation
+ *
+ * This file provides portable aligned memory allocation functions that only use
+ * malloc() and free().  This avoids portability problems with posix_memalign(),
+ * aligned_alloc(), etc.
+ */
+
+#include <stdlib.h>
+
+#include "common_defs.h"
+
+void *
+aligned_malloc(size_t size, size_t alignment)
+{
+	const uintptr_t mask = alignment - 1;
+	char *ptr = NULL;
+	char *raw_ptr;
+
+	raw_ptr = malloc(mask + sizeof(size_t) + size);
+	if (raw_ptr) {
+		ptr = (char *)raw_ptr + sizeof(size_t);
+		ptr = (void *)(((uintptr_t)ptr + mask) & ~mask);
+		*((size_t *)ptr - 1) = ptr - raw_ptr;
+	}
+	return ptr;
+}
+
+void
+aligned_free(void *ptr)
+{
+	if (ptr)
+		free((char *)ptr - *((size_t *)ptr - 1));
+}
--- a/src/common_defs.h
+++ b/src/common_defs.h
@ -0,0 +1,290 @@
+#ifndef _COMMON_DEFS_H
+#define _COMMON_DEFS_H
+
+#include <ntfs-3g/endians.h>
+#include <ntfs-3g/types.h>
+
+/* ========================================================================== */
+/*                              Type definitions                              */
+/* ========================================================================== */
+
+/*
+ * Type of a machine word.  'unsigned long' would be logical, but that is only
+ * 32 bits on x86_64 Windows.  The same applies to 'uint_fast32_t'.  So the best
+ * we can do without a bunch of #ifdefs appears to be 'size_t'.
+ */
+typedef size_t machine_word_t;
+
+#define WORDBYTES	sizeof(machine_word_t)
+#define WORDBITS	(8 * WORDBYTES)
+
+/* ========================================================================== */
+/*                         Compiler-specific definitions                      */
+/* ========================================================================== */
+
+#ifdef __GNUC__  /* GCC, or GCC-compatible compiler such as clang */
+#  define forceinline		inline __attribute__((always_inline))
+#  define likely(expr)		__builtin_expect(!!(expr), 1)
+#  define unlikely(expr)	__builtin_expect(!!(expr), 0)
+#  define _aligned_attribute(n)	__attribute__((aligned(n)))
+#  define bsr32(n)		(31 - __builtin_clz(n))
+#  define bsr64(n)		(63 - __builtin_clzll(n))
+#  define bsf32(n)		__builtin_ctz(n)
+#  define bsf64(n)		__builtin_ctzll(n)
+#  ifndef min
+#    define min(a, b)  ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \
+			(_a < _b) ? _a : _b; })
+#  endif
+#  ifndef max
+#    define max(a, b)  ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \
+			(_a > _b) ? _a : _b; })
+#  endif
+
+#  define DEFINE_UNALIGNED_TYPE(type)				\
+struct type##_unaligned {					\
+	type v;							\
+} __attribute__((packed));					\
+								\
+static inline type						\
+load_##type##_unaligned(const void *p)				\
+{								\
+	return ((const struct type##_unaligned *)p)->v;		\
+}								\
+								\
+static inline void						\
+store_##type##_unaligned(type val, void *p)			\
+{								\
+	((struct type##_unaligned *)p)->v = val;		\
+}
+
+#endif /* __GNUC__ */
+
+/* Declare that the annotated function should always be inlined.  This might be
+ * desirable in highly tuned code, e.g. compression codecs */
+#ifndef forceinline
+#  define forceinline		inline
+#endif
+
+/* Hint that the expression is usually true */
+#ifndef likely
+#  define likely(expr)		(expr)
+#endif
+
+/* Hint that the expression is usually false */
+#ifndef unlikely
+#  define unlikely(expr)	(expr)
+#endif
+
+/* Declare that the annotated variable, or variables of the annotated type, are
+ * to be aligned on n-byte boundaries */
+#ifndef _aligned_attribute
+#  define _aligned_attribute(n)
+#endif
+
+/* min() and max() macros */
+#ifndef min
+#  define min(a, b)	((a) < (b) ? (a) : (b))
+#endif
+#ifndef max
+#  define max(a, b)	((a) > (b) ? (a) : (b))
+#endif
+
+/* STATIC_ASSERT() - verify the truth of an expression at compilation time */
+#define STATIC_ASSERT(expr)	((void)sizeof(char[1 - 2 * !(expr)]))
+
+/* STATIC_ASSERT_ZERO() - verify the truth of an expression at compilation time
+ * and also produce a result of value '0' to be used in constant expressions */
+#define STATIC_ASSERT_ZERO(expr) ((int)sizeof(char[-!(expr)]))
+
+/* UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses
+ * can be performed efficiently on the target platform.  */
+#if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED)
+#  define UNALIGNED_ACCESS_IS_FAST 1
+#else
+#  define UNALIGNED_ACCESS_IS_FAST 0
+#endif
+
+/*
+ * DEFINE_UNALIGNED_TYPE(type) - a macro that, given an integer type 'type',
+ * defines load_type_unaligned(addr) and store_type_unaligned(v, addr) functions
+ * which load and store variables of type 'type' from/to unaligned memory
+ * addresses.
+ */
+#ifndef DEFINE_UNALIGNED_TYPE
+
+#include <string.h>
+/*
+ * Although memcpy() may seem inefficient, it *usually* gets optimized
+ * appropriately by modern compilers.  It's portable and may be the best we can
+ * do for a fallback...
+ */
+#define DEFINE_UNALIGNED_TYPE(type)				\
+								\
+static forceinline type						\
+load_##type##_unaligned(const void *p)				\
+{								\
+	type v;							\
+	memcpy(&v, p, sizeof(v));				\
+	return v;						\
+}								\
+								\
+static forceinline void						\
+store_##type##_unaligned(type v, void *p)			\
+{								\
+	memcpy(p, &v, sizeof(v));				\
+}
+
+#endif /* !DEFINE_UNALIGNED_TYPE */
+
+
+/* ========================================================================== */
+/*                          Unaligned memory accesses                         */
+/* ========================================================================== */
+
+DEFINE_UNALIGNED_TYPE(le16);
+DEFINE_UNALIGNED_TYPE(le32);
+DEFINE_UNALIGNED_TYPE(machine_word_t);
+
+#define load_word_unaligned	load_machine_word_t_unaligned
+#define store_word_unaligned	store_machine_word_t_unaligned
+
+static inline u16
+get_unaligned_le16(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le16_to_cpu(load_le16_unaligned(p));
+	else
+		return ((u16)p[1] << 8) | p[0];
+}
+
+static inline u32
+get_unaligned_le32(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le32_to_cpu(load_le32_unaligned(p));
+	else
+		return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
+			((u32)p[1] << 8) | p[0];
+}
+
+static inline void
+put_unaligned_le16(u16 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_le16_unaligned(cpu_to_le16(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+	}
+}
+
+static inline void
+put_unaligned_le32(u32 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_le32_unaligned(cpu_to_le32(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+		p[2] = (u8)(v >> 16);
+		p[3] = (u8)(v >> 24);
+	}
+}
+
+/* ========================================================================== */
+/*                             Bit scan functions                             */
+/* ========================================================================== */
+
+/*
+ * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
+ * significant end) of the *most* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+#ifndef bsr32
+static forceinline unsigned
+bsr32(u32 v)
+{
+	unsigned bit = 0;
+	while ((v >>= 1) != 0)
+		bit++;
+	return bit;
+}
+#endif
+
+#ifndef bsr64
+static forceinline unsigned
+bsr64(u64 v)
+{
+	unsigned bit = 0;
+	while ((v >>= 1) != 0)
+		bit++;
+	return bit;
+}
+#endif
+
+static forceinline unsigned
+bsrw(machine_word_t v)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return bsr32(v);
+	else
+		return bsr64(v);
+}
+
+/*
+ * Bit Scan Forward (BSF) - find the 0-based index (relative to the least
+ * significant end) of the *least* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+#ifndef bsf32
+static forceinline unsigned
+bsf32(u32 v)
+{
+	unsigned bit;
+	for (bit = 0; !(v & 1); bit++, v >>= 1)
+		;
+	return bit;
+}
+#endif
+
+#ifndef bsf64
+static forceinline unsigned
+bsf64(u64 v)
+{
+	unsigned bit;
+	for (bit = 0; !(v & 1); bit++, v >>= 1)
+		;
+	return bit;
+}
+#endif
+
+static forceinline unsigned
+bsfw(machine_word_t v)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return bsf32(v);
+	else
+		return bsf64(v);
+}
+
+/* Return the log base 2 of 'n', rounded up to the nearest integer. */
+static forceinline unsigned
+ilog2_ceil(size_t n)
+{
+        if (n <= 1)
+                return 0;
+        return 1 + bsrw(n - 1);
+}
+
+/* ========================================================================== */
+/*                          Aligned memory allocation                         */
+/* ========================================================================== */
+
+extern void *aligned_malloc(size_t size, size_t alignment);
+extern void aligned_free(void *ptr);
+
+#endif /* _COMMON_DEFS_H */
--- a/src/decompress_common.c
+++ b/src/decompress_common.c
@ -1,325 +1,335 @@
 /*
- * decompress_common.c - Code shared by the XPRESS and LZX decompressors
+ * decompress_common.c
 *
- * Copyright (C) 2015 Eric Biggers
+ * Code for decompression shared among multiple compression formats.
 *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 2 of the License, or (at your option) any later
- * version.
+ * The following copying information applies to this specific source code file:
 *
- * This program is distributed in the hope that it will be useful, but WITHOUT
+ * Written in 2012-2016 by Eric Biggers <ebiggers3@gmail.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
+ * Dedication (the "CC0").
+ *
+ * This software is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
+ * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
 *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
+ * You should have received a copy of the CC0 along with this software; if not
+ * see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */

 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#  include "config.h"
 #endif

 #include <string.h>

+#ifdef __SSE2__
+#  include <emmintrin.h>
+#endif
+
 #include "decompress_common.h"

 /*
 * make_huffman_decode_table() -
 *
- * Build a decoding table for a canonical prefix code, or "Huffman code".
+ * Given an alphabet of symbols and the length of each symbol's codeword in a
+ * canonical prefix code, build a table for quickly decoding symbols that were
+ * encoded with that code.
 *
- * This is an internal function, not part of the library API!
+ * A _prefix code_ is an assignment of bitstrings called _codewords_ to symbols
+ * such that no whole codeword is a prefix of any other.  A prefix code might be
+ * a _Huffman code_, which means that it is an optimum prefix code for a given
+ * list of symbol frequencies and was generated by the Huffman algorithm.
+ * Although the prefix codes processed here will ordinarily be "Huffman codes",
+ * strictly speaking the decoder cannot know whether a given code was actually
+ * generated by the Huffman algorithm or not.
 *
- * This takes as input the length of the codeword for each symbol in the
- * alphabet and produces as output a table that can be used for fast
- * decoding of prefix-encoded symbols using read_huffsym().
+ * A prefix code is _canonical_ if and only if a longer codeword never
+ * lexicographically precedes a shorter codeword, and the lexicographic ordering
+ * of codewords of equal length is the same as the lexicographic ordering of the
+ * corresponding symbols.  The advantage of using a canonical prefix code is
+ * that the codewords can be reconstructed from only the symbol => codeword
+ * length mapping.  This eliminates the need to transmit the codewords
+ * explicitly.  Instead, they can be enumerated in lexicographic order after
+ * sorting the symbols primarily by increasing codeword length and secondarily
+ * by increasing symbol value.
 *
- * Strictly speaking, a canonical prefix code might not be a Huffman
- * code.  But this algorithm will work either way; and in fact, since
- * Huffman codes are defined in terms of symbol frequencies, there is no
- * way for the decompressor to know whether the code is a true Huffman
- * code or not until all symbols have been decoded.
+ * However, the decoder's real goal is to decode symbols with the code, not just
+ * generate the list of codewords.  Consequently, this function directly builds
+ * a table for efficiently decoding symbols using the code.  The basic idea is
+ * that given the next 'max_codeword_len' bits of input, the decoder can look up
+ * the next decoded symbol by indexing a table containing '2^max_codeword_len'
+ * entries.  A codeword with length 'max_codeword_len' will have exactly one
+ * entry in this table, whereas a codeword shorter than 'max_codeword_len' will
+ * have multiple entries in this table.  Precisely, a codeword of length 'n'
+ * will have '2^(max_codeword_len - n)' entries.  The index of each such entry,
+ * considered as a bitstring of length 'max_codeword_len', will contain the
+ * corresponding codeword as a prefix.
 *
- * Because the prefix code is assumed to be "canonical", it can be
- * reconstructed directly from the codeword lengths.  A prefix code is
- * canonical if and only if a longer codeword never lexicographically
- * precedes a shorter codeword, and the lexicographic ordering of
- * codewords of the same length is the same as the lexicographic ordering
- * of the corresponding symbols.  Consequently, we can sort the symbols
- * primarily by codeword length and secondarily by symbol value, then
- * reconstruct the prefix code by generating codewords lexicographically
- * in that order.
+ * That's the basic idea, but we extend it in two ways:
 *
- * This function does not, however, generate the prefix code explicitly.
- * Instead, it directly builds a table for decoding symbols using the
- * code.  The basic idea is this: given the next 'max_codeword_len' bits
- * in the input, we can look up the decoded symbol by indexing a table
- * containing 2**max_codeword_len entries.  A codeword with length
- * 'max_codeword_len' will have exactly one entry in this table, whereas
- * a codeword shorter than 'max_codeword_len' will have multiple entries
- * in this table.  Precisely, a codeword of length n will be represented
- * by 2**(max_codeword_len - n) entries in this table.  The 0-based index
- * of each such entry will contain the corresponding codeword as a prefix
- * when zero-padded on the left to 'max_codeword_len' binary digits.
+ * - Often the maximum codeword length is too long for it to be efficient to
+ *   build the full decode table whenever a new code is used.  Instead, we build
+ *   a "root" table using only '2^table_bits' entries, where 'table_bits <=
+ *   max_codeword_len'.  Then, a lookup of 'table_bits' bits produces either a
+ *   symbol directly (for codewords not longer than 'table_bits'), or the index
+ *   of a subtable which must be indexed with additional bits of input to fully
+ *   decode the symbol (for codewords longer than 'table_bits').
 *
- * That's the basic idea, but we implement two optimizations regarding
- * the format of the decode table itself:
+ * - Whenever the decoder decodes a symbol, it needs to know the codeword length
+ *   so that it can remove the appropriate number of input bits.  The obvious
+ *   solution would be to simply retain the codeword lengths array and use the
+ *   decoded symbol as an index into it.  However, that would require two array
+ *   accesses when decoding each symbol.  Our strategy is to instead store the
+ *   codeword length directly in the decode table entry along with the symbol.
 *
- * - For many compression formats, the maximum codeword length is too
- *   long for it to be efficient to build the full decoding table
- *   whenever a new prefix code is used.  Instead, we can build the table
- *   using only 2**table_bits entries, where 'table_bits' is some number
- *   less than or equal to 'max_codeword_len'.  Then, only codewords of
- *   length 'table_bits' and shorter can be directly looked up.  For
- *   longer codewords, the direct lookup instead produces the root of a
- *   binary tree.  Using this tree, the decoder can do traditional
- *   bit-by-bit decoding of the remainder of the codeword.  Child nodes
- *   are allocated in extra entries at the end of the table; leaf nodes
- *   contain symbols.  Note that the long-codeword case is, in general,
- *   not performance critical, since in Huffman codes the most frequently
- *   used symbols are assigned the shortest codeword lengths.
- *
- * - When we decode a symbol using a direct lookup of the table, we still
- *   need to know its length so that the bitstream can be advanced by the
- *   appropriate number of bits.  The simple solution is to simply retain
- *   the 'lens' array and use the decoded symbol as an index into it.
- *   However, this requires two separate array accesses in the fast path.
- *   The optimization is to store the length directly in the decode
- *   table.  We use the bottom 11 bits for the symbol and the top 5 bits
- *   for the length.  In addition, to combine this optimization with the
- *   previous one, we introduce a special case where the top 2 bits of
- *   the length are both set if the entry is actually the root of a
- *   binary tree.
+ * See MAKE_DECODE_TABLE_ENTRY() for full details on the format of decode table
+ * entries, and see read_huffsym() for full details on how symbols are decoded.
 *
 * @decode_table:
- *	The array in which to create the decoding table.  This must have
- *	a length of at least ((2**table_bits) + 2 * num_syms) entries.
+ *	The array in which to build the decode table.  This must have been
+ *	declared by the DECODE_TABLE() macro.  This may alias @lens, since all
+ *	@lens are consumed before the decode table is written to.
 *
 * @num_syms:
- *	The number of symbols in the alphabet; also, the length of the
- *	'lens' array.  Must be less than or equal to 2048.
+ *	The number of symbols in the alphabet.
 *
 * @table_bits:
- *	The order of the decode table size, as explained above.  Must be
- *	less than or equal to 13.
+ *	The log base 2 of the number of entries in the root table.
 *
 * @lens:
- *	An array of length @num_syms, indexable by symbol, that gives the
- *	length of the codeword, in bits, for that symbol.  The length can
- *	be 0, which means that the symbol does not have a codeword
- *	assigned.
+ *	An array of length @num_syms, indexed by symbol, that gives the length
+ *	of the codeword, in bits, for each symbol.  The length can be 0, which
+ *	means that the symbol does not have a codeword assigned.  In addition,
+ *	@lens may alias @decode_table, as noted above.
 *
 * @max_codeword_len:
- *	The longest codeword length allowed in the compression format.
- *	All entries in 'lens' must be less than or equal to this value.
- *	This must be less than or equal to 23.
+ *	The maximum codeword length permitted for this code.  All entries in
+ *	'lens' must be less than or equal to this value.
 *
 * @working_space
- *	A temporary array of length '2 * (max_codeword_len + 1) +
- *	num_syms'.
+ *	A temporary array that was declared with DECODE_TABLE_WORKING_SPACE().
 *
- * Returns 0 on success, or -1 if the lengths do not form a valid prefix
- * code.
+ * Returns 0 on success, or -1 if the lengths do not form a valid prefix code.
 */
-int make_huffman_decode_table(u16 decode_table[], const unsigned num_syms,
-			      const unsigned table_bits, const u8 lens[],
-			      const unsigned max_codeword_len,
-			      u16 working_space[])
+int
+make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
+			  unsigned table_bits, const u8 lens[],
+			  unsigned max_codeword_len, u16 working_space[])
 {
-	const unsigned table_num_entries = 1 << table_bits;
 	u16 * const len_counts = &working_space[0];
 	u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
 	u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
-	int left;
-	void *decode_table_ptr;
+	s32 remainder = 1;
+	void *entry_ptr = decode_table;
+	unsigned codeword_len = 1;
 	unsigned sym_idx;
-	unsigned codeword_len;
-	unsigned stores_per_loop;
-	unsigned decode_table_pos;
-	unsigned len;
-	unsigned sym;
+	unsigned codeword;
+	unsigned subtable_pos;
+	unsigned subtable_bits;
+	unsigned subtable_prefix;

-	/* Count how many symbols have each possible codeword length.
-	 * Note that a length of 0 indicates the corresponding symbol is not
-	 * used in the code and therefore does not have a codeword.  */
-	for (len = 0; len <= max_codeword_len; len++)
+	/* Count how many codewords have each length, including 0.  */
+	for (unsigned len = 0; len <= max_codeword_len; len++)
 		len_counts[len] = 0;
-	for (sym = 0; sym < num_syms; sym++)
+	for (unsigned sym = 0; sym < num_syms; sym++)
 		len_counts[lens[sym]]++;

-	/* We can assume all lengths are <= max_codeword_len, but we
-	 * cannot assume they form a valid prefix code.  A codeword of
-	 * length n should require a proportion of the codespace equaling
-	 * (1/2)^n.  The code is valid if and only if the codespace is
-	 * exactly filled by the lengths, by this measure.  */
-	left = 1;
-	for (len = 1; len <= max_codeword_len; len++) {
-		left <<= 1;
-		left -= len_counts[len];
-		if (left < 0) {
-			/* The lengths overflow the codespace; that is, the code
-			 * is over-subscribed.  */
+	/* It is already guaranteed that all lengths are <= max_codeword_len,
+	 * but it cannot be assumed they form a complete prefix code.  A
+	 * codeword of length n should require a proportion of the codespace
+	 * equaling (1/2)^n.  The code is complete if and only if, by this
+	 * measure, the codespace is exactly filled by the lengths.  */
+	for (unsigned len = 1; len <= max_codeword_len; len++) {
+		remainder = (remainder << 1) - len_counts[len];
+		/* Do the lengths overflow the codespace? */
+		if (unlikely(remainder < 0))
 			return -1;
-		}
 	}

-	if (left != 0) {
+	if (remainder != 0) {
 		/* The lengths do not fill the codespace; that is, they form an
-		 * incomplete set.  */
-		if (left == (1 << max_codeword_len)) {
-			/* The code is completely empty.  This is arguably
-			 * invalid, but in fact it is valid in LZX and XPRESS,
-			 * so we must allow it.  By definition, no symbols can
-			 * be decoded with an empty code.  Consequently, we
-			 * technically don't even need to fill in the decode
-			 * table.  However, to avoid accessing uninitialized
-			 * memory if the algorithm nevertheless attempts to
-			 * decode symbols using such a code, we zero out the
-			 * decode table.  */
-			memset(decode_table, 0,
-			       table_num_entries * sizeof(decode_table[0]));
-			return 0;
-		}
-		return -1;
+		 * incomplete code.  This is permitted only if the code is empty
+		 * (contains no symbols). */
+
+		if (unlikely(remainder != 1U << max_codeword_len))
+			return -1;
+
+		/* The code is empty.  When processing a well-formed stream, the
+		 * decode table need not be initialized in this case.  However,
+		 * we cannot assume the stream is well-formed, so we must
+		 * initialize the decode table anyway.  Setting all entries to 0
+		 * makes the decode table always produce symbol '0' without
+		 * consuming any bits, which is good enough. */
+		memset(decode_table, 0, sizeof(decode_table[0]) << table_bits);
+		return 0;
 	}

-	/* Sort the symbols primarily by length and secondarily by symbol order.
-	 */
+	/* Sort the symbols primarily by increasing codeword length and
+	 * secondarily by increasing symbol value. */

-	/* Initialize 'offsets' so that offsets[len] for 1 <= len <=
-	 * max_codeword_len is the number of codewords shorter than 'len' bits.
-	 */
-	offsets[1] = 0;
-	for (len = 1; len < max_codeword_len; len++)
+	/* Initialize 'offsets' so that 'offsets[len]' is the number of
+	 * codewords shorter than 'len' bits, including length 0. */
+	offsets[0] = 0;
+	for (unsigned len = 0; len < max_codeword_len; len++)
 		offsets[len + 1] = offsets[len] + len_counts[len];

-	/* Use the 'offsets' array to sort the symbols.  Note that we do not
-	 * include symbols that are not used in the code.  Consequently, fewer
-	 * than 'num_syms' entries in 'sorted_syms' may be filled.  */
-	for (sym = 0; sym < num_syms; sym++)
-		if (lens[sym] != 0)
-			sorted_syms[offsets[lens[sym]]++] = sym;
+	/* Use the 'offsets' array to sort the symbols. */
+	for (unsigned sym = 0; sym < num_syms; sym++)
+		sorted_syms[offsets[lens[sym]]++] = sym;

-	/* Fill entries for codewords with length <= table_bits
-	 * --- that is, those short enough for a direct mapping.
+	/*
+	 * Fill the root table entries for codewords no longer than table_bits.
 	 *
 	 * The table will start with entries for the shortest codeword(s), which
-	 * have the most entries.  From there, the number of entries per
-	 * codeword will decrease.  */
-	decode_table_ptr = decode_table;
-	sym_idx = 0;
-	codeword_len = 1;
-	stores_per_loop = (1 << (table_bits - codeword_len));
-	for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
+	 * will have the most entries.  From there, the number of entries per
+	 * codeword will decrease.  As an optimization, we may begin filling
+	 * entries with SSE2 vector accesses (8 entries/store), then change to
+	 * word accesses (2 or 4 entries/store), then change to 16-bit accesses
+	 * (1 entry/store).
+	 */
+	sym_idx = offsets[0];
+
+#ifdef __SSE2__
+	/* Fill entries one 128-bit vector (8 entries) at a time. */
+	for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
+				    (sizeof(__m128i) / sizeof(decode_table[0]));
+	     stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
+	{
 		unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 		for (; sym_idx < end_sym_idx; sym_idx++) {
-			u16 entry;
-			u16 *p;
-			unsigned n;
-
-			entry = ((u32)codeword_len << 11) | sorted_syms[sym_idx];
-			p = (u16*)decode_table_ptr;
-			n = stores_per_loop;
-
+			/* Note: unlike in the "word" version below, the __m128i
+			 * type already has __attribute__((may_alias)), so using
+			 * it to access an array of u16 will not violate strict
+			 * aliasing.  */
+			__m128i v = _mm_set1_epi16(
+				MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
+							codeword_len));
+			unsigned n = stores_per_loop;
 			do {
-				*p++ = entry;
+				*(__m128i *)entry_ptr = v;
+				entry_ptr += sizeof(v);
 			} while (--n);
+		}
+	}
+#endif /* __SSE2__ */

-			decode_table_ptr = p;
+#ifdef __GNUC__
+	/* Fill entries one word (2 or 4 entries) at a time. */
+	for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) /
+					(WORDBYTES / sizeof(decode_table[0]));
+	     stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
+	{
+		unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
+		for (; sym_idx < end_sym_idx; sym_idx++) {
+
+			/* Accessing the array of u16 as u32 or u64 would
+			 * violate strict aliasing and would require compiling
+			 * the code with -fno-strict-aliasing to guarantee
+			 * correctness.  To work around this problem, use the
+			 * gcc 'may_alias' extension.  */
+			typedef machine_word_t
+				__attribute__((may_alias)) aliased_word_t;
+			aliased_word_t v = repeat_u16(
+				MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
+							codeword_len));
+			unsigned n = stores_per_loop;
+			do {
+				*(aliased_word_t *)entry_ptr = v;
+				entry_ptr += sizeof(v);
+			} while (--n);
+		}
+	}
+#endif /* __GNUC__ */
+
+	/* Fill entries one at a time. */
+	for (unsigned stores_per_loop = (1U << (table_bits - codeword_len));
+	     stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1)
+	{
+		unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
+		for (; sym_idx < end_sym_idx; sym_idx++) {
+			u16 v = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
+							codeword_len);
+			unsigned n = stores_per_loop;
+			do {
+				*(u16 *)entry_ptr = v;
+				entry_ptr += sizeof(v);
+			} while (--n);
 		}
 	}

-	/* If we've filled in the entire table, we are done.  Otherwise,
-	 * there are codewords longer than table_bits for which we must
-	 * generate binary trees.  */
+	/* If all symbols were processed, then no subtables are required. */
+	if (sym_idx == num_syms)
+		return 0;

-	decode_table_pos = (u16*)decode_table_ptr - decode_table;
-	if (decode_table_pos != table_num_entries) {
-		unsigned j;
-		unsigned next_free_tree_slot;
-		unsigned cur_codeword;
+	/* At least one subtable is required.  Process the remaining symbols. */
+	codeword = ((u16 *)entry_ptr - decode_table) << 1;
+	subtable_pos = 1U << table_bits;
+	subtable_bits = table_bits;
+	subtable_prefix = -1;
+	do {
+		while (len_counts[codeword_len] == 0) {
+			codeword_len++;
+			codeword <<= 1;
+		}

-		/* First, zero out the remaining entries.  This is
-		 * necessary so that these entries appear as
-		 * "unallocated" in the next part.  Each of these entries
-		 * will eventually be filled with the representation of
-		 * the root node of a binary tree.  */
-		j = decode_table_pos;
-		do {
-			decode_table[j] = 0;
-		} while (++j != table_num_entries);
+		unsigned prefix = codeword >> (codeword_len - table_bits);

-		/* We allocate child nodes starting at the end of the
-		 * direct lookup table.  Note that there should be
-		 * 2*num_syms extra entries for this purpose, although
-		 * fewer than this may actually be needed.  */
-		next_free_tree_slot = table_num_entries;
+		/* Start a new subtable if the first 'table_bits' bits of the
+		 * codeword don't match the prefix for the previous subtable, or
+		 * if this will be the first subtable. */
+		if (prefix != subtable_prefix) {

-		/* Iterate through each codeword with length greater than
-		 * 'table_bits', primarily in order of codeword length
-		 * and secondarily in order of symbol.  */
-		for (cur_codeword = decode_table_pos << 1;
-		     codeword_len <= max_codeword_len;
-		     codeword_len++, cur_codeword <<= 1)
-		{
-			unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
-			for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++)
-			{
-				/* 'sorted_sym' is the symbol represented by the
-				 * codeword.  */
-				unsigned sorted_sym = sorted_syms[sym_idx];
+			subtable_prefix = prefix;

-				unsigned extra_bits = codeword_len - table_bits;
-
-				unsigned node_idx = cur_codeword >> extra_bits;
-
-				/* Go through each bit of the current codeword
-				 * beyond the prefix of length @table_bits and
-				 * walk the appropriate binary tree, allocating
-				 * any slots that have not yet been allocated.
-				 *
-				 * Note that the 'pointer' entry to the binary
-				 * tree, which is stored in the direct lookup
-				 * portion of the table, is represented
-				 * identically to other internal (non-leaf)
-				 * nodes of the binary tree; it can be thought
-				 * of as simply the root of the tree.  The
-				 * representation of these internal nodes is
-				 * simply the index of the left child combined
-				 * with the special bits 0xC000 to distingush
-				 * the entry from direct mapping and leaf node
-				 * entries.  */
-				do {
-
-					/* At least one bit remains in the
-					 * codeword, but the current node is an
-					 * unallocated leaf.  Change it to an
-					 * internal node.  */
-					if (decode_table[node_idx] == 0) {
-						decode_table[node_idx] =
-							next_free_tree_slot | 0xC000;
-						decode_table[next_free_tree_slot++] = 0;
-						decode_table[next_free_tree_slot++] = 0;
-					}
-
-					/* Go to the left child if the next bit
-					 * in the codeword is 0; otherwise go to
-					 * the right child.  */
-					node_idx = decode_table[node_idx] & 0x3FFF;
-					--extra_bits;
-					node_idx += (cur_codeword >> extra_bits) & 1;
-				} while (extra_bits != 0);
-
-				/* We've traversed the tree using the entire
-				 * codeword, and we're now at the entry where
-				 * the actual symbol will be stored.  This is
-				 * distinguished from internal nodes by not
-				 * having its high two bits set.  */
-				decode_table[node_idx] = sorted_sym;
+			/*
+			 * Calculate the subtable length.  If the codeword
+			 * length exceeds 'table_bits' by n, then the subtable
+			 * needs at least 2^n entries.  But it may need more; if
+			 * there are fewer than 2^n codewords of length
+			 * 'table_bits + n' remaining, then n will need to be
+			 * incremented to bring in longer codewords until the
+			 * subtable can be filled completely.  Note that it
+			 * always will, eventually, be possible to fill the
+			 * subtable, since it was previously verified that the
+			 * code is complete.
+			 */
+			subtable_bits = codeword_len - table_bits;
+			remainder = (s32)1 << subtable_bits;
+			for (;;) {
+				remainder -= len_counts[table_bits +
+							subtable_bits];
+				if (remainder <= 0)
+					break;
+				subtable_bits++;
+				remainder <<= 1;
 			}
+
+			/* Create the entry that points from the root table to
+			 * the subtable.  This entry contains the index of the
+			 * start of the subtable and the number of bits with
+			 * which the subtable is indexed (the log base 2 of the
+			 * number of entries it contains).  */
+			decode_table[subtable_prefix] =
+				MAKE_DECODE_TABLE_ENTRY(subtable_pos,
+							subtable_bits);
 		}
-	}
+
+		/* Fill the subtable entries for this symbol. */
+		u16 entry = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx],
+						    codeword_len - table_bits);
+		unsigned n = 1U << (subtable_bits - (codeword_len -
+						     table_bits));
+		do {
+			decode_table[subtable_pos++] = entry;
+		} while (--n);
+
+		len_counts[codeword_len]--;
+		codeword++;
+	} while (++sym_idx < num_syms);
+
 	return 0;
 }
--- a/src/decompress_common.h
+++ b/src/decompress_common.h
@ -1,99 +1,36 @@
 /*
- * decompress_common.h - Code shared by the XPRESS and LZX decompressors
+ * decompress_common.h
 *
- * Copyright (C) 2015 Eric Biggers
+ * Header for decompression code shared by multiple compression formats.
 *
- * This program is free software: you can redistribute it and/or modify it under
- * the terms of the GNU General Public License as published by the Free Software
- * Foundation, either version 2 of the License, or (at your option) any later
- * version.
+ * The following copying information applies to this specific source code file:
 *
- * This program is distributed in the hope that it will be useful, but WITHOUT
+ * Written in 2012-2016 by Eric Biggers <ebiggers3@gmail.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
+ * Dedication (the "CC0").
+ *
+ * This software is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- * details.
+ * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
 *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
+ * You should have received a copy of the CC0 along with this software; if not
+ * see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */

-#include <stddef.h>
+#ifndef _DECOMPRESS_COMMON_H
+#define _DECOMPRESS_COMMON_H
+
+#include <errno.h>
 #include <string.h>

-#include <ntfs-3g/endians.h>
-#include <ntfs-3g/types.h>
+#include "common_defs.h"

-/* "Force inline" macro (not required, but helpful for performance)  */
-#ifdef __GNUC__
-#  define forceinline inline __attribute__((always_inline))
-#else
-#  define forceinline inline
-#endif
-
-/* Enable whole-word match copying on selected architectures  */
-#if defined(__i386__) || defined(__x86_64__) || defined(__ARM_FEATURE_UNALIGNED)
-#  define FAST_UNALIGNED_ACCESS
-#endif
-
-/* Size of a machine word  */
-#define WORDBYTES (sizeof(size_t))
-
-/* Inline functions to read and write unaligned data.
- * We use just memcpy() for this.  It is standard and modern compilers will
- * usually replace it with load/store instructions. */
-
-static forceinline u16 get_unaligned_le16(const u8 *p)
-{
-	le16 v_le;
-	memcpy(&v_le, p, 2);
-	return le16_to_cpu(v_le);
-}
-
-static forceinline u32 get_unaligned_le32(const u8 *p)
-{
-	le32 v_le;
-	memcpy(&v_le, p, 4);
-	return le32_to_cpu(v_le);
-}
-
-static forceinline void put_unaligned_le32(u32 v, u8 *p)
-{
-	le32 v_le = cpu_to_le32(v);
-	memcpy(p, &v_le, 4);
-}
-
-/* Load a "word" with platform-dependent size and endianness.  */
-static forceinline size_t get_unaligned_word(const u8 *p)
-{
-	size_t v;
-	memcpy(&v, p, WORDBYTES);
-	return v;
-}
-
-/* Store a "word" with platform-dependent size and endianness.  */
-static forceinline void put_unaligned_word(size_t v, u8 *p)
-{
-	memcpy(p, &v, WORDBYTES);
-}
-
-/* Copy a "word" with platform-dependent size.  */
-static forceinline void copy_unaligned_word(const u8 *src, u8 *dst)
-{
-	put_unaligned_word(get_unaligned_word(src), dst);
-}
-
-/* Generate a "word" with platform-dependent size whose bytes all contain the
- * value 'b'.  */
-static forceinline size_t repeat_byte(u8 b)
-{
-	 size_t v;
-
-	 v = b;
-	 v |= v << 8;
-	 v |= v << 16;
-	 v |= v << ((WORDBYTES == 8) ? 32 : 0);
-	 return v;
-}
+/******************************************************************************/
+/*                   Input bitstream for XPRESS and LZX                       */
+/*----------------------------------------------------------------------------*/

 /* Structure that encapsulates a block of in-memory data being interpreted as a
 * stream of bits, optionally with interwoven literal bytes.  Bits are assumed
@ -106,18 +43,18 @@ struct input_bitstream {
 	u32 bitbuf;

 	/* Number of bits currently held in @bitbuf.  */
-	unsigned bitsleft;
+	u32 bitsleft;

 	/* Pointer to the next byte to be retrieved from the input buffer.  */
 	const u8 *next;

-	/* Pointer to just past the end of the input buffer.  */
+	/* Pointer past the end of the input buffer.  */
 	const u8 *end;
 };

 /* Initialize a bitstream to read from the specified input buffer.  */
-static forceinline void init_input_bitstream(struct input_bitstream *is,
-					     const void *buffer, u32 size)
+static forceinline void
+init_input_bitstream(struct input_bitstream *is, const void *buffer, u32 size)
 {
 	is->bitbuf = 0;
 	is->bitsleft = 0;
@ -125,39 +62,60 @@ static forceinline void init_input_bitstream(struct input_bitstream *is,
 	is->end = is->next + size;
 }

+/* Note: for performance reasons, the following methods don't return error codes
+ * to the caller if the input buffer is overrun.  Instead, they just assume that
+ * all overrun data is zeroes.  This has no effect on well-formed compressed
+ * data.  The only disadvantage is that bad compressed data may go undetected,
+ * but even this is irrelevant if higher level code checksums the uncompressed
+ * data anyway.  */
+
 /* Ensure the bit buffer variable for the bitstream contains at least @num_bits
 * bits.  Following this, bitstream_peek_bits() and/or bitstream_remove_bits()
- * may be called on the bitstream to peek or remove up to @num_bits bits.  Note
- * that @num_bits must be <= 16.  */
-static forceinline void bitstream_ensure_bits(struct input_bitstream *is,
-					      unsigned num_bits)
+ * may be called on the bitstream to peek or remove up to @num_bits bits.  */
+static forceinline void
+bitstream_ensure_bits(struct input_bitstream *is, const unsigned num_bits)
 {
-	if (is->bitsleft < num_bits) {
-		if (is->end - is->next >= 2) {
-			is->bitbuf |= (u32)get_unaligned_le16(is->next)
-					<< (16 - is->bitsleft);
-			is->next += 2;
-		}
-		is->bitsleft += 16;
+	/* This currently works for at most 17 bits.  */
+
+	if (is->bitsleft >= num_bits)
+		return;
+
+	if (unlikely(is->end - is->next < 2))
+		goto overflow;
+
+	is->bitbuf |= (u32)get_unaligned_le16(is->next) << (16 - is->bitsleft);
+	is->next += 2;
+	is->bitsleft += 16;
+
+	if (unlikely(num_bits == 17 && is->bitsleft == 16)) {
+		if (unlikely(is->end - is->next < 2))
+			goto overflow;
+
+		is->bitbuf |= (u32)get_unaligned_le16(is->next);
+		is->next += 2;
+		is->bitsleft = 32;
 	}
+
+	return;
+
+overflow:
+	is->bitsleft = 32;
 }

 /* Return the next @num_bits bits from the bitstream, without removing them.
 * There must be at least @num_bits remaining in the buffer variable, from a
 * previous call to bitstream_ensure_bits().  */
-static forceinline u32 bitstream_peek_bits(const struct input_bitstream *is,
-					   unsigned num_bits)
+static forceinline u32
+bitstream_peek_bits(const struct input_bitstream *is, const unsigned num_bits)
 {
-	if (num_bits == 0)
-		return 0;
-	return is->bitbuf >> (32 - num_bits);
+	return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1);
 }

 /* Remove @num_bits from the bitstream.  There must be at least @num_bits
 * remaining in the buffer variable, from a previous call to
 * bitstream_ensure_bits().  */
-static forceinline void bitstream_remove_bits(struct input_bitstream *is,
-					      unsigned num_bits)
+static forceinline void
+bitstream_remove_bits(struct input_bitstream *is, unsigned num_bits)
 {
 	is->bitbuf <<= num_bits;
 	is->bitsleft -= num_bits;
@ -166,8 +124,8 @@ static forceinline void bitstream_remove_bits(struct input_bitstream *is,
 /* Remove and return @num_bits bits from the bitstream.  There must be at least
 * @num_bits remaining in the buffer variable, from a previous call to
 * bitstream_ensure_bits().  */
-static forceinline u32 bitstream_pop_bits(struct input_bitstream *is,
-					  unsigned num_bits)
+static forceinline u32
+bitstream_pop_bits(struct input_bitstream *is, unsigned num_bits)
 {
 	u32 bits = bitstream_peek_bits(is, num_bits);
 	bitstream_remove_bits(is, num_bits);
@ -175,27 +133,29 @@ static forceinline u32 bitstream_pop_bits(struct input_bitstream *is,
 }

 /* Read and return the next @num_bits bits from the bitstream.  */
-static forceinline u32 bitstream_read_bits(struct input_bitstream *is,
-					   unsigned num_bits)
+static forceinline u32
+bitstream_read_bits(struct input_bitstream *is, unsigned num_bits)
 {
 	bitstream_ensure_bits(is, num_bits);
 	return bitstream_pop_bits(is, num_bits);
 }

 /* Read and return the next literal byte embedded in the bitstream.  */
-static forceinline u8 bitstream_read_byte(struct input_bitstream *is)
+static forceinline u8
+bitstream_read_byte(struct input_bitstream *is)
 {
-	if (is->end == is->next)
+	if (unlikely(is->end == is->next))
 		return 0;
 	return *is->next++;
 }

 /* Read and return the next 16-bit integer embedded in the bitstream.  */
-static forceinline u16 bitstream_read_u16(struct input_bitstream *is)
+static forceinline u16
+bitstream_read_u16(struct input_bitstream *is)
 {
 	u16 v;

-	if (is->end - is->next < 2)
+	if (unlikely(is->end - is->next < 2))
 		return 0;
 	v = get_unaligned_le16(is->next);
 	is->next += 2;
@ -203,11 +163,12 @@ static forceinline u16 bitstream_read_u16(struct input_bitstream *is)
 }

 /* Read and return the next 32-bit integer embedded in the bitstream.  */
-static forceinline u32 bitstream_read_u32(struct input_bitstream *is)
+static forceinline u32
+bitstream_read_u32(struct input_bitstream *is)
 {
 	u32 v;

-	if (is->end - is->next < 4)
+	if (unlikely(is->end - is->next < 4))
 		return 0;
 	v = get_unaligned_le32(is->next);
 	is->next += 4;
@ -215,161 +176,370 @@ static forceinline u32 bitstream_read_u32(struct input_bitstream *is)
 }

 /* Read into @dst_buffer an array of literal bytes embedded in the bitstream.
- * Return either a pointer to the byte past the last written, or NULL if the
- * read overflows the input buffer.  */
-static forceinline void *bitstream_read_bytes(struct input_bitstream *is,
-					      void *dst_buffer, size_t count)
+ * Return 0 if there were enough bytes remaining in the input, otherwise -1. */
+static forceinline int
+bitstream_read_bytes(struct input_bitstream *is, void *dst_buffer, size_t count)
 {
-	if ((size_t)(is->end - is->next) < count)
-		return NULL;
+	if (unlikely(is->end - is->next < count))
+		return -1;
 	memcpy(dst_buffer, is->next, count);
 	is->next += count;
-	return (u8 *)dst_buffer + count;
+	return 0;
 }

 /* Align the input bitstream on a coding-unit boundary.  */
-static forceinline void bitstream_align(struct input_bitstream *is)
+static forceinline void
+bitstream_align(struct input_bitstream *is)
 {
 	is->bitsleft = 0;
 	is->bitbuf = 0;
 }

-extern int make_huffman_decode_table(u16 decode_table[], const unsigned num_syms,
-				     const unsigned num_bits, const u8 lens[],
-				     const unsigned max_codeword_len,
-				     u16 working_space[]);
+/******************************************************************************/
+/*                             Huffman decoding                               */
+/*----------------------------------------------------------------------------*/

+/*
+ * Required alignment for the Huffman decode tables.  We require this alignment
+ * so that we can fill the entries with vector or word instructions and not have
+ * to deal with misaligned buffers.
+ */
+#define DECODE_TABLE_ALIGNMENT 16

-/* Reads and returns the next Huffman-encoded symbol from a bitstream.  If the
- * input data is exhausted, the Huffman symbol is decoded as if the missing bits
- * are all zeroes.  */
-static forceinline unsigned read_huffsym(struct input_bitstream *istream,
-					 const u16 decode_table[],
-					 unsigned table_bits,
-					 unsigned max_codeword_len)
+/*
+ * Each decode table entry is 16 bits divided into two fields: 'symbol' (high 12
+ * bits) and 'length' (low 4 bits).  The precise meaning of these fields depends
+ * on the type of entry:
+ *
+ * Root table entries which are *not* subtable pointers:
+ *	symbol: symbol to decode
+ *	length: codeword length in bits
+ *
+ * Root table entries which are subtable pointers:
+ *	symbol: index of start of subtable
+ *	length: number of bits with which the subtable is indexed
+ *
+ * Subtable entries:
+ *	symbol: symbol to decode
+ *	length: codeword length in bits, minus the number of bits with which the
+ *		root table is indexed
+ */
+#define DECODE_TABLE_SYMBOL_SHIFT  4
+#define DECODE_TABLE_MAX_SYMBOL	   ((1 << (16 - DECODE_TABLE_SYMBOL_SHIFT)) - 1)
+#define DECODE_TABLE_MAX_LENGTH    ((1 << DECODE_TABLE_SYMBOL_SHIFT) - 1)
+#define DECODE_TABLE_LENGTH_MASK   DECODE_TABLE_MAX_LENGTH
+#define MAKE_DECODE_TABLE_ENTRY(symbol, length) \
+	(((symbol) << DECODE_TABLE_SYMBOL_SHIFT) | (length))
+
+/*
+ * Read and return the next Huffman-encoded symbol from the given bitstream
+ * using the given decode table.
+ *
+ * If the input data is exhausted, then the Huffman symbol will be decoded as if
+ * the missing bits were all zeroes.
+ *
+ * XXX: This is mostly duplicated in lzms_decode_huffman_symbol() in
+ * lzms_decompress.c; keep them in sync!
+ */
+static forceinline unsigned
+read_huffsym(struct input_bitstream *is, const u16 decode_table[],
+	     unsigned table_bits, unsigned max_codeword_len)
 {
 	unsigned entry;
-	unsigned key_bits;
+	unsigned symbol;
+	unsigned length;

-	bitstream_ensure_bits(istream, max_codeword_len);
+	/* Preload the bitbuffer with 'max_codeword_len' bits so that we're
+	 * guaranteed to be able to fully decode a codeword. */
+	bitstream_ensure_bits(is, max_codeword_len);

-	/* Index the decode table by the next table_bits bits of the input.  */
-	key_bits = bitstream_peek_bits(istream, table_bits);
-	entry = decode_table[key_bits];
-	if (entry < 0xC000) {
-		/* Fast case: The decode table directly provided the
-		 * symbol and codeword length.  The low 11 bits are the
-		 * symbol, and the high 5 bits are the codeword length.  */
-		bitstream_remove_bits(istream, entry >> 11);
-		return entry & 0x7FF;
-	} else {
-		/* Slow case: The codeword for the symbol is longer than
-		 * table_bits, so the symbol does not have an entry
-		 * directly in the first (1 << table_bits) entries of the
-		 * decode table.  Traverse the appropriate binary tree
-		 * bit-by-bit to decode the symbol.  */
-		bitstream_remove_bits(istream, table_bits);
-		do {
-			key_bits = (entry & 0x3FFF) + bitstream_pop_bits(istream, 1);
-		} while ((entry = decode_table[key_bits]) >= 0xC000);
-		return entry;
+	/* Index the root table by the next 'table_bits' bits of input. */
+	entry = decode_table[bitstream_peek_bits(is, table_bits)];
+
+	/* Extract the "symbol" and "length" from the entry. */
+	symbol = entry >> DECODE_TABLE_SYMBOL_SHIFT;
+	length = entry & DECODE_TABLE_LENGTH_MASK;
+
+	/* If the root table is indexed by the full 'max_codeword_len' bits,
+	 * then there cannot be any subtables, and this will be known at compile
+	 * time.  Otherwise, we must check whether the decoded symbol is really
+	 * a subtable pointer.  If so, we must discard the bits with which the
+	 * root table was indexed, then index the subtable by the next 'length'
+	 * bits of input to get the real entry. */
+	if (max_codeword_len > table_bits &&
+	    entry >= (1U << (table_bits + DECODE_TABLE_SYMBOL_SHIFT)))
+	{
+		/* Subtable required */
+		bitstream_remove_bits(is, table_bits);
+		entry = decode_table[symbol + bitstream_peek_bits(is, length)];
+		symbol = entry >> DECODE_TABLE_SYMBOL_SHIFT;
+		length = entry & DECODE_TABLE_LENGTH_MASK;
 	}
+
+	/* Discard the bits (or the remaining bits, if a subtable was required)
+	 * of the codeword. */
+	bitstream_remove_bits(is, length);
+
+	/* Return the decoded symbol. */
+	return symbol;
 }

 /*
- * Copy an LZ77 match at (dst - offset) to dst.
+ * The DECODE_TABLE_ENOUGH() macro evaluates to the maximum number of decode
+ * table entries, including all subtable entries, that may be required for
+ * decoding a given Huffman code.  This depends on three parameters:
 *
- * The length and offset must be already validated --- that is, (dst - offset)
- * can't underrun the output buffer, and (dst + length) can't overrun the output
- * buffer.  Also, the length cannot be 0.
+ *	num_syms: the maximum number of symbols in the code
+ *	table_bits: the number of bits with which the root table will be indexed
+ *	max_codeword_len: the maximum allowed codeword length in the code
 *
- * @bufend points to the byte past the end of the output buffer.  This function
- * won't write any data beyond this position.
- *
- * Returns dst + length.
+ * Given these parameters, the utility program 'enough' from zlib, when passed
+ * the three arguments 'num_syms', 'table_bits', and 'max_codeword_len', will
+ * compute the maximum number of entries required.  This has already been done
+ * for the combinations we need and incorporated into the macro below so that
+ * the mapping can be done at compilation time.  If an unknown combination is
+ * used, then a compilation error will result.  To fix this, use 'enough' to
+ * find the missing value and add it below.  If that still doesn't fix the
+ * compilation error, then most likely a constraint would be violated by the
+ * requested parameters, so they cannot be used, at least without other changes
+ * to the decode table --- see DECODE_TABLE_SIZE().
 */
-static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend,
-			       u32 min_length)
+#define DECODE_TABLE_ENOUGH(num_syms, table_bits, max_codeword_len) ( \
+	((num_syms) == 8 && (table_bits) == 7 && (max_codeword_len) == 15) ? 128 : \
+	((num_syms) == 8 && (table_bits) == 5 && (max_codeword_len) == 7) ? 36 : \
+	((num_syms) == 8 && (table_bits) == 6 && (max_codeword_len) == 7) ? 66 : \
+	((num_syms) == 8 && (table_bits) == 7 && (max_codeword_len) == 7) ? 128 : \
+	((num_syms) == 20 && (table_bits) == 5 && (max_codeword_len) == 15) ? 1062 : \
+	((num_syms) == 20 && (table_bits) == 6 && (max_codeword_len) == 15) ? 582 : \
+	((num_syms) == 20 && (table_bits) == 7 && (max_codeword_len) == 15) ? 390 : \
+	((num_syms) == 54 && (table_bits) == 9 && (max_codeword_len) == 15) ? 618 : \
+	((num_syms) == 54 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1098 : \
+	((num_syms) == 249 && (table_bits) == 9 && (max_codeword_len) == 16) ? 878 : \
+	((num_syms) == 249 && (table_bits) == 10 && (max_codeword_len) == 16) ? 1326 : \
+	((num_syms) == 249 && (table_bits) == 11 && (max_codeword_len) == 16) ? 2318 : \
+	((num_syms) == 256 && (table_bits) == 9 && (max_codeword_len) == 15) ? 822 : \
+	((num_syms) == 256 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1302 : \
+	((num_syms) == 256 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2310 : \
+	((num_syms) == 512 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1558 : \
+	((num_syms) == 512 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2566 : \
+	((num_syms) == 512 && (table_bits) == 12 && (max_codeword_len) == 15) ? 4606 : \
+	((num_syms) == 656 && (table_bits) == 10 && (max_codeword_len) == 16) ? 1734 : \
+	((num_syms) == 656 && (table_bits) == 11 && (max_codeword_len) == 16) ? 2726 : \
+	((num_syms) == 656 && (table_bits) == 12 && (max_codeword_len) == 16) ? 4758 : \
+	((num_syms) == 799 && (table_bits) == 9 && (max_codeword_len) == 15) ? 1366 : \
+	((num_syms) == 799 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1846 : \
+	((num_syms) == 799 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2854 : \
+	-1)
+
+/* Wrapper around DECODE_TABLE_ENOUGH() that does additional compile-time
+ * validation. */
+#define DECODE_TABLE_SIZE(num_syms, table_bits, max_codeword_len) (	\
+									\
+	/* All values must be positive. */				\
+	STATIC_ASSERT_ZERO((num_syms) > 0) +				\
+	STATIC_ASSERT_ZERO((table_bits) > 0) +				\
+	STATIC_ASSERT_ZERO((max_codeword_len) > 0) +			\
+									\
+	/* There cannot be more symbols than possible codewords. */	\
+	STATIC_ASSERT_ZERO((num_syms) <= 1U << (max_codeword_len)) +	\
+									\
+	/* There is no reason for the root table to be indexed with
+	 * more bits than the maximum codeword length. */		\
+	STATIC_ASSERT_ZERO((table_bits) <= (max_codeword_len)) +	\
+									\
+	/* The maximum symbol value must fit in the 'symbol' field. */	\
+	STATIC_ASSERT_ZERO((num_syms) - 1 <= DECODE_TABLE_MAX_SYMBOL) +	\
+									\
+	/* The maximum codeword length in the root table must fit in
+	 * the 'length' field. */					\
+	STATIC_ASSERT_ZERO((table_bits) <= DECODE_TABLE_MAX_LENGTH) +	\
+									\
+	/* The maximum codeword length in a subtable must fit in the
+	 * 'length' field. */						\
+	STATIC_ASSERT_ZERO((max_codeword_len) - (table_bits) <=		\
+			   DECODE_TABLE_MAX_LENGTH) +			\
+									\
+	/* The minimum subtable index must be greater than the maximum
+	 * symbol value.  If this were not the case, then there would
+	 * be no way to tell whether a given root table entry is a
+	 * "subtable pointer" or not.  (An alternate solution would be
+	 * to reserve a flag bit specifically for this purpose.) */	\
+	STATIC_ASSERT_ZERO((1U << table_bits) > (num_syms) - 1) +	\
+									\
+	/* The needed 'enough' value must have been defined. */		\
+	STATIC_ASSERT_ZERO(DECODE_TABLE_ENOUGH(				\
+				(num_syms), (table_bits),		\
+				(max_codeword_len)) > 0) +		\
+									\
+	/* The maximum subtable index must fit in the 'symbol' field. */\
+	STATIC_ASSERT_ZERO(DECODE_TABLE_ENOUGH(				\
+				(num_syms), (table_bits),		\
+				(max_codeword_len)) - 1 <=		\
+					DECODE_TABLE_MAX_SYMBOL) +	\
+									\
+	/* Finally, make the macro evaluate to the needed maximum
+	 * number of decode table entries. */				\
+	DECODE_TABLE_ENOUGH((num_syms), (table_bits),			\
+			    (max_codeword_len))				\
+)
+
+/*
+ * Declare the decode table for a Huffman code, given several compile-time
+ * constants that describe the code.  See DECODE_TABLE_ENOUGH() for details.
+ *
+ * Decode tables must be aligned to a DECODE_TABLE_ALIGNMENT-byte boundary.
+ * This implies that if a decode table is nested inside a dynamically allocated
+ * structure, then the outer structure must be allocated on a
+ * DECODE_TABLE_ALIGNMENT-byte aligned boundary as well.
+ */
+#define DECODE_TABLE(name, num_syms, table_bits, max_codeword_len) \
+	u16 name[DECODE_TABLE_SIZE((num_syms), (table_bits), \
+				   (max_codeword_len))]	\
+		_aligned_attribute(DECODE_TABLE_ALIGNMENT)
+
+/*
+ * Declare the temporary "working_space" array needed for building the decode
+ * table for a Huffman code.
+ */
+#define DECODE_TABLE_WORKING_SPACE(name, num_syms, max_codeword_len)	\
+	u16 name[2 * ((max_codeword_len) + 1)  + (num_syms)];
+
+extern int
+make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
+			  unsigned table_bits, const u8 lens[],
+			  unsigned max_codeword_len, u16 working_space[]);
+
+/******************************************************************************/
+/*                             LZ match copying                               */
+/*----------------------------------------------------------------------------*/
+
+static forceinline void
+copy_word_unaligned(const void *src, void *dst)
 {
-	const u8 *src = dst - offset;
+	store_word_unaligned(load_word_unaligned(src), dst);
+}
+
+static forceinline machine_word_t
+repeat_u16(u16 b)
+{
+	machine_word_t v = b;
+
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	v |= v << 16;
+	v |= v << ((WORDBITS == 64) ? 32 : 0);
+	return v;
+}
+
+static forceinline machine_word_t
+repeat_byte(u8 b)
+{
+	return repeat_u16(((u16)b << 8) | b);
+}
+
+/*
+ * Copy an LZ77 match of 'length' bytes from the match source at 'out_next -
+ * offset' to the match destination at 'out_next'.  The source and destination
+ * may overlap.
+ *
+ * This handles validating the length and offset.  It is validated that the
+ * beginning of the match source is '>= out_begin' and that end of the match
+ * destination is '<= out_end'.  The return value is 0 if the match was valid
+ * (and was copied), otherwise -1.
+ *
+ * 'min_length' is a hint which specifies the minimum possible match length.
+ * This should be a compile-time constant.
+ */
+static forceinline int
+lz_copy(u32 length, u32 offset, u8 *out_begin, u8 *out_next, u8 *out_end,
+	u32 min_length)
+{
+	const u8 *src;
+	u8 *end;
+
+	/* Validate the offset. */
+	if (unlikely(offset > out_next - out_begin))
+		return -1;

 	/*
-	 * Try to copy one machine word at a time.  On i386 and x86_64 this is
-	 * faster than copying one byte at a time, unless the data is
-	 * near-random and all the matches have very short lengths.  Note that
-	 * since this requires unaligned memory accesses, it won't necessarily
-	 * be faster on every architecture.
+	 * Fast path: copy a match which is no longer than a few words, is not
+	 * overlapped such that copying a word at a time would produce incorrect
+	 * results, and is not too close to the end of the buffer.  Note that
+	 * this might copy more than the length of the match, but that's okay in
+	 * this scenario.
+	 */
+	src = out_next - offset;
+	if (UNALIGNED_ACCESS_IS_FAST && length <= 3 * WORDBYTES &&
+	    offset >= WORDBYTES && out_end - out_next >= 3 * WORDBYTES)
+	{
+		copy_word_unaligned(src + WORDBYTES*0, out_next + WORDBYTES*0);
+		copy_word_unaligned(src + WORDBYTES*1, out_next + WORDBYTES*1);
+		copy_word_unaligned(src + WORDBYTES*2, out_next + WORDBYTES*2);
+		return 0;
+	}
+
+	/* Validate the length.  This isn't needed in the fast path above, due
+	 * to the additional conditions tested, but we do need it here. */
+	if (unlikely(length > out_end - out_next))
+		return -1;
+	end = out_next + length;
+
+	/*
+	 * Try to copy one word at a time.  On i386 and x86_64 this is faster
+	 * than copying one byte at a time, unless the data is near-random and
+	 * all the matches have very short lengths.  Note that since this
+	 * requires unaligned memory accesses, it won't necessarily be faster on
+	 * every architecture.
 	 *
 	 * Also note that we might copy more than the length of the match.  For
 	 * example, if a word is 8 bytes and the match is of length 5, then
 	 * we'll simply copy 8 bytes.  This is okay as long as we don't write
-	 * beyond the end of the output buffer, hence the check for (bufend -
+	 * beyond the end of the output buffer, hence the check for (out_end -
 	 * end >= WORDBYTES - 1).
 	 */
-#ifdef FAST_UNALIGNED_ACCESS
-	u8 * const end = dst + length;
-	if (bufend - end >= (ptrdiff_t)(WORDBYTES - 1)) {
-
+	if (UNALIGNED_ACCESS_IS_FAST && likely(out_end - end >= WORDBYTES - 1))
+	{
 		if (offset >= WORDBYTES) {
-			/* The source and destination words don't overlap.  */
-
-			/* To improve branch prediction, one iteration of this
-			 * loop is unrolled.  Most matches are short and will
-			 * fail the first check.  But if that check passes, then
-			 * it becomes increasing likely that the match is long
-			 * and we'll need to continue copying.  */
-
-			copy_unaligned_word(src, dst);
-			src += WORDBYTES;
-			dst += WORDBYTES;
-
-			if (dst < end) {
-				do {
-					copy_unaligned_word(src, dst);
-					src += WORDBYTES;
-					dst += WORDBYTES;
-				} while (dst < end);
-			}
-			return end;
+			/* The source and destination words don't overlap. */
+			do {
+				copy_word_unaligned(src, out_next);
+				src += WORDBYTES;
+				out_next += WORDBYTES;
+			} while (out_next < end);
+			return 0;
 		} else if (offset == 1) {
-
 			/* Offset 1 matches are equivalent to run-length
 			 * encoding of the previous byte.  This case is common
-			 * if the data contains many repeated bytes.  */
-
-			size_t v = repeat_byte(*(dst - 1));
+			 * if the data contains many repeated bytes. */
+			machine_word_t v = repeat_byte(*(out_next - 1));
 			do {
-				put_unaligned_word(v, dst);
+				store_word_unaligned(v, out_next);
 				src += WORDBYTES;
-				dst += WORDBYTES;
-			} while (dst < end);
-			return end;
+				out_next += WORDBYTES;
+			} while (out_next < end);
+			return 0;
 		}
 		/*
 		 * We don't bother with special cases for other 'offset <
-		 * WORDBYTES', which are usually rarer than 'offset == 1'.  Extra
-		 * checks will just slow things down.  Actually, it's possible
-		 * to handle all the 'offset < WORDBYTES' cases using the same
-		 * code, but it still becomes more complicated doesn't seem any
-		 * faster overall; it definitely slows down the more common
-		 * 'offset == 1' case.
+		 * WORDBYTES', which are usually rarer than 'offset == 1'.
+		 * Extra checks will just slow things down.  Actually, it's
+		 * possible to handle all the 'offset < WORDBYTES' cases using
+		 * the same code, but it still becomes more complicated doesn't
+		 * seem any faster overall; it definitely slows down the more
+		 * common 'offset == 1' case.
 		 */
 	}
-#endif /* FAST_UNALIGNED_ACCESS */

 	/* Fall back to a bytewise copy.  */
-
-	if (min_length >= 2) {
-		*dst++ = *src++;
-		length--;
-	}
-	if (min_length >= 3) {
-		*dst++ = *src++;
-		length--;
-	}
+	if (min_length >= 2)
+		*out_next++ = *src++;
+	if (min_length >= 3)
+		*out_next++ = *src++;
+	if (min_length >= 4)
+		*out_next++ = *src++;
 	do {
-		*dst++ = *src++;
-	} while (--length);
-
-	return dst;
+		*out_next++ = *src++;
+	} while (out_next != end);
+	return 0;
 }
+
+#endif /* _DECOMPRESS_COMMON_H */
--- a/src/lzx_common.c
+++ b/src/lzx_common.c
@ -0,0 +1,324 @@
+/*
+ * lzx_common.c - Common code for LZX compression and decompression.
+ */
+
+/*
+ * Copyright (C) 2012-2016 Eric Biggers
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation, either version 2 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include <string.h>
+
+#ifdef __SSE2__
+#  include <emmintrin.h>
+#endif
+
+#ifdef __AVX2__
+#  include <immintrin.h>
+#endif
+
+#include "common_defs.h"
+#include "lzx_common.h"
+
+/* Mapping: offset slot => first match offset that uses that offset slot.
+ * The offset slots for repeat offsets map to "fake" offsets < 1.  */
+const s32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1] = {
+        -2     , -1     , 0      , 1      , 2      ,    /* 0  --- 4  */
+        4      , 6      , 10     , 14     , 22     ,    /* 5  --- 9  */
+        30     , 46     , 62     , 94     , 126    ,    /* 10 --- 14 */
+        190    , 254    , 382    , 510    , 766    ,    /* 15 --- 19 */
+        1022   , 1534   , 2046   , 3070   , 4094   ,    /* 20 --- 24 */
+        6142   , 8190   , 12286  , 16382  , 24574  ,    /* 25 --- 29 */
+        32766  , 49150  , 65534  , 98302  , 131070 ,    /* 30 --- 34 */
+        196606 , 262142 , 393214 , 524286 , 655358 ,    /* 35 --- 39 */
+        786430 , 917502 , 1048574, 1179646, 1310718,    /* 40 --- 44 */
+        1441790, 1572862, 1703934, 1835006, 1966078,    /* 45 --- 49 */
+        2097150                                         /* extra     */
+};
+
+/* Mapping: offset slot => how many extra bits must be read and added to the
+ * corresponding offset slot base to decode the match offset.  */
+const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = {
+	0 , 0 , 0 , 0 , 1 ,
+	1 , 2 , 2 , 3 , 3 ,
+	4 , 4 , 5 , 5 , 6 ,
+	6 , 7 , 7 , 8 , 8 ,
+	9 , 9 , 10, 10, 11,
+	11, 12, 12, 13, 13,
+	14, 14, 15, 15, 16,
+	16, 17, 17, 17, 17,
+	17, 17, 17, 17, 17,
+	17, 17, 17, 17, 17,
+};
+
+/* Round the specified buffer size up to the next valid LZX window size, and
+ * return its order (log2).  Or, if the buffer size is 0 or greater than the
+ * largest valid LZX window size, return 0.  */
+unsigned
+lzx_get_window_order(size_t max_bufsize)
+{
+	if (max_bufsize == 0 || max_bufsize > LZX_MAX_WINDOW_SIZE)
+		return 0;
+
+	return max(ilog2_ceil(max_bufsize), LZX_MIN_WINDOW_ORDER);
+}
+
+/* Given a valid LZX window order, return the number of symbols that will exist
+ * in the main Huffman code.  */
+unsigned
+lzx_get_num_main_syms(unsigned window_order)
+{
+	/* Note: one would expect that the maximum match offset would be
+	 * 'window_size - LZX_MIN_MATCH_LEN', which would occur if the first two
+	 * bytes were to match the last two bytes.  However, the format
+	 * disallows this case.  This reduces the number of needed offset slots
+	 * by 1.  */
+	u32 window_size = (u32)1 << window_order;
+	u32 max_offset = window_size - LZX_MIN_MATCH_LEN - 1;
+	unsigned num_offset_slots = 30;
+	while (max_offset >= lzx_offset_slot_base[num_offset_slots])
+		num_offset_slots++;
+
+	return LZX_NUM_CHARS + (num_offset_slots * LZX_NUM_LEN_HEADERS);
+}
+
+static void
+do_translate_target(void *target, s32 input_pos)
+{
+	s32 abs_offset, rel_offset;
+
+	rel_offset = get_unaligned_le32(target);
+	if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) {
+		if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) {
+			/* "good translation" */
+			abs_offset = rel_offset + input_pos;
+		} else {
+			/* "compensating translation" */
+			abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE;
+		}
+		put_unaligned_le32(abs_offset, target);
+	}
+}
+
+static void
+undo_translate_target(void *target, s32 input_pos)
+{
+	s32 abs_offset, rel_offset;
+
+	abs_offset = get_unaligned_le32(target);
+	if (abs_offset >= 0) {
+		if (abs_offset < LZX_WIM_MAGIC_FILESIZE) {
+			/* "good translation" */
+			rel_offset = abs_offset - input_pos;
+			put_unaligned_le32(rel_offset, target);
+		}
+	} else {
+		if (abs_offset >= -input_pos) {
+			/* "compensating translation" */
+			rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE;
+			put_unaligned_le32(rel_offset, target);
+		}
+	}
+}
+
+/*
+ * Do or undo the 'E8' preprocessing used in LZX.  Before compression, the
+ * uncompressed data is preprocessed by changing the targets of x86 CALL
+ * instructions from relative offsets to absolute offsets.  After decompression,
+ * the translation is undone by changing the targets of x86 CALL instructions
+ * from absolute offsets to relative offsets.
+ *
+ * Note that despite its intent, E8 preprocessing can be done on any data even
+ * if it is not actually x86 machine code.  In fact, E8 preprocessing appears to
+ * always be used in LZX-compressed resources in WIM files; there is no bit to
+ * indicate whether it is used or not, unlike in the LZX compressed format as
+ * used in cabinet files, where a bit is reserved for that purpose.
+ *
+ * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data,
+ * which really means the 5-byte call instruction cannot start in the last 10
+ * bytes of the uncompressed data.  This is one of the errors in the LZX
+ * documentation.
+ *
+ * E8 preprocessing does not appear to be disabled after the 32768th chunk of a
+ * WIM resource, which apparently is another difference from the LZX compression
+ * used in cabinet files.
+ *
+ * E8 processing is supposed to take the file size as a parameter, as it is used
+ * in calculating the translated jump targets.  But in WIM files, this file size
+ * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000).
+ */
+static void
+lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
+{
+
+#if !defined(__SSE2__) && !defined(__AVX2__)
+	/*
+	 * A worthwhile optimization is to push the end-of-buffer check into the
+	 * relatively rare E8 case.  This is possible if we replace the last six
+	 * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte
+	 * before reaching end-of-buffer.  In addition, this scheme guarantees
+	 * that no translation can begin following an E8 byte in the last 10
+	 * bytes because a 4-byte offset containing E8 as its high byte is a
+	 * large negative number that is not valid for translation.  That is
+	 * exactly what we need.
+	 */
+	u8 *tail;
+	u8 saved_bytes[6];
+	u8 *p;
+
+	if (size <= 10)
+		return;
+
+	tail = &data[size - 6];
+	memcpy(saved_bytes, tail, 6);
+	memset(tail, 0xE8, 6);
+	p = data;
+	for (;;) {
+		while (*p != 0xE8)
+			p++;
+		if (p >= tail)
+			break;
+		(*process_target)(p + 1, p - data);
+		p += 5;
+	}
+	memcpy(tail, saved_bytes, 6);
+#else
+	/* SSE2 or AVX-2 optimized version for x86_64  */
+
+	u8 *p = data;
+	u64 valid_mask = ~0;
+
+	if (size <= 10)
+		return;
+#ifdef __AVX2__
+#  define ALIGNMENT_REQUIRED 32
+#else
+#  define ALIGNMENT_REQUIRED 16
+#endif
+
+	/* Process one byte at a time until the pointer is properly aligned.  */
+	while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) {
+		if (p >= data + size - 10)
+			return;
+		if (*p == 0xE8 && (valid_mask & 1)) {
+			(*process_target)(p + 1, p - data);
+			valid_mask &= ~0x1F;
+		}
+		p++;
+		valid_mask >>= 1;
+		valid_mask |= (u64)1 << 63;
+	}
+
+	if (data + size - p >= 64) {
+
+		/* Vectorized processing  */
+
+		/* Note: we use a "trap" E8 byte to eliminate the need to check
+		 * for end-of-buffer in the inner loop.  This byte is carefully
+		 * positioned so that it will never be changed by a previous
+		 * translation before it is detected.  */
+
+		u8 *trap = p + ((data + size - p) & ~31) - 32 + 4;
+		u8 saved_byte = *trap;
+		*trap = 0xE8;
+
+		for (;;) {
+			u32 e8_mask;
+			u8 *orig_p = p;
+		#ifdef __AVX2__
+			const __m256i e8_bytes = _mm256_set1_epi8(0xE8);
+			for (;;) {
+				__m256i bytes = *(const __m256i *)p;
+				__m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes);
+				e8_mask = _mm256_movemask_epi8(cmpresult);
+				if (e8_mask)
+					break;
+				p += 32;
+			}
+		#else
+			const __m128i e8_bytes = _mm_set1_epi8(0xE8);
+			for (;;) {
+				/* Read the next 32 bytes of data and test them
+				 * for E8 bytes.  */
+				__m128i bytes1 = *(const __m128i *)p;
+				__m128i bytes2 = *(const __m128i *)(p + 16);
+				__m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes);
+				__m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes);
+				u32 mask1 = _mm_movemask_epi8(cmpresult1);
+				u32 mask2 = _mm_movemask_epi8(cmpresult2);
+				/* The masks have a bit set for each E8 byte.
+				 * We stay in this fast inner loop as long as
+				 * there are no E8 bytes.  */
+				if (mask1 | mask2) {
+					e8_mask = mask1 | (mask2 << 16);
+					break;
+				}
+				p += 32;
+			}
+		#endif
+
+			/* Did we pass over data with no E8 bytes?  */
+			if (p != orig_p)
+				valid_mask = ~0;
+
+			/* Are we nearing end-of-buffer?  */
+			if (p == trap - 4)
+				break;
+
+			/* Process the E8 bytes.  However, the AND with
+			 * 'valid_mask' ensures we never process an E8 byte that
+			 * was itself part of a translation target.  */
+			while ((e8_mask &= valid_mask)) {
+				unsigned bit = bsf32(e8_mask);
+				(*process_target)(p + bit + 1, p + bit - data);
+				valid_mask &= ~((u64)0x1F << bit);
+			}
+
+			valid_mask >>= 32;
+			valid_mask |= 0xFFFFFFFF00000000;
+			p += 32;
+		}
+
+		*trap = saved_byte;
+	}
+
+	/* Approaching the end of the buffer; process one byte a time.  */
+	while (p < data + size - 10) {
+		if (*p == 0xE8 && (valid_mask & 1)) {
+			(*process_target)(p + 1, p - data);
+			valid_mask &= ~0x1F;
+		}
+		p++;
+		valid_mask >>= 1;
+		valid_mask |= (u64)1 << 63;
+	}
+#endif /* __SSE2__ || __AVX2__ */
+}
+
+void
+lzx_preprocess(u8 *data, u32 size)
+{
+	lzx_e8_filter(data, size, do_translate_target);
+}
+
+void
+lzx_postprocess(u8 *data, u32 size)
+{
+	lzx_e8_filter(data, size, undo_translate_target);
+}
--- a/src/lzx_common.h
+++ b/src/lzx_common.h
@ -0,0 +1,29 @@
+/*
+ * lzx_common.h
+ *
+ * Declarations shared between LZX compression and decompression.
+ */
+
+#ifndef _LZX_COMMON_H
+#define _LZX_COMMON_H
+
+#include "lzx_constants.h"
+#include "common_defs.h"
+
+extern const s32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1];
+
+extern const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS];
+
+extern unsigned
+lzx_get_window_order(size_t max_bufsize);
+
+extern unsigned
+lzx_get_num_main_syms(unsigned window_order);
+
+extern void
+lzx_preprocess(u8 *data, u32 size);
+
+extern void
+lzx_postprocess(u8 *data, u32 size);
+
+#endif /* _LZX_COMMON_H */
--- a/src/lzx_constants.h
+++ b/src/lzx_constants.h
@ -0,0 +1,103 @@
+/*
+ * lzx_constants.h
+ *
+ * Constants for the LZX compression format.
+ */
+
+#ifndef _LZX_CONSTANTS_H
+#define _LZX_CONSTANTS_H
+
+/* Number of literal byte values.  */
+#define LZX_NUM_CHARS	256
+
+/* The smallest and largest allowed match lengths.  */
+#define LZX_MIN_MATCH_LEN	2
+#define LZX_MAX_MATCH_LEN	257
+
+/* Number of distinct match lengths that can be represented.  */
+#define LZX_NUM_LENS		(LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
+
+/* Number of match lengths for which no length symbol is required.  */
+#define LZX_NUM_PRIMARY_LENS	7
+#define LZX_NUM_LEN_HEADERS	(LZX_NUM_PRIMARY_LENS + 1)
+
+/* Valid values of the 3-bit block type field.  */
+#define LZX_BLOCKTYPE_VERBATIM       1
+#define LZX_BLOCKTYPE_ALIGNED        2
+#define LZX_BLOCKTYPE_UNCOMPRESSED   3
+
+/* 'LZX_MIN_WINDOW_SIZE' and 'LZX_MAX_WINDOW_SIZE' are the minimum and maximum
+ * sizes of the sliding window.  */
+#define LZX_MIN_WINDOW_ORDER	15
+#define LZX_MAX_WINDOW_ORDER	21
+#define LZX_MIN_WINDOW_SIZE	(1UL << LZX_MIN_WINDOW_ORDER)  /* 32768   */
+#define LZX_MAX_WINDOW_SIZE	(1UL << LZX_MAX_WINDOW_ORDER)  /* 2097152 */
+
+/* Maximum number of offset slots.  (The actual number of offset slots depends
+ * on the window size.)  */
+#define LZX_MAX_OFFSET_SLOTS	50
+
+/* Maximum number of symbols in the main code.  (The actual number of symbols in
+ * the main code depends on the window size.)  */
+#define LZX_MAINCODE_MAX_NUM_SYMBOLS	\
+	(LZX_NUM_CHARS + (LZX_MAX_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS))
+
+/* Number of symbols in the length code.  */
+#define LZX_LENCODE_NUM_SYMBOLS		(LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS)
+
+/* Number of symbols in the pre-code.  */
+#define LZX_PRECODE_NUM_SYMBOLS		20
+
+/* Number of bits in which each pre-code codeword length is represented.  */
+#define LZX_PRECODE_ELEMENT_SIZE	4
+
+/* Number of low-order bits of each match offset that are entropy-encoded in
+ * aligned offset blocks.  */
+#define LZX_NUM_ALIGNED_OFFSET_BITS	3
+
+/* Number of symbols in the aligned offset code.  */
+#define LZX_ALIGNEDCODE_NUM_SYMBOLS	(1 << LZX_NUM_ALIGNED_OFFSET_BITS)
+
+/* Mask for the match offset bits that are entropy-encoded in aligned offset
+ * blocks.  */
+#define LZX_ALIGNED_OFFSET_BITMASK	((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1)
+
+/* Number of bits in which each aligned offset codeword length is represented.  */
+#define LZX_ALIGNEDCODE_ELEMENT_SIZE	3
+
+/* The first offset slot which requires an aligned offset symbol in aligned
+ * offset blocks.  */
+#define LZX_MIN_ALIGNED_OFFSET_SLOT	8
+
+/* The offset slot base for LZX_MIN_ALIGNED_OFFSET_SLOT.  */
+#define LZX_MIN_ALIGNED_OFFSET		14
+
+/* The maximum number of extra offset bits in verbatim blocks.  (One would need
+ * to subtract LZX_NUM_ALIGNED_OFFSET_BITS to get the number of extra offset
+ * bits in *aligned* blocks.)  */
+#define LZX_MAX_NUM_EXTRA_BITS		17
+
+/* Maximum lengths (in bits) for length-limited Huffman code construction.  */
+#define LZX_MAX_MAIN_CODEWORD_LEN	16
+#define LZX_MAX_LEN_CODEWORD_LEN	16
+#define LZX_MAX_PRE_CODEWORD_LEN	((1 << LZX_PRECODE_ELEMENT_SIZE) - 1)
+#define LZX_MAX_ALIGNED_CODEWORD_LEN	((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1)
+
+/* For LZX-compressed blocks in WIM resources, this value is always used as the
+ * filesize parameter for the call instruction (0xe8 byte) preprocessing, even
+ * though the blocks themselves are not this size, and the size of the actual
+ * file resource in the WIM file is very likely to be something entirely
+ * different as well.  */
+#define LZX_WIM_MAGIC_FILESIZE	12000000
+
+/* Assumed LZX block size when the encoded block size begins with a 0 bit.
+ * This is probably WIM-specific.  */
+#define LZX_DEFAULT_BLOCK_SIZE	32768
+
+/* Number of offsets in the recent (or "repeat") offsets queue.  */
+#define LZX_NUM_RECENT_OFFSETS	3
+
+/* An offset of n bytes is actually encoded as (n + LZX_OFFSET_ADJUSTMENT).  */
+#define LZX_OFFSET_ADJUSTMENT	(LZX_NUM_RECENT_OFFSETS - 1)
+
+#endif /* _LZX_CONSTANTS_H */
--- a/src/lzx_decompress.c
+++ b/src/lzx_decompress.c
@ -1,10 +1,11 @@
 /*
- * lzx_decompress.c - A decompressor for the LZX compression format, which can
- * be used in "System Compressed" files.  This is based on the code from wimlib.
- * This code only supports a window size (dictionary size) of 32768 bytes, since
- * this is the only size used in System Compression.
+ * lzx_decompress.c
 *
- * Copyright (C) 2015 Eric Biggers
+ * A decompressor for the LZX compression format, as used in WIM files.
+ */
+
+/*
+ * Copyright (C) 2012-2016 Eric Biggers
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
@ -20,276 +21,157 @@
 * this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+/*
+ * LZX is an LZ77 and Huffman-code based compression format that has many
+ * similarities to DEFLATE (the format used by zlib/gzip).  The compression
+ * ratio is as good or better than DEFLATE.  See lzx_compress.c for a format
+ * overview, and see https://en.wikipedia.org/wiki/LZX_(algorithm) for a
+ * historical overview.  Here I make some pragmatic notes.
+ *
+ * The old specification for LZX is the document "Microsoft LZX Data Compression
+ * Format" (1997).  It defines the LZX format as used in cabinet files.  Allowed
+ * window sizes are 2^n where 15 <= n <= 21.  However, this document contains
+ * several errors, so don't read too much into it...
+ *
+ * The new specification for LZX is the document "[MS-PATCH]: LZX DELTA
+ * Compression and Decompression" (2014).  It defines the LZX format as used by
+ * Microsoft's binary patcher.  It corrects several errors in the 1997 document
+ * and extends the format in several ways --- namely, optional reference data,
+ * up to 2^25 byte windows, and longer match lengths.
+ *
+ * WIM files use a more restricted form of LZX.  No LZX DELTA extensions are
+ * present, the window is not "sliding", E8 preprocessing is done
+ * unconditionally with a fixed file size, and the maximum window size is always
+ * 2^15 bytes (equal to the size of each "chunk" in a compressed WIM resource).
+ * This code is primarily intended to implement this form of LZX.  But although
+ * not compatible with WIMGAPI, this code also supports maximum window sizes up
+ * to 2^21 bytes.
+ *
+ * TODO: Add support for window sizes up to 2^25 bytes.
+ */
+
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#  include "config.h"
 #endif

-#include <errno.h>
-#include <stdlib.h>
 #include <string.h>

-#include <ntfs-3g/misc.h>
-
 #include "decompress_common.h"
+#include "lzx_common.h"
 #include "system_compression.h"

-/* Number of literal byte values  */
-#define LZX_NUM_CHARS			256
-
-/* The smallest and largest allowed match lengths  */
-#define LZX_MIN_MATCH_LEN		2
-#define LZX_MAX_MATCH_LEN		257
-
-/* Number of distinct match lengths that can be represented  */
-#define LZX_NUM_LENS			(LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
-
-/* Number of match lengths for which no length symbol is required  */
-#define LZX_NUM_PRIMARY_LENS		7
-#define LZX_NUM_LEN_HEADERS		(LZX_NUM_PRIMARY_LENS + 1)
-
-/* Valid values of the 3-bit block type field  */
-#define LZX_BLOCKTYPE_VERBATIM		1
-#define LZX_BLOCKTYPE_ALIGNED		2
-#define LZX_BLOCKTYPE_UNCOMPRESSED	3
-
-/* Number of offset slots for a window size of 32768  */
-#define LZX_NUM_OFFSET_SLOTS		30
-
-/* Number of symbols in the main code for a window size of 32768  */
-#define LZX_MAINCODE_NUM_SYMBOLS	\
-	(LZX_NUM_CHARS + (LZX_NUM_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS))
-
-/* Number of symbols in the length code  */
-#define LZX_LENCODE_NUM_SYMBOLS		(LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS)
-
-/* Number of symbols in the precode  */
-#define LZX_PRECODE_NUM_SYMBOLS		20
-
-/* Number of bits in which each precode codeword length is represented  */
-#define LZX_PRECODE_ELEMENT_SIZE	4
-
-/* Number of low-order bits of each match offset that are entropy-encoded in
- * aligned offset blocks  */
-#define LZX_NUM_ALIGNED_OFFSET_BITS	3
-
-/* Number of symbols in the aligned offset code  */
-#define LZX_ALIGNEDCODE_NUM_SYMBOLS	(1 << LZX_NUM_ALIGNED_OFFSET_BITS)
-
-/* Mask for the match offset bits that are entropy-encoded in aligned offset
- * blocks  */
-#define LZX_ALIGNED_OFFSET_BITMASK	((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1)
-
-/* Number of bits in which each aligned offset codeword length is represented  */
-#define LZX_ALIGNEDCODE_ELEMENT_SIZE	3
-
-/* Maximum lengths (in bits) of the codewords in each Huffman code  */
-#define LZX_MAX_MAIN_CODEWORD_LEN	16
-#define LZX_MAX_LEN_CODEWORD_LEN	16
-#define LZX_MAX_PRE_CODEWORD_LEN	((1 << LZX_PRECODE_ELEMENT_SIZE) - 1)
-#define LZX_MAX_ALIGNED_CODEWORD_LEN	((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1)
-
-/* The default "filesize" value used in pre/post-processing.  In the LZX format
- * used in cabinet files this value must be given to the decompressor, whereas
- * in the LZX format used in WIM files and system-compressed files this value is
- * fixed at 12000000.  */
-#define LZX_DEFAULT_FILESIZE		12000000
-
-/* Assumed block size when the encoded block size begins with a 0 bit.  */
-#define LZX_DEFAULT_BLOCK_SIZE		32768
-
-/* Number of offsets in the recent (or "repeat") offsets queue.  */
-#define LZX_NUM_RECENT_OFFSETS		3
-
 /* These values are chosen for fast decompression.  */
 #define LZX_MAINCODE_TABLEBITS		11
-#define LZX_LENCODE_TABLEBITS		10
+#define LZX_LENCODE_TABLEBITS		9
 #define LZX_PRECODE_TABLEBITS		6
 #define LZX_ALIGNEDCODE_TABLEBITS	7

-#define LZX_READ_LENS_MAX_OVERRUN	50
+#define LZX_READ_LENS_MAX_OVERRUN 50

-/* Mapping: offset slot => first match offset that uses that offset slot.
- */
-static const u32 lzx_offset_slot_base[LZX_NUM_OFFSET_SLOTS + 1] = {
-	0      , 1      , 2      , 3      , 4      ,	/* 0  --- 4  */
-	6      , 8      , 12     , 16     , 24     ,	/* 5  --- 9  */
-	32     , 48     , 64     , 96     , 128    ,    /* 10 --- 14 */
-	192    , 256    , 384    , 512    , 768    ,    /* 15 --- 19 */
-	1024   , 1536   , 2048   , 3072   , 4096   ,    /* 20 --- 24 */
-	6144   , 8192   , 12288  , 16384  , 24576  ,    /* 25 --- 29 */
-	32768  ,					/* extra     */
-};
-
-/* Mapping: offset slot => how many extra bits must be read and added to the
- * corresponding offset slot base to decode the match offset.  */
-static const u8 lzx_extra_offset_bits[LZX_NUM_OFFSET_SLOTS] = {
-	0 , 0 , 0 , 0 , 1 ,
-	1 , 2 , 2 , 3 , 3 ,
-	4 , 4 , 5 , 5 , 6 ,
-	6 , 7 , 7 , 8 , 8 ,
-	9 , 9 , 10, 10, 11,
-	11, 12, 12, 13, 13,
-};
-
-/* Reusable heap-allocated memory for LZX decompression  */
 struct lzx_decompressor {

-	/* Huffman decoding tables, and arrays that map symbols to codeword
-	 * lengths  */
+	DECODE_TABLE(maincode_decode_table, LZX_MAINCODE_MAX_NUM_SYMBOLS,
+		     LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN);
+	u8 maincode_lens[LZX_MAINCODE_MAX_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];

-	u16 maincode_decode_table[(1 << LZX_MAINCODE_TABLEBITS) +
-					(LZX_MAINCODE_NUM_SYMBOLS * 2)];
-	u8 maincode_lens[LZX_MAINCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];
-
-
-	u16 lencode_decode_table[(1 << LZX_LENCODE_TABLEBITS) +
-					(LZX_LENCODE_NUM_SYMBOLS * 2)];
+	DECODE_TABLE(lencode_decode_table, LZX_LENCODE_NUM_SYMBOLS,
+		     LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN);
 	u8 lencode_lens[LZX_LENCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];

+	union {
+		DECODE_TABLE(alignedcode_decode_table, LZX_ALIGNEDCODE_NUM_SYMBOLS,
+			     LZX_ALIGNEDCODE_TABLEBITS, LZX_MAX_ALIGNED_CODEWORD_LEN);
+		u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+	};

-	u16 alignedcode_decode_table[(1 << LZX_ALIGNEDCODE_TABLEBITS) +
-					(LZX_ALIGNEDCODE_NUM_SYMBOLS * 2)];
-	u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+	union {
+		DECODE_TABLE(precode_decode_table, LZX_PRECODE_NUM_SYMBOLS,
+			     LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN);
+		u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
+		u8 extra_offset_bits[LZX_MAX_OFFSET_SLOTS];
+	};

-	u16 precode_decode_table[(1 << LZX_PRECODE_TABLEBITS) +
-				 (LZX_PRECODE_NUM_SYMBOLS * 2)];
-	u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
+	union {
+		DECODE_TABLE_WORKING_SPACE(maincode_working_space,
+					   LZX_MAINCODE_MAX_NUM_SYMBOLS,
+					   LZX_MAX_MAIN_CODEWORD_LEN);
+		DECODE_TABLE_WORKING_SPACE(lencode_working_space,
+					   LZX_LENCODE_NUM_SYMBOLS,
+					   LZX_MAX_LEN_CODEWORD_LEN);
+		DECODE_TABLE_WORKING_SPACE(alignedcode_working_space,
+					   LZX_ALIGNEDCODE_NUM_SYMBOLS,
+					   LZX_MAX_ALIGNED_CODEWORD_LEN);
+		DECODE_TABLE_WORKING_SPACE(precode_working_space,
+					   LZX_PRECODE_NUM_SYMBOLS,
+					   LZX_MAX_PRE_CODEWORD_LEN);
+	};

-	/* Temporary space for make_huffman_decode_table()  */
-	u16 working_space[2 * (1 + LZX_MAX_MAIN_CODEWORD_LEN) +
-			  LZX_MAINCODE_NUM_SYMBOLS];
-};
+	unsigned window_order;
+	unsigned num_main_syms;

-static void undo_e8_translation(void *target, s32 input_pos)
-{
-	s32 abs_offset, rel_offset;
+	/* Like lzx_extra_offset_bits[], but does not include the entropy-coded
+	 * bits of aligned offset blocks */
+	u8 extra_offset_bits_minus_aligned[LZX_MAX_OFFSET_SLOTS];

-	abs_offset = get_unaligned_le32(target);
-	if (abs_offset >= 0) {
-		if (abs_offset < LZX_DEFAULT_FILESIZE) {
-			/* "good translation" */
-			rel_offset = abs_offset - input_pos;
-			put_unaligned_le32(rel_offset, target);
-		}
-	} else {
-		if (abs_offset >= -input_pos) {
-			/* "compensating translation" */
-			rel_offset = abs_offset + LZX_DEFAULT_FILESIZE;
-			put_unaligned_le32(rel_offset, target);
-		}
-	}
-}
+} _aligned_attribute(DECODE_TABLE_ALIGNMENT);

-/*
- * Undo the 'E8' preprocessing used in LZX.  Before compression, the
- * uncompressed data was preprocessed by changing the targets of suspected x86
- * CALL instructions from relative offsets to absolute offsets.  After
- * match/literal decoding, the decompressor must undo the translation.
- */
-static void lzx_postprocess(u8 *data, u32 size)
-{
-	/*
-	 * A worthwhile optimization is to push the end-of-buffer check into the
-	 * relatively rare E8 case.  This is possible if we replace the last six
-	 * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte
-	 * before reaching end-of-buffer.  In addition, this scheme guarantees
-	 * that no translation can begin following an E8 byte in the last 10
-	 * bytes because a 4-byte offset containing E8 as its high byte is a
-	 * large negative number that is not valid for translation.  That is
-	 * exactly what we need.
-	 */
-	u8 *tail;
-	u8 saved_bytes[6];
-	u8 *p;
-
-	if (size <= 10)
-		return;
-
-	tail = &data[size - 6];
-	memcpy(saved_bytes, tail, 6);
-	memset(tail, 0xE8, 6);
-	p = data;
-	for (;;) {
-		while (*p != 0xE8)
-			p++;
-		if (p >= tail)
-			break;
-		undo_e8_translation(p + 1, p - data);
-		p += 5;
-	}
-	memcpy(tail, saved_bytes, 6);
-}
-
-/* Read a Huffman-encoded symbol using the precode.  */
-static forceinline unsigned read_presym(const struct lzx_decompressor *d,
-					struct input_bitstream *is)
+/* Read a Huffman-encoded symbol using the precode. */
+static forceinline unsigned
+read_presym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
 	return read_huffsym(is, d->precode_decode_table,
 			    LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN);
 }

-/* Read a Huffman-encoded symbol using the main code.  */
-static forceinline unsigned read_mainsym(const struct lzx_decompressor *d,
-					 struct input_bitstream *is)
+/* Read a Huffman-encoded symbol using the main code. */
+static forceinline unsigned
+read_mainsym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
 	return read_huffsym(is, d->maincode_decode_table,
 			    LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN);
 }

-/* Read a Huffman-encoded symbol using the length code.  */
-static forceinline unsigned read_lensym(const struct lzx_decompressor *d,
-					struct input_bitstream *is)
+/* Read a Huffman-encoded symbol using the length code. */
+static forceinline unsigned
+read_lensym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
 	return read_huffsym(is, d->lencode_decode_table,
 			    LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN);
 }

-/* Read a Huffman-encoded symbol using the aligned offset code.  */
-static forceinline unsigned read_alignedsym(const struct lzx_decompressor *d,
-					    struct input_bitstream *is)
+/* Read a Huffman-encoded symbol using the aligned offset code. */
+static forceinline unsigned
+read_alignedsym(const struct lzx_decompressor *d, struct input_bitstream *is)
 {
 	return read_huffsym(is, d->alignedcode_decode_table,
-			    LZX_ALIGNEDCODE_TABLEBITS,
-			    LZX_MAX_ALIGNED_CODEWORD_LEN);
+			    LZX_ALIGNEDCODE_TABLEBITS, LZX_MAX_ALIGNED_CODEWORD_LEN);
 }

 /*
- * Read the precode from the compressed input bitstream, then use it to decode
- * @num_lens codeword length values.
- *
- * @is:		The input bitstream.
- *
- * @lens:	An array that contains the length values from the previous time
- *		the codeword lengths for this Huffman code were read, or all 0's
- *		if this is the first time.  This array must have at least
- *		(@num_lens + LZX_READ_LENS_MAX_OVERRUN) entries.
- *
- * @num_lens:	Number of length values to decode.
- *
- * Returns 0 on success, or -1 if the data was invalid.
+ * Read a precode from the compressed input bitstream, then use it to decode
+ * @num_lens codeword length values and write them to @lens.
 */
-static int lzx_read_codeword_lens(struct lzx_decompressor *d,
-				  struct input_bitstream *is,
-				  u8 *lens, unsigned num_lens)
+static int
+lzx_read_codeword_lens(struct lzx_decompressor *d, struct input_bitstream *is,
+		       u8 *lens, unsigned num_lens)
 {
 	u8 *len_ptr = lens;
 	u8 *lens_end = lens + num_lens;
-	int i;

-	/* Read the lengths of the precode codewords.  These are given
-	 * explicitly.  */
-	for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) {
+	/* Read the lengths of the precode codewords.  These are stored
+	 * explicitly. */
+	for (int i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) {
 		d->precode_lens[i] =
 			bitstream_read_bits(is, LZX_PRECODE_ELEMENT_SIZE);
 	}

-	/* Make the decoding table for the precode.  */
+	/* Build the decoding table for the precode. */
 	if (make_huffman_decode_table(d->precode_decode_table,
 				      LZX_PRECODE_NUM_SYMBOLS,
 				      LZX_PRECODE_TABLEBITS,
 				      d->precode_lens,
 				      LZX_MAX_PRE_CODEWORD_LEN,
-				      d->working_space))
+				      d->precode_working_space))
 		return -1;

 	/* Decode the codeword lengths.  */
@ -322,7 +204,7 @@ static int lzx_read_codeword_lens(struct lzx_decompressor *d,
 				/* Run of identical lengths  */
 				run_len = 4 + bitstream_read_bits(is, 1);
 				presym = read_presym(d, is);
-				if (presym > 17)
+				if (unlikely(presym > 17))
 					return -1;
 				len = *len_ptr - presym;
 				if ((s8)len < 0)
@ -332,7 +214,8 @@ static int lzx_read_codeword_lens(struct lzx_decompressor *d,
 			do {
 				*len_ptr++ = len;
 			} while (--run_len);
-			/* Worst case overrun is when presym == 18,
+			/*
+			 * The worst case overrun is when presym == 18,
 			 * run_len == 20 + 31, and only 1 length was remaining.
 			 * So LZX_READ_LENS_MAX_OVERRUN == 50.
 			 *
@ -340,7 +223,8 @@ static int lzx_read_codeword_lens(struct lzx_decompressor *d,
 			 * can corrupt the previous values in the second half.
 			 * This doesn't really matter because the resulting
 			 * lengths will still be in range, and data that
-			 * generates overruns is invalid anyway.  */
+			 * generates overruns is invalid anyway.
+			 */
 		}
 	} while (len_ptr < lens_end);

@ -348,115 +232,82 @@ static int lzx_read_codeword_lens(struct lzx_decompressor *d,
 }

 /*
- * Read the header of an LZX block and save the block type and (uncompressed)
- * size in *block_type_ret and *block_size_ret, respectively.
- *
- * If the block is compressed, also update the Huffman decode @tables with the
- * new Huffman codes.  If the block is uncompressed, also update the match
- * offset @queue with the new match offsets.
- *
- * Return 0 on success, or -1 if the data was invalid.
+ * Read the header of an LZX block.  For all block types, the block type and
+ * size is saved in *block_type_ret and *block_size_ret, respectively.  For
+ * compressed blocks, the codeword lengths are also saved.  For uncompressed
+ * blocks, the recent offsets queue is also updated.
 */
-static int lzx_read_block_header(struct lzx_decompressor *d,
-				 struct input_bitstream *is,
-				 int *block_type_ret,
-				 u32 *block_size_ret,
-				 u32 recent_offsets[])
+static int
+lzx_read_block_header(struct lzx_decompressor *d, struct input_bitstream *is,
+		      u32 recent_offsets[], int *block_type_ret,
+		      u32 *block_size_ret)
 {
 	int block_type;
 	u32 block_size;
-	int i;

 	bitstream_ensure_bits(is, 4);

-	/* The first three bits tell us what kind of block it is, and should be
-	 * one of the LZX_BLOCKTYPE_* values.  */
+	/* Read the block type. */
 	block_type = bitstream_pop_bits(is, 3);

-	/* Read the block size.  */
+	/* Read the block size. */
 	if (bitstream_pop_bits(is, 1)) {
 		block_size = LZX_DEFAULT_BLOCK_SIZE;
 	} else {
-		block_size = 0;
-		block_size |= bitstream_read_bits(is, 8);
-		block_size <<= 8;
-		block_size |= bitstream_read_bits(is, 8);
+		block_size = bitstream_read_bits(is, 16);
+		if (d->window_order >= 16) {
+			block_size <<= 8;
+			block_size |= bitstream_read_bits(is, 8);
+		}
 	}

 	switch (block_type) {

 	case LZX_BLOCKTYPE_ALIGNED:

-		/* Read the aligned offset code and prepare its decode table.
-		 */
+		/* Read the aligned offset codeword lengths. */

-		for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+		for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
 			d->alignedcode_lens[i] =
 				bitstream_read_bits(is,
 						    LZX_ALIGNEDCODE_ELEMENT_SIZE);
 		}

-		if (make_huffman_decode_table(d->alignedcode_decode_table,
-					      LZX_ALIGNEDCODE_NUM_SYMBOLS,
-					      LZX_ALIGNEDCODE_TABLEBITS,
-					      d->alignedcode_lens,
-					      LZX_MAX_ALIGNED_CODEWORD_LEN,
-					      d->working_space))
-			return -1;
-
 		/* Fall though, since the rest of the header for aligned offset
 		 * blocks is the same as that for verbatim blocks.  */

 	case LZX_BLOCKTYPE_VERBATIM:

-		/* Read the main code and prepare its decode table.
-		 *
-		 * Note that the codeword lengths in the main code are encoded
-		 * in two parts: one part for literal symbols, and one part for
-		 * match symbols.  */
+		/* Read the main codeword lengths, which are divided into two
+		 * parts: literal symbols and match headers. */

 		if (lzx_read_codeword_lens(d, is, d->maincode_lens,
 					   LZX_NUM_CHARS))
 			return -1;

-		if (lzx_read_codeword_lens(d, is,
-					   d->maincode_lens + LZX_NUM_CHARS,
-					   LZX_MAINCODE_NUM_SYMBOLS - LZX_NUM_CHARS))
+		if (lzx_read_codeword_lens(d, is, d->maincode_lens + LZX_NUM_CHARS,
+					   d->num_main_syms - LZX_NUM_CHARS))
 			return -1;

-		if (make_huffman_decode_table(d->maincode_decode_table,
-					      LZX_MAINCODE_NUM_SYMBOLS,
-					      LZX_MAINCODE_TABLEBITS,
-					      d->maincode_lens,
-					      LZX_MAX_MAIN_CODEWORD_LEN,
-					      d->working_space))
-			return -1;

-		/* Read the length code and prepare its decode table.  */
+		/* Read the length codeword lengths. */

 		if (lzx_read_codeword_lens(d, is, d->lencode_lens,
 					   LZX_LENCODE_NUM_SYMBOLS))
 			return -1;

-		if (make_huffman_decode_table(d->lencode_decode_table,
-					      LZX_LENCODE_NUM_SYMBOLS,
-					      LZX_LENCODE_TABLEBITS,
-					      d->lencode_lens,
-					      LZX_MAX_LEN_CODEWORD_LEN,
-					      d->working_space))
-			return -1;
-
 		break;

 	case LZX_BLOCKTYPE_UNCOMPRESSED:
-
-		/* Before reading the three recent offsets from the uncompressed
-		 * block header, the stream must be aligned on a 16-bit
-		 * boundary.  But if the stream is *already* aligned, then the
-		 * next 16 bits must be discarded.  */
+		/*
+		 * The header of an uncompressed block contains new values for
+		 * the recent offsets queue, starting on the next 16-bit
+		 * boundary in the bitstream.  Careful: if the stream is
+		 * *already* aligned, the correct thing to do is to throw away
+		 * the next 16 bits (this is probably a mistake in the format).
+		 */
 		bitstream_ensure_bits(is, 1);
 		bitstream_align(is);
-
 		recent_offsets[0] = bitstream_read_u32(is);
 		recent_offsets[1] = bitstream_read_u32(is);
 		recent_offsets[2] = bitstream_read_u32(is);
@ -477,202 +328,218 @@ static int lzx_read_block_header(struct lzx_decompressor *d,
 	return 0;
 }

-/* Decompress a block of LZX-compressed data.  */
-static int lzx_decompress_block(const struct lzx_decompressor *d,
-				struct input_bitstream *is,
-				int block_type, u32 block_size,
-				u8 * const out_begin, u8 *out_next,
-				u32 recent_offsets[])
+/* Decompress a block of LZX-compressed data. */
+static int
+lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *is,
+		     int block_type, u32 block_size,
+		     u8 * const out_begin, u8 *out_next, u32 recent_offsets[])
 {
 	u8 * const block_end = out_next + block_size;
-	unsigned ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED);
+	unsigned min_aligned_offset_slot;
+
+	/*
+	 * Build the Huffman decode tables.  We always need to build the main
+	 * and length decode tables.  For aligned blocks we additionally need to
+	 * build the aligned offset decode table.
+	 */
+
+	if (make_huffman_decode_table(d->maincode_decode_table,
+				      d->num_main_syms,
+				      LZX_MAINCODE_TABLEBITS,
+				      d->maincode_lens,
+				      LZX_MAX_MAIN_CODEWORD_LEN,
+				      d->maincode_working_space))
+		return -1;
+
+	if (make_huffman_decode_table(d->lencode_decode_table,
+				      LZX_LENCODE_NUM_SYMBOLS,
+				      LZX_LENCODE_TABLEBITS,
+				      d->lencode_lens,
+				      LZX_MAX_LEN_CODEWORD_LEN,
+				      d->lencode_working_space))
+		return -1;
+
+	if (block_type == LZX_BLOCKTYPE_ALIGNED) {
+		if (make_huffman_decode_table(d->alignedcode_decode_table,
+					      LZX_ALIGNEDCODE_NUM_SYMBOLS,
+					      LZX_ALIGNEDCODE_TABLEBITS,
+					      d->alignedcode_lens,
+					      LZX_MAX_ALIGNED_CODEWORD_LEN,
+					      d->alignedcode_working_space))
+			return -1;
+		min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
+		memcpy(d->extra_offset_bits, d->extra_offset_bits_minus_aligned,
+		       sizeof(lzx_extra_offset_bits));
+	} else {
+		min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
+		memcpy(d->extra_offset_bits, lzx_extra_offset_bits,
+		       sizeof(lzx_extra_offset_bits));
+	}
+
+	/* Decode the literals and matches. */

 	do {
 		unsigned mainsym;
-		unsigned match_len;
-		u32 match_offset;
+		unsigned length;
+		u32 offset;
 		unsigned offset_slot;
-		unsigned num_extra_bits;

 		mainsym = read_mainsym(d, is);
 		if (mainsym < LZX_NUM_CHARS) {
-			/* Literal  */
+			/* Literal */
 			*out_next++ = mainsym;
 			continue;
 		}

-		/* Match  */
+		/* Match */

 		/* Decode the length header and offset slot.  */
-		mainsym -= LZX_NUM_CHARS;
-		match_len = mainsym % LZX_NUM_LEN_HEADERS;
-		offset_slot = mainsym / LZX_NUM_LEN_HEADERS;
+		STATIC_ASSERT(LZX_NUM_CHARS % LZX_NUM_LEN_HEADERS == 0);
+		length = mainsym % LZX_NUM_LEN_HEADERS;
+		offset_slot = (mainsym - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS;

 		/* If needed, read a length symbol to decode the full length. */
-		if (match_len == LZX_NUM_PRIMARY_LENS)
-			match_len += read_lensym(d, is);
-		match_len += LZX_MIN_MATCH_LEN;
+		if (length == LZX_NUM_PRIMARY_LENS)
+			length += read_lensym(d, is);
+		length += LZX_MIN_MATCH_LEN;

 		if (offset_slot < LZX_NUM_RECENT_OFFSETS) {
 			/* Repeat offset  */

 			/* Note: This isn't a real LRU queue, since using the R2
-			 * offset doesn't bump the R1 offset down to R2.  This
-			 * quirk allows all 3 recent offsets to be handled by
-			 * the same code.  (For R0, the swap is a no-op.)  */
-			match_offset = recent_offsets[offset_slot];
+			 * offset doesn't bump the R1 offset down to R2. */
+			offset = recent_offsets[offset_slot];
 			recent_offsets[offset_slot] = recent_offsets[0];
-			recent_offsets[0] = match_offset;
 		} else {
 			/* Explicit offset  */
-
-			/* Look up the number of extra bits that need to be read
-			 * to decode offsets with this offset slot.  */
-			num_extra_bits = lzx_extra_offset_bits[offset_slot];
-
-			/* Start with the offset slot base value.  */
-			match_offset = lzx_offset_slot_base[offset_slot];
-
-			/* In aligned offset blocks, the low-order 3 bits of
-			 * each offset are encoded using the aligned offset
-			 * code.  Otherwise, all the extra bits are literal.  */
-
-			if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) {
-				match_offset +=
-					bitstream_read_bits(is, num_extra_bits -
-								LZX_NUM_ALIGNED_OFFSET_BITS)
-							<< LZX_NUM_ALIGNED_OFFSET_BITS;
-				match_offset += read_alignedsym(d, is);
-			} else {
-				match_offset += bitstream_read_bits(is, num_extra_bits);
+			offset = bitstream_read_bits(is, d->extra_offset_bits[offset_slot]);
+			if (offset_slot >= min_aligned_offset_slot) {
+				offset = (offset << LZX_NUM_ALIGNED_OFFSET_BITS) |
+					 read_alignedsym(d, is);
 			}
+			offset += lzx_offset_slot_base[offset_slot];

-			/* Adjust the offset.  */
-			match_offset -= (LZX_NUM_RECENT_OFFSETS - 1);
-
-			/* Update the recent offsets.  */
+			/* Update the match offset LRU queue.  */
+			STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
 			recent_offsets[2] = recent_offsets[1];
 			recent_offsets[1] = recent_offsets[0];
-			recent_offsets[0] = match_offset;
 		}
+		recent_offsets[0] = offset;

-		/* Validate the match, then copy it to the current position.  */
-
-		if (match_len > (size_t)(block_end - out_next))
+		/* Validate the match and copy it to the current position.  */
+		if (unlikely(lz_copy(length, offset, out_begin,
+				     out_next, block_end, LZX_MIN_MATCH_LEN)))
 			return -1;
-
-		if (match_offset > (size_t)(out_next - out_begin))
-			return -1;
-
-		out_next = lz_copy(out_next, match_len, match_offset,
-				   block_end, LZX_MIN_MATCH_LEN);
-
+		out_next += length;
 	} while (out_next != block_end);

 	return 0;
 }

-/*
- * lzx_allocate_decompressor - Allocate an LZX decompressor
- *
- * Return the pointer to the decompressor on success, or return NULL and set
- * errno on failure.
- */
-struct lzx_decompressor *lzx_allocate_decompressor(void)
+int
+lzx_decompress(struct lzx_decompressor *restrict d,
+	       const void *restrict compressed_data, size_t compressed_size,
+	       void *restrict uncompressed_data, size_t uncompressed_size)
 {
-	return ntfs_malloc(sizeof(struct lzx_decompressor));
-}
-
-/*
- * lzx_decompress - Decompress a buffer of LZX-compressed data
- *
- * @decompressor:      A decompressor allocated with lzx_allocate_decompressor()
- * @compressed_data:	The buffer of data to decompress
- * @compressed_size:	Number of bytes of compressed data
- * @uncompressed_data:	The buffer in which to store the decompressed data
- * @uncompressed_size:	The number of bytes the data decompresses into
- *
- * Return 0 on success, or return -1 and set errno on failure.
- */
-int lzx_decompress(struct lzx_decompressor *decompressor,
-		   const void *compressed_data, size_t compressed_size,
-		   void *uncompressed_data, size_t uncompressed_size)
-{
-	struct lzx_decompressor *d = decompressor;
 	u8 * const out_begin = uncompressed_data;
 	u8 *out_next = out_begin;
 	u8 * const out_end = out_begin + uncompressed_size;
 	struct input_bitstream is;
+	STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
 	u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
-	int e8_status = 0;
+	unsigned may_have_e8_byte = 0;

 	init_input_bitstream(&is, compressed_data, compressed_size);

-	/* Codeword lengths begin as all 0's for delta encoding purposes.  */
-	memset(d->maincode_lens, 0, LZX_MAINCODE_NUM_SYMBOLS);
+	/* Codeword lengths begin as all 0's for delta encoding purposes. */
+	memset(d->maincode_lens, 0, d->num_main_syms);
 	memset(d->lencode_lens, 0, LZX_LENCODE_NUM_SYMBOLS);

-	/* Decompress blocks until we have all the uncompressed data.  */
+	/* Decompress blocks until we have all the uncompressed data. */

 	while (out_next != out_end) {
 		int block_type;
 		u32 block_size;

-		if (lzx_read_block_header(d, &is, &block_type, &block_size,
-					  recent_offsets))
-			goto invalid;
+		if (lzx_read_block_header(d, &is, recent_offsets,
+					  &block_type, &block_size))
+			return -1;

-		if (block_size < 1 || block_size > (size_t)(out_end - out_next))
-			goto invalid;
+		if (block_size < 1 || block_size > out_end - out_next)
+			return -1;

-		if (block_type != LZX_BLOCKTYPE_UNCOMPRESSED) {
+		if (likely(block_type != LZX_BLOCKTYPE_UNCOMPRESSED)) {

-			/* Compressed block  */
-
-			if (lzx_decompress_block(d,
-						 &is,
-						 block_type,
-						 block_size,
-						 out_begin,
-						 out_next,
+			/* Compressed block */
+			if (lzx_decompress_block(d, &is, block_type, block_size,
+						 out_begin, out_next,
 						 recent_offsets))
-				goto invalid;
+				return -1;

-			e8_status |= d->maincode_lens[0xe8];
-			out_next += block_size;
+			/* If the first E8 byte was in this block, then it must
+			 * have been encoded as a literal using mainsym E8. */
+			may_have_e8_byte |= d->maincode_lens[0xE8];
 		} else {
-			/* Uncompressed block  */

-			out_next = bitstream_read_bytes(&is, out_next,
-							block_size);
-			if (!out_next)
-				goto invalid;
+			/* Uncompressed block */
+			if (bitstream_read_bytes(&is, out_next, block_size))
+				return -1;

+			/* Re-align the bitstream if needed. */
 			if (block_size & 1)
 				bitstream_read_byte(&is);

-			e8_status = 1;
+			/* There may have been an E8 byte in the block. */
+			may_have_e8_byte = 1;
 		}
+		out_next += block_size;
 	}

-	/* Postprocess the data unless it cannot possibly contain 0xe8 bytes. */
-	if (e8_status)
+	/* Postprocess the data unless it cannot possibly contain E8 bytes. */
+	if (may_have_e8_byte)
 		lzx_postprocess(uncompressed_data, uncompressed_size);

 	return 0;
-
-invalid:
-	errno = EINVAL;
-	return -1;
 }

-/*
- * lzx_free_decompressor - Free an LZX decompressor
- *
- * @decompressor:       A decompressor that was allocated with
- *			lzx_allocate_decompressor(), or NULL.
- */
-void lzx_free_decompressor(struct lzx_decompressor *decompressor)
+struct lzx_decompressor *
+lzx_allocate_decompressor(size_t max_block_size)
 {
-	free(decompressor);
+	unsigned window_order;
+	struct lzx_decompressor *d;
+
+	window_order = lzx_get_window_order(max_block_size);
+	if (window_order == 0) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	d = aligned_malloc(sizeof(*d), DECODE_TABLE_ALIGNMENT);
+	if (!d)
+		return NULL;
+
+	d->window_order = window_order;
+	d->num_main_syms = lzx_get_num_main_syms(window_order);
+
+	/* Initialize 'd->extra_offset_bits_minus_aligned'. */
+	STATIC_ASSERT(sizeof(d->extra_offset_bits_minus_aligned) ==
+		      sizeof(lzx_extra_offset_bits));
+	STATIC_ASSERT(sizeof(d->extra_offset_bits) ==
+		      sizeof(lzx_extra_offset_bits));
+	memcpy(d->extra_offset_bits_minus_aligned, lzx_extra_offset_bits,
+	       sizeof(lzx_extra_offset_bits));
+	for (unsigned offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
+	     offset_slot < LZX_MAX_OFFSET_SLOTS; offset_slot++)
+	{
+		d->extra_offset_bits_minus_aligned[offset_slot] -=
+				LZX_NUM_ALIGNED_OFFSET_BITS;
+	}
+
+	return d;
+}
+
+void
+lzx_free_decompressor(struct lzx_decompressor *d)
+{
+	aligned_free(d);
 }
--- a/src/system_compression.c
+++ b/src/system_compression.c
@ -211,7 +211,7 @@ struct ntfs_system_decompression_ctx {
 static int allocate_decompressor(struct ntfs_system_decompression_ctx *ctx)
 {
 	if (ctx->format == FORMAT_LZX)
-		ctx->decompressor = lzx_allocate_decompressor();
+		ctx->decompressor = lzx_allocate_decompressor(32768);
 	else
 		ctx->decompressor = xpress_allocate_decompressor();
 	if (!ctx->decompressor)
@ -590,8 +590,13 @@ static int read_and_decompress_chunk(struct ntfs_system_decompression_ctx *ctx,
 		return 0;

 	/* The chunk was stored compressed.  Decompress its data.  */
-	return decompress(ctx, read_buffer, stored_size,
-			  buffer, uncompressed_size);
+	if (decompress(ctx, read_buffer, stored_size,
+		       buffer, uncompressed_size)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	return 0;
 }

 /* Retrieve a pointer to the uncompressed data of the specified chunk.  On
--- a/src/system_compression.h
+++ b/src/system_compression.h
@ -60,7 +60,8 @@ extern void xpress_free_decompressor(struct xpress_decompressor *decompressor);

 struct lzx_decompressor;

-extern struct lzx_decompressor *lzx_allocate_decompressor(void);
+extern struct lzx_decompressor *
+lzx_allocate_decompressor(size_t max_block_size);

 extern int lzx_decompress(struct lzx_decompressor *decompressor,
 			  const void *compressed_data, size_t compressed_size,
--- a/src/xpress_constants.h
+++ b/src/xpress_constants.h
@ -0,0 +1,22 @@
+/*
+ * xpress_constants.h
+ *
+ * Constants for the XPRESS compression format.
+ */
+
+#ifndef _XPRESS_CONSTANTS_H
+#define _XPRESS_CONSTANTS_H
+
+#define XPRESS_NUM_CHARS	256
+#define XPRESS_NUM_SYMBOLS	512
+#define XPRESS_MAX_CODEWORD_LEN	15
+
+#define XPRESS_END_OF_DATA	256
+
+#define XPRESS_MIN_OFFSET	1
+#define XPRESS_MAX_OFFSET	65535
+
+#define XPRESS_MIN_MATCH_LEN	3
+#define XPRESS_MAX_MATCH_LEN	65538
+
+#endif /* _XPRESS_CONSTANTS_H */
--- a/src/xpress_decompress.c
+++ b/src/xpress_decompress.c
@ -1,9 +1,12 @@
 /*
- * xpress_decompress.c - A decompressor for the XPRESS compression format
- * (Huffman variant), which can be used in "System Compressed" files.  This is
- * based on the code from wimlib.
+ * xpress_decompress.c
 *
- * Copyright (C) 2015 Eric Biggers
+ * A decompressor for the XPRESS compression format (Huffman variant).
+ */
+
+/*
+ *
+ * Copyright (C) 2012-2016 Eric Biggers
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
@ -19,80 +22,85 @@
 * this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+
+/*
+ * The XPRESS compression format is an LZ77 and Huffman-code based algorithm.
+ * That means it is fairly similar to LZX compression, but XPRESS is simpler, so
+ * it is a little faster to compress and decompress.
+ *
+ * The XPRESS compression format is mostly documented in a file called "[MS-XCA]
+ * Xpress Compression Algorithm".  In the MSDN library, it can currently be
+ * found under Open Specifications => Protocols => Windows Protocols => Windows
+ * Server Protocols => [MS-XCA] Xpress Compression Algorithm".  The format in
+ * WIMs is specifically the algorithm labeled as the "LZ77+Huffman Algorithm"
+ * (there apparently are some other versions of XPRESS as well).
+ *
+ * If you are already familiar with the LZ77 algorithm and Huffman coding, the
+ * XPRESS format is fairly simple.  The compressed data begins with 256 bytes
+ * that contain 512 4-bit integers that are the lengths of the symbols in the
+ * Huffman code used for match/literal headers.  In contrast with more
+ * complicated formats such as DEFLATE and LZX, this is the only Huffman code
+ * that is used for the entirety of the XPRESS compressed data, and the codeword
+ * lengths are not encoded with a pretree.
+ *
+ * The rest of the compressed data is Huffman-encoded symbols.  Values 0 through
+ * 255 represent the corresponding literal bytes.  Values 256 through 511
+ * represent matches and may require extra bits or bytes to be read to get the
+ * match offset and match length.
+ *
+ * The trickiest part is probably the way in which literal bytes for match
+ * lengths are interleaved in the bitstream.
+ *
+ * Also, a caveat--- according to Microsoft's documentation for XPRESS,
+ *
+ *	"Some implementation of the decompression algorithm expect an extra
+ *	symbol to mark the end of the data.  Specifically, some implementations
+ *	fail during decompression if the Huffman symbol 256 is not found after
+ *	the actual data."
+ *
+ * This is the case with Microsoft's implementation in WIMGAPI, for example.  So
+ * although our implementation doesn't currently check for this extra symbol,
+ * compressors would be wise to add it.
+ */
+
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#  include "config.h"
 #endif

-#include <errno.h>
-#include <stdlib.h>
-
-#include <ntfs-3g/misc.h>
-
 #include "decompress_common.h"
 #include "system_compression.h"
-
-#define XPRESS_NUM_SYMBOLS	512
-#define XPRESS_MAX_CODEWORD_LEN	15
-#define XPRESS_MIN_MATCH_LEN	3
+#include "xpress_constants.h"

 /* This value is chosen for fast decompression.  */
-#define XPRESS_TABLEBITS 12
+#define XPRESS_TABLEBITS 11

-/* Reusable heap-allocated memory for XPRESS decompression  */
 struct xpress_decompressor {
+	union {
+		DECODE_TABLE(decode_table, XPRESS_NUM_SYMBOLS,
+			     XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN);
+		u8 lens[XPRESS_NUM_SYMBOLS];
+	};
+	DECODE_TABLE_WORKING_SPACE(working_space, XPRESS_NUM_SYMBOLS,
+				   XPRESS_MAX_CODEWORD_LEN);
+} _aligned_attribute(DECODE_TABLE_ALIGNMENT);

-	/* The Huffman decoding table  */
-	u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS];
-
-	/* An array that maps symbols to codeword lengths  */
-	u8 lens[XPRESS_NUM_SYMBOLS];
-
-	/* Temporary space for make_huffman_decode_table()  */
-	u16 working_space[2 * (1 + XPRESS_MAX_CODEWORD_LEN) +
-			  XPRESS_NUM_SYMBOLS];
-};
-
-/*
- * xpress_allocate_decompressor - Allocate an XPRESS decompressor
- *
- * Return the pointer to the decompressor on success, or return NULL and set
- * errno on failure.
- */
-struct xpress_decompressor *xpress_allocate_decompressor(void)
+int
+xpress_decompress(struct xpress_decompressor *restrict d,
+		  const void *restrict compressed_data, size_t compressed_size,
+		  void *restrict uncompressed_data, size_t uncompressed_size)
 {
-	return ntfs_malloc(sizeof(struct xpress_decompressor));
-}
-
-/*
- * xpress_decompress - Decompress a buffer of XPRESS-compressed data
- *
- * @decompressor:       A decompressor that was allocated with
- *			xpress_allocate_decompressor()
- * @compressed_data:	The buffer of data to decompress
- * @compressed_size:	Number of bytes of compressed data
- * @uncompressed_data:	The buffer in which to store the decompressed data
- * @uncompressed_size:	The number of bytes the data decompresses into
- *
- * Return 0 on success, or return -1 and set errno on failure.
- */
-int xpress_decompress(struct xpress_decompressor *decompressor,
-		      const void *compressed_data, size_t compressed_size,
-		      void *uncompressed_data, size_t uncompressed_size)
-{
-	struct xpress_decompressor *d = decompressor;
 	const u8 * const in_begin = compressed_data;
 	u8 * const out_begin = uncompressed_data;
 	u8 *out_next = out_begin;
 	u8 * const out_end = out_begin + uncompressed_size;
 	struct input_bitstream is;
-	unsigned i;

 	/* Read the Huffman codeword lengths.  */
 	if (compressed_size < XPRESS_NUM_SYMBOLS / 2)
-		goto invalid;
-	for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) {
-		d->lens[i*2 + 0] = in_begin[i] & 0xF;
-		d->lens[i*2 + 1] = in_begin[i] >> 4;
+		return -1;
+	for (int i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) {
+		d->lens[2 * i + 0] = in_begin[i] & 0xf;
+		d->lens[2 * i + 1] = in_begin[i] >> 4;
 	}

 	/* Build a decoding table for the Huffman code.  */
@ -100,7 +108,7 @@ int xpress_decompress(struct xpress_decompressor *decompressor,
 				      XPRESS_TABLEBITS, d->lens,
 				      XPRESS_MAX_CODEWORD_LEN,
 				      d->working_space))
-		goto invalid;
+		return -1;

 	/* Decode the matches and literals.  */

@ -115,7 +123,7 @@ int xpress_decompress(struct xpress_decompressor *decompressor,

 		sym = read_huffsym(&is, d->decode_table,
 				   XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN);
-		if (sym < 256) {
+		if (sym < XPRESS_NUM_CHARS) {
 			/* Literal  */
 			*out_next++ = sym;
 		} else {
@ -135,30 +143,26 @@ int xpress_decompress(struct xpress_decompressor *decompressor,
 			}
 			length += XPRESS_MIN_MATCH_LEN;

-			if (offset > (size_t)(out_next - out_begin))
-				goto invalid;
+			if (unlikely(lz_copy(length, offset,
+					     out_begin, out_next, out_end,
+					     XPRESS_MIN_MATCH_LEN)))
+				return -1;

-			if (length > (size_t)(out_end - out_next))
-				goto invalid;
-
-			out_next = lz_copy(out_next, length, offset, out_end,
-					   XPRESS_MIN_MATCH_LEN);
+			out_next += length;
 		}
 	}
 	return 0;
-
-invalid:
-	errno = EINVAL;
-	return -1;
 }

-/*
- * xpress_free_decompressor - Free an XPRESS decompressor
- *
- * @decompressor:       A decompressor that was allocated with
- *			xpress_allocate_decompressor(), or NULL.
- */
-void xpress_free_decompressor(struct xpress_decompressor *decompressor)
+struct xpress_decompressor *
+xpress_allocate_decompressor(void)
 {
-	free(decompressor);
+	return aligned_malloc(sizeof(struct xpress_decompressor),
+			      DECODE_TABLE_ALIGNMENT);
+}
+
+void
+xpress_free_decompressor(struct xpress_decompressor *d)
+{
+	aligned_free(d);
 }