diff --git a/Makefile.am b/Makefile.am index 191ac40..f0dfe3a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -7,15 +7,21 @@ plugindir = $(libdir)/ntfs-3g plugin_LTLIBRARIES = ntfs-plugin-80000017.la ntfs_plugin_80000017_la_SOURCES = \ + src/aligned_malloc.c \ + src/common_defs.h \ src/decompress_common.c \ src/decompress_common.h \ + src/lzx_common.c \ + src/lzx_common.h \ + src/lzx_constants.h \ src/lzx_decompress.c \ src/plugin.c \ src/system_compression.c \ src/system_compression.h \ + src/xpress_constants.h \ src/xpress_decompress.c ntfs_plugin_80000017_la_LDFLAGS = -module -shared -avoid-version ntfs_plugin_80000017_la_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -ntfs_plugin_80000017_la_CFLAGS = $(LIBNTFS_3G_CFLAGS) +ntfs_plugin_80000017_la_CFLAGS = $(LIBNTFS_3G_CFLAGS) -std=gnu99 ntfs_plugin_80000017_la_LIBADD = $(LIBNTFS_3G_LIBS) diff --git a/README.md b/README.md index 45a0072..58b04a2 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,22 @@ directory (`$libdir`). An example full path to the installed plugin is platforms. `make install` will create the plugin directory if it does not already exist. -# License +# Implementation note + +The XPRESS and LZX decompression formats used in system-compressed files are +identical to the formats used in Windows Imaging (WIM) archives. Therefore, for +the system compression plugin I borrowed the XPRESS and LZX decompressors I had +already written for the wimlib project (https://wimlib.net/). I made some +slight modifications for integration purposes. The code in wimlib is currently +licensed LGPLv3+, but I have relicensed the version in this plugin to GPLv2+ for +consistency with NTFS-3G's license. (Public domain portions remain public +domain.) + +# Notices + +The NTFS-3G system compression plugin was written by Eric Biggers, with +contributions from Jean-Pierre André. You can contact the author at +ebiggers3@gmail.com. This software may be redistributed and/or modified under the terms of the GNU General Public License as published by the Free Software Foundation, either diff --git a/configure.ac b/configure.ac index 30f9e58..f315165 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ntfs-3g-system-compression], [0.1], [ebiggers3@gmail.com]) +AC_INIT([ntfs-3g-system-compression], [0.2], [ebiggers3@gmail.com]) AC_CONFIG_SRCDIR([src/plugin.c]) AC_CONFIG_MACRO_DIR([m4]) diff --git a/src/aligned_malloc.c b/src/aligned_malloc.c new file mode 100644 index 0000000..289f160 --- /dev/null +++ b/src/aligned_malloc.c @@ -0,0 +1,34 @@ +/* + * aligned_malloc.c - aligned memory allocation + * + * This file provides portable aligned memory allocation functions that only use + * malloc() and free(). This avoids portability problems with posix_memalign(), + * aligned_alloc(), etc. + */ + +#include + +#include "common_defs.h" + +void * +aligned_malloc(size_t size, size_t alignment) +{ + const uintptr_t mask = alignment - 1; + char *ptr = NULL; + char *raw_ptr; + + raw_ptr = malloc(mask + sizeof(size_t) + size); + if (raw_ptr) { + ptr = (char *)raw_ptr + sizeof(size_t); + ptr = (void *)(((uintptr_t)ptr + mask) & ~mask); + *((size_t *)ptr - 1) = ptr - raw_ptr; + } + return ptr; +} + +void +aligned_free(void *ptr) +{ + if (ptr) + free((char *)ptr - *((size_t *)ptr - 1)); +} diff --git a/src/common_defs.h b/src/common_defs.h new file mode 100644 index 0000000..64024fb --- /dev/null +++ b/src/common_defs.h @@ -0,0 +1,290 @@ +#ifndef _COMMON_DEFS_H +#define _COMMON_DEFS_H + +#include +#include + +/* ========================================================================== */ +/* Type definitions */ +/* ========================================================================== */ + +/* + * Type of a machine word. 'unsigned long' would be logical, but that is only + * 32 bits on x86_64 Windows. The same applies to 'uint_fast32_t'. So the best + * we can do without a bunch of #ifdefs appears to be 'size_t'. + */ +typedef size_t machine_word_t; + +#define WORDBYTES sizeof(machine_word_t) +#define WORDBITS (8 * WORDBYTES) + +/* ========================================================================== */ +/* Compiler-specific definitions */ +/* ========================================================================== */ + +#ifdef __GNUC__ /* GCC, or GCC-compatible compiler such as clang */ +# define forceinline inline __attribute__((always_inline)) +# define likely(expr) __builtin_expect(!!(expr), 1) +# define unlikely(expr) __builtin_expect(!!(expr), 0) +# define _aligned_attribute(n) __attribute__((aligned(n))) +# define bsr32(n) (31 - __builtin_clz(n)) +# define bsr64(n) (63 - __builtin_clzll(n)) +# define bsf32(n) __builtin_ctz(n) +# define bsf64(n) __builtin_ctzll(n) +# ifndef min +# define min(a, b) ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \ + (_a < _b) ? _a : _b; }) +# endif +# ifndef max +# define max(a, b) ({ __typeof__(a) _a = (a); __typeof__(b) _b = (b); \ + (_a > _b) ? _a : _b; }) +# endif + +# define DEFINE_UNALIGNED_TYPE(type) \ +struct type##_unaligned { \ + type v; \ +} __attribute__((packed)); \ + \ +static inline type \ +load_##type##_unaligned(const void *p) \ +{ \ + return ((const struct type##_unaligned *)p)->v; \ +} \ + \ +static inline void \ +store_##type##_unaligned(type val, void *p) \ +{ \ + ((struct type##_unaligned *)p)->v = val; \ +} + +#endif /* __GNUC__ */ + +/* Declare that the annotated function should always be inlined. This might be + * desirable in highly tuned code, e.g. compression codecs */ +#ifndef forceinline +# define forceinline inline +#endif + +/* Hint that the expression is usually true */ +#ifndef likely +# define likely(expr) (expr) +#endif + +/* Hint that the expression is usually false */ +#ifndef unlikely +# define unlikely(expr) (expr) +#endif + +/* Declare that the annotated variable, or variables of the annotated type, are + * to be aligned on n-byte boundaries */ +#ifndef _aligned_attribute +# define _aligned_attribute(n) +#endif + +/* min() and max() macros */ +#ifndef min +# define min(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef max +# define max(a, b) ((a) > (b) ? (a) : (b)) +#endif + +/* STATIC_ASSERT() - verify the truth of an expression at compilation time */ +#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)])) + +/* STATIC_ASSERT_ZERO() - verify the truth of an expression at compilation time + * and also produce a result of value '0' to be used in constant expressions */ +#define STATIC_ASSERT_ZERO(expr) ((int)sizeof(char[-!(expr)])) + +/* UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses + * can be performed efficiently on the target platform. */ +#if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED) +# define UNALIGNED_ACCESS_IS_FAST 1 +#else +# define UNALIGNED_ACCESS_IS_FAST 0 +#endif + +/* + * DEFINE_UNALIGNED_TYPE(type) - a macro that, given an integer type 'type', + * defines load_type_unaligned(addr) and store_type_unaligned(v, addr) functions + * which load and store variables of type 'type' from/to unaligned memory + * addresses. + */ +#ifndef DEFINE_UNALIGNED_TYPE + +#include +/* + * Although memcpy() may seem inefficient, it *usually* gets optimized + * appropriately by modern compilers. It's portable and may be the best we can + * do for a fallback... + */ +#define DEFINE_UNALIGNED_TYPE(type) \ + \ +static forceinline type \ +load_##type##_unaligned(const void *p) \ +{ \ + type v; \ + memcpy(&v, p, sizeof(v)); \ + return v; \ +} \ + \ +static forceinline void \ +store_##type##_unaligned(type v, void *p) \ +{ \ + memcpy(p, &v, sizeof(v)); \ +} + +#endif /* !DEFINE_UNALIGNED_TYPE */ + + +/* ========================================================================== */ +/* Unaligned memory accesses */ +/* ========================================================================== */ + +DEFINE_UNALIGNED_TYPE(le16); +DEFINE_UNALIGNED_TYPE(le32); +DEFINE_UNALIGNED_TYPE(machine_word_t); + +#define load_word_unaligned load_machine_word_t_unaligned +#define store_word_unaligned store_machine_word_t_unaligned + +static inline u16 +get_unaligned_le16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le16_to_cpu(load_le16_unaligned(p)); + else + return ((u16)p[1] << 8) | p[0]; +} + +static inline u32 +get_unaligned_le32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le32_to_cpu(load_le32_unaligned(p)); + else + return ((u32)p[3] << 24) | ((u32)p[2] << 16) | + ((u32)p[1] << 8) | p[0]; +} + +static inline void +put_unaligned_le16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_le16_unaligned(cpu_to_le16(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + } +} + +static inline void +put_unaligned_le32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_le32_unaligned(cpu_to_le32(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + } +} + +/* ========================================================================== */ +/* Bit scan functions */ +/* ========================================================================== */ + +/* + * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least + * significant end) of the *most* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +#ifndef bsr32 +static forceinline unsigned +bsr32(u32 v) +{ + unsigned bit = 0; + while ((v >>= 1) != 0) + bit++; + return bit; +} +#endif + +#ifndef bsr64 +static forceinline unsigned +bsr64(u64 v) +{ + unsigned bit = 0; + while ((v >>= 1) != 0) + bit++; + return bit; +} +#endif + +static forceinline unsigned +bsrw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsr32(v); + else + return bsr64(v); +} + +/* + * Bit Scan Forward (BSF) - find the 0-based index (relative to the least + * significant end) of the *least* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +#ifndef bsf32 +static forceinline unsigned +bsf32(u32 v) +{ + unsigned bit; + for (bit = 0; !(v & 1); bit++, v >>= 1) + ; + return bit; +} +#endif + +#ifndef bsf64 +static forceinline unsigned +bsf64(u64 v) +{ + unsigned bit; + for (bit = 0; !(v & 1); bit++, v >>= 1) + ; + return bit; +} +#endif + +static forceinline unsigned +bsfw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsf32(v); + else + return bsf64(v); +} + +/* Return the log base 2 of 'n', rounded up to the nearest integer. */ +static forceinline unsigned +ilog2_ceil(size_t n) +{ + if (n <= 1) + return 0; + return 1 + bsrw(n - 1); +} + +/* ========================================================================== */ +/* Aligned memory allocation */ +/* ========================================================================== */ + +extern void *aligned_malloc(size_t size, size_t alignment); +extern void aligned_free(void *ptr); + +#endif /* _COMMON_DEFS_H */ diff --git a/src/decompress_common.c b/src/decompress_common.c index a963b54..951d092 100644 --- a/src/decompress_common.c +++ b/src/decompress_common.c @@ -1,325 +1,335 @@ /* - * decompress_common.c - Code shared by the XPRESS and LZX decompressors + * decompress_common.c * - * Copyright (C) 2015 Eric Biggers + * Code for decompression shared among multiple compression formats. * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 2 of the License, or (at your option) any later - * version. + * The following copying information applies to this specific source code file: * - * This program is distributed in the hope that it will be useful, but WITHOUT + * Written in 2012-2016 by Eric Biggers + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide via the Creative Commons Zero 1.0 Universal Public Domain + * Dedication (the "CC0"). + * + * This software is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. + * FOR A PARTICULAR PURPOSE. See the CC0 for more details. * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . + * You should have received a copy of the CC0 along with this software; if not + * see . */ #ifdef HAVE_CONFIG_H -#include "config.h" +# include "config.h" #endif #include +#ifdef __SSE2__ +# include +#endif + #include "decompress_common.h" /* * make_huffman_decode_table() - * - * Build a decoding table for a canonical prefix code, or "Huffman code". + * Given an alphabet of symbols and the length of each symbol's codeword in a + * canonical prefix code, build a table for quickly decoding symbols that were + * encoded with that code. * - * This is an internal function, not part of the library API! + * A _prefix code_ is an assignment of bitstrings called _codewords_ to symbols + * such that no whole codeword is a prefix of any other. A prefix code might be + * a _Huffman code_, which means that it is an optimum prefix code for a given + * list of symbol frequencies and was generated by the Huffman algorithm. + * Although the prefix codes processed here will ordinarily be "Huffman codes", + * strictly speaking the decoder cannot know whether a given code was actually + * generated by the Huffman algorithm or not. * - * This takes as input the length of the codeword for each symbol in the - * alphabet and produces as output a table that can be used for fast - * decoding of prefix-encoded symbols using read_huffsym(). + * A prefix code is _canonical_ if and only if a longer codeword never + * lexicographically precedes a shorter codeword, and the lexicographic ordering + * of codewords of equal length is the same as the lexicographic ordering of the + * corresponding symbols. The advantage of using a canonical prefix code is + * that the codewords can be reconstructed from only the symbol => codeword + * length mapping. This eliminates the need to transmit the codewords + * explicitly. Instead, they can be enumerated in lexicographic order after + * sorting the symbols primarily by increasing codeword length and secondarily + * by increasing symbol value. * - * Strictly speaking, a canonical prefix code might not be a Huffman - * code. But this algorithm will work either way; and in fact, since - * Huffman codes are defined in terms of symbol frequencies, there is no - * way for the decompressor to know whether the code is a true Huffman - * code or not until all symbols have been decoded. + * However, the decoder's real goal is to decode symbols with the code, not just + * generate the list of codewords. Consequently, this function directly builds + * a table for efficiently decoding symbols using the code. The basic idea is + * that given the next 'max_codeword_len' bits of input, the decoder can look up + * the next decoded symbol by indexing a table containing '2^max_codeword_len' + * entries. A codeword with length 'max_codeword_len' will have exactly one + * entry in this table, whereas a codeword shorter than 'max_codeword_len' will + * have multiple entries in this table. Precisely, a codeword of length 'n' + * will have '2^(max_codeword_len - n)' entries. The index of each such entry, + * considered as a bitstring of length 'max_codeword_len', will contain the + * corresponding codeword as a prefix. * - * Because the prefix code is assumed to be "canonical", it can be - * reconstructed directly from the codeword lengths. A prefix code is - * canonical if and only if a longer codeword never lexicographically - * precedes a shorter codeword, and the lexicographic ordering of - * codewords of the same length is the same as the lexicographic ordering - * of the corresponding symbols. Consequently, we can sort the symbols - * primarily by codeword length and secondarily by symbol value, then - * reconstruct the prefix code by generating codewords lexicographically - * in that order. + * That's the basic idea, but we extend it in two ways: * - * This function does not, however, generate the prefix code explicitly. - * Instead, it directly builds a table for decoding symbols using the - * code. The basic idea is this: given the next 'max_codeword_len' bits - * in the input, we can look up the decoded symbol by indexing a table - * containing 2**max_codeword_len entries. A codeword with length - * 'max_codeword_len' will have exactly one entry in this table, whereas - * a codeword shorter than 'max_codeword_len' will have multiple entries - * in this table. Precisely, a codeword of length n will be represented - * by 2**(max_codeword_len - n) entries in this table. The 0-based index - * of each such entry will contain the corresponding codeword as a prefix - * when zero-padded on the left to 'max_codeword_len' binary digits. + * - Often the maximum codeword length is too long for it to be efficient to + * build the full decode table whenever a new code is used. Instead, we build + * a "root" table using only '2^table_bits' entries, where 'table_bits <= + * max_codeword_len'. Then, a lookup of 'table_bits' bits produces either a + * symbol directly (for codewords not longer than 'table_bits'), or the index + * of a subtable which must be indexed with additional bits of input to fully + * decode the symbol (for codewords longer than 'table_bits'). * - * That's the basic idea, but we implement two optimizations regarding - * the format of the decode table itself: + * - Whenever the decoder decodes a symbol, it needs to know the codeword length + * so that it can remove the appropriate number of input bits. The obvious + * solution would be to simply retain the codeword lengths array and use the + * decoded symbol as an index into it. However, that would require two array + * accesses when decoding each symbol. Our strategy is to instead store the + * codeword length directly in the decode table entry along with the symbol. * - * - For many compression formats, the maximum codeword length is too - * long for it to be efficient to build the full decoding table - * whenever a new prefix code is used. Instead, we can build the table - * using only 2**table_bits entries, where 'table_bits' is some number - * less than or equal to 'max_codeword_len'. Then, only codewords of - * length 'table_bits' and shorter can be directly looked up. For - * longer codewords, the direct lookup instead produces the root of a - * binary tree. Using this tree, the decoder can do traditional - * bit-by-bit decoding of the remainder of the codeword. Child nodes - * are allocated in extra entries at the end of the table; leaf nodes - * contain symbols. Note that the long-codeword case is, in general, - * not performance critical, since in Huffman codes the most frequently - * used symbols are assigned the shortest codeword lengths. - * - * - When we decode a symbol using a direct lookup of the table, we still - * need to know its length so that the bitstream can be advanced by the - * appropriate number of bits. The simple solution is to simply retain - * the 'lens' array and use the decoded symbol as an index into it. - * However, this requires two separate array accesses in the fast path. - * The optimization is to store the length directly in the decode - * table. We use the bottom 11 bits for the symbol and the top 5 bits - * for the length. In addition, to combine this optimization with the - * previous one, we introduce a special case where the top 2 bits of - * the length are both set if the entry is actually the root of a - * binary tree. + * See MAKE_DECODE_TABLE_ENTRY() for full details on the format of decode table + * entries, and see read_huffsym() for full details on how symbols are decoded. * * @decode_table: - * The array in which to create the decoding table. This must have - * a length of at least ((2**table_bits) + 2 * num_syms) entries. + * The array in which to build the decode table. This must have been + * declared by the DECODE_TABLE() macro. This may alias @lens, since all + * @lens are consumed before the decode table is written to. * * @num_syms: - * The number of symbols in the alphabet; also, the length of the - * 'lens' array. Must be less than or equal to 2048. + * The number of symbols in the alphabet. * * @table_bits: - * The order of the decode table size, as explained above. Must be - * less than or equal to 13. + * The log base 2 of the number of entries in the root table. * * @lens: - * An array of length @num_syms, indexable by symbol, that gives the - * length of the codeword, in bits, for that symbol. The length can - * be 0, which means that the symbol does not have a codeword - * assigned. + * An array of length @num_syms, indexed by symbol, that gives the length + * of the codeword, in bits, for each symbol. The length can be 0, which + * means that the symbol does not have a codeword assigned. In addition, + * @lens may alias @decode_table, as noted above. * * @max_codeword_len: - * The longest codeword length allowed in the compression format. - * All entries in 'lens' must be less than or equal to this value. - * This must be less than or equal to 23. + * The maximum codeword length permitted for this code. All entries in + * 'lens' must be less than or equal to this value. * * @working_space - * A temporary array of length '2 * (max_codeword_len + 1) + - * num_syms'. + * A temporary array that was declared with DECODE_TABLE_WORKING_SPACE(). * - * Returns 0 on success, or -1 if the lengths do not form a valid prefix - * code. + * Returns 0 on success, or -1 if the lengths do not form a valid prefix code. */ -int make_huffman_decode_table(u16 decode_table[], const unsigned num_syms, - const unsigned table_bits, const u8 lens[], - const unsigned max_codeword_len, - u16 working_space[]) +int +make_huffman_decode_table(u16 decode_table[], unsigned num_syms, + unsigned table_bits, const u8 lens[], + unsigned max_codeword_len, u16 working_space[]) { - const unsigned table_num_entries = 1 << table_bits; u16 * const len_counts = &working_space[0]; u16 * const offsets = &working_space[1 * (max_codeword_len + 1)]; u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)]; - int left; - void *decode_table_ptr; + s32 remainder = 1; + void *entry_ptr = decode_table; + unsigned codeword_len = 1; unsigned sym_idx; - unsigned codeword_len; - unsigned stores_per_loop; - unsigned decode_table_pos; - unsigned len; - unsigned sym; + unsigned codeword; + unsigned subtable_pos; + unsigned subtable_bits; + unsigned subtable_prefix; - /* Count how many symbols have each possible codeword length. - * Note that a length of 0 indicates the corresponding symbol is not - * used in the code and therefore does not have a codeword. */ - for (len = 0; len <= max_codeword_len; len++) + /* Count how many codewords have each length, including 0. */ + for (unsigned len = 0; len <= max_codeword_len; len++) len_counts[len] = 0; - for (sym = 0; sym < num_syms; sym++) + for (unsigned sym = 0; sym < num_syms; sym++) len_counts[lens[sym]]++; - /* We can assume all lengths are <= max_codeword_len, but we - * cannot assume they form a valid prefix code. A codeword of - * length n should require a proportion of the codespace equaling - * (1/2)^n. The code is valid if and only if the codespace is - * exactly filled by the lengths, by this measure. */ - left = 1; - for (len = 1; len <= max_codeword_len; len++) { - left <<= 1; - left -= len_counts[len]; - if (left < 0) { - /* The lengths overflow the codespace; that is, the code - * is over-subscribed. */ + /* It is already guaranteed that all lengths are <= max_codeword_len, + * but it cannot be assumed they form a complete prefix code. A + * codeword of length n should require a proportion of the codespace + * equaling (1/2)^n. The code is complete if and only if, by this + * measure, the codespace is exactly filled by the lengths. */ + for (unsigned len = 1; len <= max_codeword_len; len++) { + remainder = (remainder << 1) - len_counts[len]; + /* Do the lengths overflow the codespace? */ + if (unlikely(remainder < 0)) return -1; - } } - if (left != 0) { + if (remainder != 0) { /* The lengths do not fill the codespace; that is, they form an - * incomplete set. */ - if (left == (1 << max_codeword_len)) { - /* The code is completely empty. This is arguably - * invalid, but in fact it is valid in LZX and XPRESS, - * so we must allow it. By definition, no symbols can - * be decoded with an empty code. Consequently, we - * technically don't even need to fill in the decode - * table. However, to avoid accessing uninitialized - * memory if the algorithm nevertheless attempts to - * decode symbols using such a code, we zero out the - * decode table. */ - memset(decode_table, 0, - table_num_entries * sizeof(decode_table[0])); - return 0; - } - return -1; + * incomplete code. This is permitted only if the code is empty + * (contains no symbols). */ + + if (unlikely(remainder != 1U << max_codeword_len)) + return -1; + + /* The code is empty. When processing a well-formed stream, the + * decode table need not be initialized in this case. However, + * we cannot assume the stream is well-formed, so we must + * initialize the decode table anyway. Setting all entries to 0 + * makes the decode table always produce symbol '0' without + * consuming any bits, which is good enough. */ + memset(decode_table, 0, sizeof(decode_table[0]) << table_bits); + return 0; } - /* Sort the symbols primarily by length and secondarily by symbol order. - */ + /* Sort the symbols primarily by increasing codeword length and + * secondarily by increasing symbol value. */ - /* Initialize 'offsets' so that offsets[len] for 1 <= len <= - * max_codeword_len is the number of codewords shorter than 'len' bits. - */ - offsets[1] = 0; - for (len = 1; len < max_codeword_len; len++) + /* Initialize 'offsets' so that 'offsets[len]' is the number of + * codewords shorter than 'len' bits, including length 0. */ + offsets[0] = 0; + for (unsigned len = 0; len < max_codeword_len; len++) offsets[len + 1] = offsets[len] + len_counts[len]; - /* Use the 'offsets' array to sort the symbols. Note that we do not - * include symbols that are not used in the code. Consequently, fewer - * than 'num_syms' entries in 'sorted_syms' may be filled. */ - for (sym = 0; sym < num_syms; sym++) - if (lens[sym] != 0) - sorted_syms[offsets[lens[sym]]++] = sym; + /* Use the 'offsets' array to sort the symbols. */ + for (unsigned sym = 0; sym < num_syms; sym++) + sorted_syms[offsets[lens[sym]]++] = sym; - /* Fill entries for codewords with length <= table_bits - * --- that is, those short enough for a direct mapping. + /* + * Fill the root table entries for codewords no longer than table_bits. * * The table will start with entries for the shortest codeword(s), which - * have the most entries. From there, the number of entries per - * codeword will decrease. */ - decode_table_ptr = decode_table; - sym_idx = 0; - codeword_len = 1; - stores_per_loop = (1 << (table_bits - codeword_len)); - for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) { + * will have the most entries. From there, the number of entries per + * codeword will decrease. As an optimization, we may begin filling + * entries with SSE2 vector accesses (8 entries/store), then change to + * word accesses (2 or 4 entries/store), then change to 16-bit accesses + * (1 entry/store). + */ + sym_idx = offsets[0]; + +#ifdef __SSE2__ + /* Fill entries one 128-bit vector (8 entries) at a time. */ + for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) / + (sizeof(__m128i) / sizeof(decode_table[0])); + stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) + { unsigned end_sym_idx = sym_idx + len_counts[codeword_len]; for (; sym_idx < end_sym_idx; sym_idx++) { - u16 entry; - u16 *p; - unsigned n; - - entry = ((u32)codeword_len << 11) | sorted_syms[sym_idx]; - p = (u16*)decode_table_ptr; - n = stores_per_loop; - + /* Note: unlike in the "word" version below, the __m128i + * type already has __attribute__((may_alias)), so using + * it to access an array of u16 will not violate strict + * aliasing. */ + __m128i v = _mm_set1_epi16( + MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx], + codeword_len)); + unsigned n = stores_per_loop; do { - *p++ = entry; + *(__m128i *)entry_ptr = v; + entry_ptr += sizeof(v); } while (--n); + } + } +#endif /* __SSE2__ */ - decode_table_ptr = p; +#ifdef __GNUC__ + /* Fill entries one word (2 or 4 entries) at a time. */ + for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)) / + (WORDBYTES / sizeof(decode_table[0])); + stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) + { + unsigned end_sym_idx = sym_idx + len_counts[codeword_len]; + for (; sym_idx < end_sym_idx; sym_idx++) { + + /* Accessing the array of u16 as u32 or u64 would + * violate strict aliasing and would require compiling + * the code with -fno-strict-aliasing to guarantee + * correctness. To work around this problem, use the + * gcc 'may_alias' extension. */ + typedef machine_word_t + __attribute__((may_alias)) aliased_word_t; + aliased_word_t v = repeat_u16( + MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx], + codeword_len)); + unsigned n = stores_per_loop; + do { + *(aliased_word_t *)entry_ptr = v; + entry_ptr += sizeof(v); + } while (--n); + } + } +#endif /* __GNUC__ */ + + /* Fill entries one at a time. */ + for (unsigned stores_per_loop = (1U << (table_bits - codeword_len)); + stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) + { + unsigned end_sym_idx = sym_idx + len_counts[codeword_len]; + for (; sym_idx < end_sym_idx; sym_idx++) { + u16 v = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx], + codeword_len); + unsigned n = stores_per_loop; + do { + *(u16 *)entry_ptr = v; + entry_ptr += sizeof(v); + } while (--n); } } - /* If we've filled in the entire table, we are done. Otherwise, - * there are codewords longer than table_bits for which we must - * generate binary trees. */ + /* If all symbols were processed, then no subtables are required. */ + if (sym_idx == num_syms) + return 0; - decode_table_pos = (u16*)decode_table_ptr - decode_table; - if (decode_table_pos != table_num_entries) { - unsigned j; - unsigned next_free_tree_slot; - unsigned cur_codeword; + /* At least one subtable is required. Process the remaining symbols. */ + codeword = ((u16 *)entry_ptr - decode_table) << 1; + subtable_pos = 1U << table_bits; + subtable_bits = table_bits; + subtable_prefix = -1; + do { + while (len_counts[codeword_len] == 0) { + codeword_len++; + codeword <<= 1; + } - /* First, zero out the remaining entries. This is - * necessary so that these entries appear as - * "unallocated" in the next part. Each of these entries - * will eventually be filled with the representation of - * the root node of a binary tree. */ - j = decode_table_pos; - do { - decode_table[j] = 0; - } while (++j != table_num_entries); + unsigned prefix = codeword >> (codeword_len - table_bits); - /* We allocate child nodes starting at the end of the - * direct lookup table. Note that there should be - * 2*num_syms extra entries for this purpose, although - * fewer than this may actually be needed. */ - next_free_tree_slot = table_num_entries; + /* Start a new subtable if the first 'table_bits' bits of the + * codeword don't match the prefix for the previous subtable, or + * if this will be the first subtable. */ + if (prefix != subtable_prefix) { - /* Iterate through each codeword with length greater than - * 'table_bits', primarily in order of codeword length - * and secondarily in order of symbol. */ - for (cur_codeword = decode_table_pos << 1; - codeword_len <= max_codeword_len; - codeword_len++, cur_codeword <<= 1) - { - unsigned end_sym_idx = sym_idx + len_counts[codeword_len]; - for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++) - { - /* 'sorted_sym' is the symbol represented by the - * codeword. */ - unsigned sorted_sym = sorted_syms[sym_idx]; + subtable_prefix = prefix; - unsigned extra_bits = codeword_len - table_bits; - - unsigned node_idx = cur_codeword >> extra_bits; - - /* Go through each bit of the current codeword - * beyond the prefix of length @table_bits and - * walk the appropriate binary tree, allocating - * any slots that have not yet been allocated. - * - * Note that the 'pointer' entry to the binary - * tree, which is stored in the direct lookup - * portion of the table, is represented - * identically to other internal (non-leaf) - * nodes of the binary tree; it can be thought - * of as simply the root of the tree. The - * representation of these internal nodes is - * simply the index of the left child combined - * with the special bits 0xC000 to distingush - * the entry from direct mapping and leaf node - * entries. */ - do { - - /* At least one bit remains in the - * codeword, but the current node is an - * unallocated leaf. Change it to an - * internal node. */ - if (decode_table[node_idx] == 0) { - decode_table[node_idx] = - next_free_tree_slot | 0xC000; - decode_table[next_free_tree_slot++] = 0; - decode_table[next_free_tree_slot++] = 0; - } - - /* Go to the left child if the next bit - * in the codeword is 0; otherwise go to - * the right child. */ - node_idx = decode_table[node_idx] & 0x3FFF; - --extra_bits; - node_idx += (cur_codeword >> extra_bits) & 1; - } while (extra_bits != 0); - - /* We've traversed the tree using the entire - * codeword, and we're now at the entry where - * the actual symbol will be stored. This is - * distinguished from internal nodes by not - * having its high two bits set. */ - decode_table[node_idx] = sorted_sym; + /* + * Calculate the subtable length. If the codeword + * length exceeds 'table_bits' by n, then the subtable + * needs at least 2^n entries. But it may need more; if + * there are fewer than 2^n codewords of length + * 'table_bits + n' remaining, then n will need to be + * incremented to bring in longer codewords until the + * subtable can be filled completely. Note that it + * always will, eventually, be possible to fill the + * subtable, since it was previously verified that the + * code is complete. + */ + subtable_bits = codeword_len - table_bits; + remainder = (s32)1 << subtable_bits; + for (;;) { + remainder -= len_counts[table_bits + + subtable_bits]; + if (remainder <= 0) + break; + subtable_bits++; + remainder <<= 1; } + + /* Create the entry that points from the root table to + * the subtable. This entry contains the index of the + * start of the subtable and the number of bits with + * which the subtable is indexed (the log base 2 of the + * number of entries it contains). */ + decode_table[subtable_prefix] = + MAKE_DECODE_TABLE_ENTRY(subtable_pos, + subtable_bits); } - } + + /* Fill the subtable entries for this symbol. */ + u16 entry = MAKE_DECODE_TABLE_ENTRY(sorted_syms[sym_idx], + codeword_len - table_bits); + unsigned n = 1U << (subtable_bits - (codeword_len - + table_bits)); + do { + decode_table[subtable_pos++] = entry; + } while (--n); + + len_counts[codeword_len]--; + codeword++; + } while (++sym_idx < num_syms); + return 0; } diff --git a/src/decompress_common.h b/src/decompress_common.h index cfb035c..acbdb95 100644 --- a/src/decompress_common.h +++ b/src/decompress_common.h @@ -1,99 +1,36 @@ /* - * decompress_common.h - Code shared by the XPRESS and LZX decompressors + * decompress_common.h * - * Copyright (C) 2015 Eric Biggers + * Header for decompression code shared by multiple compression formats. * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 2 of the License, or (at your option) any later - * version. + * The following copying information applies to this specific source code file: * - * This program is distributed in the hope that it will be useful, but WITHOUT + * Written in 2012-2016 by Eric Biggers + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide via the Creative Commons Zero 1.0 Universal Public Domain + * Dedication (the "CC0"). + * + * This software is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. + * FOR A PARTICULAR PURPOSE. See the CC0 for more details. * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . + * You should have received a copy of the CC0 along with this software; if not + * see . */ -#include +#ifndef _DECOMPRESS_COMMON_H +#define _DECOMPRESS_COMMON_H + +#include #include -#include -#include +#include "common_defs.h" -/* "Force inline" macro (not required, but helpful for performance) */ -#ifdef __GNUC__ -# define forceinline inline __attribute__((always_inline)) -#else -# define forceinline inline -#endif - -/* Enable whole-word match copying on selected architectures */ -#if defined(__i386__) || defined(__x86_64__) || defined(__ARM_FEATURE_UNALIGNED) -# define FAST_UNALIGNED_ACCESS -#endif - -/* Size of a machine word */ -#define WORDBYTES (sizeof(size_t)) - -/* Inline functions to read and write unaligned data. - * We use just memcpy() for this. It is standard and modern compilers will - * usually replace it with load/store instructions. */ - -static forceinline u16 get_unaligned_le16(const u8 *p) -{ - le16 v_le; - memcpy(&v_le, p, 2); - return le16_to_cpu(v_le); -} - -static forceinline u32 get_unaligned_le32(const u8 *p) -{ - le32 v_le; - memcpy(&v_le, p, 4); - return le32_to_cpu(v_le); -} - -static forceinline void put_unaligned_le32(u32 v, u8 *p) -{ - le32 v_le = cpu_to_le32(v); - memcpy(p, &v_le, 4); -} - -/* Load a "word" with platform-dependent size and endianness. */ -static forceinline size_t get_unaligned_word(const u8 *p) -{ - size_t v; - memcpy(&v, p, WORDBYTES); - return v; -} - -/* Store a "word" with platform-dependent size and endianness. */ -static forceinline void put_unaligned_word(size_t v, u8 *p) -{ - memcpy(p, &v, WORDBYTES); -} - -/* Copy a "word" with platform-dependent size. */ -static forceinline void copy_unaligned_word(const u8 *src, u8 *dst) -{ - put_unaligned_word(get_unaligned_word(src), dst); -} - -/* Generate a "word" with platform-dependent size whose bytes all contain the - * value 'b'. */ -static forceinline size_t repeat_byte(u8 b) -{ - size_t v; - - v = b; - v |= v << 8; - v |= v << 16; - v |= v << ((WORDBYTES == 8) ? 32 : 0); - return v; -} +/******************************************************************************/ +/* Input bitstream for XPRESS and LZX */ +/*----------------------------------------------------------------------------*/ /* Structure that encapsulates a block of in-memory data being interpreted as a * stream of bits, optionally with interwoven literal bytes. Bits are assumed @@ -106,18 +43,18 @@ struct input_bitstream { u32 bitbuf; /* Number of bits currently held in @bitbuf. */ - unsigned bitsleft; + u32 bitsleft; /* Pointer to the next byte to be retrieved from the input buffer. */ const u8 *next; - /* Pointer to just past the end of the input buffer. */ + /* Pointer past the end of the input buffer. */ const u8 *end; }; /* Initialize a bitstream to read from the specified input buffer. */ -static forceinline void init_input_bitstream(struct input_bitstream *is, - const void *buffer, u32 size) +static forceinline void +init_input_bitstream(struct input_bitstream *is, const void *buffer, u32 size) { is->bitbuf = 0; is->bitsleft = 0; @@ -125,39 +62,60 @@ static forceinline void init_input_bitstream(struct input_bitstream *is, is->end = is->next + size; } +/* Note: for performance reasons, the following methods don't return error codes + * to the caller if the input buffer is overrun. Instead, they just assume that + * all overrun data is zeroes. This has no effect on well-formed compressed + * data. The only disadvantage is that bad compressed data may go undetected, + * but even this is irrelevant if higher level code checksums the uncompressed + * data anyway. */ + /* Ensure the bit buffer variable for the bitstream contains at least @num_bits * bits. Following this, bitstream_peek_bits() and/or bitstream_remove_bits() - * may be called on the bitstream to peek or remove up to @num_bits bits. Note - * that @num_bits must be <= 16. */ -static forceinline void bitstream_ensure_bits(struct input_bitstream *is, - unsigned num_bits) + * may be called on the bitstream to peek or remove up to @num_bits bits. */ +static forceinline void +bitstream_ensure_bits(struct input_bitstream *is, const unsigned num_bits) { - if (is->bitsleft < num_bits) { - if (is->end - is->next >= 2) { - is->bitbuf |= (u32)get_unaligned_le16(is->next) - << (16 - is->bitsleft); - is->next += 2; - } - is->bitsleft += 16; + /* This currently works for at most 17 bits. */ + + if (is->bitsleft >= num_bits) + return; + + if (unlikely(is->end - is->next < 2)) + goto overflow; + + is->bitbuf |= (u32)get_unaligned_le16(is->next) << (16 - is->bitsleft); + is->next += 2; + is->bitsleft += 16; + + if (unlikely(num_bits == 17 && is->bitsleft == 16)) { + if (unlikely(is->end - is->next < 2)) + goto overflow; + + is->bitbuf |= (u32)get_unaligned_le16(is->next); + is->next += 2; + is->bitsleft = 32; } + + return; + +overflow: + is->bitsleft = 32; } /* Return the next @num_bits bits from the bitstream, without removing them. * There must be at least @num_bits remaining in the buffer variable, from a * previous call to bitstream_ensure_bits(). */ -static forceinline u32 bitstream_peek_bits(const struct input_bitstream *is, - unsigned num_bits) +static forceinline u32 +bitstream_peek_bits(const struct input_bitstream *is, const unsigned num_bits) { - if (num_bits == 0) - return 0; - return is->bitbuf >> (32 - num_bits); + return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1); } /* Remove @num_bits from the bitstream. There must be at least @num_bits * remaining in the buffer variable, from a previous call to * bitstream_ensure_bits(). */ -static forceinline void bitstream_remove_bits(struct input_bitstream *is, - unsigned num_bits) +static forceinline void +bitstream_remove_bits(struct input_bitstream *is, unsigned num_bits) { is->bitbuf <<= num_bits; is->bitsleft -= num_bits; @@ -166,8 +124,8 @@ static forceinline void bitstream_remove_bits(struct input_bitstream *is, /* Remove and return @num_bits bits from the bitstream. There must be at least * @num_bits remaining in the buffer variable, from a previous call to * bitstream_ensure_bits(). */ -static forceinline u32 bitstream_pop_bits(struct input_bitstream *is, - unsigned num_bits) +static forceinline u32 +bitstream_pop_bits(struct input_bitstream *is, unsigned num_bits) { u32 bits = bitstream_peek_bits(is, num_bits); bitstream_remove_bits(is, num_bits); @@ -175,27 +133,29 @@ static forceinline u32 bitstream_pop_bits(struct input_bitstream *is, } /* Read and return the next @num_bits bits from the bitstream. */ -static forceinline u32 bitstream_read_bits(struct input_bitstream *is, - unsigned num_bits) +static forceinline u32 +bitstream_read_bits(struct input_bitstream *is, unsigned num_bits) { bitstream_ensure_bits(is, num_bits); return bitstream_pop_bits(is, num_bits); } /* Read and return the next literal byte embedded in the bitstream. */ -static forceinline u8 bitstream_read_byte(struct input_bitstream *is) +static forceinline u8 +bitstream_read_byte(struct input_bitstream *is) { - if (is->end == is->next) + if (unlikely(is->end == is->next)) return 0; return *is->next++; } /* Read and return the next 16-bit integer embedded in the bitstream. */ -static forceinline u16 bitstream_read_u16(struct input_bitstream *is) +static forceinline u16 +bitstream_read_u16(struct input_bitstream *is) { u16 v; - if (is->end - is->next < 2) + if (unlikely(is->end - is->next < 2)) return 0; v = get_unaligned_le16(is->next); is->next += 2; @@ -203,11 +163,12 @@ static forceinline u16 bitstream_read_u16(struct input_bitstream *is) } /* Read and return the next 32-bit integer embedded in the bitstream. */ -static forceinline u32 bitstream_read_u32(struct input_bitstream *is) +static forceinline u32 +bitstream_read_u32(struct input_bitstream *is) { u32 v; - if (is->end - is->next < 4) + if (unlikely(is->end - is->next < 4)) return 0; v = get_unaligned_le32(is->next); is->next += 4; @@ -215,161 +176,370 @@ static forceinline u32 bitstream_read_u32(struct input_bitstream *is) } /* Read into @dst_buffer an array of literal bytes embedded in the bitstream. - * Return either a pointer to the byte past the last written, or NULL if the - * read overflows the input buffer. */ -static forceinline void *bitstream_read_bytes(struct input_bitstream *is, - void *dst_buffer, size_t count) + * Return 0 if there were enough bytes remaining in the input, otherwise -1. */ +static forceinline int +bitstream_read_bytes(struct input_bitstream *is, void *dst_buffer, size_t count) { - if ((size_t)(is->end - is->next) < count) - return NULL; + if (unlikely(is->end - is->next < count)) + return -1; memcpy(dst_buffer, is->next, count); is->next += count; - return (u8 *)dst_buffer + count; + return 0; } /* Align the input bitstream on a coding-unit boundary. */ -static forceinline void bitstream_align(struct input_bitstream *is) +static forceinline void +bitstream_align(struct input_bitstream *is) { is->bitsleft = 0; is->bitbuf = 0; } -extern int make_huffman_decode_table(u16 decode_table[], const unsigned num_syms, - const unsigned num_bits, const u8 lens[], - const unsigned max_codeword_len, - u16 working_space[]); +/******************************************************************************/ +/* Huffman decoding */ +/*----------------------------------------------------------------------------*/ +/* + * Required alignment for the Huffman decode tables. We require this alignment + * so that we can fill the entries with vector or word instructions and not have + * to deal with misaligned buffers. + */ +#define DECODE_TABLE_ALIGNMENT 16 -/* Reads and returns the next Huffman-encoded symbol from a bitstream. If the - * input data is exhausted, the Huffman symbol is decoded as if the missing bits - * are all zeroes. */ -static forceinline unsigned read_huffsym(struct input_bitstream *istream, - const u16 decode_table[], - unsigned table_bits, - unsigned max_codeword_len) +/* + * Each decode table entry is 16 bits divided into two fields: 'symbol' (high 12 + * bits) and 'length' (low 4 bits). The precise meaning of these fields depends + * on the type of entry: + * + * Root table entries which are *not* subtable pointers: + * symbol: symbol to decode + * length: codeword length in bits + * + * Root table entries which are subtable pointers: + * symbol: index of start of subtable + * length: number of bits with which the subtable is indexed + * + * Subtable entries: + * symbol: symbol to decode + * length: codeword length in bits, minus the number of bits with which the + * root table is indexed + */ +#define DECODE_TABLE_SYMBOL_SHIFT 4 +#define DECODE_TABLE_MAX_SYMBOL ((1 << (16 - DECODE_TABLE_SYMBOL_SHIFT)) - 1) +#define DECODE_TABLE_MAX_LENGTH ((1 << DECODE_TABLE_SYMBOL_SHIFT) - 1) +#define DECODE_TABLE_LENGTH_MASK DECODE_TABLE_MAX_LENGTH +#define MAKE_DECODE_TABLE_ENTRY(symbol, length) \ + (((symbol) << DECODE_TABLE_SYMBOL_SHIFT) | (length)) + +/* + * Read and return the next Huffman-encoded symbol from the given bitstream + * using the given decode table. + * + * If the input data is exhausted, then the Huffman symbol will be decoded as if + * the missing bits were all zeroes. + * + * XXX: This is mostly duplicated in lzms_decode_huffman_symbol() in + * lzms_decompress.c; keep them in sync! + */ +static forceinline unsigned +read_huffsym(struct input_bitstream *is, const u16 decode_table[], + unsigned table_bits, unsigned max_codeword_len) { unsigned entry; - unsigned key_bits; + unsigned symbol; + unsigned length; - bitstream_ensure_bits(istream, max_codeword_len); + /* Preload the bitbuffer with 'max_codeword_len' bits so that we're + * guaranteed to be able to fully decode a codeword. */ + bitstream_ensure_bits(is, max_codeword_len); - /* Index the decode table by the next table_bits bits of the input. */ - key_bits = bitstream_peek_bits(istream, table_bits); - entry = decode_table[key_bits]; - if (entry < 0xC000) { - /* Fast case: The decode table directly provided the - * symbol and codeword length. The low 11 bits are the - * symbol, and the high 5 bits are the codeword length. */ - bitstream_remove_bits(istream, entry >> 11); - return entry & 0x7FF; - } else { - /* Slow case: The codeword for the symbol is longer than - * table_bits, so the symbol does not have an entry - * directly in the first (1 << table_bits) entries of the - * decode table. Traverse the appropriate binary tree - * bit-by-bit to decode the symbol. */ - bitstream_remove_bits(istream, table_bits); - do { - key_bits = (entry & 0x3FFF) + bitstream_pop_bits(istream, 1); - } while ((entry = decode_table[key_bits]) >= 0xC000); - return entry; + /* Index the root table by the next 'table_bits' bits of input. */ + entry = decode_table[bitstream_peek_bits(is, table_bits)]; + + /* Extract the "symbol" and "length" from the entry. */ + symbol = entry >> DECODE_TABLE_SYMBOL_SHIFT; + length = entry & DECODE_TABLE_LENGTH_MASK; + + /* If the root table is indexed by the full 'max_codeword_len' bits, + * then there cannot be any subtables, and this will be known at compile + * time. Otherwise, we must check whether the decoded symbol is really + * a subtable pointer. If so, we must discard the bits with which the + * root table was indexed, then index the subtable by the next 'length' + * bits of input to get the real entry. */ + if (max_codeword_len > table_bits && + entry >= (1U << (table_bits + DECODE_TABLE_SYMBOL_SHIFT))) + { + /* Subtable required */ + bitstream_remove_bits(is, table_bits); + entry = decode_table[symbol + bitstream_peek_bits(is, length)]; + symbol = entry >> DECODE_TABLE_SYMBOL_SHIFT; + length = entry & DECODE_TABLE_LENGTH_MASK; } + + /* Discard the bits (or the remaining bits, if a subtable was required) + * of the codeword. */ + bitstream_remove_bits(is, length); + + /* Return the decoded symbol. */ + return symbol; } /* - * Copy an LZ77 match at (dst - offset) to dst. + * The DECODE_TABLE_ENOUGH() macro evaluates to the maximum number of decode + * table entries, including all subtable entries, that may be required for + * decoding a given Huffman code. This depends on three parameters: * - * The length and offset must be already validated --- that is, (dst - offset) - * can't underrun the output buffer, and (dst + length) can't overrun the output - * buffer. Also, the length cannot be 0. + * num_syms: the maximum number of symbols in the code + * table_bits: the number of bits with which the root table will be indexed + * max_codeword_len: the maximum allowed codeword length in the code * - * @bufend points to the byte past the end of the output buffer. This function - * won't write any data beyond this position. - * - * Returns dst + length. + * Given these parameters, the utility program 'enough' from zlib, when passed + * the three arguments 'num_syms', 'table_bits', and 'max_codeword_len', will + * compute the maximum number of entries required. This has already been done + * for the combinations we need and incorporated into the macro below so that + * the mapping can be done at compilation time. If an unknown combination is + * used, then a compilation error will result. To fix this, use 'enough' to + * find the missing value and add it below. If that still doesn't fix the + * compilation error, then most likely a constraint would be violated by the + * requested parameters, so they cannot be used, at least without other changes + * to the decode table --- see DECODE_TABLE_SIZE(). */ -static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend, - u32 min_length) +#define DECODE_TABLE_ENOUGH(num_syms, table_bits, max_codeword_len) ( \ + ((num_syms) == 8 && (table_bits) == 7 && (max_codeword_len) == 15) ? 128 : \ + ((num_syms) == 8 && (table_bits) == 5 && (max_codeword_len) == 7) ? 36 : \ + ((num_syms) == 8 && (table_bits) == 6 && (max_codeword_len) == 7) ? 66 : \ + ((num_syms) == 8 && (table_bits) == 7 && (max_codeword_len) == 7) ? 128 : \ + ((num_syms) == 20 && (table_bits) == 5 && (max_codeword_len) == 15) ? 1062 : \ + ((num_syms) == 20 && (table_bits) == 6 && (max_codeword_len) == 15) ? 582 : \ + ((num_syms) == 20 && (table_bits) == 7 && (max_codeword_len) == 15) ? 390 : \ + ((num_syms) == 54 && (table_bits) == 9 && (max_codeword_len) == 15) ? 618 : \ + ((num_syms) == 54 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1098 : \ + ((num_syms) == 249 && (table_bits) == 9 && (max_codeword_len) == 16) ? 878 : \ + ((num_syms) == 249 && (table_bits) == 10 && (max_codeword_len) == 16) ? 1326 : \ + ((num_syms) == 249 && (table_bits) == 11 && (max_codeword_len) == 16) ? 2318 : \ + ((num_syms) == 256 && (table_bits) == 9 && (max_codeword_len) == 15) ? 822 : \ + ((num_syms) == 256 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1302 : \ + ((num_syms) == 256 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2310 : \ + ((num_syms) == 512 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1558 : \ + ((num_syms) == 512 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2566 : \ + ((num_syms) == 512 && (table_bits) == 12 && (max_codeword_len) == 15) ? 4606 : \ + ((num_syms) == 656 && (table_bits) == 10 && (max_codeword_len) == 16) ? 1734 : \ + ((num_syms) == 656 && (table_bits) == 11 && (max_codeword_len) == 16) ? 2726 : \ + ((num_syms) == 656 && (table_bits) == 12 && (max_codeword_len) == 16) ? 4758 : \ + ((num_syms) == 799 && (table_bits) == 9 && (max_codeword_len) == 15) ? 1366 : \ + ((num_syms) == 799 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1846 : \ + ((num_syms) == 799 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2854 : \ + -1) + +/* Wrapper around DECODE_TABLE_ENOUGH() that does additional compile-time + * validation. */ +#define DECODE_TABLE_SIZE(num_syms, table_bits, max_codeword_len) ( \ + \ + /* All values must be positive. */ \ + STATIC_ASSERT_ZERO((num_syms) > 0) + \ + STATIC_ASSERT_ZERO((table_bits) > 0) + \ + STATIC_ASSERT_ZERO((max_codeword_len) > 0) + \ + \ + /* There cannot be more symbols than possible codewords. */ \ + STATIC_ASSERT_ZERO((num_syms) <= 1U << (max_codeword_len)) + \ + \ + /* There is no reason for the root table to be indexed with + * more bits than the maximum codeword length. */ \ + STATIC_ASSERT_ZERO((table_bits) <= (max_codeword_len)) + \ + \ + /* The maximum symbol value must fit in the 'symbol' field. */ \ + STATIC_ASSERT_ZERO((num_syms) - 1 <= DECODE_TABLE_MAX_SYMBOL) + \ + \ + /* The maximum codeword length in the root table must fit in + * the 'length' field. */ \ + STATIC_ASSERT_ZERO((table_bits) <= DECODE_TABLE_MAX_LENGTH) + \ + \ + /* The maximum codeword length in a subtable must fit in the + * 'length' field. */ \ + STATIC_ASSERT_ZERO((max_codeword_len) - (table_bits) <= \ + DECODE_TABLE_MAX_LENGTH) + \ + \ + /* The minimum subtable index must be greater than the maximum + * symbol value. If this were not the case, then there would + * be no way to tell whether a given root table entry is a + * "subtable pointer" or not. (An alternate solution would be + * to reserve a flag bit specifically for this purpose.) */ \ + STATIC_ASSERT_ZERO((1U << table_bits) > (num_syms) - 1) + \ + \ + /* The needed 'enough' value must have been defined. */ \ + STATIC_ASSERT_ZERO(DECODE_TABLE_ENOUGH( \ + (num_syms), (table_bits), \ + (max_codeword_len)) > 0) + \ + \ + /* The maximum subtable index must fit in the 'symbol' field. */\ + STATIC_ASSERT_ZERO(DECODE_TABLE_ENOUGH( \ + (num_syms), (table_bits), \ + (max_codeword_len)) - 1 <= \ + DECODE_TABLE_MAX_SYMBOL) + \ + \ + /* Finally, make the macro evaluate to the needed maximum + * number of decode table entries. */ \ + DECODE_TABLE_ENOUGH((num_syms), (table_bits), \ + (max_codeword_len)) \ +) + +/* + * Declare the decode table for a Huffman code, given several compile-time + * constants that describe the code. See DECODE_TABLE_ENOUGH() for details. + * + * Decode tables must be aligned to a DECODE_TABLE_ALIGNMENT-byte boundary. + * This implies that if a decode table is nested inside a dynamically allocated + * structure, then the outer structure must be allocated on a + * DECODE_TABLE_ALIGNMENT-byte aligned boundary as well. + */ +#define DECODE_TABLE(name, num_syms, table_bits, max_codeword_len) \ + u16 name[DECODE_TABLE_SIZE((num_syms), (table_bits), \ + (max_codeword_len))] \ + _aligned_attribute(DECODE_TABLE_ALIGNMENT) + +/* + * Declare the temporary "working_space" array needed for building the decode + * table for a Huffman code. + */ +#define DECODE_TABLE_WORKING_SPACE(name, num_syms, max_codeword_len) \ + u16 name[2 * ((max_codeword_len) + 1) + (num_syms)]; + +extern int +make_huffman_decode_table(u16 decode_table[], unsigned num_syms, + unsigned table_bits, const u8 lens[], + unsigned max_codeword_len, u16 working_space[]); + +/******************************************************************************/ +/* LZ match copying */ +/*----------------------------------------------------------------------------*/ + +static forceinline void +copy_word_unaligned(const void *src, void *dst) { - const u8 *src = dst - offset; + store_word_unaligned(load_word_unaligned(src), dst); +} + +static forceinline machine_word_t +repeat_u16(u16 b) +{ + machine_word_t v = b; + + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + v |= v << 16; + v |= v << ((WORDBITS == 64) ? 32 : 0); + return v; +} + +static forceinline machine_word_t +repeat_byte(u8 b) +{ + return repeat_u16(((u16)b << 8) | b); +} + +/* + * Copy an LZ77 match of 'length' bytes from the match source at 'out_next - + * offset' to the match destination at 'out_next'. The source and destination + * may overlap. + * + * This handles validating the length and offset. It is validated that the + * beginning of the match source is '>= out_begin' and that end of the match + * destination is '<= out_end'. The return value is 0 if the match was valid + * (and was copied), otherwise -1. + * + * 'min_length' is a hint which specifies the minimum possible match length. + * This should be a compile-time constant. + */ +static forceinline int +lz_copy(u32 length, u32 offset, u8 *out_begin, u8 *out_next, u8 *out_end, + u32 min_length) +{ + const u8 *src; + u8 *end; + + /* Validate the offset. */ + if (unlikely(offset > out_next - out_begin)) + return -1; /* - * Try to copy one machine word at a time. On i386 and x86_64 this is - * faster than copying one byte at a time, unless the data is - * near-random and all the matches have very short lengths. Note that - * since this requires unaligned memory accesses, it won't necessarily - * be faster on every architecture. + * Fast path: copy a match which is no longer than a few words, is not + * overlapped such that copying a word at a time would produce incorrect + * results, and is not too close to the end of the buffer. Note that + * this might copy more than the length of the match, but that's okay in + * this scenario. + */ + src = out_next - offset; + if (UNALIGNED_ACCESS_IS_FAST && length <= 3 * WORDBYTES && + offset >= WORDBYTES && out_end - out_next >= 3 * WORDBYTES) + { + copy_word_unaligned(src + WORDBYTES*0, out_next + WORDBYTES*0); + copy_word_unaligned(src + WORDBYTES*1, out_next + WORDBYTES*1); + copy_word_unaligned(src + WORDBYTES*2, out_next + WORDBYTES*2); + return 0; + } + + /* Validate the length. This isn't needed in the fast path above, due + * to the additional conditions tested, but we do need it here. */ + if (unlikely(length > out_end - out_next)) + return -1; + end = out_next + length; + + /* + * Try to copy one word at a time. On i386 and x86_64 this is faster + * than copying one byte at a time, unless the data is near-random and + * all the matches have very short lengths. Note that since this + * requires unaligned memory accesses, it won't necessarily be faster on + * every architecture. * * Also note that we might copy more than the length of the match. For * example, if a word is 8 bytes and the match is of length 5, then * we'll simply copy 8 bytes. This is okay as long as we don't write - * beyond the end of the output buffer, hence the check for (bufend - + * beyond the end of the output buffer, hence the check for (out_end - * end >= WORDBYTES - 1). */ -#ifdef FAST_UNALIGNED_ACCESS - u8 * const end = dst + length; - if (bufend - end >= (ptrdiff_t)(WORDBYTES - 1)) { - + if (UNALIGNED_ACCESS_IS_FAST && likely(out_end - end >= WORDBYTES - 1)) + { if (offset >= WORDBYTES) { - /* The source and destination words don't overlap. */ - - /* To improve branch prediction, one iteration of this - * loop is unrolled. Most matches are short and will - * fail the first check. But if that check passes, then - * it becomes increasing likely that the match is long - * and we'll need to continue copying. */ - - copy_unaligned_word(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - - if (dst < end) { - do { - copy_unaligned_word(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - } while (dst < end); - } - return end; + /* The source and destination words don't overlap. */ + do { + copy_word_unaligned(src, out_next); + src += WORDBYTES; + out_next += WORDBYTES; + } while (out_next < end); + return 0; } else if (offset == 1) { - /* Offset 1 matches are equivalent to run-length * encoding of the previous byte. This case is common - * if the data contains many repeated bytes. */ - - size_t v = repeat_byte(*(dst - 1)); + * if the data contains many repeated bytes. */ + machine_word_t v = repeat_byte(*(out_next - 1)); do { - put_unaligned_word(v, dst); + store_word_unaligned(v, out_next); src += WORDBYTES; - dst += WORDBYTES; - } while (dst < end); - return end; + out_next += WORDBYTES; + } while (out_next < end); + return 0; } /* * We don't bother with special cases for other 'offset < - * WORDBYTES', which are usually rarer than 'offset == 1'. Extra - * checks will just slow things down. Actually, it's possible - * to handle all the 'offset < WORDBYTES' cases using the same - * code, but it still becomes more complicated doesn't seem any - * faster overall; it definitely slows down the more common - * 'offset == 1' case. + * WORDBYTES', which are usually rarer than 'offset == 1'. + * Extra checks will just slow things down. Actually, it's + * possible to handle all the 'offset < WORDBYTES' cases using + * the same code, but it still becomes more complicated doesn't + * seem any faster overall; it definitely slows down the more + * common 'offset == 1' case. */ } -#endif /* FAST_UNALIGNED_ACCESS */ /* Fall back to a bytewise copy. */ - - if (min_length >= 2) { - *dst++ = *src++; - length--; - } - if (min_length >= 3) { - *dst++ = *src++; - length--; - } + if (min_length >= 2) + *out_next++ = *src++; + if (min_length >= 3) + *out_next++ = *src++; + if (min_length >= 4) + *out_next++ = *src++; do { - *dst++ = *src++; - } while (--length); - - return dst; + *out_next++ = *src++; + } while (out_next != end); + return 0; } + +#endif /* _DECOMPRESS_COMMON_H */ diff --git a/src/lzx_common.c b/src/lzx_common.c new file mode 100644 index 0000000..ed9475a --- /dev/null +++ b/src/lzx_common.c @@ -0,0 +1,324 @@ +/* + * lzx_common.c - Common code for LZX compression and decompression. + */ + +/* + * Copyright (C) 2012-2016 Eric Biggers + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +#ifdef __SSE2__ +# include +#endif + +#ifdef __AVX2__ +# include +#endif + +#include "common_defs.h" +#include "lzx_common.h" + +/* Mapping: offset slot => first match offset that uses that offset slot. + * The offset slots for repeat offsets map to "fake" offsets < 1. */ +const s32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1] = { + -2 , -1 , 0 , 1 , 2 , /* 0 --- 4 */ + 4 , 6 , 10 , 14 , 22 , /* 5 --- 9 */ + 30 , 46 , 62 , 94 , 126 , /* 10 --- 14 */ + 190 , 254 , 382 , 510 , 766 , /* 15 --- 19 */ + 1022 , 1534 , 2046 , 3070 , 4094 , /* 20 --- 24 */ + 6142 , 8190 , 12286 , 16382 , 24574 , /* 25 --- 29 */ + 32766 , 49150 , 65534 , 98302 , 131070 , /* 30 --- 34 */ + 196606 , 262142 , 393214 , 524286 , 655358 , /* 35 --- 39 */ + 786430 , 917502 , 1048574, 1179646, 1310718, /* 40 --- 44 */ + 1441790, 1572862, 1703934, 1835006, 1966078, /* 45 --- 49 */ + 2097150 /* extra */ +}; + +/* Mapping: offset slot => how many extra bits must be read and added to the + * corresponding offset slot base to decode the match offset. */ +const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = { + 0 , 0 , 0 , 0 , 1 , + 1 , 2 , 2 , 3 , 3 , + 4 , 4 , 5 , 5 , 6 , + 6 , 7 , 7 , 8 , 8 , + 9 , 9 , 10, 10, 11, + 11, 12, 12, 13, 13, + 14, 14, 15, 15, 16, + 16, 17, 17, 17, 17, + 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, +}; + +/* Round the specified buffer size up to the next valid LZX window size, and + * return its order (log2). Or, if the buffer size is 0 or greater than the + * largest valid LZX window size, return 0. */ +unsigned +lzx_get_window_order(size_t max_bufsize) +{ + if (max_bufsize == 0 || max_bufsize > LZX_MAX_WINDOW_SIZE) + return 0; + + return max(ilog2_ceil(max_bufsize), LZX_MIN_WINDOW_ORDER); +} + +/* Given a valid LZX window order, return the number of symbols that will exist + * in the main Huffman code. */ +unsigned +lzx_get_num_main_syms(unsigned window_order) +{ + /* Note: one would expect that the maximum match offset would be + * 'window_size - LZX_MIN_MATCH_LEN', which would occur if the first two + * bytes were to match the last two bytes. However, the format + * disallows this case. This reduces the number of needed offset slots + * by 1. */ + u32 window_size = (u32)1 << window_order; + u32 max_offset = window_size - LZX_MIN_MATCH_LEN - 1; + unsigned num_offset_slots = 30; + while (max_offset >= lzx_offset_slot_base[num_offset_slots]) + num_offset_slots++; + + return LZX_NUM_CHARS + (num_offset_slots * LZX_NUM_LEN_HEADERS); +} + +static void +do_translate_target(void *target, s32 input_pos) +{ + s32 abs_offset, rel_offset; + + rel_offset = get_unaligned_le32(target); + if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) { + if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) { + /* "good translation" */ + abs_offset = rel_offset + input_pos; + } else { + /* "compensating translation" */ + abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE; + } + put_unaligned_le32(abs_offset, target); + } +} + +static void +undo_translate_target(void *target, s32 input_pos) +{ + s32 abs_offset, rel_offset; + + abs_offset = get_unaligned_le32(target); + if (abs_offset >= 0) { + if (abs_offset < LZX_WIM_MAGIC_FILESIZE) { + /* "good translation" */ + rel_offset = abs_offset - input_pos; + put_unaligned_le32(rel_offset, target); + } + } else { + if (abs_offset >= -input_pos) { + /* "compensating translation" */ + rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE; + put_unaligned_le32(rel_offset, target); + } + } +} + +/* + * Do or undo the 'E8' preprocessing used in LZX. Before compression, the + * uncompressed data is preprocessed by changing the targets of x86 CALL + * instructions from relative offsets to absolute offsets. After decompression, + * the translation is undone by changing the targets of x86 CALL instructions + * from absolute offsets to relative offsets. + * + * Note that despite its intent, E8 preprocessing can be done on any data even + * if it is not actually x86 machine code. In fact, E8 preprocessing appears to + * always be used in LZX-compressed resources in WIM files; there is no bit to + * indicate whether it is used or not, unlike in the LZX compressed format as + * used in cabinet files, where a bit is reserved for that purpose. + * + * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data, + * which really means the 5-byte call instruction cannot start in the last 10 + * bytes of the uncompressed data. This is one of the errors in the LZX + * documentation. + * + * E8 preprocessing does not appear to be disabled after the 32768th chunk of a + * WIM resource, which apparently is another difference from the LZX compression + * used in cabinet files. + * + * E8 processing is supposed to take the file size as a parameter, as it is used + * in calculating the translated jump targets. But in WIM files, this file size + * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000). + */ +static void +lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) +{ + +#if !defined(__SSE2__) && !defined(__AVX2__) + /* + * A worthwhile optimization is to push the end-of-buffer check into the + * relatively rare E8 case. This is possible if we replace the last six + * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte + * before reaching end-of-buffer. In addition, this scheme guarantees + * that no translation can begin following an E8 byte in the last 10 + * bytes because a 4-byte offset containing E8 as its high byte is a + * large negative number that is not valid for translation. That is + * exactly what we need. + */ + u8 *tail; + u8 saved_bytes[6]; + u8 *p; + + if (size <= 10) + return; + + tail = &data[size - 6]; + memcpy(saved_bytes, tail, 6); + memset(tail, 0xE8, 6); + p = data; + for (;;) { + while (*p != 0xE8) + p++; + if (p >= tail) + break; + (*process_target)(p + 1, p - data); + p += 5; + } + memcpy(tail, saved_bytes, 6); +#else + /* SSE2 or AVX-2 optimized version for x86_64 */ + + u8 *p = data; + u64 valid_mask = ~0; + + if (size <= 10) + return; +#ifdef __AVX2__ +# define ALIGNMENT_REQUIRED 32 +#else +# define ALIGNMENT_REQUIRED 16 +#endif + + /* Process one byte at a time until the pointer is properly aligned. */ + while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) { + if (p >= data + size - 10) + return; + if (*p == 0xE8 && (valid_mask & 1)) { + (*process_target)(p + 1, p - data); + valid_mask &= ~0x1F; + } + p++; + valid_mask >>= 1; + valid_mask |= (u64)1 << 63; + } + + if (data + size - p >= 64) { + + /* Vectorized processing */ + + /* Note: we use a "trap" E8 byte to eliminate the need to check + * for end-of-buffer in the inner loop. This byte is carefully + * positioned so that it will never be changed by a previous + * translation before it is detected. */ + + u8 *trap = p + ((data + size - p) & ~31) - 32 + 4; + u8 saved_byte = *trap; + *trap = 0xE8; + + for (;;) { + u32 e8_mask; + u8 *orig_p = p; + #ifdef __AVX2__ + const __m256i e8_bytes = _mm256_set1_epi8(0xE8); + for (;;) { + __m256i bytes = *(const __m256i *)p; + __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); + e8_mask = _mm256_movemask_epi8(cmpresult); + if (e8_mask) + break; + p += 32; + } + #else + const __m128i e8_bytes = _mm_set1_epi8(0xE8); + for (;;) { + /* Read the next 32 bytes of data and test them + * for E8 bytes. */ + __m128i bytes1 = *(const __m128i *)p; + __m128i bytes2 = *(const __m128i *)(p + 16); + __m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes); + __m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes); + u32 mask1 = _mm_movemask_epi8(cmpresult1); + u32 mask2 = _mm_movemask_epi8(cmpresult2); + /* The masks have a bit set for each E8 byte. + * We stay in this fast inner loop as long as + * there are no E8 bytes. */ + if (mask1 | mask2) { + e8_mask = mask1 | (mask2 << 16); + break; + } + p += 32; + } + #endif + + /* Did we pass over data with no E8 bytes? */ + if (p != orig_p) + valid_mask = ~0; + + /* Are we nearing end-of-buffer? */ + if (p == trap - 4) + break; + + /* Process the E8 bytes. However, the AND with + * 'valid_mask' ensures we never process an E8 byte that + * was itself part of a translation target. */ + while ((e8_mask &= valid_mask)) { + unsigned bit = bsf32(e8_mask); + (*process_target)(p + bit + 1, p + bit - data); + valid_mask &= ~((u64)0x1F << bit); + } + + valid_mask >>= 32; + valid_mask |= 0xFFFFFFFF00000000; + p += 32; + } + + *trap = saved_byte; + } + + /* Approaching the end of the buffer; process one byte a time. */ + while (p < data + size - 10) { + if (*p == 0xE8 && (valid_mask & 1)) { + (*process_target)(p + 1, p - data); + valid_mask &= ~0x1F; + } + p++; + valid_mask >>= 1; + valid_mask |= (u64)1 << 63; + } +#endif /* __SSE2__ || __AVX2__ */ +} + +void +lzx_preprocess(u8 *data, u32 size) +{ + lzx_e8_filter(data, size, do_translate_target); +} + +void +lzx_postprocess(u8 *data, u32 size) +{ + lzx_e8_filter(data, size, undo_translate_target); +} diff --git a/src/lzx_common.h b/src/lzx_common.h new file mode 100644 index 0000000..bbdc5a0 --- /dev/null +++ b/src/lzx_common.h @@ -0,0 +1,29 @@ +/* + * lzx_common.h + * + * Declarations shared between LZX compression and decompression. + */ + +#ifndef _LZX_COMMON_H +#define _LZX_COMMON_H + +#include "lzx_constants.h" +#include "common_defs.h" + +extern const s32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1]; + +extern const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS]; + +extern unsigned +lzx_get_window_order(size_t max_bufsize); + +extern unsigned +lzx_get_num_main_syms(unsigned window_order); + +extern void +lzx_preprocess(u8 *data, u32 size); + +extern void +lzx_postprocess(u8 *data, u32 size); + +#endif /* _LZX_COMMON_H */ diff --git a/src/lzx_constants.h b/src/lzx_constants.h new file mode 100644 index 0000000..20c4c62 --- /dev/null +++ b/src/lzx_constants.h @@ -0,0 +1,103 @@ +/* + * lzx_constants.h + * + * Constants for the LZX compression format. + */ + +#ifndef _LZX_CONSTANTS_H +#define _LZX_CONSTANTS_H + +/* Number of literal byte values. */ +#define LZX_NUM_CHARS 256 + +/* The smallest and largest allowed match lengths. */ +#define LZX_MIN_MATCH_LEN 2 +#define LZX_MAX_MATCH_LEN 257 + +/* Number of distinct match lengths that can be represented. */ +#define LZX_NUM_LENS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) + +/* Number of match lengths for which no length symbol is required. */ +#define LZX_NUM_PRIMARY_LENS 7 +#define LZX_NUM_LEN_HEADERS (LZX_NUM_PRIMARY_LENS + 1) + +/* Valid values of the 3-bit block type field. */ +#define LZX_BLOCKTYPE_VERBATIM 1 +#define LZX_BLOCKTYPE_ALIGNED 2 +#define LZX_BLOCKTYPE_UNCOMPRESSED 3 + +/* 'LZX_MIN_WINDOW_SIZE' and 'LZX_MAX_WINDOW_SIZE' are the minimum and maximum + * sizes of the sliding window. */ +#define LZX_MIN_WINDOW_ORDER 15 +#define LZX_MAX_WINDOW_ORDER 21 +#define LZX_MIN_WINDOW_SIZE (1UL << LZX_MIN_WINDOW_ORDER) /* 32768 */ +#define LZX_MAX_WINDOW_SIZE (1UL << LZX_MAX_WINDOW_ORDER) /* 2097152 */ + +/* Maximum number of offset slots. (The actual number of offset slots depends + * on the window size.) */ +#define LZX_MAX_OFFSET_SLOTS 50 + +/* Maximum number of symbols in the main code. (The actual number of symbols in + * the main code depends on the window size.) */ +#define LZX_MAINCODE_MAX_NUM_SYMBOLS \ + (LZX_NUM_CHARS + (LZX_MAX_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS)) + +/* Number of symbols in the length code. */ +#define LZX_LENCODE_NUM_SYMBOLS (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS) + +/* Number of symbols in the pre-code. */ +#define LZX_PRECODE_NUM_SYMBOLS 20 + +/* Number of bits in which each pre-code codeword length is represented. */ +#define LZX_PRECODE_ELEMENT_SIZE 4 + +/* Number of low-order bits of each match offset that are entropy-encoded in + * aligned offset blocks. */ +#define LZX_NUM_ALIGNED_OFFSET_BITS 3 + +/* Number of symbols in the aligned offset code. */ +#define LZX_ALIGNEDCODE_NUM_SYMBOLS (1 << LZX_NUM_ALIGNED_OFFSET_BITS) + +/* Mask for the match offset bits that are entropy-encoded in aligned offset + * blocks. */ +#define LZX_ALIGNED_OFFSET_BITMASK ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1) + +/* Number of bits in which each aligned offset codeword length is represented. */ +#define LZX_ALIGNEDCODE_ELEMENT_SIZE 3 + +/* The first offset slot which requires an aligned offset symbol in aligned + * offset blocks. */ +#define LZX_MIN_ALIGNED_OFFSET_SLOT 8 + +/* The offset slot base for LZX_MIN_ALIGNED_OFFSET_SLOT. */ +#define LZX_MIN_ALIGNED_OFFSET 14 + +/* The maximum number of extra offset bits in verbatim blocks. (One would need + * to subtract LZX_NUM_ALIGNED_OFFSET_BITS to get the number of extra offset + * bits in *aligned* blocks.) */ +#define LZX_MAX_NUM_EXTRA_BITS 17 + +/* Maximum lengths (in bits) for length-limited Huffman code construction. */ +#define LZX_MAX_MAIN_CODEWORD_LEN 16 +#define LZX_MAX_LEN_CODEWORD_LEN 16 +#define LZX_MAX_PRE_CODEWORD_LEN ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1) +#define LZX_MAX_ALIGNED_CODEWORD_LEN ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1) + +/* For LZX-compressed blocks in WIM resources, this value is always used as the + * filesize parameter for the call instruction (0xe8 byte) preprocessing, even + * though the blocks themselves are not this size, and the size of the actual + * file resource in the WIM file is very likely to be something entirely + * different as well. */ +#define LZX_WIM_MAGIC_FILESIZE 12000000 + +/* Assumed LZX block size when the encoded block size begins with a 0 bit. + * This is probably WIM-specific. */ +#define LZX_DEFAULT_BLOCK_SIZE 32768 + +/* Number of offsets in the recent (or "repeat") offsets queue. */ +#define LZX_NUM_RECENT_OFFSETS 3 + +/* An offset of n bytes is actually encoded as (n + LZX_OFFSET_ADJUSTMENT). */ +#define LZX_OFFSET_ADJUSTMENT (LZX_NUM_RECENT_OFFSETS - 1) + +#endif /* _LZX_CONSTANTS_H */ diff --git a/src/lzx_decompress.c b/src/lzx_decompress.c index a6bed16..6c251a4 100644 --- a/src/lzx_decompress.c +++ b/src/lzx_decompress.c @@ -1,10 +1,11 @@ /* - * lzx_decompress.c - A decompressor for the LZX compression format, which can - * be used in "System Compressed" files. This is based on the code from wimlib. - * This code only supports a window size (dictionary size) of 32768 bytes, since - * this is the only size used in System Compression. + * lzx_decompress.c * - * Copyright (C) 2015 Eric Biggers + * A decompressor for the LZX compression format, as used in WIM files. + */ + +/* + * Copyright (C) 2012-2016 Eric Biggers * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software @@ -20,276 +21,157 @@ * this program. If not, see . */ +/* + * LZX is an LZ77 and Huffman-code based compression format that has many + * similarities to DEFLATE (the format used by zlib/gzip). The compression + * ratio is as good or better than DEFLATE. See lzx_compress.c for a format + * overview, and see https://en.wikipedia.org/wiki/LZX_(algorithm) for a + * historical overview. Here I make some pragmatic notes. + * + * The old specification for LZX is the document "Microsoft LZX Data Compression + * Format" (1997). It defines the LZX format as used in cabinet files. Allowed + * window sizes are 2^n where 15 <= n <= 21. However, this document contains + * several errors, so don't read too much into it... + * + * The new specification for LZX is the document "[MS-PATCH]: LZX DELTA + * Compression and Decompression" (2014). It defines the LZX format as used by + * Microsoft's binary patcher. It corrects several errors in the 1997 document + * and extends the format in several ways --- namely, optional reference data, + * up to 2^25 byte windows, and longer match lengths. + * + * WIM files use a more restricted form of LZX. No LZX DELTA extensions are + * present, the window is not "sliding", E8 preprocessing is done + * unconditionally with a fixed file size, and the maximum window size is always + * 2^15 bytes (equal to the size of each "chunk" in a compressed WIM resource). + * This code is primarily intended to implement this form of LZX. But although + * not compatible with WIMGAPI, this code also supports maximum window sizes up + * to 2^21 bytes. + * + * TODO: Add support for window sizes up to 2^25 bytes. + */ + #ifdef HAVE_CONFIG_H -#include "config.h" +# include "config.h" #endif -#include -#include #include -#include - #include "decompress_common.h" +#include "lzx_common.h" #include "system_compression.h" -/* Number of literal byte values */ -#define LZX_NUM_CHARS 256 - -/* The smallest and largest allowed match lengths */ -#define LZX_MIN_MATCH_LEN 2 -#define LZX_MAX_MATCH_LEN 257 - -/* Number of distinct match lengths that can be represented */ -#define LZX_NUM_LENS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) - -/* Number of match lengths for which no length symbol is required */ -#define LZX_NUM_PRIMARY_LENS 7 -#define LZX_NUM_LEN_HEADERS (LZX_NUM_PRIMARY_LENS + 1) - -/* Valid values of the 3-bit block type field */ -#define LZX_BLOCKTYPE_VERBATIM 1 -#define LZX_BLOCKTYPE_ALIGNED 2 -#define LZX_BLOCKTYPE_UNCOMPRESSED 3 - -/* Number of offset slots for a window size of 32768 */ -#define LZX_NUM_OFFSET_SLOTS 30 - -/* Number of symbols in the main code for a window size of 32768 */ -#define LZX_MAINCODE_NUM_SYMBOLS \ - (LZX_NUM_CHARS + (LZX_NUM_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS)) - -/* Number of symbols in the length code */ -#define LZX_LENCODE_NUM_SYMBOLS (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS) - -/* Number of symbols in the precode */ -#define LZX_PRECODE_NUM_SYMBOLS 20 - -/* Number of bits in which each precode codeword length is represented */ -#define LZX_PRECODE_ELEMENT_SIZE 4 - -/* Number of low-order bits of each match offset that are entropy-encoded in - * aligned offset blocks */ -#define LZX_NUM_ALIGNED_OFFSET_BITS 3 - -/* Number of symbols in the aligned offset code */ -#define LZX_ALIGNEDCODE_NUM_SYMBOLS (1 << LZX_NUM_ALIGNED_OFFSET_BITS) - -/* Mask for the match offset bits that are entropy-encoded in aligned offset - * blocks */ -#define LZX_ALIGNED_OFFSET_BITMASK ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1) - -/* Number of bits in which each aligned offset codeword length is represented */ -#define LZX_ALIGNEDCODE_ELEMENT_SIZE 3 - -/* Maximum lengths (in bits) of the codewords in each Huffman code */ -#define LZX_MAX_MAIN_CODEWORD_LEN 16 -#define LZX_MAX_LEN_CODEWORD_LEN 16 -#define LZX_MAX_PRE_CODEWORD_LEN ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1) -#define LZX_MAX_ALIGNED_CODEWORD_LEN ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1) - -/* The default "filesize" value used in pre/post-processing. In the LZX format - * used in cabinet files this value must be given to the decompressor, whereas - * in the LZX format used in WIM files and system-compressed files this value is - * fixed at 12000000. */ -#define LZX_DEFAULT_FILESIZE 12000000 - -/* Assumed block size when the encoded block size begins with a 0 bit. */ -#define LZX_DEFAULT_BLOCK_SIZE 32768 - -/* Number of offsets in the recent (or "repeat") offsets queue. */ -#define LZX_NUM_RECENT_OFFSETS 3 - /* These values are chosen for fast decompression. */ #define LZX_MAINCODE_TABLEBITS 11 -#define LZX_LENCODE_TABLEBITS 10 +#define LZX_LENCODE_TABLEBITS 9 #define LZX_PRECODE_TABLEBITS 6 #define LZX_ALIGNEDCODE_TABLEBITS 7 -#define LZX_READ_LENS_MAX_OVERRUN 50 +#define LZX_READ_LENS_MAX_OVERRUN 50 -/* Mapping: offset slot => first match offset that uses that offset slot. - */ -static const u32 lzx_offset_slot_base[LZX_NUM_OFFSET_SLOTS + 1] = { - 0 , 1 , 2 , 3 , 4 , /* 0 --- 4 */ - 6 , 8 , 12 , 16 , 24 , /* 5 --- 9 */ - 32 , 48 , 64 , 96 , 128 , /* 10 --- 14 */ - 192 , 256 , 384 , 512 , 768 , /* 15 --- 19 */ - 1024 , 1536 , 2048 , 3072 , 4096 , /* 20 --- 24 */ - 6144 , 8192 , 12288 , 16384 , 24576 , /* 25 --- 29 */ - 32768 , /* extra */ -}; - -/* Mapping: offset slot => how many extra bits must be read and added to the - * corresponding offset slot base to decode the match offset. */ -static const u8 lzx_extra_offset_bits[LZX_NUM_OFFSET_SLOTS] = { - 0 , 0 , 0 , 0 , 1 , - 1 , 2 , 2 , 3 , 3 , - 4 , 4 , 5 , 5 , 6 , - 6 , 7 , 7 , 8 , 8 , - 9 , 9 , 10, 10, 11, - 11, 12, 12, 13, 13, -}; - -/* Reusable heap-allocated memory for LZX decompression */ struct lzx_decompressor { - /* Huffman decoding tables, and arrays that map symbols to codeword - * lengths */ + DECODE_TABLE(maincode_decode_table, LZX_MAINCODE_MAX_NUM_SYMBOLS, + LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN); + u8 maincode_lens[LZX_MAINCODE_MAX_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; - u16 maincode_decode_table[(1 << LZX_MAINCODE_TABLEBITS) + - (LZX_MAINCODE_NUM_SYMBOLS * 2)]; - u8 maincode_lens[LZX_MAINCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; - - - u16 lencode_decode_table[(1 << LZX_LENCODE_TABLEBITS) + - (LZX_LENCODE_NUM_SYMBOLS * 2)]; + DECODE_TABLE(lencode_decode_table, LZX_LENCODE_NUM_SYMBOLS, + LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN); u8 lencode_lens[LZX_LENCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; + union { + DECODE_TABLE(alignedcode_decode_table, LZX_ALIGNEDCODE_NUM_SYMBOLS, + LZX_ALIGNEDCODE_TABLEBITS, LZX_MAX_ALIGNED_CODEWORD_LEN); + u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS]; + }; - u16 alignedcode_decode_table[(1 << LZX_ALIGNEDCODE_TABLEBITS) + - (LZX_ALIGNEDCODE_NUM_SYMBOLS * 2)]; - u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS]; + union { + DECODE_TABLE(precode_decode_table, LZX_PRECODE_NUM_SYMBOLS, + LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN); + u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS]; + u8 extra_offset_bits[LZX_MAX_OFFSET_SLOTS]; + }; - u16 precode_decode_table[(1 << LZX_PRECODE_TABLEBITS) + - (LZX_PRECODE_NUM_SYMBOLS * 2)]; - u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS]; + union { + DECODE_TABLE_WORKING_SPACE(maincode_working_space, + LZX_MAINCODE_MAX_NUM_SYMBOLS, + LZX_MAX_MAIN_CODEWORD_LEN); + DECODE_TABLE_WORKING_SPACE(lencode_working_space, + LZX_LENCODE_NUM_SYMBOLS, + LZX_MAX_LEN_CODEWORD_LEN); + DECODE_TABLE_WORKING_SPACE(alignedcode_working_space, + LZX_ALIGNEDCODE_NUM_SYMBOLS, + LZX_MAX_ALIGNED_CODEWORD_LEN); + DECODE_TABLE_WORKING_SPACE(precode_working_space, + LZX_PRECODE_NUM_SYMBOLS, + LZX_MAX_PRE_CODEWORD_LEN); + }; - /* Temporary space for make_huffman_decode_table() */ - u16 working_space[2 * (1 + LZX_MAX_MAIN_CODEWORD_LEN) + - LZX_MAINCODE_NUM_SYMBOLS]; -}; + unsigned window_order; + unsigned num_main_syms; -static void undo_e8_translation(void *target, s32 input_pos) -{ - s32 abs_offset, rel_offset; + /* Like lzx_extra_offset_bits[], but does not include the entropy-coded + * bits of aligned offset blocks */ + u8 extra_offset_bits_minus_aligned[LZX_MAX_OFFSET_SLOTS]; - abs_offset = get_unaligned_le32(target); - if (abs_offset >= 0) { - if (abs_offset < LZX_DEFAULT_FILESIZE) { - /* "good translation" */ - rel_offset = abs_offset - input_pos; - put_unaligned_le32(rel_offset, target); - } - } else { - if (abs_offset >= -input_pos) { - /* "compensating translation" */ - rel_offset = abs_offset + LZX_DEFAULT_FILESIZE; - put_unaligned_le32(rel_offset, target); - } - } -} +} _aligned_attribute(DECODE_TABLE_ALIGNMENT); -/* - * Undo the 'E8' preprocessing used in LZX. Before compression, the - * uncompressed data was preprocessed by changing the targets of suspected x86 - * CALL instructions from relative offsets to absolute offsets. After - * match/literal decoding, the decompressor must undo the translation. - */ -static void lzx_postprocess(u8 *data, u32 size) -{ - /* - * A worthwhile optimization is to push the end-of-buffer check into the - * relatively rare E8 case. This is possible if we replace the last six - * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte - * before reaching end-of-buffer. In addition, this scheme guarantees - * that no translation can begin following an E8 byte in the last 10 - * bytes because a 4-byte offset containing E8 as its high byte is a - * large negative number that is not valid for translation. That is - * exactly what we need. - */ - u8 *tail; - u8 saved_bytes[6]; - u8 *p; - - if (size <= 10) - return; - - tail = &data[size - 6]; - memcpy(saved_bytes, tail, 6); - memset(tail, 0xE8, 6); - p = data; - for (;;) { - while (*p != 0xE8) - p++; - if (p >= tail) - break; - undo_e8_translation(p + 1, p - data); - p += 5; - } - memcpy(tail, saved_bytes, 6); -} - -/* Read a Huffman-encoded symbol using the precode. */ -static forceinline unsigned read_presym(const struct lzx_decompressor *d, - struct input_bitstream *is) +/* Read a Huffman-encoded symbol using the precode. */ +static forceinline unsigned +read_presym(const struct lzx_decompressor *d, struct input_bitstream *is) { return read_huffsym(is, d->precode_decode_table, LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN); } -/* Read a Huffman-encoded symbol using the main code. */ -static forceinline unsigned read_mainsym(const struct lzx_decompressor *d, - struct input_bitstream *is) +/* Read a Huffman-encoded symbol using the main code. */ +static forceinline unsigned +read_mainsym(const struct lzx_decompressor *d, struct input_bitstream *is) { return read_huffsym(is, d->maincode_decode_table, LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN); } -/* Read a Huffman-encoded symbol using the length code. */ -static forceinline unsigned read_lensym(const struct lzx_decompressor *d, - struct input_bitstream *is) +/* Read a Huffman-encoded symbol using the length code. */ +static forceinline unsigned +read_lensym(const struct lzx_decompressor *d, struct input_bitstream *is) { return read_huffsym(is, d->lencode_decode_table, LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN); } -/* Read a Huffman-encoded symbol using the aligned offset code. */ -static forceinline unsigned read_alignedsym(const struct lzx_decompressor *d, - struct input_bitstream *is) +/* Read a Huffman-encoded symbol using the aligned offset code. */ +static forceinline unsigned +read_alignedsym(const struct lzx_decompressor *d, struct input_bitstream *is) { return read_huffsym(is, d->alignedcode_decode_table, - LZX_ALIGNEDCODE_TABLEBITS, - LZX_MAX_ALIGNED_CODEWORD_LEN); + LZX_ALIGNEDCODE_TABLEBITS, LZX_MAX_ALIGNED_CODEWORD_LEN); } /* - * Read the precode from the compressed input bitstream, then use it to decode - * @num_lens codeword length values. - * - * @is: The input bitstream. - * - * @lens: An array that contains the length values from the previous time - * the codeword lengths for this Huffman code were read, or all 0's - * if this is the first time. This array must have at least - * (@num_lens + LZX_READ_LENS_MAX_OVERRUN) entries. - * - * @num_lens: Number of length values to decode. - * - * Returns 0 on success, or -1 if the data was invalid. + * Read a precode from the compressed input bitstream, then use it to decode + * @num_lens codeword length values and write them to @lens. */ -static int lzx_read_codeword_lens(struct lzx_decompressor *d, - struct input_bitstream *is, - u8 *lens, unsigned num_lens) +static int +lzx_read_codeword_lens(struct lzx_decompressor *d, struct input_bitstream *is, + u8 *lens, unsigned num_lens) { u8 *len_ptr = lens; u8 *lens_end = lens + num_lens; - int i; - /* Read the lengths of the precode codewords. These are given - * explicitly. */ - for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) { + /* Read the lengths of the precode codewords. These are stored + * explicitly. */ + for (int i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) { d->precode_lens[i] = bitstream_read_bits(is, LZX_PRECODE_ELEMENT_SIZE); } - /* Make the decoding table for the precode. */ + /* Build the decoding table for the precode. */ if (make_huffman_decode_table(d->precode_decode_table, LZX_PRECODE_NUM_SYMBOLS, LZX_PRECODE_TABLEBITS, d->precode_lens, LZX_MAX_PRE_CODEWORD_LEN, - d->working_space)) + d->precode_working_space)) return -1; /* Decode the codeword lengths. */ @@ -322,7 +204,7 @@ static int lzx_read_codeword_lens(struct lzx_decompressor *d, /* Run of identical lengths */ run_len = 4 + bitstream_read_bits(is, 1); presym = read_presym(d, is); - if (presym > 17) + if (unlikely(presym > 17)) return -1; len = *len_ptr - presym; if ((s8)len < 0) @@ -332,7 +214,8 @@ static int lzx_read_codeword_lens(struct lzx_decompressor *d, do { *len_ptr++ = len; } while (--run_len); - /* Worst case overrun is when presym == 18, + /* + * The worst case overrun is when presym == 18, * run_len == 20 + 31, and only 1 length was remaining. * So LZX_READ_LENS_MAX_OVERRUN == 50. * @@ -340,7 +223,8 @@ static int lzx_read_codeword_lens(struct lzx_decompressor *d, * can corrupt the previous values in the second half. * This doesn't really matter because the resulting * lengths will still be in range, and data that - * generates overruns is invalid anyway. */ + * generates overruns is invalid anyway. + */ } } while (len_ptr < lens_end); @@ -348,115 +232,82 @@ static int lzx_read_codeword_lens(struct lzx_decompressor *d, } /* - * Read the header of an LZX block and save the block type and (uncompressed) - * size in *block_type_ret and *block_size_ret, respectively. - * - * If the block is compressed, also update the Huffman decode @tables with the - * new Huffman codes. If the block is uncompressed, also update the match - * offset @queue with the new match offsets. - * - * Return 0 on success, or -1 if the data was invalid. + * Read the header of an LZX block. For all block types, the block type and + * size is saved in *block_type_ret and *block_size_ret, respectively. For + * compressed blocks, the codeword lengths are also saved. For uncompressed + * blocks, the recent offsets queue is also updated. */ -static int lzx_read_block_header(struct lzx_decompressor *d, - struct input_bitstream *is, - int *block_type_ret, - u32 *block_size_ret, - u32 recent_offsets[]) +static int +lzx_read_block_header(struct lzx_decompressor *d, struct input_bitstream *is, + u32 recent_offsets[], int *block_type_ret, + u32 *block_size_ret) { int block_type; u32 block_size; - int i; bitstream_ensure_bits(is, 4); - /* The first three bits tell us what kind of block it is, and should be - * one of the LZX_BLOCKTYPE_* values. */ + /* Read the block type. */ block_type = bitstream_pop_bits(is, 3); - /* Read the block size. */ + /* Read the block size. */ if (bitstream_pop_bits(is, 1)) { block_size = LZX_DEFAULT_BLOCK_SIZE; } else { - block_size = 0; - block_size |= bitstream_read_bits(is, 8); - block_size <<= 8; - block_size |= bitstream_read_bits(is, 8); + block_size = bitstream_read_bits(is, 16); + if (d->window_order >= 16) { + block_size <<= 8; + block_size |= bitstream_read_bits(is, 8); + } } switch (block_type) { case LZX_BLOCKTYPE_ALIGNED: - /* Read the aligned offset code and prepare its decode table. - */ + /* Read the aligned offset codeword lengths. */ - for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { + for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { d->alignedcode_lens[i] = bitstream_read_bits(is, LZX_ALIGNEDCODE_ELEMENT_SIZE); } - if (make_huffman_decode_table(d->alignedcode_decode_table, - LZX_ALIGNEDCODE_NUM_SYMBOLS, - LZX_ALIGNEDCODE_TABLEBITS, - d->alignedcode_lens, - LZX_MAX_ALIGNED_CODEWORD_LEN, - d->working_space)) - return -1; - /* Fall though, since the rest of the header for aligned offset * blocks is the same as that for verbatim blocks. */ case LZX_BLOCKTYPE_VERBATIM: - /* Read the main code and prepare its decode table. - * - * Note that the codeword lengths in the main code are encoded - * in two parts: one part for literal symbols, and one part for - * match symbols. */ + /* Read the main codeword lengths, which are divided into two + * parts: literal symbols and match headers. */ if (lzx_read_codeword_lens(d, is, d->maincode_lens, LZX_NUM_CHARS)) return -1; - if (lzx_read_codeword_lens(d, is, - d->maincode_lens + LZX_NUM_CHARS, - LZX_MAINCODE_NUM_SYMBOLS - LZX_NUM_CHARS)) + if (lzx_read_codeword_lens(d, is, d->maincode_lens + LZX_NUM_CHARS, + d->num_main_syms - LZX_NUM_CHARS)) return -1; - if (make_huffman_decode_table(d->maincode_decode_table, - LZX_MAINCODE_NUM_SYMBOLS, - LZX_MAINCODE_TABLEBITS, - d->maincode_lens, - LZX_MAX_MAIN_CODEWORD_LEN, - d->working_space)) - return -1; - /* Read the length code and prepare its decode table. */ + /* Read the length codeword lengths. */ if (lzx_read_codeword_lens(d, is, d->lencode_lens, LZX_LENCODE_NUM_SYMBOLS)) return -1; - if (make_huffman_decode_table(d->lencode_decode_table, - LZX_LENCODE_NUM_SYMBOLS, - LZX_LENCODE_TABLEBITS, - d->lencode_lens, - LZX_MAX_LEN_CODEWORD_LEN, - d->working_space)) - return -1; - break; case LZX_BLOCKTYPE_UNCOMPRESSED: - - /* Before reading the three recent offsets from the uncompressed - * block header, the stream must be aligned on a 16-bit - * boundary. But if the stream is *already* aligned, then the - * next 16 bits must be discarded. */ + /* + * The header of an uncompressed block contains new values for + * the recent offsets queue, starting on the next 16-bit + * boundary in the bitstream. Careful: if the stream is + * *already* aligned, the correct thing to do is to throw away + * the next 16 bits (this is probably a mistake in the format). + */ bitstream_ensure_bits(is, 1); bitstream_align(is); - recent_offsets[0] = bitstream_read_u32(is); recent_offsets[1] = bitstream_read_u32(is); recent_offsets[2] = bitstream_read_u32(is); @@ -477,202 +328,218 @@ static int lzx_read_block_header(struct lzx_decompressor *d, return 0; } -/* Decompress a block of LZX-compressed data. */ -static int lzx_decompress_block(const struct lzx_decompressor *d, - struct input_bitstream *is, - int block_type, u32 block_size, - u8 * const out_begin, u8 *out_next, - u32 recent_offsets[]) +/* Decompress a block of LZX-compressed data. */ +static int +lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *is, + int block_type, u32 block_size, + u8 * const out_begin, u8 *out_next, u32 recent_offsets[]) { u8 * const block_end = out_next + block_size; - unsigned ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED); + unsigned min_aligned_offset_slot; + + /* + * Build the Huffman decode tables. We always need to build the main + * and length decode tables. For aligned blocks we additionally need to + * build the aligned offset decode table. + */ + + if (make_huffman_decode_table(d->maincode_decode_table, + d->num_main_syms, + LZX_MAINCODE_TABLEBITS, + d->maincode_lens, + LZX_MAX_MAIN_CODEWORD_LEN, + d->maincode_working_space)) + return -1; + + if (make_huffman_decode_table(d->lencode_decode_table, + LZX_LENCODE_NUM_SYMBOLS, + LZX_LENCODE_TABLEBITS, + d->lencode_lens, + LZX_MAX_LEN_CODEWORD_LEN, + d->lencode_working_space)) + return -1; + + if (block_type == LZX_BLOCKTYPE_ALIGNED) { + if (make_huffman_decode_table(d->alignedcode_decode_table, + LZX_ALIGNEDCODE_NUM_SYMBOLS, + LZX_ALIGNEDCODE_TABLEBITS, + d->alignedcode_lens, + LZX_MAX_ALIGNED_CODEWORD_LEN, + d->alignedcode_working_space)) + return -1; + min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT; + memcpy(d->extra_offset_bits, d->extra_offset_bits_minus_aligned, + sizeof(lzx_extra_offset_bits)); + } else { + min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS; + memcpy(d->extra_offset_bits, lzx_extra_offset_bits, + sizeof(lzx_extra_offset_bits)); + } + + /* Decode the literals and matches. */ do { unsigned mainsym; - unsigned match_len; - u32 match_offset; + unsigned length; + u32 offset; unsigned offset_slot; - unsigned num_extra_bits; mainsym = read_mainsym(d, is); if (mainsym < LZX_NUM_CHARS) { - /* Literal */ + /* Literal */ *out_next++ = mainsym; continue; } - /* Match */ + /* Match */ /* Decode the length header and offset slot. */ - mainsym -= LZX_NUM_CHARS; - match_len = mainsym % LZX_NUM_LEN_HEADERS; - offset_slot = mainsym / LZX_NUM_LEN_HEADERS; + STATIC_ASSERT(LZX_NUM_CHARS % LZX_NUM_LEN_HEADERS == 0); + length = mainsym % LZX_NUM_LEN_HEADERS; + offset_slot = (mainsym - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS; /* If needed, read a length symbol to decode the full length. */ - if (match_len == LZX_NUM_PRIMARY_LENS) - match_len += read_lensym(d, is); - match_len += LZX_MIN_MATCH_LEN; + if (length == LZX_NUM_PRIMARY_LENS) + length += read_lensym(d, is); + length += LZX_MIN_MATCH_LEN; if (offset_slot < LZX_NUM_RECENT_OFFSETS) { /* Repeat offset */ /* Note: This isn't a real LRU queue, since using the R2 - * offset doesn't bump the R1 offset down to R2. This - * quirk allows all 3 recent offsets to be handled by - * the same code. (For R0, the swap is a no-op.) */ - match_offset = recent_offsets[offset_slot]; + * offset doesn't bump the R1 offset down to R2. */ + offset = recent_offsets[offset_slot]; recent_offsets[offset_slot] = recent_offsets[0]; - recent_offsets[0] = match_offset; } else { /* Explicit offset */ - - /* Look up the number of extra bits that need to be read - * to decode offsets with this offset slot. */ - num_extra_bits = lzx_extra_offset_bits[offset_slot]; - - /* Start with the offset slot base value. */ - match_offset = lzx_offset_slot_base[offset_slot]; - - /* In aligned offset blocks, the low-order 3 bits of - * each offset are encoded using the aligned offset - * code. Otherwise, all the extra bits are literal. */ - - if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) { - match_offset += - bitstream_read_bits(is, num_extra_bits - - LZX_NUM_ALIGNED_OFFSET_BITS) - << LZX_NUM_ALIGNED_OFFSET_BITS; - match_offset += read_alignedsym(d, is); - } else { - match_offset += bitstream_read_bits(is, num_extra_bits); + offset = bitstream_read_bits(is, d->extra_offset_bits[offset_slot]); + if (offset_slot >= min_aligned_offset_slot) { + offset = (offset << LZX_NUM_ALIGNED_OFFSET_BITS) | + read_alignedsym(d, is); } + offset += lzx_offset_slot_base[offset_slot]; - /* Adjust the offset. */ - match_offset -= (LZX_NUM_RECENT_OFFSETS - 1); - - /* Update the recent offsets. */ + /* Update the match offset LRU queue. */ + STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); recent_offsets[2] = recent_offsets[1]; recent_offsets[1] = recent_offsets[0]; - recent_offsets[0] = match_offset; } + recent_offsets[0] = offset; - /* Validate the match, then copy it to the current position. */ - - if (match_len > (size_t)(block_end - out_next)) + /* Validate the match and copy it to the current position. */ + if (unlikely(lz_copy(length, offset, out_begin, + out_next, block_end, LZX_MIN_MATCH_LEN))) return -1; - - if (match_offset > (size_t)(out_next - out_begin)) - return -1; - - out_next = lz_copy(out_next, match_len, match_offset, - block_end, LZX_MIN_MATCH_LEN); - + out_next += length; } while (out_next != block_end); return 0; } -/* - * lzx_allocate_decompressor - Allocate an LZX decompressor - * - * Return the pointer to the decompressor on success, or return NULL and set - * errno on failure. - */ -struct lzx_decompressor *lzx_allocate_decompressor(void) +int +lzx_decompress(struct lzx_decompressor *restrict d, + const void *restrict compressed_data, size_t compressed_size, + void *restrict uncompressed_data, size_t uncompressed_size) { - return ntfs_malloc(sizeof(struct lzx_decompressor)); -} - -/* - * lzx_decompress - Decompress a buffer of LZX-compressed data - * - * @decompressor: A decompressor allocated with lzx_allocate_decompressor() - * @compressed_data: The buffer of data to decompress - * @compressed_size: Number of bytes of compressed data - * @uncompressed_data: The buffer in which to store the decompressed data - * @uncompressed_size: The number of bytes the data decompresses into - * - * Return 0 on success, or return -1 and set errno on failure. - */ -int lzx_decompress(struct lzx_decompressor *decompressor, - const void *compressed_data, size_t compressed_size, - void *uncompressed_data, size_t uncompressed_size) -{ - struct lzx_decompressor *d = decompressor; u8 * const out_begin = uncompressed_data; u8 *out_next = out_begin; u8 * const out_end = out_begin + uncompressed_size; struct input_bitstream is; + STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1}; - int e8_status = 0; + unsigned may_have_e8_byte = 0; init_input_bitstream(&is, compressed_data, compressed_size); - /* Codeword lengths begin as all 0's for delta encoding purposes. */ - memset(d->maincode_lens, 0, LZX_MAINCODE_NUM_SYMBOLS); + /* Codeword lengths begin as all 0's for delta encoding purposes. */ + memset(d->maincode_lens, 0, d->num_main_syms); memset(d->lencode_lens, 0, LZX_LENCODE_NUM_SYMBOLS); - /* Decompress blocks until we have all the uncompressed data. */ + /* Decompress blocks until we have all the uncompressed data. */ while (out_next != out_end) { int block_type; u32 block_size; - if (lzx_read_block_header(d, &is, &block_type, &block_size, - recent_offsets)) - goto invalid; + if (lzx_read_block_header(d, &is, recent_offsets, + &block_type, &block_size)) + return -1; - if (block_size < 1 || block_size > (size_t)(out_end - out_next)) - goto invalid; + if (block_size < 1 || block_size > out_end - out_next) + return -1; - if (block_type != LZX_BLOCKTYPE_UNCOMPRESSED) { + if (likely(block_type != LZX_BLOCKTYPE_UNCOMPRESSED)) { - /* Compressed block */ - - if (lzx_decompress_block(d, - &is, - block_type, - block_size, - out_begin, - out_next, + /* Compressed block */ + if (lzx_decompress_block(d, &is, block_type, block_size, + out_begin, out_next, recent_offsets)) - goto invalid; + return -1; - e8_status |= d->maincode_lens[0xe8]; - out_next += block_size; + /* If the first E8 byte was in this block, then it must + * have been encoded as a literal using mainsym E8. */ + may_have_e8_byte |= d->maincode_lens[0xE8]; } else { - /* Uncompressed block */ - out_next = bitstream_read_bytes(&is, out_next, - block_size); - if (!out_next) - goto invalid; + /* Uncompressed block */ + if (bitstream_read_bytes(&is, out_next, block_size)) + return -1; + /* Re-align the bitstream if needed. */ if (block_size & 1) bitstream_read_byte(&is); - e8_status = 1; + /* There may have been an E8 byte in the block. */ + may_have_e8_byte = 1; } + out_next += block_size; } - /* Postprocess the data unless it cannot possibly contain 0xe8 bytes. */ - if (e8_status) + /* Postprocess the data unless it cannot possibly contain E8 bytes. */ + if (may_have_e8_byte) lzx_postprocess(uncompressed_data, uncompressed_size); return 0; - -invalid: - errno = EINVAL; - return -1; } -/* - * lzx_free_decompressor - Free an LZX decompressor - * - * @decompressor: A decompressor that was allocated with - * lzx_allocate_decompressor(), or NULL. - */ -void lzx_free_decompressor(struct lzx_decompressor *decompressor) +struct lzx_decompressor * +lzx_allocate_decompressor(size_t max_block_size) { - free(decompressor); + unsigned window_order; + struct lzx_decompressor *d; + + window_order = lzx_get_window_order(max_block_size); + if (window_order == 0) { + errno = EINVAL; + return NULL; + } + + d = aligned_malloc(sizeof(*d), DECODE_TABLE_ALIGNMENT); + if (!d) + return NULL; + + d->window_order = window_order; + d->num_main_syms = lzx_get_num_main_syms(window_order); + + /* Initialize 'd->extra_offset_bits_minus_aligned'. */ + STATIC_ASSERT(sizeof(d->extra_offset_bits_minus_aligned) == + sizeof(lzx_extra_offset_bits)); + STATIC_ASSERT(sizeof(d->extra_offset_bits) == + sizeof(lzx_extra_offset_bits)); + memcpy(d->extra_offset_bits_minus_aligned, lzx_extra_offset_bits, + sizeof(lzx_extra_offset_bits)); + for (unsigned offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT; + offset_slot < LZX_MAX_OFFSET_SLOTS; offset_slot++) + { + d->extra_offset_bits_minus_aligned[offset_slot] -= + LZX_NUM_ALIGNED_OFFSET_BITS; + } + + return d; +} + +void +lzx_free_decompressor(struct lzx_decompressor *d) +{ + aligned_free(d); } diff --git a/src/system_compression.c b/src/system_compression.c index 287e6c3..08fae47 100644 --- a/src/system_compression.c +++ b/src/system_compression.c @@ -211,7 +211,7 @@ struct ntfs_system_decompression_ctx { static int allocate_decompressor(struct ntfs_system_decompression_ctx *ctx) { if (ctx->format == FORMAT_LZX) - ctx->decompressor = lzx_allocate_decompressor(); + ctx->decompressor = lzx_allocate_decompressor(32768); else ctx->decompressor = xpress_allocate_decompressor(); if (!ctx->decompressor) @@ -590,8 +590,13 @@ static int read_and_decompress_chunk(struct ntfs_system_decompression_ctx *ctx, return 0; /* The chunk was stored compressed. Decompress its data. */ - return decompress(ctx, read_buffer, stored_size, - buffer, uncompressed_size); + if (decompress(ctx, read_buffer, stored_size, + buffer, uncompressed_size)) { + errno = EINVAL; + return -1; + } + + return 0; } /* Retrieve a pointer to the uncompressed data of the specified chunk. On diff --git a/src/system_compression.h b/src/system_compression.h index 9abc512..4453f23 100644 --- a/src/system_compression.h +++ b/src/system_compression.h @@ -60,7 +60,8 @@ extern void xpress_free_decompressor(struct xpress_decompressor *decompressor); struct lzx_decompressor; -extern struct lzx_decompressor *lzx_allocate_decompressor(void); +extern struct lzx_decompressor * +lzx_allocate_decompressor(size_t max_block_size); extern int lzx_decompress(struct lzx_decompressor *decompressor, const void *compressed_data, size_t compressed_size, diff --git a/src/xpress_constants.h b/src/xpress_constants.h new file mode 100644 index 0000000..9a8ba2c --- /dev/null +++ b/src/xpress_constants.h @@ -0,0 +1,22 @@ +/* + * xpress_constants.h + * + * Constants for the XPRESS compression format. + */ + +#ifndef _XPRESS_CONSTANTS_H +#define _XPRESS_CONSTANTS_H + +#define XPRESS_NUM_CHARS 256 +#define XPRESS_NUM_SYMBOLS 512 +#define XPRESS_MAX_CODEWORD_LEN 15 + +#define XPRESS_END_OF_DATA 256 + +#define XPRESS_MIN_OFFSET 1 +#define XPRESS_MAX_OFFSET 65535 + +#define XPRESS_MIN_MATCH_LEN 3 +#define XPRESS_MAX_MATCH_LEN 65538 + +#endif /* _XPRESS_CONSTANTS_H */ diff --git a/src/xpress_decompress.c b/src/xpress_decompress.c index 61aa42d..d3ef21e 100644 --- a/src/xpress_decompress.c +++ b/src/xpress_decompress.c @@ -1,9 +1,12 @@ /* - * xpress_decompress.c - A decompressor for the XPRESS compression format - * (Huffman variant), which can be used in "System Compressed" files. This is - * based on the code from wimlib. + * xpress_decompress.c * - * Copyright (C) 2015 Eric Biggers + * A decompressor for the XPRESS compression format (Huffman variant). + */ + +/* + * + * Copyright (C) 2012-2016 Eric Biggers * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software @@ -19,80 +22,85 @@ * this program. If not, see . */ + +/* + * The XPRESS compression format is an LZ77 and Huffman-code based algorithm. + * That means it is fairly similar to LZX compression, but XPRESS is simpler, so + * it is a little faster to compress and decompress. + * + * The XPRESS compression format is mostly documented in a file called "[MS-XCA] + * Xpress Compression Algorithm". In the MSDN library, it can currently be + * found under Open Specifications => Protocols => Windows Protocols => Windows + * Server Protocols => [MS-XCA] Xpress Compression Algorithm". The format in + * WIMs is specifically the algorithm labeled as the "LZ77+Huffman Algorithm" + * (there apparently are some other versions of XPRESS as well). + * + * If you are already familiar with the LZ77 algorithm and Huffman coding, the + * XPRESS format is fairly simple. The compressed data begins with 256 bytes + * that contain 512 4-bit integers that are the lengths of the symbols in the + * Huffman code used for match/literal headers. In contrast with more + * complicated formats such as DEFLATE and LZX, this is the only Huffman code + * that is used for the entirety of the XPRESS compressed data, and the codeword + * lengths are not encoded with a pretree. + * + * The rest of the compressed data is Huffman-encoded symbols. Values 0 through + * 255 represent the corresponding literal bytes. Values 256 through 511 + * represent matches and may require extra bits or bytes to be read to get the + * match offset and match length. + * + * The trickiest part is probably the way in which literal bytes for match + * lengths are interleaved in the bitstream. + * + * Also, a caveat--- according to Microsoft's documentation for XPRESS, + * + * "Some implementation of the decompression algorithm expect an extra + * symbol to mark the end of the data. Specifically, some implementations + * fail during decompression if the Huffman symbol 256 is not found after + * the actual data." + * + * This is the case with Microsoft's implementation in WIMGAPI, for example. So + * although our implementation doesn't currently check for this extra symbol, + * compressors would be wise to add it. + */ + #ifdef HAVE_CONFIG_H -#include "config.h" +# include "config.h" #endif -#include -#include - -#include - #include "decompress_common.h" #include "system_compression.h" - -#define XPRESS_NUM_SYMBOLS 512 -#define XPRESS_MAX_CODEWORD_LEN 15 -#define XPRESS_MIN_MATCH_LEN 3 +#include "xpress_constants.h" /* This value is chosen for fast decompression. */ -#define XPRESS_TABLEBITS 12 +#define XPRESS_TABLEBITS 11 -/* Reusable heap-allocated memory for XPRESS decompression */ struct xpress_decompressor { + union { + DECODE_TABLE(decode_table, XPRESS_NUM_SYMBOLS, + XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN); + u8 lens[XPRESS_NUM_SYMBOLS]; + }; + DECODE_TABLE_WORKING_SPACE(working_space, XPRESS_NUM_SYMBOLS, + XPRESS_MAX_CODEWORD_LEN); +} _aligned_attribute(DECODE_TABLE_ALIGNMENT); - /* The Huffman decoding table */ - u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]; - - /* An array that maps symbols to codeword lengths */ - u8 lens[XPRESS_NUM_SYMBOLS]; - - /* Temporary space for make_huffman_decode_table() */ - u16 working_space[2 * (1 + XPRESS_MAX_CODEWORD_LEN) + - XPRESS_NUM_SYMBOLS]; -}; - -/* - * xpress_allocate_decompressor - Allocate an XPRESS decompressor - * - * Return the pointer to the decompressor on success, or return NULL and set - * errno on failure. - */ -struct xpress_decompressor *xpress_allocate_decompressor(void) +int +xpress_decompress(struct xpress_decompressor *restrict d, + const void *restrict compressed_data, size_t compressed_size, + void *restrict uncompressed_data, size_t uncompressed_size) { - return ntfs_malloc(sizeof(struct xpress_decompressor)); -} - -/* - * xpress_decompress - Decompress a buffer of XPRESS-compressed data - * - * @decompressor: A decompressor that was allocated with - * xpress_allocate_decompressor() - * @compressed_data: The buffer of data to decompress - * @compressed_size: Number of bytes of compressed data - * @uncompressed_data: The buffer in which to store the decompressed data - * @uncompressed_size: The number of bytes the data decompresses into - * - * Return 0 on success, or return -1 and set errno on failure. - */ -int xpress_decompress(struct xpress_decompressor *decompressor, - const void *compressed_data, size_t compressed_size, - void *uncompressed_data, size_t uncompressed_size) -{ - struct xpress_decompressor *d = decompressor; const u8 * const in_begin = compressed_data; u8 * const out_begin = uncompressed_data; u8 *out_next = out_begin; u8 * const out_end = out_begin + uncompressed_size; struct input_bitstream is; - unsigned i; /* Read the Huffman codeword lengths. */ if (compressed_size < XPRESS_NUM_SYMBOLS / 2) - goto invalid; - for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { - d->lens[i*2 + 0] = in_begin[i] & 0xF; - d->lens[i*2 + 1] = in_begin[i] >> 4; + return -1; + for (int i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { + d->lens[2 * i + 0] = in_begin[i] & 0xf; + d->lens[2 * i + 1] = in_begin[i] >> 4; } /* Build a decoding table for the Huffman code. */ @@ -100,7 +108,7 @@ int xpress_decompress(struct xpress_decompressor *decompressor, XPRESS_TABLEBITS, d->lens, XPRESS_MAX_CODEWORD_LEN, d->working_space)) - goto invalid; + return -1; /* Decode the matches and literals. */ @@ -115,7 +123,7 @@ int xpress_decompress(struct xpress_decompressor *decompressor, sym = read_huffsym(&is, d->decode_table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN); - if (sym < 256) { + if (sym < XPRESS_NUM_CHARS) { /* Literal */ *out_next++ = sym; } else { @@ -135,30 +143,26 @@ int xpress_decompress(struct xpress_decompressor *decompressor, } length += XPRESS_MIN_MATCH_LEN; - if (offset > (size_t)(out_next - out_begin)) - goto invalid; + if (unlikely(lz_copy(length, offset, + out_begin, out_next, out_end, + XPRESS_MIN_MATCH_LEN))) + return -1; - if (length > (size_t)(out_end - out_next)) - goto invalid; - - out_next = lz_copy(out_next, length, offset, out_end, - XPRESS_MIN_MATCH_LEN); + out_next += length; } } return 0; - -invalid: - errno = EINVAL; - return -1; } -/* - * xpress_free_decompressor - Free an XPRESS decompressor - * - * @decompressor: A decompressor that was allocated with - * xpress_allocate_decompressor(), or NULL. - */ -void xpress_free_decompressor(struct xpress_decompressor *decompressor) +struct xpress_decompressor * +xpress_allocate_decompressor(void) { - free(decompressor); + return aligned_malloc(sizeof(struct xpress_decompressor), + DECODE_TABLE_ALIGNMENT); +} + +void +xpress_free_decompressor(struct xpress_decompressor *d) +{ + aligned_free(d); }