/* * LZMAInputStream * * Authors: Lasse Collin <lasse.collin@tukaani.org> * Igor Pavlov <http://7-zip.org/> * * This file has been put into the public domain. * You can do whatever you want with this file. */ package org.tukaani.xz; import java.io.InputStream; import java.io.DataInputStream; import java.io.IOException; import org.tukaani.xz.lz.LZDecoder; import org.tukaani.xz.rangecoder.RangeDecoderFromStream; import org.tukaani.xz.lzma.LZMADecoder; /** * Decompresses legacy .lzma files and raw LZMA streams (no .lzma header). * <p> * <b>IMPORTANT:</b> In contrast to other classes in this package, this class * reads data from its input stream one byte at a time. If the input stream * is for example {@link java.io.FileInputStream}, wrapping it into * {@link java.io.BufferedInputStream} tends to improve performance a lot. * This is not automatically done by this class because there may be use * cases where it is desired that this class won't read any bytes past * the end of the LZMA stream. * <p> * Even when using <code>BufferedInputStream</code>, the performance tends * to be worse (maybe 10-20 % slower) than with {@link LZMA2InputStream} * or {@link XZInputStream} (when the .xz file contains LZMA2-compressed data). * * @since 1.4 */ public class LZMAInputStream extends InputStream { /** * Largest dictionary size supported by this implementation. * <p> * LZMA allows dictionaries up to one byte less than 4 GiB. This * implementation supports only 16 bytes less than 2 GiB. This * limitation is due to Java using signed 32-bit integers for array * indexing. The limitation shouldn't matter much in practice since so * huge dictionaries are not normally used. */ public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15; private InputStream in; private LZDecoder lz; private RangeDecoderFromStream rc; private LZMADecoder lzma; private boolean endReached = false; private final byte[] tempBuf = new byte[1]; /** * Number of uncompressed bytes left to be decompressed, or -1 if * the end marker is used. */ private long remainingSize; private IOException exception = null; /** * Gets approximate decompressor memory requirements as kibibytes for * the given dictionary size and LZMA properties byte (lc, lp, and pb). * * @param dictSize LZMA dictionary size as bytes, should be * in the range [<code>0</code>, * <code>DICT_SIZE_MAX</code>] * * @param propsByte LZMA properties byte that encodes the values * of lc, lp, and pb * * @return approximate memory requirements as kibibytes (KiB) * * @throws UnsupportedOptionsException * if <code>dictSize</code> is outside * the range [<code>0</code>, * <code>DICT_SIZE_MAX</code>] * * @throws CorruptedInputException * if <code>propsByte</code> is invalid */ public static int getMemoryUsage(int dictSize, byte propsByte) throws UnsupportedOptionsException, CorruptedInputException { if (dictSize < 0 || dictSize > DICT_SIZE_MAX) throw new UnsupportedOptionsException( "LZMA dictionary is too big for this implementation"); int props = propsByte & 0xFF; if (props > (4 * 5 + 4) * 9 + 8) throw new CorruptedInputException("Invalid LZMA properties byte"); props %= 9 * 5; int lp = props / 9; int lc = props - lp * 9; return getMemoryUsage(dictSize, lc, lp); } /** * Gets approximate decompressor memory requirements as kibibytes for * the given dictionary size, lc, and lp. Note that pb isn't needed. * * @param dictSize LZMA dictionary size as bytes, must be * in the range [<code>0</code>, * <code>DICT_SIZE_MAX</code>] * * @param lc number of literal context bits, must be * in the range [0, 8] * * @param lp number of literal position bits, must be * in the range [0, 4] * * @return approximate memory requirements as kibibytes (KiB) */ public static int getMemoryUsage(int dictSize, int lc, int lp) { if (lc < 0 || lc > 8 || lp < 0 || lp > 4) throw new IllegalArgumentException("Invalid lc or lp"); // Probability variables have the type "short". There are // 0x300 (768) probability variables in each literal subcoder. // The number of literal subcoders is 2^(lc + lp). // // Roughly 10 KiB for the base state + LZ decoder's dictionary buffer // + sizeof(short) * number probability variables per literal subcoder // * number of literal subcoders return 10 + getDictSize(dictSize) / 1024 + ((2 * 0x300) << (lc + lp)) / 1024; } private static int getDictSize(int dictSize) { if (dictSize < 0 || dictSize > DICT_SIZE_MAX) throw new IllegalArgumentException( "LZMA dictionary is too big for this implementation"); // For performance reasons, use a 4 KiB dictionary if something // smaller was requested. It's a rare situation and the performance // difference isn't huge, and it starts to matter mostly when the // dictionary is just a few bytes. But we need to handle the special // case of dictSize == 0 anyway, which is an allowed value but in // practice means one-byte dictionary. // // Note that using a dictionary bigger than specified in the headers // can hide errors if there is a reference to data beyond the original // dictionary size but is still within 4 KiB. if (dictSize < 4096) dictSize = 4096; // Round dictionary size upward to a multiple of 16. This way LZMA // can use LZDecoder.getPos() for calculating LZMA's posMask. return (dictSize + 15) & ~15; } /** * Creates a new .lzma file format decompressor without * a memory usage limit. * * @param in input stream from which .lzma data is read; * it might be a good idea to wrap it in * <code>BufferedInputStream</code>, see the * note at the top of this page * * @throws CorruptedInputException * file is corrupt or perhaps not in * the .lzma format at all * * @throws UnsupportedOptionsException * dictionary size or uncompressed size is too * big for this implementation * * @throws EOFException * file is truncated or perhaps not in * the .lzma format at all * * @throws IOException may be thrown by <code>in</code> */ public LZMAInputStream(InputStream in) throws IOException { this(in, -1); } /** * Creates a new .lzma file format decompressor with an optional * memory usage limit. * * @param in input stream from which .lzma data is read; * it might be a good idea to wrap it in * <code>BufferedInputStream</code>, see the * note at the top of this page * * @param memoryLimit memory usage limit in kibibytes (KiB) * or <code>-1</code> to impose no * memory usage limit * * @throws CorruptedInputException * file is corrupt or perhaps not in * the .lzma format at all * * @throws UnsupportedOptionsException * dictionary size or uncompressed size is too * big for this implementation * * @throws MemoryLimitException * memory usage limit was exceeded * * @throws EOFException * file is truncated or perhaps not in * the .lzma format at all * * @throws IOException may be thrown by <code>in</code> */ public LZMAInputStream(InputStream in, int memoryLimit) throws IOException { DataInputStream inData = new DataInputStream(in); // Properties byte (lc, lp, and pb) byte propsByte = inData.readByte(); // Dictionary size is an unsigned 32-bit little endian integer. int dictSize = 0; for (int i = 0; i < 4; ++i) dictSize |= inData.readUnsignedByte() << (8 * i); // Uncompressed size is an unsigned 64-bit little endian integer. // The maximum 64-bit value is a special case (becomes -1 here) // which indicates that the end marker is used instead of knowing // the uncompressed size beforehand. long uncompSize = 0; for (int i = 0; i < 8; ++i) uncompSize |= (long)inData.readUnsignedByte() << (8 * i); // Check the memory usage limit. int memoryNeeded = getMemoryUsage(dictSize, propsByte); if (memoryLimit != -1 && memoryNeeded > memoryLimit) throw new MemoryLimitException(memoryNeeded, memoryLimit); initialize(in, uncompSize, propsByte, dictSize, null); } /** * Creates a new input stream that decompresses raw LZMA data (no .lzma * header) from <code>in</code>. * <p> * The caller needs to know if the "end of payload marker (EOPM)" alias * "end of stream marker (EOS marker)" alias "end marker" present. * If the end marker isn't used, the caller must know the exact * uncompressed size of the stream. * <p> * The caller also needs to provide the LZMA properties byte that encodes * the number of literal context bits (lc), literal position bits (lp), * and position bits (pb). * <p> * The dictionary size used when compressing is also needed. Specifying * a too small dictionary size will prevent decompressing the stream. * Specifying a too big dictionary is waste of memory but decompression * will work. * <p> * There is no need to specify a dictionary bigger than * the uncompressed size of the data even if a bigger dictionary * was used when compressing. If you know the uncompressed size * of the data, this might allow saving some memory. * * @param in input stream from which compressed * data is read * * @param uncompSize uncompressed size of the LZMA stream or -1 * if the end marker is used in the LZMA stream * * @param propsByte LZMA properties byte that has the encoded * values for literal context bits (lc), literal * position bits (lp), and position bits (pb) * * @param dictSize dictionary size as bytes, must be in the range * [<code>0</code>, <code>DICT_SIZE_MAX</code>] * * @throws CorruptedInputException * if <code>propsByte</code> is invalid or * the first input byte is not 0x00 * * @throws UnsupportedOptionsException * dictionary size or uncompressed size is too * big for this implementation * * */ public LZMAInputStream(InputStream in, long uncompSize, byte propsByte, int dictSize) throws IOException { initialize(in, uncompSize, propsByte, dictSize, null); } /** * Creates a new input stream that decompresses raw LZMA data (no .lzma * header) from <code>in</code> optionally with a preset dictionary. * * @param in input stream from which LZMA-compressed * data is read * * @param uncompSize uncompressed size of the LZMA stream or -1 * if the end marker is used in the LZMA stream * * @param propsByte LZMA properties byte that has the encoded * values for literal context bits (lc), literal * position bits (lp), and position bits (pb) * * @param dictSize dictionary size as bytes, must be in the range * [<code>0</code>, <code>DICT_SIZE_MAX</code>] * * @param presetDict preset dictionary or <code>null</code> * to use no preset dictionary * * @throws CorruptedInputException * if <code>propsByte</code> is invalid or * the first input byte is not 0x00 * * @throws UnsupportedOptionsException * dictionary size or uncompressed size is too * big for this implementation * * @throws EOFException file is truncated or corrupt * * @throws IOException may be thrown by <code>in</code> */ public LZMAInputStream(InputStream in, long uncompSize, byte propsByte, int dictSize, byte[] presetDict) throws IOException { initialize(in, uncompSize, propsByte, dictSize, presetDict); } /** * Creates a new input stream that decompresses raw LZMA data (no .lzma * header) from <code>in</code> optionally with a preset dictionary. * * @param in input stream from which LZMA-compressed * data is read * * @param uncompSize uncompressed size of the LZMA stream or -1 * if the end marker is used in the LZMA stream * * @param lc number of literal context bits, must be * in the range [0, 8] * * @param lp number of literal position bits, must be * in the range [0, 4] * * @param pb number position bits, must be * in the range [0, 4] * * @param dictSize dictionary size as bytes, must be in the range * [<code>0</code>, <code>DICT_SIZE_MAX</code>] * * @param presetDict preset dictionary or <code>null</code> * to use no preset dictionary * * @throws CorruptedInputException * if the first input byte is not 0x00 * * @throws EOFException file is truncated or corrupt * * @throws IOException may be thrown by <code>in</code> */ public LZMAInputStream(InputStream in, long uncompSize, int lc, int lp, int pb, int dictSize, byte[] presetDict) throws IOException { initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict); } private void initialize(InputStream in, long uncompSize, byte propsByte, int dictSize, byte[] presetDict) throws IOException { // Validate the uncompressed size since the other "initialize" throws // IllegalArgumentException if uncompSize < -1. if (uncompSize < -1) throw new UnsupportedOptionsException( "Uncompressed size is too big"); // Decode the properties byte. In contrast to LZMA2, there is no // limit of lc + lp <= 4. int props = propsByte & 0xFF; if (props > (4 * 5 + 4) * 9 + 8) throw new CorruptedInputException("Invalid LZMA properties byte"); int pb = props / (9 * 5); props -= pb * 9 * 5; int lp = props / 9; int lc = props - lp * 9; // Validate the dictionary size since the other "initialize" throws // IllegalArgumentException if dictSize is not supported. if (dictSize < 0 || dictSize > DICT_SIZE_MAX) throw new UnsupportedOptionsException( "LZMA dictionary is too big for this implementation"); initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict); } private void initialize(InputStream in, long uncompSize, int lc, int lp, int pb, int dictSize, byte[] presetDict) throws IOException { // getDictSize validates dictSize and gives a message in // the exception too, so skip validating dictSize here. if (uncompSize < -1 || lc < 0 || lc > 8 || lp < 0 || lp > 4 || pb < 0 || pb > 4) throw new IllegalArgumentException(); this.in = in; // If uncompressed size is known, use it to avoid wasting memory for // a uselessly large dictionary buffer. dictSize = getDictSize(dictSize); if (uncompSize >= 0 && dictSize > uncompSize) dictSize = getDictSize((int)uncompSize); lz = new LZDecoder(getDictSize(dictSize), presetDict); rc = new RangeDecoderFromStream(in); lzma = new LZMADecoder(lz, rc, lc, lp, pb); remainingSize = uncompSize; } /** * Decompresses the next byte from this input stream. * <p> * Reading lots of data with <code>read()</code> from this input stream * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code> * if you need to read lots of data one byte at a time. * * @return the next decompressed byte, or <code>-1</code> * to indicate the end of the compressed stream * * @throws CorruptedInputException * * @throws XZIOException if the stream has been closed * * @throws EOFException * compressed input is truncated or corrupt * * @throws IOException may be thrown by <code>in</code> */ public int read() throws IOException { return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF); } /** * Decompresses into an array of bytes. * <p> * If <code>len</code> is zero, no bytes are read and <code>0</code> * is returned. Otherwise this will block until <code>len</code> * bytes have been decompressed, the end of the LZMA stream is reached, * or an exception is thrown. * * @param buf target buffer for uncompressed data * @param off start offset in <code>buf</code> * @param len maximum number of uncompressed bytes to read * * @return number of bytes read, or <code>-1</code> to indicate * the end of the compressed stream * * @throws CorruptedInputException * * @throws XZIOException if the stream has been closed * * @throws EOFException compressed input is truncated or corrupt * * @throws IOException may be thrown by <code>in</code> */ public int read(byte[] buf, int off, int len) throws IOException { if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) throw new IndexOutOfBoundsException(); if (len == 0) return 0; if (in == null) throw new XZIOException("Stream closed"); if (exception != null) throw exception; if (endReached) return -1; try { int size = 0; while (len > 0) { // If uncompressed size is known and thus no end marker will // be present, set the limit so that the uncompressed size // won't be exceeded. int copySizeMax = len; if (remainingSize >= 0 && remainingSize < len) copySizeMax = (int)remainingSize; lz.setLimit(copySizeMax); // Decode into the dictionary buffer. try { lzma.decode(); } catch (CorruptedInputException e) { // The end marker is encoded with a LZMA symbol that // indicates maximum match distance. This is larger // than any supported dictionary and thus causes // CorruptedInputException from LZDecoder.repeat. if (remainingSize != -1 || !lzma.endMarkerDetected()) throw e; endReached = true; // The exception makes lzma.decode() miss the last range // decoder normalization, so do it here. This might // cause an IOException if it needs to read a byte // from the input stream. rc.normalize(); } // Copy from the dictionary to buf. int copiedSize = lz.flush(buf, off); off += copiedSize; len -= copiedSize; size += copiedSize; if (remainingSize >= 0) { // Update the number of bytes left to be decompressed. remainingSize -= copiedSize; assert remainingSize >= 0; if (remainingSize == 0) endReached = true; } if (endReached) { // Checking these helps a lot when catching corrupt // or truncated .lzma files. LZMA Utils doesn't do // the first check and thus it accepts many invalid // files that this implementation and XZ Utils don't. if (!rc.isFinished() || lz.hasPending()) throw new CorruptedInputException(); return size == 0 ? -1 : size; } } return size; } catch (IOException e) { exception = e; throw e; } } /** * Closes the stream and calls <code>in.close()</code>. * If the stream was already closed, this does nothing. * * @throws IOException if thrown by <code>in.close()</code> */ public void close() throws IOException { if (in != null) { try { in.close(); } finally { in = null; } } } }