1 | /*
|
---|
2 | * Copyright 2004-2011 H2 Group. Multiple-Licensed under the H2 License,
|
---|
3 | * Version 1.0, and under the Eclipse Public License, Version 1.0
|
---|
4 | * (http://h2database.com/html/license.html).
|
---|
5 | *
|
---|
6 | * This code is based on the LZF algorithm from Marc Lehmann. It is a
|
---|
7 | * re-implementation of the C code:
|
---|
8 | * http://cvs.schmorp.de/liblzf/lzf_c.c?view=markup
|
---|
9 | * http://cvs.schmorp.de/liblzf/lzf_d.c?view=markup
|
---|
10 | *
|
---|
11 | * According to a mail from Marc Lehmann, it's OK to use his algorithm:
|
---|
12 | * Date: 2010-07-15 15:57
|
---|
13 | * Subject: Re: Question about LZF licensing
|
---|
14 | * ...
|
---|
15 | * The algorithm is not copyrighted (and cannot be copyrighted afaik) - as long
|
---|
16 | * as you wrote everything yourself, without copying my code, that's just fine
|
---|
17 | * (looking is of course fine too).
|
---|
18 | * ...
|
---|
19 | *
|
---|
20 | * Still I would like to keep his copyright info:
|
---|
21 | *
|
---|
22 | * Copyright (c) 2000-2005 Marc Alexander Lehmann <schmorp@schmorp.de>
|
---|
23 | * Copyright (c) 2005 Oren J. Maurice <oymaurice@hazorea.org.il>
|
---|
24 | *
|
---|
25 | * Redistribution and use in source and binary forms, with or without
|
---|
26 | * modification, are permitted provided that the following conditions are met:
|
---|
27 | *
|
---|
28 | * 1. Redistributions of source code must retain the above copyright notice,
|
---|
29 | * this list of conditions and the following disclaimer.
|
---|
30 | *
|
---|
31 | * 2. Redistributions in binary form must reproduce the above copyright
|
---|
32 | * notice, this list of conditions and the following disclaimer in the
|
---|
33 | * documentation and/or other materials provided with the distribution.
|
---|
34 | *
|
---|
35 | * 3. The name of the author may not be used to endorse or promote products
|
---|
36 | * derived from this software without specific prior written permission.
|
---|
37 | *
|
---|
38 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
|
---|
39 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
---|
40 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
---|
41 | * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
---|
42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
---|
43 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
---|
44 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
---|
45 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
---|
46 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
---|
47 | * OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
48 | */
|
---|
49 |
|
---|
50 | package org.mapdb;
|
---|
51 |
|
---|
52 | import java.io.DataInput;
|
---|
53 | import java.io.DataOutput;
|
---|
54 | import java.io.IOException;
|
---|
55 | import java.io.Serializable;
|
---|
56 | import java.util.Arrays;
|
---|
57 |
|
---|
58 | /**
|
---|
59 | * <p>
|
---|
60 | * This class implements the LZF lossless data compression algorithm. LZF is a
|
---|
61 | * Lempel-Ziv variant with byte-aligned output, and optimized for speed.
|
---|
62 | * </p>
|
---|
63 | * <p>
|
---|
64 | * Safety/Use Notes:
|
---|
65 | * </p>
|
---|
66 | * <ul>
|
---|
67 | * <li>Each instance should be used by a single thread only.</li>
|
---|
68 | * <li>The data buffers should be smaller than 1 GB.</li>
|
---|
69 | * <li>For performance reasons, safety checks on expansion are omitted.</li>
|
---|
70 | * <li>Invalid compressed data can cause an ArrayIndexOutOfBoundsException.</li>
|
---|
71 | * </ul>
|
---|
72 | * <p>
|
---|
73 | * The LZF compressed format knows literal runs and back-references:
|
---|
74 | * </p>
|
---|
75 | * <ul>
|
---|
76 | * <li>Literal run: directly copy bytes from input to output.</li>
|
---|
77 | * <li>Back-reference: copy previous data to output stream, with specified
|
---|
78 | * offset from location and length. The length is at least 3 bytes.</li>
|
---|
79 | * </ul>
|
---|
80 | *<p>
|
---|
81 | * The first byte of the compressed stream is the control byte. For literal
|
---|
82 | * runs, the highest three bits of the control byte are not set, the the lower
|
---|
83 | * bits are the literal run length, and the next bytes are data to copy directly
|
---|
84 | * into the output. For back-references, the highest three bits of the control
|
---|
85 | * byte are the back-reference length. If all three bits are set, then the
|
---|
86 | * back-reference length is stored in the next byte. The lower bits of the
|
---|
87 | * control byte combined with the next byte form the offset for the
|
---|
88 | * back-reference.
|
---|
89 | * </p>
|
---|
90 | */
|
---|
91 | public final class CompressLZF{
|
---|
92 |
|
---|
93 | /**
|
---|
94 | * The number of entries in the hash table. The size is a trade-off between
|
---|
95 | * hash collisions (reduced compression) and speed (amount that fits in CPU
|
---|
96 | * cache).
|
---|
97 | */
|
---|
98 | private static final int HASH_SIZE = 1 << 14;
|
---|
99 |
|
---|
100 | /**
|
---|
101 | * The maximum number of literals in a chunk (32).
|
---|
102 | */
|
---|
103 | private static final int MAX_LITERAL = 1 << 5;
|
---|
104 |
|
---|
105 | /**
|
---|
106 | * The maximum offset allowed for a back-reference (8192).
|
---|
107 | */
|
---|
108 | private static final int MAX_OFF = 1 << 13;
|
---|
109 |
|
---|
110 | /**
|
---|
111 | * The maximum back-reference length (264).
|
---|
112 | */
|
---|
113 | private static final int MAX_REF = (1 << 8) + (1 << 3);
|
---|
114 |
|
---|
115 | /**
|
---|
116 | * Hash table for matching byte sequences (reused for performance).
|
---|
117 | */
|
---|
118 | private int[] cachedHashTable;
|
---|
119 |
|
---|
120 | /**
|
---|
121 | * Return byte with lower 2 bytes being byte at index, then index+1.
|
---|
122 | */
|
---|
123 | private static int first(byte[] in, int inPos) {
|
---|
124 | return (in[inPos] << 8) | (in[inPos + 1] & 255);
|
---|
125 | }
|
---|
126 |
|
---|
127 | /**
|
---|
128 | * Shift v 1 byte left, add value at index inPos+2.
|
---|
129 | */
|
---|
130 | private static int next(int v, byte[] in, int inPos) {
|
---|
131 | return (v << 8) | (in[inPos + 2] & 255);
|
---|
132 | }
|
---|
133 |
|
---|
134 | /**
|
---|
135 | * Compute the address in the hash table.
|
---|
136 | */
|
---|
137 | private static int hash(int h) {
|
---|
138 | return ((h * 2777) >> 9) & (HASH_SIZE - 1);
|
---|
139 | }
|
---|
140 |
|
---|
141 | public int compress(byte[] in, int inLen, byte[] out, int outPos) {
|
---|
142 | int inPos = 0;
|
---|
143 | if (cachedHashTable == null) {
|
---|
144 | cachedHashTable = new int[HASH_SIZE];
|
---|
145 | }
|
---|
146 | int[] hashTab = cachedHashTable;
|
---|
147 | int literals = 0;
|
---|
148 | outPos++;
|
---|
149 | int future = first(in, 0);
|
---|
150 | while (inPos < inLen - 4) {
|
---|
151 | byte p2 = in[inPos + 2];
|
---|
152 | // next
|
---|
153 | future = (future << 8) + (p2 & 255);
|
---|
154 | int off = hash(future);
|
---|
155 | int ref = hashTab[off];
|
---|
156 | hashTab[off] = inPos;
|
---|
157 | // if (ref < inPos
|
---|
158 | // && ref > 0
|
---|
159 | // && (off = inPos - ref - 1) < MAX_OFF
|
---|
160 | // && in[ref + 2] == p2
|
---|
161 | // && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) ==
|
---|
162 | // ((future >> 8) & 0xffff)) {
|
---|
163 | if (ref < inPos
|
---|
164 | && ref > 0
|
---|
165 | && (off = inPos - ref - 1) < MAX_OFF
|
---|
166 | && in[ref + 2] == p2
|
---|
167 | && in[ref + 1] == (byte) (future >> 8)
|
---|
168 | && in[ref] == (byte) (future >> 16)) {
|
---|
169 | // match
|
---|
170 | int maxLen = inLen - inPos - 2;
|
---|
171 | if (maxLen > MAX_REF) {
|
---|
172 | maxLen = MAX_REF;
|
---|
173 | }
|
---|
174 | if (literals == 0) {
|
---|
175 | // multiple back-references,
|
---|
176 | // so there is no literal run control byte
|
---|
177 | outPos--;
|
---|
178 | } else {
|
---|
179 | // set the control byte at the start of the literal run
|
---|
180 | // to store the number of literals
|
---|
181 | out[outPos - literals - 1] = (byte) (literals - 1);
|
---|
182 | literals = 0;
|
---|
183 | }
|
---|
184 | int len = 3;
|
---|
185 | while (len < maxLen && in[ref + len] == in[inPos + len]) {
|
---|
186 | len++;
|
---|
187 | }
|
---|
188 | len -= 2;
|
---|
189 | if (len < 7) {
|
---|
190 | out[outPos++] = (byte) ((off >> 8) + (len << 5));
|
---|
191 | } else {
|
---|
192 | out[outPos++] = (byte) ((off >> 8) + (7 << 5));
|
---|
193 | out[outPos++] = (byte) (len - 7);
|
---|
194 | }
|
---|
195 | out[outPos++] = (byte) off;
|
---|
196 | // move one byte forward to allow for a literal run control byte
|
---|
197 | outPos++;
|
---|
198 | inPos += len;
|
---|
199 | // rebuild the future, and store the last bytes to the hashtable.
|
---|
200 | // Storing hashes of the last bytes in back-reference improves
|
---|
201 | // the compression ratio and only reduces speed slightly.
|
---|
202 | future = first(in, inPos);
|
---|
203 | future = next(future, in, inPos);
|
---|
204 | hashTab[hash(future)] = inPos++;
|
---|
205 | future = next(future, in, inPos);
|
---|
206 | hashTab[hash(future)] = inPos++;
|
---|
207 | } else {
|
---|
208 | // copy one byte from input to output as part of literal
|
---|
209 | out[outPos++] = in[inPos++];
|
---|
210 | literals++;
|
---|
211 | // at the end of this literal chunk, write the length
|
---|
212 | // to the control byte and start a new chunk
|
---|
213 | if (literals == MAX_LITERAL) {
|
---|
214 | out[outPos - literals - 1] = (byte) (literals - 1);
|
---|
215 | literals = 0;
|
---|
216 | // move ahead one byte to allow for the
|
---|
217 | // literal run control byte
|
---|
218 | outPos++;
|
---|
219 | }
|
---|
220 | }
|
---|
221 | }
|
---|
222 | // write the remaining few bytes as literals
|
---|
223 | while (inPos < inLen) {
|
---|
224 | out[outPos++] = in[inPos++];
|
---|
225 | literals++;
|
---|
226 | if (literals == MAX_LITERAL) {
|
---|
227 | out[outPos - literals - 1] = (byte) (literals - 1);
|
---|
228 | literals = 0;
|
---|
229 | outPos++;
|
---|
230 | }
|
---|
231 | }
|
---|
232 | // writes the final literal run length to the control byte
|
---|
233 | out[outPos - literals - 1] = (byte) (literals - 1);
|
---|
234 | if (literals == 0) {
|
---|
235 | outPos--;
|
---|
236 | }
|
---|
237 | return outPos;
|
---|
238 | }
|
---|
239 |
|
---|
240 | public void expand(byte[] in, int inPos, int inLen, byte[] out, int outPos, int outLen) {
|
---|
241 | // if ((inPos | outPos | outLen) < 0) {
|
---|
242 | if (inPos < 0 || outPos < 0 || outLen < 0) {
|
---|
243 | throw new IllegalArgumentException();
|
---|
244 | }
|
---|
245 | do {
|
---|
246 | int ctrl = in[inPos++] & 255;
|
---|
247 | if (ctrl < MAX_LITERAL) {
|
---|
248 | // literal run of length = ctrl + 1,
|
---|
249 | ctrl++;
|
---|
250 | // copy to output and move forward this many bytes
|
---|
251 | System.arraycopy(in, inPos, out, outPos, ctrl);
|
---|
252 | outPos += ctrl;
|
---|
253 | inPos += ctrl;
|
---|
254 | } else {
|
---|
255 | // back reference
|
---|
256 | // the highest 3 bits are the match length
|
---|
257 | int len = ctrl >> 5;
|
---|
258 | // if the length is maxed, add the next byte to the length
|
---|
259 | if (len == 7) {
|
---|
260 | len += in[inPos++] & 255;
|
---|
261 | }
|
---|
262 | // minimum back-reference is 3 bytes,
|
---|
263 | // so 2 was subtracted before storing size
|
---|
264 | len += 2;
|
---|
265 |
|
---|
266 | // ctrl is now the offset for a back-reference...
|
---|
267 | // the logical AND operation removes the length bits
|
---|
268 | ctrl = -((ctrl & 0x1f) << 8) - 1;
|
---|
269 |
|
---|
270 | // the next byte augments/increases the offset
|
---|
271 | ctrl -= in[inPos++] & 255;
|
---|
272 |
|
---|
273 | // copy the back-reference bytes from the given
|
---|
274 | // location in output to current position
|
---|
275 | ctrl += outPos;
|
---|
276 | if (outPos + len >= out.length) {
|
---|
277 | // reduce array bounds checking
|
---|
278 | throw new ArrayIndexOutOfBoundsException();
|
---|
279 | }
|
---|
280 | for (int i = 0; i < len; i++) {
|
---|
281 | out[outPos++] = out[ctrl++];
|
---|
282 | }
|
---|
283 | }
|
---|
284 | } while (outPos < outLen);
|
---|
285 | }
|
---|
286 |
|
---|
287 |
|
---|
288 | public static final Serializer<byte[]> SERIALIZER = new Serializer<byte[]>() {
|
---|
289 |
|
---|
290 | final ThreadLocal<CompressLZF> LZF = new ThreadLocal<CompressLZF>() {
|
---|
291 | @Override
|
---|
292 | protected CompressLZF initialValue() {
|
---|
293 | return new CompressLZF();
|
---|
294 | }
|
---|
295 | };
|
---|
296 |
|
---|
297 | @Override
|
---|
298 | public void serialize(DataOutput out, byte[] value) throws IOException {
|
---|
299 | if (value == null) return;
|
---|
300 |
|
---|
301 | CompressLZF lzf = LZF.get();
|
---|
302 | byte[] outbuf = new byte[value.length + 40];
|
---|
303 | int len = lzf.compress(value, value.length, outbuf, 0);
|
---|
304 | //check if compressed data are larger then original
|
---|
305 | if (value.length <= len) {
|
---|
306 | //in this case do not compress data, write 0 as indicator
|
---|
307 | Utils.packInt(out, 0);
|
---|
308 | out.write(value);
|
---|
309 | } else {
|
---|
310 | Utils.packInt(out, value.length); //write original decompressed size
|
---|
311 | out.write(outbuf, 0, len);
|
---|
312 | }
|
---|
313 | }
|
---|
314 |
|
---|
315 | @Override
|
---|
316 | public byte[] deserialize(DataInput in, int available) throws IOException {
|
---|
317 | if (available == 0) return null;
|
---|
318 | //get original decompressed size
|
---|
319 | DataInput2 in2 = (DataInput2) in;
|
---|
320 | int origPos = in2.pos;
|
---|
321 | int expendedLen = Utils.unpackInt(in);
|
---|
322 | byte[] inbuf = new byte[available - (in2.pos - origPos)];
|
---|
323 | in.readFully(inbuf);
|
---|
324 | if (expendedLen == 0) {
|
---|
325 | //special case, data are not compressed
|
---|
326 | return inbuf;
|
---|
327 | }
|
---|
328 | byte[] outbuf = new byte[expendedLen + 40];
|
---|
329 |
|
---|
330 | CompressLZF lzf = LZF.get();
|
---|
331 | lzf.expand(inbuf, 0, inbuf.length, outbuf, 0, expendedLen);
|
---|
332 | outbuf = Arrays.copyOf(outbuf, expendedLen);
|
---|
333 |
|
---|
334 | return outbuf;
|
---|
335 | }
|
---|
336 |
|
---|
337 |
|
---|
338 | };
|
---|
339 |
|
---|
340 | /**
|
---|
341 | * Wraps existing serializer and compresses its input/output
|
---|
342 | */
|
---|
343 | public static <E> Serializer<E> serializerCompressWrapper(Serializer<E> serializer) {
|
---|
344 | return new SerializerCompressWrapper<E>(serializer);
|
---|
345 | }
|
---|
346 |
|
---|
347 |
|
---|
348 | protected static class SerializerCompressWrapper<E> implements Serializer<E>, Serializable {
|
---|
349 | protected final Serializer<E> serializer;
|
---|
350 | public SerializerCompressWrapper(Serializer<E> serializer) {
|
---|
351 | this.serializer = serializer;
|
---|
352 | }
|
---|
353 |
|
---|
354 | @Override
|
---|
355 | public void serialize(DataOutput out, E value) throws IOException {
|
---|
356 | //serialize to byte[]
|
---|
357 | DataOutput2 out2 = new DataOutput2();
|
---|
358 | serializer.serialize(out2, value);
|
---|
359 | byte[] b = out2.copyBytes();
|
---|
360 | CompressLZF.SERIALIZER.serialize(out, b);
|
---|
361 | }
|
---|
362 |
|
---|
363 | @Override
|
---|
364 | public E deserialize(DataInput in, int available) throws IOException {
|
---|
365 | byte[] b = CompressLZF.SERIALIZER.deserialize(in, available);
|
---|
366 | DataInput2 in2 = new DataInput2(b);
|
---|
367 | return serializer.deserialize(in2, b.length);
|
---|
368 | }
|
---|
369 | }
|
---|
370 |
|
---|
371 | }
|
---|