1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package org.waarp.compress.zstdsafe;
35
36 import static org.waarp.compress.zstdsafe.Constants.*;
37 import static org.waarp.compress.zstdsafe.FiniteStateEntropy.*;
38 import static org.waarp.compress.zstdsafe.UnsafeUtil.*;
39 import static org.waarp.compress.zstdsafe.Util.*;
40
41 class SequenceEncoder {
42 private static final int DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS_LOG = 6;
43 private static final short[] DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS = {
44 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1
46 };
47
48 private static final int DEFAULT_MATCH_LENGTH_NORMALIZED_COUNTS_LOG = 6;
49 private static final short[] DEFAULT_MATCH_LENGTH_NORMALIZED_COUNTS = {
50 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1,
52 -1, -1, -1, -1
53 };
54
55 private static final int DEFAULT_OFFSET_NORMALIZED_COUNTS_LOG = 5;
56 private static final short[] DEFAULT_OFFSET_NORMALIZED_COUNTS = {
57 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 -1, -1, -1, -1, -1
59 };
60
61 private static final FseCompressionTable DEFAULT_LITERAL_LENGTHS_TABLE =
62 FseCompressionTable.newInstance(DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS,
63 MAX_LITERALS_LENGTH_SYMBOL,
64 DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS_LOG);
65 private static final FseCompressionTable DEFAULT_MATCH_LENGTHS_TABLE =
66 FseCompressionTable.newInstance(DEFAULT_MATCH_LENGTH_NORMALIZED_COUNTS,
67 MAX_MATCH_LENGTH_SYMBOL,
68 DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS_LOG);
69 private static final FseCompressionTable DEFAULT_OFFSETS_TABLE =
70 FseCompressionTable.newInstance(DEFAULT_OFFSET_NORMALIZED_COUNTS,
71 DEFAULT_MAX_OFFSET_CODE_SYMBOL,
72 DEFAULT_OFFSET_NORMALIZED_COUNTS_LOG);
73 public static final String NOT_YET_IMPLEMENTED = "not yet implemented";
74
75 private SequenceEncoder() {
76 }
77
78 public static int compressSequences(final byte[] outputBase,
79 final int outputAddress,
80 final int outputSize,
81 final SequenceStore sequences,
82 final CompressionParameters.Strategy strategy,
83 final SequenceEncodingContext workspace) {
84 int output = outputAddress;
85 final int outputLimit = outputAddress + outputSize;
86
87 checkArgument(outputLimit - output >
88 3 + 1 ,
89 "Output buffer too small");
90
91 final int sequenceCount = sequences.sequenceCount;
92 if (sequenceCount < 0x7F) {
93 outputBase[output] = (byte) sequenceCount;
94 output++;
95 } else if (sequenceCount < LONG_NUMBER_OF_SEQUENCES) {
96 outputBase[output] = (byte) (sequenceCount >>> 8 | 0x80);
97 outputBase[output + 1] = (byte) sequenceCount;
98 output += SIZE_OF_SHORT;
99 } else {
100 outputBase[output] = (byte) 0xFF;
101 output++;
102 putShort(outputBase, output,
103 (short) (sequenceCount - LONG_NUMBER_OF_SEQUENCES));
104 output += SIZE_OF_SHORT;
105 }
106
107 if (sequenceCount == 0) {
108 return output - outputAddress;
109 }
110
111
112 final int headerAddress = output++;
113
114 int maxSymbol;
115 int largestCount;
116
117
118 final int[] counts = workspace.counts;
119 Histogram.count(sequences.literalLengthCodes, sequenceCount,
120 workspace.counts);
121 maxSymbol = Histogram.findMaxSymbol(counts, MAX_LITERALS_LENGTH_SYMBOL);
122 largestCount = Histogram.findLargestCount(counts, maxSymbol);
123
124 final int literalsLengthEncodingType =
125 selectEncodingType(largestCount, sequenceCount,
126 DEFAULT_LITERAL_LENGTH_NORMALIZED_COUNTS_LOG, true,
127 strategy);
128
129 final FseCompressionTable literalLengthTable;
130 switch (literalsLengthEncodingType) {
131 case SEQUENCE_ENCODING_RLE:
132 outputBase[output] = sequences.literalLengthCodes[0];
133 output++;
134 workspace.literalLengthTable.initializeRleTable(maxSymbol);
135 literalLengthTable = workspace.literalLengthTable;
136 break;
137 case SEQUENCE_ENCODING_BASIC:
138 literalLengthTable = DEFAULT_LITERAL_LENGTHS_TABLE;
139 break;
140 case SEQUENCE_ENCODING_COMPRESSED:
141 output +=
142 buildCompressionTable(workspace.literalLengthTable, outputBase,
143 output, outputLimit, sequenceCount,
144 LITERAL_LENGTH_TABLE_LOG,
145 sequences.literalLengthCodes,
146 workspace.counts, maxSymbol,
147 workspace.normalizedCounts);
148 literalLengthTable = workspace.literalLengthTable;
149 break;
150 default:
151 throw new UnsupportedOperationException(NOT_YET_IMPLEMENTED);
152 }
153
154
155 Histogram.count(sequences.offsetCodes, sequenceCount, workspace.counts);
156 maxSymbol = Histogram.findMaxSymbol(counts, MAX_OFFSET_CODE_SYMBOL);
157 largestCount = Histogram.findLargestCount(counts, maxSymbol);
158
159
160 final boolean defaultAllowed = maxSymbol < DEFAULT_MAX_OFFSET_CODE_SYMBOL;
161
162 final int offsetEncodingType =
163 selectEncodingType(largestCount, sequenceCount,
164 DEFAULT_OFFSET_NORMALIZED_COUNTS_LOG, defaultAllowed,
165 strategy);
166
167 final FseCompressionTable offsetCodeTable;
168 switch (offsetEncodingType) {
169 case SEQUENCE_ENCODING_RLE:
170 outputBase[output] = sequences.offsetCodes[0];
171 output++;
172 workspace.offsetCodeTable.initializeRleTable(maxSymbol);
173 offsetCodeTable = workspace.offsetCodeTable;
174 break;
175 case SEQUENCE_ENCODING_BASIC:
176 offsetCodeTable = DEFAULT_OFFSETS_TABLE;
177 break;
178 case SEQUENCE_ENCODING_COMPRESSED:
179 output +=
180 buildCompressionTable(workspace.offsetCodeTable, outputBase, output,
181 output + outputSize, sequenceCount,
182 OFFSET_TABLE_LOG, sequences.offsetCodes,
183 workspace.counts, maxSymbol,
184 workspace.normalizedCounts);
185 offsetCodeTable = workspace.offsetCodeTable;
186 break;
187 default:
188 throw new UnsupportedOperationException(NOT_YET_IMPLEMENTED);
189 }
190
191
192 Histogram.count(sequences.matchLengthCodes, sequenceCount,
193 workspace.counts);
194 maxSymbol = Histogram.findMaxSymbol(counts, MAX_MATCH_LENGTH_SYMBOL);
195 largestCount = Histogram.findLargestCount(counts, maxSymbol);
196
197 final int matchLengthEncodingType =
198 selectEncodingType(largestCount, sequenceCount,
199 DEFAULT_MATCH_LENGTH_NORMALIZED_COUNTS_LOG, true,
200 strategy);
201
202 final FseCompressionTable matchLengthTable;
203 switch (matchLengthEncodingType) {
204 case SEQUENCE_ENCODING_RLE:
205 outputBase[output] = sequences.matchLengthCodes[0];
206 output++;
207 workspace.matchLengthTable.initializeRleTable(maxSymbol);
208 matchLengthTable = workspace.matchLengthTable;
209 break;
210 case SEQUENCE_ENCODING_BASIC:
211 matchLengthTable = DEFAULT_MATCH_LENGTHS_TABLE;
212 break;
213 case SEQUENCE_ENCODING_COMPRESSED:
214 output += buildCompressionTable(workspace.matchLengthTable, outputBase,
215 output, outputLimit, sequenceCount,
216 MATCH_LENGTH_TABLE_LOG,
217 sequences.matchLengthCodes,
218 workspace.counts, maxSymbol,
219 workspace.normalizedCounts);
220 matchLengthTable = workspace.matchLengthTable;
221 break;
222 default:
223 throw new UnsupportedOperationException(NOT_YET_IMPLEMENTED);
224 }
225
226
227 outputBase[headerAddress] =
228 (byte) ((literalsLengthEncodingType << 6) | (offsetEncodingType << 4) |
229 (matchLengthEncodingType << 2));
230
231 output += encodeSequences(outputBase, output, outputLimit, matchLengthTable,
232 offsetCodeTable, literalLengthTable, sequences);
233
234 return output - outputAddress;
235 }
236
237 private static int buildCompressionTable(final FseCompressionTable table,
238 final byte[] outputBase,
239 final int output,
240 final int outputLimit,
241 int sequenceCount,
242 final int maxTableLog,
243 final byte[] codes,
244 final int[] counts,
245 final int maxSymbol,
246 final short[] normalizedCounts) {
247 final int tableLog = optimalTableLog(maxTableLog, sequenceCount, maxSymbol);
248
249
250
251 if (counts[codes[sequenceCount - 1]] > 1) {
252 counts[codes[sequenceCount - 1]]--;
253 sequenceCount--;
254 }
255
256 FiniteStateEntropy.normalizeCounts(normalizedCounts, tableLog, counts,
257 sequenceCount, maxSymbol);
258 table.initialize(normalizedCounts, maxSymbol, tableLog);
259
260 return FiniteStateEntropy.writeNormalizedCounts(outputBase, output,
261 outputLimit - output,
262 normalizedCounts, maxSymbol,
263 tableLog);
264 }
265
266 private static int encodeSequences(final byte[] outputBase, final int output,
267 final int outputLimit,
268 final FseCompressionTable matchLengthTable,
269 final FseCompressionTable offsetsTable,
270 final FseCompressionTable literalLengthTable,
271 final SequenceStore sequences) {
272 final byte[] matchLengthCodes = sequences.matchLengthCodes;
273 final byte[] offsetCodes = sequences.offsetCodes;
274 final byte[] literalLengthCodes = sequences.literalLengthCodes;
275
276 final BitOutputStream blockStream =
277 new BitOutputStream(outputBase, output, outputLimit - output);
278
279 final int sequenceCount = sequences.sequenceCount;
280
281
282 int matchLengthState =
283 matchLengthTable.begin(matchLengthCodes[sequenceCount - 1]);
284 int offsetState = offsetsTable.begin(offsetCodes[sequenceCount - 1]);
285 int literalLengthState =
286 literalLengthTable.begin(literalLengthCodes[sequenceCount - 1]);
287
288 blockStream.addBits(sequences.literalLengths[sequenceCount - 1],
289 LITERALS_LENGTH_BITS[literalLengthCodes[sequenceCount -
290 1]]);
291 blockStream.addBits(sequences.matchLengths[sequenceCount - 1],
292 MATCH_LENGTH_BITS[matchLengthCodes[sequenceCount - 1]]);
293 blockStream.addBits(sequences.offsets[sequenceCount - 1],
294 offsetCodes[sequenceCount - 1]);
295 blockStream.flush();
296
297 if (sequenceCount >= 2) {
298 for (int n = sequenceCount - 2; n >= 0; n--) {
299 final byte literalLengthCode = literalLengthCodes[n];
300 final byte offsetCode = offsetCodes[n];
301 final byte matchLengthCode = matchLengthCodes[n];
302
303 final int literalLengthBits = LITERALS_LENGTH_BITS[literalLengthCode];
304 final int matchLengthBits = MATCH_LENGTH_BITS[matchLengthCode];
305
306
307 offsetState =
308 offsetsTable.encode(blockStream, offsetState, offsetCode);
309 matchLengthState =
310 matchLengthTable.encode(blockStream, matchLengthState,
311 matchLengthCode);
312 literalLengthState =
313 literalLengthTable.encode(blockStream, literalLengthState,
314 literalLengthCode);
315
316 if (((int) offsetCode + matchLengthBits + literalLengthBits >= 64 - 7 -
317 (LITERAL_LENGTH_TABLE_LOG +
318 MATCH_LENGTH_TABLE_LOG +
319 OFFSET_TABLE_LOG))) {
320 blockStream.flush();
321 }
322
323 blockStream.addBits(sequences.literalLengths[n], literalLengthBits);
324 if (((literalLengthBits + matchLengthBits) > 24)) {
325 blockStream.flush();
326 }
327
328 blockStream.addBits(sequences.matchLengths[n], matchLengthBits);
329 if (((int) offsetCode + matchLengthBits + literalLengthBits > 56)) {
330 blockStream.flush();
331 }
332
333 blockStream.addBits(sequences.offsets[n], offsetCode);
334 blockStream.flush();
335 }
336 }
337
338 matchLengthTable.finish(blockStream, matchLengthState);
339 offsetsTable.finish(blockStream, offsetState);
340 literalLengthTable.finish(blockStream, literalLengthState);
341
342 final int streamSize = blockStream.close();
343 checkArgument(streamSize > 0, "Output buffer too small");
344
345 return streamSize;
346 }
347
348 private static int selectEncodingType(final int largestCount,
349 final int sequenceCount,
350 final int defaultNormalizedCountsLog,
351 final boolean isDefaultTableAllowed,
352 final CompressionParameters.Strategy strategy) {
353 if (largestCount == sequenceCount) {
354 if (isDefaultTableAllowed && sequenceCount <= 2) {
355
356
357
358
359 return SEQUENCE_ENCODING_BASIC;
360 }
361
362 return SEQUENCE_ENCODING_RLE;
363 }
364
365 if (strategy.ordinal() < CompressionParameters.Strategy.LAZY.ordinal()) {
366
367 if (isDefaultTableAllowed) {
368 final int factor =
369 10 - strategy.ordinal();
370 final int baseLog = 3;
371 final int minNumberOfSequences =
372 ((1 << defaultNormalizedCountsLog) * factor) >>
373 baseLog;
374
375 if ((sequenceCount < minNumberOfSequences) || (largestCount <
376 (sequenceCount >>
377 (defaultNormalizedCountsLog -
378 1)))) {
379
380
381
382
383
384
385 return SEQUENCE_ENCODING_BASIC;
386 }
387 }
388 } else {
389
390 throw new UnsupportedOperationException(NOT_YET_IMPLEMENTED);
391 }
392
393 return SEQUENCE_ENCODING_COMPRESSED;
394 }
395 }