1 package com.fasterxml.jackson.core.sym;
2
3 import java.util.Arrays;
4 import java.util.concurrent.atomic.AtomicReference;
5
6 import com.fasterxml.jackson.core.JsonFactory;
7 import com.fasterxml.jackson.core.util.InternCache;
8
9 /**
10 * Replacement for <code>BytesToNameCanonicalizer</code> which aims at more localized
11 * memory access due to flattening of name quad data.
12 * Performance improvement modest for simple JSON document data binding (maybe 3%),
13 * but should help more for larger symbol tables, or for binary formats like Smile.
14 *<p>
15 * Hash area is divided into 4 sections:
16 *<ol>
17 * <li>Primary area (1/2 of total size), direct match from hash (LSB)</li>
18 * <li>Secondary area (1/4 of total size), match from {@code hash (LSB) >> 1}</li>
19 * <li>Tertiary area (1/8 of total size), match from {@code hash (LSB) >> 2}</li>
20 * <li>Spill-over area (remaining 1/8) with linear scan, insertion order</li>
21 * <li></li>
22 * </ol>
23 * and within every area, entries are 4 {@code int}s, where 1 - 3 ints contain 1 - 12
24 * UTF-8 encoded bytes of name (null-padded), and last int is offset in
25 * {@code _names} that contains actual name Strings.
26 *
27 * @since 2.6
28 */
29 public final class ByteQuadsCanonicalizer
30 {
31 /**
32 * Initial size of the primary hash area. Each entry consumes 4 ints (16 bytes),
33 * and secondary area is same as primary; so default size will use 2kB of memory
34 * (plus 64x4 or 64x8 (256/512 bytes) for references to Strings, and Strings
35 * themselves).
36 */
37 private static final int DEFAULT_T_SIZE = 64;
38 // private static final int DEFAULT_T_SIZE = 256;
39
40 /**
41 * Let's not expand symbol tables past some maximum size;
42 * with unique (~= random) names.
43 * Size is in
44 */
45 private static final int MAX_T_SIZE = 0x10000; // 64k entries == 2M mem hash area
46
47 /**
48 * No point in trying to construct tiny tables, just need to resize soon.
49 */
50 private final static int MIN_HASH_SIZE = 16;
51
52 /**
53 * Let's only share reasonably sized symbol tables. Max size set to 3/4 of 8k;
54 * this corresponds to 256k main hash index. This should allow for enough distinct
55 * names for almost any case, while preventing ballooning for cases where names
56 * are unique (or close thereof).
57 */
58 final static int MAX_ENTRIES_FOR_REUSE = 6000;
59
60 /*
61 /**********************************************************
62 /* Linkage, needed for merging symbol tables
63 /**********************************************************
64 */
65
66 /**
67 * Reference to the root symbol table, for child tables, so
68 * that they can merge table information back as necessary.
69 */
70 final private ByteQuadsCanonicalizer _parent;
71
72 /**
73 * Member that is only used by the root table instance: root
74 * passes immutable state info child instances, and children
75 * may return new state if they add entries to the table.
76 * Child tables do NOT use the reference.
77 */
78 final private AtomicReference<TableInfo> _tableInfo;
79
80 /**
81 * Seed value we use as the base to make hash codes non-static between
82 * different runs, but still stable for lifetime of a single symbol table
83 * instance.
84 * This is done for security reasons, to avoid potential DoS attack via
85 * hash collisions.
86 */
87 final private int _seed;
88
89 /*
90 /**********************************************************
91 /* Configuration
92 /**********************************************************
93 */
94
95 /**
96 * Whether canonical symbol Strings are to be intern()ed before added
97 * to the table or not.
98 *<p>
99 * NOTE: non-final to allow disabling intern()ing in case of excessive
100 * collisions.
101 */
102 private boolean _intern;
103
104 /**
105 * Flag that indicates whether we should throw an exception if enough
106 * hash collisions are detected (true); or just worked around (false).
107 *
108 * @since 2.4
109 */
110 private final boolean _failOnDoS;
111
112 /*
113 /**********************************************************
114 /* First, main hash area info
115 /**********************************************************
116 */
117
118 /**
119 * Primary hash information area: consists of <code>2 * _hashSize</code>
120 * entries of 16 bytes (4 ints), arranged in a cascading lookup
121 * structure (details of which may be tweaked depending on expected rates
122 * of collisions).
123 */
124 private int[] _hashArea;
125
126 /**
127 * Number of slots for primary entries within {@link #_hashArea}; which is
128 * at most <code>1/8</code> of actual size of the underlying array (4-int slots,
129 * primary covers only half of the area; plus, additional area for longer
130 * symbols after hash area).
131 */
132 private int _hashSize;
133
134 /**
135 * Offset within {@link #_hashArea} where secondary entries start
136 */
137 private int _secondaryStart;
138
139 /**
140 * Offset within {@link #_hashArea} where tertiary entries start
141 */
142 private int _tertiaryStart;
143
144 /**
145 * Constant that determines size of buckets for tertiary entries:
146 * <code>1 << _tertiaryShift</code> is the size, and shift value
147 * is also used for translating from primary offset into
148 * tertiary bucket (shift right by <code>4 + _tertiaryShift</code>).
149 *<p>
150 * Default value is 2, for buckets of 4 slots; grows bigger with
151 * bigger table sizes.
152 */
153 private int _tertiaryShift;
154
155 /**
156 * Total number of Strings in the symbol table; only used for child tables.
157 */
158 private int _count;
159
160 /**
161 * Array that contains <code>String</code> instances matching
162 * entries in {@link #_hashArea}.
163 * Contains nulls for unused entries. Note that this size is twice
164 * that of {@link #_hashArea}
165 */
166 private String[] _names;
167
168 /*
169 /**********************************************************
170 /* Then information on collisions etc
171 /**********************************************************
172 */
173
174 /**
175 * Pointer to the offset within spill-over area where there is room
176 * for more spilled over entries (if any).
177 * Spill over area is within fixed-size portion of {@link #_hashArea}.
178 */
179 private int _spilloverEnd;
180
181 /**
182 * Offset within {@link #_hashArea} that follows main slots and contains
183 * quads for longer names (13 bytes or longer), and points to the
184 * first available int that may be used for appending quads of the next
185 * long name.
186 * Note that long name area follows immediately after the fixed-size
187 * main hash area ({@link #_hashArea}).
188 */
189 private int _longNameOffset;
190
191 /*
192 /**********************************************************
193 /* Sharing, versioning
194 /**********************************************************
195 */
196
197 // // // Which of the buffers may be shared (and are copy-on-write)?
198
199 /**
200 * Flag that indicates whether underlying data structures for
201 * the main hash area are shared or not. If they are, then they
202 * need to be handled in copy-on-write way, i.e. if they need
203 * to be modified, a copy needs to be made first; at this point
204 * it will not be shared any more, and can be modified.
205 *<p>
206 * This flag needs to be checked both when adding new main entries,
207 * and when adding new collision list queues (i.e. creating a new
208 * collision list head entry)
209 */
210 private boolean _hashShared;
211
212 /*
213 /**********************************************************
214 /* Life-cycle: constructors
215 /**********************************************************
216 */
217
218 /**
219 * Constructor used for creating per-<code>JsonFactory</code> "root"
220 * symbol tables: ones used for merging and sharing common symbols
221 *
222 * @param sz Initial primary hash area size
223 * @param intern Whether Strings contained should be {@link String#intern}ed
224 * @param seed Random seed valued used to make it more difficult to cause
225 * collisions (used for collision-based DoS attacks).
226 */
227 private ByteQuadsCanonicalizer(int sz, boolean intern, int seed, boolean failOnDoS) {
228 _parent = null;
229 _seed = seed;
230 _intern = intern;
231 _failOnDoS = failOnDoS;
232 // Sanity check: let's now allow hash sizes below certain minimum value
233 if (sz < MIN_HASH_SIZE) {
234 sz = MIN_HASH_SIZE;
235 } else {
236 // Also; size must be 2^N; otherwise hash algorithm won't
237 // work... so let's just pad it up, if so
238 if ((sz & (sz - 1)) != 0) { // only true if it's 2^N
239 int curr = MIN_HASH_SIZE;
240 while (curr < sz) {
241 curr += curr;
242 }
243 sz = curr;
244 }
245 }
246 _tableInfo = new AtomicReference<TableInfo>(TableInfo.createInitial(sz));
247 }
248
249 /**
250 * Constructor used when creating a child instance
251 */
252 private ByteQuadsCanonicalizer(ByteQuadsCanonicalizer parent, boolean intern,
253 int seed, boolean failOnDoS, TableInfo state)
254 {
255 _parent = parent;
256 _seed = seed;
257 _intern = intern;
258 _failOnDoS = failOnDoS;
259 _tableInfo = null; // not used by child tables
260
261 // Then copy shared state
262 _count = state.count;
263 _hashSize = state.size;
264 _secondaryStart = _hashSize << 2; // right after primary area
265 _tertiaryStart = _secondaryStart + (_secondaryStart >> 1); // right after secondary
266 _tertiaryShift = state.tertiaryShift;
267
268 _hashArea = state.mainHash;
269 _names = state.names;
270
271 _spilloverEnd = state.spilloverEnd;
272 _longNameOffset = state.longNameOffset;
273
274 // and then set other state to reflect sharing status
275 _hashShared = true;
276 }
277
278 /*
279 /**********************************************************
280 /* Life-cycle: factory methods, merging
281 /**********************************************************
282 */
283
284 /**
285 * Factory method to call to create a symbol table instance with a
286 * randomized seed value.
287 */
288 public static ByteQuadsCanonicalizer createRoot() {
289 // Need to use a variable seed, to thwart hash-collision based attacks.
290 // 14-Feb-2017, tatu: Does this actually help?
291 long now = System.currentTimeMillis();
292 // ensure it's not 0; and might as well require to be odd so:
293 int seed = (((int) now) + ((int) (now >>> 32))) | 1;
294 return createRoot(seed);
295 }
296
297 // Factory method that should only be called from unit tests, where seed
298 // value should remain the same.
299 protected static ByteQuadsCanonicalizer createRoot(int seed) {
300 return new ByteQuadsCanonicalizer(DEFAULT_T_SIZE, true, seed, true);
301 }
302
303 /**
304 * Factory method used to create actual symbol table instance to
305 * use for parsing.
306 */
307 public ByteQuadsCanonicalizer makeChild(int flags) {
308 return new ByteQuadsCanonicalizer(this,
309 JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(flags),
310 _seed,
311 JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(flags),
312 _tableInfo.get());
313 }
314
315 /**
316 * Method called by the using code to indicate it is done with this instance.
317 * This lets instance merge accumulated changes into parent (if need be),
318 * safely and efficiently, and without calling code having to know about parent
319 * information.
320 */
321 public void release()
322 {
323 // we will try to merge if child table has new entries
324 // 28-Jul-2019, tatu: From [core#548]: do not share if immediate rehash needed
325 if ((_parent != null) && maybeDirty()) {
326 _parent.mergeChild(new TableInfo(this));
327 // Let's also mark this instance as dirty, so that just in
328 // case release was too early, there's no corruption of possibly shared data.
329 _hashShared = true;
330 }
331 }
332
333 private void mergeChild(TableInfo childState)
334 {
335 final int childCount = childState.count;
336 TableInfo currState = _tableInfo.get();
337
338 // Should usually grow; but occasionally could also shrink if (but only if)
339 // collision list overflow ends up clearing some collision lists.
340 if (childCount == currState.count) {
341 return;
342 }
343
344 // One caveat: let's try to avoid problems with degenerate cases of documents with
345 // generated "random" names: for these, symbol tables would bloat indefinitely.
346 // One way to do this is to just purge tables if they grow
347 // too large, and that's what we'll do here.
348 if (childCount > MAX_ENTRIES_FOR_REUSE) {
349 // At any rate, need to clean up the tables
350 childState = TableInfo.createInitial(DEFAULT_T_SIZE);
351 }
352 _tableInfo.compareAndSet(currState, childState);
353 }
354
355 /*
356 /**********************************************************
357 /* API, accessors
358 /**********************************************************
359 */
360
361 public int size()
362 {
363 if (_tableInfo != null) { // root table
364 return _tableInfo.get().count;
365 }
366 // nope, child table
367 return _count;
368 }
369
370 /**
371 * Returns number of primary slots table has currently
372 */
373 public int bucketCount() { return _hashSize; }
374
375 /**
376 * Method called to check to quickly see if a child symbol table
377 * may have gotten additional entries. Used for checking to see
378 * if a child table should be merged into shared table.
379 */
380 public boolean maybeDirty() { return !_hashShared; }
381
382 public int hashSeed() { return _seed; }
383
384 /**
385 * Method mostly needed by unit tests; calculates number of
386 * entries that are in the primary slot set. These are
387 * "perfect" entries, accessible with a single lookup
388 */
389 public int primaryCount()
390 {
391 int count = 0;
392 for (int offset = 3, end = _secondaryStart; offset < end; offset += 4) {
393 if (_hashArea[offset] != 0) {
394 ++count;
395 }
396 }
397 return count;
398 }
399
400 /**
401 * Method mostly needed by unit tests; calculates number of entries
402 * in secondary buckets
403 */
404 public int secondaryCount() {
405 int count = 0;
406 int offset = _secondaryStart + 3;
407 for (int end = _tertiaryStart; offset < end; offset += 4) {
408 if (_hashArea[offset] != 0) {
409 ++count;
410 }
411 }
412 return count;
413 }
414
415 /**
416 * Method mostly needed by unit tests; calculates number of entries
417 * in tertiary buckets
418 */
419 public int tertiaryCount() {
420 int count = 0;
421 int offset = _tertiaryStart + 3; // to 1.5x, starting point of tertiary
422 for (int end = offset + _hashSize; offset < end; offset += 4) {
423 if (_hashArea[offset] != 0) {
424 ++count;
425 }
426 }
427 return count;
428 }
429
430 /**
431 * Method mostly needed by unit tests; calculates number of entries
432 * in shared spillover area
433 */
434 public int spilloverCount() {
435 // difference between spillover end, start, divided by 4 (four ints per slot)
436 return (_spilloverEnd - _spilloverStart()) >> 2;
437 }
438
439 public int totalCount()
440 {
441 int count = 0;
442 for (int offset = 3, end = (_hashSize << 3); offset < end; offset += 4) {
443 if (_hashArea[offset] != 0) {
444 ++count;
445 }
446 }
447 return count;
448 }
449
450 @Override
451 public String toString() {
452 int pri = primaryCount();
453 int sec = secondaryCount();
454 int tert = tertiaryCount();
455 int spill = spilloverCount();
456 int total = totalCount();
457 return String.format("[%s: size=%d, hashSize=%d, %d/%d/%d/%d pri/sec/ter/spill (=%s), total:%d]",
458 getClass().getName(), _count, _hashSize,
459 pri, sec, tert, spill, (pri+sec+tert+spill), total);
460 }
461
462 /*
463 /**********************************************************
464 /* Public API, accessing symbols
465 /**********************************************************
466 */
467
468 public String findName(int q1)
469 {
470 int offset = _calcOffset(calcHash(q1));
471 // first: primary match?
472 final int[] hashArea = _hashArea;
473
474 int len = hashArea[offset+3];
475
476 if (len == 1) {
477 if (hashArea[offset] == q1) {
478 return _names[offset >> 2];
479 }
480 } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so
481 return null;
482 }
483 // secondary? single slot shared by N/2 primaries
484 int offset2 = _secondaryStart + ((offset >> 3) << 2);
485
486 len = hashArea[offset2+3];
487
488 if (len == 1) {
489 if (hashArea[offset2] == q1) {
490 return _names[offset2 >> 2];
491 }
492 } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so
493 return null;
494 }
495
496 // tertiary lookup & spillovers best to offline
497 return _findSecondary(offset, q1);
498 }
499
500 public String findName(int q1, int q2)
501 {
502 int offset = _calcOffset(calcHash(q1, q2));
503
504 final int[] hashArea = _hashArea;
505
506 int len = hashArea[offset+3];
507
508 if (len == 2) {
509 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1])) {
510 return _names[offset >> 2];
511 }
512 } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so
513 return null;
514 }
515 // secondary?
516 int offset2 = _secondaryStart + ((offset >> 3) << 2);
517
518 len = hashArea[offset2+3];
519
520 if (len == 2) {
521 if ((q1 == hashArea[offset2]) && (q2 == hashArea[offset2+1])) {
522 return _names[offset2 >> 2];
523 }
524 } else if (len == 0) { // empty slot? Short-circuit if no more spillovers
525 return null;
526 }
527 return _findSecondary(offset, q1, q2);
528 }
529
530 public String findName(int q1, int q2, int q3)
531 {
532 int offset = _calcOffset(calcHash(q1, q2, q3));
533 final int[] hashArea = _hashArea;
534 int len = hashArea[offset+3];
535
536 if (len == 3) {
537 if ((q1 == hashArea[offset]) && (hashArea[offset+1] == q2) && (hashArea[offset+2] == q3)) {
538 return _names[offset >> 2];
539 }
540 } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so
541 return null;
542 }
543 // secondary?
544 int offset2 = _secondaryStart + ((offset >> 3) << 2);
545
546 len = hashArea[offset2+3];
547
548 if (len == 3) {
549 if ((q1 == hashArea[offset2]) && (hashArea[offset2+1] == q2) && (hashArea[offset2+2] == q3)) {
550 return _names[offset2 >> 2];
551 }
552 } else if (len == 0) { // empty slot? Short-circuit if no more spillovers
553 return null;
554 }
555 return _findSecondary(offset, q1, q2, q3);
556 }
557
558 public String findName(int[] q, int qlen)
559 {
560 /* This version differs significantly, because longer names do not fit within cell.
561 * Rather, they contain hash in main slot, and offset+length to extension area
562 * that contains actual quads.
563 */
564 if (qlen < 4) { // another sanity check
565 switch (qlen) {
566 case 3:
567 return findName(q[0], q[1], q[2]);
568 case 2:
569 return findName(q[0], q[1]);
570 case 1:
571 return findName(q[0]);
572 default: // if 0 ever passed
573 return "";
574 }
575 }
576 final int hash = calcHash(q, qlen);
577 int offset = _calcOffset(hash);
578
579 final int[] hashArea = _hashArea;
580
581 final int len = hashArea[offset+3];
582
583 if ((hash == hashArea[offset]) && (len == qlen)) {
584 // probable but not guaranteed: verify
585 if (_verifyLongName(q, qlen, hashArea[offset+1])) {
586 return _names[offset >> 2];
587 }
588 }
589 if (len == 0) { // empty slot; unlikely but avoid further lookups if so
590 return null;
591 }
592 // secondary?
593 int offset2 = _secondaryStart + ((offset >> 3) << 2);
594
595 final int len2 = hashArea[offset2+3];
596 if ((hash == hashArea[offset2]) && (len2 == qlen)) {
597 if (_verifyLongName(q, qlen, hashArea[offset2+1])) {
598 return _names[offset2 >> 2];
599 }
600 }
601 return _findSecondary(offset, hash, q, qlen);
602 }
603
604 private final int _calcOffset(int hash)
605 {
606 // NOTE: simple for initial impl, but we may want to interleave it a bit
607 // in near future
608 // So: first, hash into primary hash index
609 int ix = hash & (_hashSize-1);
610 // keeping in mind we have 4 ints per entry
611 return (ix << 2);
612 }
613
614 /*
615 /**********************************************************
616 /* Access from spill-over areas
617 /**********************************************************
618 */
619
620 private String _findSecondary(int origOffset, int q1)
621 {
622 // tertiary area division is dynamic. First; its size is N/4 compared to
623 // primary hash size; and offsets are for 4 int slots. So to get to logical
624 // index would shift by 4. But! Tertiary area is further split into buckets,
625 // determined by shift value. And finally, from bucket back into physical offsets
626 int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);
627 final int[] hashArea = _hashArea;
628 final int bucketSize = (1 << _tertiaryShift);
629 for (int end = offset + bucketSize; offset < end; offset += 4) {
630 int len = hashArea[offset+3];
631 if ((q1 == hashArea[offset]) && (1 == len)) {
632 return _names[offset >> 2];
633 }
634 if (len == 0) {
635 return null;
636 }
637 }
638 // but if tertiary full, check out spill-over area as last resort
639 // shared spillover starts at 7/8 of the main hash area
640 // (which is sized at 2 * _hashSize), so:
641 for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {
642 if ((q1 == hashArea[offset]) && (1 == hashArea[offset+3])) {
643 return _names[offset >> 2];
644 }
645 }
646 return null;
647 }
648
649 private String _findSecondary(int origOffset, int q1, int q2)
650 {
651 int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);
652 final int[] hashArea = _hashArea;
653
654 final int bucketSize = (1 << _tertiaryShift);
655 for (int end = offset + bucketSize; offset < end; offset += 4) {
656 int len = hashArea[offset+3];
657 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (2 == len)) {
658 return _names[offset >> 2];
659 }
660 if (len == 0) {
661 return null;
662 }
663 }
664 for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {
665 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (2 == hashArea[offset+3])) {
666 return _names[offset >> 2];
667 }
668 }
669 return null;
670 }
671
672 private String _findSecondary(int origOffset, int q1, int q2, int q3)
673 {
674 int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);
675 final int[] hashArea = _hashArea;
676
677 final int bucketSize = (1 << _tertiaryShift);
678 for (int end = offset + bucketSize; offset < end; offset += 4) {
679 int len = hashArea[offset+3];
680 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (q3 == hashArea[offset+2]) && (3 == len)) {
681 return _names[offset >> 2];
682 }
683 if (len == 0) {
684 return null;
685 }
686 }
687 for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {
688 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (q3 == hashArea[offset+2])
689 && (3 == hashArea[offset+3])) {
690 return _names[offset >> 2];
691 }
692 }
693 return null;
694 }
695
696 private String _findSecondary(int origOffset, int hash, int[] q, int qlen)
697 {
698 int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);
699 final int[] hashArea = _hashArea;
700
701 final int bucketSize = (1 << _tertiaryShift);
702 for (int end = offset + bucketSize; offset < end; offset += 4) {
703 int len = hashArea[offset+3];
704 if ((hash == hashArea[offset]) && (qlen == len)) {
705 if (_verifyLongName(q, qlen, hashArea[offset+1])) {
706 return _names[offset >> 2];
707 }
708 }
709 if (len == 0) {
710 return null;
711 }
712 }
713 for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {
714 if ((hash == hashArea[offset]) && (qlen == hashArea[offset+3])) {
715 if (_verifyLongName(q, qlen, hashArea[offset+1])) {
716 return _names[offset >> 2];
717 }
718 }
719 }
720 return null;
721 }
722
723 private boolean _verifyLongName(int[] q, int qlen, int spillOffset)
724 {
725 final int[] hashArea = _hashArea;
726 // spillOffset assumed to be physical index right into quad string
727 int ix = 0;
728
729 switch (qlen) {
730 default:
731 return _verifyLongName2(q, qlen, spillOffset);
732 case 8:
733 if (q[ix++] != hashArea[spillOffset++]) return false;
734 case 7:
735 if (q[ix++] != hashArea[spillOffset++]) return false;
736 case 6:
737 if (q[ix++] != hashArea[spillOffset++]) return false;
738 case 5:
739 if (q[ix++] != hashArea[spillOffset++]) return false;
740 case 4: // always at least 4
741 if (q[ix++] != hashArea[spillOffset++]) return false;
742 if (q[ix++] != hashArea[spillOffset++]) return false;
743 if (q[ix++] != hashArea[spillOffset++]) return false;
744 if (q[ix++] != hashArea[spillOffset++]) return false;
745 }
746 return true;
747 }
748
749 private boolean _verifyLongName2(int[] q, int qlen, int spillOffset)
750 {
751 int ix = 0;
752 do {
753 if (q[ix++] != _hashArea[spillOffset++]) {
754 return false;
755 }
756 } while (ix < qlen);
757 return true;
758 }
759
760 /*
761 /**********************************************************
762 /* API, mutators
763 /**********************************************************
764 */
765
766 public String addName(String name, int q1) {
767 _verifySharing();
768 if (_intern) {
769 name = InternCache.instance.intern(name);
770 }
771 int offset = _findOffsetForAdd(calcHash(q1));
772 _hashArea[offset] = q1;
773 _hashArea[offset+3] = 1;
774 _names[offset >> 2] = name;
775 ++_count;
776 return name;
777 }
778
779 public String addName(String name, int q1, int q2) {
780 _verifySharing();
781 if (_intern) {
782 name = InternCache.instance.intern(name);
783 }
784 int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2);
785 int offset = _findOffsetForAdd(hash);
786 _hashArea[offset] = q1;
787 _hashArea[offset+1] = q2;
788 _hashArea[offset+3] = 2;
789 _names[offset >> 2] = name;
790 ++_count;
791 return name;
792 }
793
794 public String addName(String name, int q1, int q2, int q3) {
795 _verifySharing();
796 if (_intern) {
797 name = InternCache.instance.intern(name);
798 }
799 int offset = _findOffsetForAdd(calcHash(q1, q2, q3));
800 _hashArea[offset] = q1;
801 _hashArea[offset+1] = q2;
802 _hashArea[offset+2] = q3;
803 _hashArea[offset+3] = 3;
804 _names[offset >> 2] = name;
805 ++_count;
806 return name;
807 }
808
809 public String addName(String name, int[] q, int qlen)
810 {
811 _verifySharing();
812 if (_intern) {
813 name = InternCache.instance.intern(name);
814 }
815 int offset;
816
817 switch (qlen) {
818 case 1:
819 {
820 offset = _findOffsetForAdd(calcHash(q[0]));
821 _hashArea[offset] = q[0];
822 _hashArea[offset+3] = 1;
823 }
824 break;
825 case 2:
826 {
827 offset = _findOffsetForAdd(calcHash(q[0], q[1]));
828 _hashArea[offset] = q[0];
829 _hashArea[offset+1] = q[1];
830 _hashArea[offset+3] = 2;
831 }
832 break;
833 case 3:
834 {
835 offset = _findOffsetForAdd(calcHash(q[0], q[1], q[2]));
836 _hashArea[offset] = q[0];
837 _hashArea[offset+1] = q[1];
838 _hashArea[offset+2] = q[2];
839 _hashArea[offset+3] = 3;
840 }
841 break;
842 default:
843 final int hash = calcHash(q, qlen);
844 offset = _findOffsetForAdd(hash);
845
846 _hashArea[offset] = hash;
847 int longStart = _appendLongName(q, qlen);
848 _hashArea[offset+1] = longStart;
849 _hashArea[offset+3] = qlen;
850 }
851 // plus add the actual String
852 _names[offset >> 2] = name;
853
854 // and finally; see if we really should rehash.
855 ++_count;
856 return name;
857 }
858
859 private void _verifySharing()
860 {
861 if (_hashShared) {
862 _hashArea = Arrays.copyOf(_hashArea, _hashArea.length);
863 _names = Arrays.copyOf(_names, _names.length);
864 _hashShared = false;
865 }
866 }
867
868 /**
869 * Method called to find the location within hash table to add a new symbol in.
870 */
871 private int _findOffsetForAdd(int hash)
872 {
873 // first, check the primary: if slot found, no need for resize
874 int offset = _calcOffset(hash);
875 final int[] hashArea = _hashArea;
876 if (hashArea[offset+3] == 0) {
877 //System.err.printf(" PRImary slot #%d, hash %X\n", (offset>>2), hash & 0x7F);
878 return offset;
879 }
880
881 // Otherwise let's see if we are due resize():
882 if (_checkNeedForRehash()) {
883 return _resizeAndFindOffsetForAdd(hash);
884 }
885
886 // If not, proceed with secondary slot
887 int offset2 = _secondaryStart + ((offset >> 3) << 2);
888 if (hashArea[offset2+3] == 0) {
889 //System.err.printf(" SECondary slot #%d (start x%X), hash %X\n",(offset >> 3), _secondaryStart, (hash & 0x7F));
890 return offset2;
891 }
892 // if not, tertiary?
893
894 offset2 = _tertiaryStart + ((offset >> (_tertiaryShift + 2)) << _tertiaryShift);
895 final int bucketSize = (1 << _tertiaryShift);
896 for (int end = offset2 + bucketSize; offset2 < end; offset2 += 4) {
897 if (hashArea[offset2+3] == 0) {
898 //System.err.printf(" TERtiary slot x%X (from x%X, start x%X), hash %X.\n", offset2, ((offset >> (_tertiaryShift + 2)) << _tertiaryShift), _tertiaryStart, (hash & 0x7F));
899 return offset2;
900 }
901 }
902
903 // and if even tertiary full, append at the end of spill area
904 offset = _spilloverEnd;
905 _spilloverEnd += 4;
906
907 //System.err.printf(" SPIll-over at x%X; start x%X; end x%X, hash %X\n", offset, _spilloverStart(), _hashArea.length, (hash & 0x7F));
908
909 // one caveat: in the unlikely event if spill-over filling up,
910 // check if that could be considered a DoS attack; handle appropriately
911 // (NOTE: approximate for now; we could verify details if that becomes necessary)
912 /* 31-Jul-2015, tatu: Note that spillover area does NOT end at end of array,
913 * since "long names" area follows. Instead, need to calculate from hash size.
914 */
915 final int end = (_hashSize << 3);
916 if (_spilloverEnd >= end) {
917 if (_failOnDoS) {
918 _reportTooManyCollisions();
919 }
920 return _resizeAndFindOffsetForAdd(hash);
921 }
922 return offset;
923 }
924
925 // @since 2.10
926 private int _resizeAndFindOffsetForAdd(int hash)
927 {
928 // First things first: we need to resize+rehash (or, if too big, nuke contents)
929 rehash();
930
931 // Copy of main _findOffsetForAdd except for checks to resize: can not be needed
932 int offset = _calcOffset(hash);
933 final int[] hashArea = _hashArea;
934 if (hashArea[offset+3] == 0) {
935 return offset;
936 }
937 int offset2 = _secondaryStart + ((offset >> 3) << 2);
938 if (hashArea[offset2+3] == 0) {
939 return offset2;
940 }
941 offset2 = _tertiaryStart + ((offset >> (_tertiaryShift + 2)) << _tertiaryShift);
942 final int bucketSize = (1 << _tertiaryShift);
943 for (int end = offset2 + bucketSize; offset2 < end; offset2 += 4) {
944 if (hashArea[offset2+3] == 0) {
945 return offset2;
946 }
947 }
948 offset = _spilloverEnd;
949 _spilloverEnd += 4;
950 return offset;
951 }
952
953 // Helper method for checking if we should simply rehash() before add
954 private boolean _checkNeedForRehash() {
955 // Yes if above 80%, or above 50% AND have ~1% spill-overs
956 if (_count > (_hashSize >> 1)) { // over 50%
957 int spillCount = (_spilloverEnd - _spilloverStart()) >> 2;
958 if ((spillCount > (1 + _count >> 7))
959 || (_count > (_hashSize * 0.80))) {
960 return true;
961 }
962 }
963 return false;
964 }
965
966 private int _appendLongName(int[] quads, int qlen)
967 {
968 int start = _longNameOffset;
969
970 // note: at this point we must already be shared. But may not have enough space
971 if ((start + qlen) > _hashArea.length) {
972 // try to increment in reasonable chunks; at least space that we need
973 int toAdd = (start + qlen) - _hashArea.length;
974 // but at least 1/8 of regular hash area size or 16kB (whichever smaller)
975 int minAdd = Math.min(4096, _hashSize);
976
977 int newSize = _hashArea.length + Math.max(toAdd, minAdd);
978 _hashArea = Arrays.copyOf(_hashArea, newSize);
979 }
980 System.arraycopy(quads, 0, _hashArea, start, qlen);
981 _longNameOffset += qlen;
982 return start;
983 }
984
985 /*
986 /**********************************************************
987 /* Hash calculation
988 /**********************************************************
989 */
990
991 /* Note on hash calculation: we try to make it more difficult to
992 * generate collisions automatically; part of this is to avoid
993 * simple "multiply-add" algorithm (like JDK String.hashCode()),
994 * and add bit of shifting. And other part is to make this
995 * non-linear, at least for shorter symbols.
996 */
997
998 // JDK uses 31; other fine choices are 33 and 65599, let's use 33
999 // as it seems to give fewest collisions for us
1000 // (see [http://www.cse.yorku.ca/~oz/hash.html] for details)
1001 private final static int MULT = 33;
1002 private final static int MULT2 = 65599;
1003 private final static int MULT3 = 31;
1004
1005 public int calcHash(int q1)
1006 {
1007 int hash = q1 ^ _seed;
1008 /* 29-Mar-2015, tatu: Earlier used 15 + 9 right shifts, which worked ok
1009 * except for one specific problem case: numbers. So needed to make sure
1010 * that all 4 least-significant bits participate in hash. Couple of ways
1011 * to work it out, but this is the simplest, fast and seems to do ok.
1012 */
1013 hash += (hash >>> 16); // to xor hi- and low- 16-bits
1014 hash ^= (hash << 3); // shuffle back a bit
1015 hash += (hash >>> 12); // and bit more
1016 return hash;
1017 }
1018
1019 public int calcHash(int q1, int q2)
1020 {
1021 // For two quads, let's change algorithm a bit, to spice
1022 // things up (can do bit more processing anyway)
1023 int hash = q1;
1024
1025 hash += (hash >>> 15); // try mixing first and second byte pairs first
1026 hash ^= (hash >>> 9); // as well as lowest 2 bytes
1027 hash += (q2 * MULT); // then add second quad
1028 hash ^= _seed;
1029 hash += (hash >>> 16); // and shuffle some more
1030 hash ^= (hash >>> 4);
1031 hash += (hash << 3);
1032
1033 return hash;
1034 }
1035
1036 public int calcHash(int q1, int q2, int q3)
1037 { // use same algorithm as multi-byte, tested to work well
1038 int hash = q1 ^ _seed;
1039 hash += (hash >>> 9);
1040 hash *= MULT3;
1041 hash += q2;
1042 hash *= MULT;
1043 hash += (hash >>> 15);
1044 hash ^= q3;
1045 // 26-Mar-2015, tatu: As per two-quad case, a short shift seems to help more here
1046 hash += (hash >>> 4);
1047
1048 hash += (hash >>> 15);
1049 hash ^= (hash << 9);
1050
1051 return hash;
1052 }
1053
1054 public int calcHash(int[] q, int qlen)
1055 {
1056 if (qlen < 4) {
1057 throw new IllegalArgumentException();
1058 }
1059 /* And then change handling again for "multi-quad" case; mostly
1060 * to make calculation of collisions less fun. For example,
1061 * add seed bit later in the game, and switch plus/xor around,
1062 * use different shift lengths.
1063 */
1064 int hash = q[0] ^ _seed;
1065 hash += (hash >>> 9);
1066 hash += q[1];
1067 hash += (hash >>> 15);
1068 hash *= MULT;
1069 hash ^= q[2];
1070 hash += (hash >>> 4);
1071
1072 for (int i = 3; i < qlen; ++i) {
1073 int next = q[i];
1074 next = next ^ (next >> 21);
1075 hash += next;
1076 }
1077 hash *= MULT2;
1078
1079 // and finally shuffle some more once done
1080 hash += (hash >>> 19);
1081 hash ^= (hash << 5);
1082 return hash;
1083 }
1084
1085 /*
1086 /**********************************************************
1087 /* Rehashing
1088 /**********************************************************
1089 */
1090
1091 private void rehash()
1092 {
1093 // Note: since we'll make copies, no need to unshare, can just mark as such:
1094 _hashShared = false;
1095
1096 // And then we can first deal with the main hash area. Since we are expanding
1097 // linearly (double up), we know there'll be no collisions during this phase.
1098 final int[] oldHashArea = _hashArea;
1099 final String[] oldNames = _names;
1100 final int oldSize = _hashSize;
1101 final int oldCount = _count;
1102 final int newSize = oldSize + oldSize;
1103 final int oldEnd = _spilloverEnd;
1104
1105 /* 13-Mar-2010, tatu: Let's guard against OOME that could be caused by
1106 * large documents with unique (or mostly so) names
1107 */
1108 if (newSize > MAX_T_SIZE) {
1109 nukeSymbols(true);
1110 return;
1111 }
1112 // double up main hash area, but do not expand long-name area:
1113 _hashArea = new int[oldHashArea.length + (oldSize<<3)];
1114 _hashSize = newSize;
1115 _secondaryStart = (newSize << 2); // 4 ints per entry
1116 _tertiaryStart = _secondaryStart + (_secondaryStart >> 1); // right after secondary
1117 _tertiaryShift = _calcTertiaryShift(newSize);
1118
1119 // and simply double up name array
1120 _names = new String[oldNames.length << 1];
1121 nukeSymbols(false);
1122
1123 // Plus we can scan only through the primary hash area, looking for non-empty
1124 // slots, without worrying about ordering. This should never reduce priority
1125 // of existing entries: primaries remain primaries; however, due to increased
1126 // space, secondaries may become primaries etc
1127
1128 int copyCount = 0;
1129 int[] q = new int[16];
1130 for (int offset = 0, end = oldEnd; offset < end; offset += 4) {
1131 int len = oldHashArea[offset+3];
1132 if (len == 0) { // empty slot, skip
1133 continue;
1134 }
1135 ++copyCount;
1136 String name = oldNames[offset>>2];
1137 switch (len) {
1138 case 1:
1139 q[0] = oldHashArea[offset];
1140 addName(name, q, 1);
1141 break;
1142 case 2:
1143 q[0] = oldHashArea[offset];
1144 q[1] = oldHashArea[offset+1];
1145 addName(name, q, 2);
1146 break;
1147 case 3:
1148 q[0] = oldHashArea[offset];
1149 q[1] = oldHashArea[offset+1];
1150 q[2] = oldHashArea[offset+2];
1151 addName(name, q, 3);
1152 break;
1153 default:
1154 if (len > q.length) {
1155 q = new int[len];
1156 }
1157 // #0 is hash, #1 offset
1158 int qoff = oldHashArea[offset+1];
1159 System.arraycopy(oldHashArea, qoff, q, 0, len);
1160 addName(name, q, len);
1161 break;
1162 }
1163 }
1164
1165 // Sanity checks: since corruption difficult to detect, assert explicitly
1166 // with production code
1167 if (copyCount != oldCount) {
1168 throw new IllegalStateException("Failed rehash(): old count="+oldCount+", copyCount="+copyCount);
1169 }
1170 }
1171
1172 /**
1173 * Helper method called to empty all shared symbols, but to leave
1174 * arrays allocated
1175 */
1176 private void nukeSymbols(boolean fill) {
1177 _count = 0;
1178 // reset spill-over to empty (starting at 7/8 of hash area)
1179 _spilloverEnd = _spilloverStart();
1180 // and long name area to empty, starting immediately after hash area
1181 _longNameOffset = _hashSize << 3;
1182 if (fill) {
1183 Arrays.fill(_hashArea, 0);
1184 Arrays.fill(_names, null);
1185 }
1186 }
1187
1188 /*
1189 /**********************************************************
1190 /* Helper methods
1191 /**********************************************************
1192 */
1193
1194 /**
1195 * Helper method that calculates start of the spillover area
1196 */
1197 private final int _spilloverStart() {
1198 // we'll need slot at 1.75x of hashSize, but with 4-ints per slot.
1199 // So basically multiply by 7
1200 int offset = _hashSize;
1201 return (offset << 3) - offset;
1202 }
1203
1204 protected void _reportTooManyCollisions()
1205 {
1206 // First: do not fuzz about small symbol tables; may get balanced by doubling up
1207 if (_hashSize <= 1024) { // would have spill-over area of 128 entries
1208 return;
1209 }
1210 throw new IllegalStateException("Spill-over slots in symbol table with "+_count
1211 +" entries, hash area of "+_hashSize+" slots is now full (all "
1212 +(_hashSize >> 3)+" slots -- suspect a DoS attack based on hash collisions."
1213 +" You can disable the check via `JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW`");
1214 }
1215
1216 static int _calcTertiaryShift(int primarySlots)
1217 {
1218 // first: we only get 1/4 of slots of primary, to divide
1219 int tertSlots = (primarySlots) >> 2;
1220
1221 // default is for buckets of 4 slots (each 4 ints, i.e. 1 << 4)
1222 if (tertSlots < 64) {
1223 return 4;
1224 }
1225 if (tertSlots <= 256) { // buckets of 8 slots (up to 256 == 32 x 8)
1226 return 5;
1227 }
1228 if (tertSlots <= 1024) { // buckets of 16 slots (up to 1024 == 64 x 16)
1229 return 6;
1230 }
1231 // and biggest buckets have 32 slots
1232 return 7;
1233 }
1234
1235 /*
1236 /**********************************************************
1237 /* Helper classes
1238 /**********************************************************
1239 */
1240
1241 /**
1242 * Immutable value class used for sharing information as efficiently
1243 * as possible, by only require synchronization of reference manipulation
1244 * but not access to contents.
1245 *
1246 * @since 2.1
1247 */
1248 private final static class TableInfo
1249 {
1250 public final int size;
1251 public final int count;
1252 public final int tertiaryShift;
1253 public final int[] mainHash;
1254 public final String[] names;
1255 public final int spilloverEnd;
1256 public final int longNameOffset;
1257
1258 public TableInfo(int size, int count, int tertiaryShift,
1259 int[] mainHash, String[] names, int spilloverEnd, int longNameOffset)
1260 {
1261 this.size = size;
1262 this.count = count;
1263 this.tertiaryShift = tertiaryShift;
1264 this.mainHash = mainHash;
1265 this.names = names;
1266 this.spilloverEnd = spilloverEnd;
1267 this.longNameOffset = longNameOffset;
1268 }
1269
1270 public TableInfo(ByteQuadsCanonicalizer src)
1271 {
1272 size = src._hashSize;
1273 count = src._count;
1274 tertiaryShift = src._tertiaryShift;
1275 mainHash = src._hashArea;
1276 names = src._names;
1277 spilloverEnd = src._spilloverEnd;
1278 longNameOffset = src._longNameOffset;
1279 }
1280
1281 public static TableInfo createInitial(int sz) {
1282 int hashAreaSize = sz << 3;
1283 int tertShift = _calcTertiaryShift(sz);
1284
1285 return new TableInfo(sz, // hashSize
1286 0, // count
1287 tertShift,
1288 new int[hashAreaSize], // mainHash, 2x slots, 4 ints per slot
1289 new String[sz << 1], // names == 2x slots
1290 hashAreaSize - sz, // at 7/8 of the total area
1291 hashAreaSize // longNameOffset, immediately after main hashes
1292 );
1293 }
1294 }
1295 }
1296