1 package com.fasterxml.jackson.core.sym;
2
3 import java.util.Arrays;
4 import java.util.BitSet;
5 import java.util.concurrent.atomic.AtomicReference;
6
7 import com.fasterxml.jackson.core.JsonFactory;
8 import com.fasterxml.jackson.core.util.InternCache;
9
10 /**
11 * This class is a kind of specialized type-safe Map, from char array to
12 * String value. Specialization means that in addition to type-safety
13 * and specific access patterns (key char array, Value optionally interned
14 * String; values added on access if necessary), and that instances are
15 * meant to be used concurrently, but by using well-defined mechanisms
16 * to obtain such concurrently usable instances. Main use for the class
17 * is to store symbol table information for things like compilers and
18 * parsers; especially when number of symbols (keywords) is limited.
19 *<p>
20 * For optimal performance, usage pattern should be one where matches
21 * should be very common (especially after "warm-up"), and as with most hash-based
22 * maps/sets, that hash codes are uniformly distributed. Also, collisions
23 * are slightly more expensive than with HashMap or HashSet, since hash codes
24 * are not used in resolving collisions; that is, equals() comparison is
25 * done with all symbols in same bucket index.<br>
26 * Finally, rehashing is also more expensive, as hash codes are not
27 * stored; rehashing requires all entries' hash codes to be recalculated.
28 * Reason for not storing hash codes is reduced memory usage, hoping
29 * for better memory locality.
30 *<p>
31 * Usual usage pattern is to create a single "master" instance, and either
32 * use that instance in sequential fashion, or to create derived "child"
33 * instances, which after use, are asked to return possible symbol additions
34 * to master instance. In either case benefit is that symbol table gets
35 * initialized so that further uses are more efficient, as eventually all
36 * symbols needed will already be in symbol table. At that point no more
37 * Symbol String allocations are needed, nor changes to symbol table itself.
38 *<p>
39 * Note that while individual SymbolTable instances are NOT thread-safe
40 * (much like generic collection classes), concurrently used "child"
41 * instances can be freely used without synchronization. However, using
42 * master table concurrently with child instances can only be done if
43 * access to master instance is read-only (i.e. no modifications done).
44 */
45 public final class CharsToNameCanonicalizer
46 {
47 /* If we use "multiply-add" based hash algorithm, this is the multiplier
48 * we use.
49 *<p>
50 * Note that JDK uses 31; but it seems that 33 produces fewer collisions,
51 * at least with tests we have.
52 */
53 public final static int HASH_MULT = 33;
54
55 /**
56 * Default initial table size. Shouldn't be miniscule (as there's
57 * cost to both array realloc and rehashing), but let's keep
58 * it reasonably small. For systems that properly
59 * reuse factories it doesn't matter either way; but when
60 * recreating factories often, initial overhead may dominate.
61 */
62 private static final int DEFAULT_T_SIZE = 64;
63
64 /**
65 * Let's not expand symbol tables past some maximum size;
66 * this should protected against OOMEs caused by large documents
67 * with unique (~= random) names.
68 */
69 private static final int MAX_T_SIZE = 0x10000; // 64k entries == 256k mem
70
71 /**
72 * Let's only share reasonably sized symbol tables. Max size set to 3/4 of 16k;
73 * this corresponds to 64k main hash index. This should allow for enough distinct
74 * names for almost any case.
75 */
76 static final int MAX_ENTRIES_FOR_REUSE = 12000;
77
78 /**
79 * Also: to thwart attacks based on hash collisions (which may or may not
80 * be cheap to calculate), we will need to detect "too long"
81 * collision chains. Let's start with static value of 100 entries
82 * for the longest legal chain.
83 *<p>
84 * Note: longest chain we have been able to produce without malicious
85 * intent has been 38 (with "com.fasterxml.jackson.core.main.TestWithTonsaSymbols");
86 * our setting should be reasonable here.
87 *
88 * @since 2.1
89 */
90 static final int MAX_COLL_CHAIN_LENGTH = 100;
91
92 /*
93 /**********************************************************
94 /* Configuration
95 /**********************************************************
96 */
97
98 /**
99 * Sharing of learnt symbols is done by optional linking of symbol
100 * table instances with their parents. When parent linkage is
101 * defined, and child instance is released (call to <code>release</code>),
102 * parent's shared tables may be updated from the child instance.
103 */
104 final private CharsToNameCanonicalizer _parent;
105
106 /**
107 * Member that is only used by the root table instance: root
108 * passes immutable state info child instances, and children
109 * may return new state if they add entries to the table.
110 * Child tables do NOT use the reference.
111 */
112 final private AtomicReference<TableInfo> _tableInfo;
113
114 /**
115 * Seed value we use as the base to make hash codes non-static between
116 * different runs, but still stable for lifetime of a single symbol table
117 * instance.
118 * This is done for security reasons, to avoid potential DoS attack via
119 * hash collisions.
120 *
121 * @since 2.1
122 */
123 final private int _seed;
124
125 final private int _flags;
126
127 /**
128 * Whether any canonicalization should be attempted (whether using
129 * intern or not.
130 *<p>
131 * NOTE: non-final since we may need to disable this with overflow.
132 */
133 private boolean _canonicalize;
134
135 /*
136 /**********************************************************
137 /* Actual symbol table data
138 /**********************************************************
139 */
140
141 /**
142 * Primary matching symbols; it's expected most match occur from
143 * here.
144 */
145 private String[] _symbols;
146
147 /**
148 * Overflow buckets; if primary doesn't match, lookup is done
149 * from here.
150 *<p>
151 * Note: Number of buckets is half of number of symbol entries, on
152 * assumption there's less need for buckets.
153 */
154 private Bucket[] _buckets;
155
156 /**
157 * Current size (number of entries); needed to know if and when
158 * rehash.
159 */
160 private int _size;
161
162 /**
163 * Limit that indicates maximum size this instance can hold before
164 * it needs to be expanded and rehashed. Calculated using fill
165 * factor passed in to constructor.
166 */
167 private int _sizeThreshold;
168
169 /**
170 * Mask used to get index from hash values; equal to
171 * <code>_buckets.length - 1</code>, when _buckets.length is
172 * a power of two.
173 */
174 private int _indexMask;
175
176 /**
177 * We need to keep track of the longest collision list; this is needed
178 * both to indicate problems with attacks and to allow flushing for
179 * other cases.
180 *
181 * @since 2.1
182 */
183 private int _longestCollisionList;
184
185 /*
186 /**********************************************************
187 /* State regarding shared arrays
188 /**********************************************************
189 */
190
191 /**
192 * Flag that indicates whether underlying data structures for
193 * the main hash area are shared or not. If they are, then they
194 * need to be handled in copy-on-write way, i.e. if they need
195 * to be modified, a copy needs to be made first; at this point
196 * it will not be shared any more, and can be modified.
197 *<p>
198 * This flag needs to be checked both when adding new main entries,
199 * and when adding new collision list queues (i.e. creating a new
200 * collision list head entry)
201 */
202 private boolean _hashShared;
203
204 /*
205 /**********************************************************
206 /* Bit of DoS detection goodness
207 /**********************************************************
208 */
209
210 /**
211 * Lazily constructed structure that is used to keep track of
212 * collision buckets that have overflowed once: this is used
213 * to detect likely attempts at denial-of-service attacks that
214 * uses hash collisions.
215 *
216 * @since 2.4
217 */
218 private BitSet _overflows;
219
220 /*
221 /**********************************************************
222 /* Life-cycle: constructors
223 /**********************************************************
224 */
225
226 /**
227 * Main method for constructing a root symbol table instance.
228 */
229 private CharsToNameCanonicalizer(int seed)
230 {
231 _parent = null;
232 _seed = seed;
233
234 // these settings don't really matter for the bootstrap instance
235 _canonicalize = true;
236 _flags = -1;
237 // And we'll also set flags so no copying of buckets is needed:
238 _hashShared = false; // doesn't really matter for root instance
239 _longestCollisionList = 0;
240
241 _tableInfo = new AtomicReference<TableInfo>(TableInfo.createInitial(DEFAULT_T_SIZE));
242 // and actually do NOT assign buffers so we'll find if anyone tried to
243 // use root instance
244 }
245
246 /**
247 * Internal constructor used when creating child instances.
248 */
249 private CharsToNameCanonicalizer(CharsToNameCanonicalizer parent, int flags, int seed,
250 TableInfo parentState)
251 {
252 _parent = parent;
253 _seed = seed;
254 _tableInfo = null; // not used by child tables
255 _flags = flags;
256 _canonicalize = JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(flags);
257
258 // Then copy shared state
259 _symbols = parentState.symbols;
260 _buckets = parentState.buckets;
261
262 _size = parentState.size;
263 _longestCollisionList = parentState.longestCollisionList;
264
265 // Hard-coded fill factor, 75%
266 int arrayLen = (_symbols.length);
267 _sizeThreshold = _thresholdSize(arrayLen);
268 _indexMask = (arrayLen - 1);
269
270 // Need to make copies of arrays, if/when adding new entries
271 _hashShared = true;
272 }
273
274 private static int _thresholdSize(int hashAreaSize) { return hashAreaSize - (hashAreaSize >> 2); }
275
276 /*
277 /**********************************************************
278 /* Life-cycle: factory methods, merging
279 /**********************************************************
280 */
281
282 /**
283 * Method called to create root canonicalizer for a {@link com.fasterxml.jackson.core.JsonFactory}
284 * instance. Root instance is never used directly; its main use is for
285 * storing and sharing underlying symbol arrays as needed.
286 */
287 public static CharsToNameCanonicalizer createRoot() {
288 // Need to use a variable seed, to thwart hash-collision based attacks.
289 // 14-Feb-2017, tatu: not sure it actually helps, at all, since it won't
290 // change mixing or any of the steps. Should likely just remove in future.
291 long now = System.currentTimeMillis();
292 // ensure it's not 0; and might as well require to be odd so:
293 int seed = (((int) now) + ((int) (now >>> 32))) | 1;
294 return createRoot(seed);
295 }
296
297 protected static CharsToNameCanonicalizer createRoot(int seed) {
298 return new CharsToNameCanonicalizer(seed);
299 }
300 /**
301 * "Factory" method; will create a new child instance of this symbol
302 * table. It will be a copy-on-write instance, ie. it will only use
303 * read-only copy of parent's data, but when changes are needed, a
304 * copy will be created.
305 *<p>
306 * Note: while this method is synchronized, it is generally not
307 * safe to both use makeChild/mergeChild, AND to use instance
308 * actively. Instead, a separate 'root' instance should be used
309 * on which only makeChild/mergeChild are called, but instance itself
310 * is not used as a symbol table.
311 */
312 public CharsToNameCanonicalizer makeChild(int flags) {
313 return new CharsToNameCanonicalizer(this, flags, _seed, _tableInfo.get());
314 }
315
316 /**
317 * Method called by the using code to indicate it is done with this instance.
318 * This lets instance merge accumulated changes into parent (if need be),
319 * safely and efficiently, and without calling code having to know about parent
320 * information.
321 */
322 public void release() {
323 // If nothing has been added, nothing to do
324 if (!maybeDirty()) { return; }
325
326 // we will try to merge if child table has new entries
327 if (_parent != null && _canonicalize) { // canonicalize set to false if max size was reached
328 _parent.mergeChild(new TableInfo(this));
329 // Let's also mark this instance as dirty, so that just in
330 // case release was too early, there's no corruption of possibly shared data.
331 _hashShared = true;
332 }
333 }
334
335 /**
336 * Method that allows contents of child table to potentially be
337 * "merged in" with contents of this symbol table.
338 *<p>
339 * Note that caller has to make sure symbol table passed in is
340 * really a child or sibling of this symbol table.
341 */
342 private void mergeChild(TableInfo childState)
343 {
344 final int childCount = childState.size;
345 TableInfo currState = _tableInfo.get();
346
347 // Should usually grow; but occasionally could also shrink if (but only if)
348 // collision list overflow ends up clearing some collision lists.
349 if (childCount == currState.size) {
350 return;
351 }
352 // One caveat: let's try to avoid problems with degenerate cases of documents with
353 // generated "random" names: for these, symbol tables would bloat indefinitely.
354 // One way to do this is to just purge tables if they grow
355 // too large, and that's what we'll do here.
356 if (childCount > MAX_ENTRIES_FOR_REUSE) {
357 // At any rate, need to clean up the tables
358 childState = TableInfo.createInitial(DEFAULT_T_SIZE);
359 }
360 _tableInfo.compareAndSet(currState, childState);
361 }
362
363 /*
364 /**********************************************************
365 /* Public API, generic accessors:
366 /**********************************************************
367 */
368
369 public int size() {
370 if (_tableInfo != null) { // root table
371 return _tableInfo.get().size;
372 }
373 // nope, child table
374 return _size;
375 }
376
377 /**
378 * Method for checking number of primary hash buckets this symbol
379 * table uses.
380 *
381 * @since 2.1
382 */
383 public int bucketCount() { return _symbols.length; }
384 public boolean maybeDirty() { return !_hashShared; }
385 public int hashSeed() { return _seed; }
386
387 /**
388 * Method mostly needed by unit tests; calculates number of
389 * entries that are in collision list. Value can be at most
390 * ({@link #size} - 1), but should usually be much lower, ideally 0.
391 *
392 * @since 2.1
393 */
394 public int collisionCount() {
395 int count = 0;
396
397 for (Bucket bucket : _buckets) {
398 if (bucket != null) {
399 count += bucket.length;
400 }
401 }
402 return count;
403 }
404
405 /**
406 * Method mostly needed by unit tests; calculates length of the
407 * longest collision chain. This should typically be a low number,
408 * but may be up to {@link #size} - 1 in the pathological case
409 *
410 * @since 2.1
411 */
412 public int maxCollisionLength() { return _longestCollisionList; }
413
414 /*
415 /**********************************************************
416 /* Public API, accessing symbols:
417 /**********************************************************
418 */
419
420 public String findSymbol(char[] buffer, int start, int len, int h)
421 {
422 if (len < 1) { // empty Strings are simplest to handle up front
423 return "";
424 }
425 if (!_canonicalize) { // [JACKSON-259]
426 return new String(buffer, start, len);
427 }
428
429 /* Related to problems with sub-standard hashing (somewhat
430 * relevant for collision attacks too), let's try little
431 * bit of shuffling to improve hash codes.
432 * (note, however, that this can't help with full collisions)
433 */
434 int index = _hashToIndex(h);
435 String sym = _symbols[index];
436
437 // Optimal case; checking existing primary symbol for hash index:
438 if (sym != null) {
439 // Let's inline primary String equality checking:
440 if (sym.length() == len) {
441 int i = 0;
442 while (sym.charAt(i) == buffer[start+i]) {
443 // Optimal case; primary match found
444 if (++i == len) {
445 return sym;
446 }
447 }
448 }
449 Bucket b = _buckets[index>>1];
450 if (b != null) {
451 sym = b.has(buffer, start, len);
452 if (sym != null) {
453 return sym;
454 }
455 sym = _findSymbol2(buffer, start, len, b.next);
456 if (sym != null) {
457 return sym;
458 }
459 }
460 }
461 return _addSymbol(buffer, start, len, h, index);
462 }
463
464 private String _findSymbol2(char[] buffer, int start, int len, Bucket b) {
465 while (b != null) {
466 String sym = b.has(buffer, start, len);
467 if (sym != null) {
468 return sym;
469 }
470 b = b.next;
471 }
472 return null;
473 }
474
475 private String _addSymbol(char[] buffer, int start, int len, int h, int index)
476 {
477 if (_hashShared) { //need to do copy-on-write?
478 copyArrays();
479 _hashShared = false;
480 } else if (_size >= _sizeThreshold) { // Need to expand?
481 rehash();
482 // Need to recalc hash; rare occurrence (index mask has been
483 // recalculated as part of rehash)
484 index = _hashToIndex(calcHash(buffer, start, len));
485 }
486
487 String newSymbol = new String(buffer, start, len);
488 if (JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(_flags)) {
489 newSymbol = InternCache.instance.intern(newSymbol);
490 }
491 ++_size;
492 // Ok; do we need to add primary entry, or a bucket?
493 if (_symbols[index] == null) {
494 _symbols[index] = newSymbol;
495 } else {
496 final int bix = (index >> 1);
497 Bucket newB = new Bucket(newSymbol, _buckets[bix]);
498 int collLen = newB.length;
499 if (collLen > MAX_COLL_CHAIN_LENGTH) {
500 // 23-May-2014, tatu: Instead of throwing an exception right away,
501 // let's handle in bit smarter way.
502 _handleSpillOverflow(bix, newB, index);
503 } else {
504 _buckets[bix] = newB;
505 _longestCollisionList = Math.max(collLen, _longestCollisionList);
506 }
507 }
508 return newSymbol;
509 }
510
511 /**
512 * Method called when an overflow bucket has hit the maximum expected length:
513 * this may be a case of DoS attack. Deal with it based on settings by either
514 * clearing up bucket (to avoid indefinite expansion) or throwing exception.
515 * Currently the first overflow for any single bucket DOES NOT throw an exception,
516 * only second time (per symbol table instance)
517 */
518 private void _handleSpillOverflow(int bucketIndex, Bucket newBucket, int mainIndex)
519 {
520 if (_overflows == null) {
521 _overflows = new BitSet();
522 _overflows.set(bucketIndex);
523 } else {
524 if (_overflows.get(bucketIndex)) {
525 // Has happened once already for this bucket index, so probably not coincidental...
526 if (JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(_flags)) {
527 reportTooManyCollisions(MAX_COLL_CHAIN_LENGTH);
528 }
529 // but even if we don't fail, we will stop canonicalizing as safety measure
530 // (so as not to cause problems with PermGen)
531 _canonicalize = false;
532 } else {
533 _overflows.set(bucketIndex);
534 }
535 }
536
537 // regardless, if we get this far, clear up the bucket, adjust size appropriately.
538 _symbols[mainIndex] = newBucket.symbol;
539 _buckets[bucketIndex] = null;
540 // newBucket contains new symbol; but we will
541 _size -= (newBucket.length);
542 // we could calculate longest; but for now just mark as invalid
543 _longestCollisionList = -1;
544 }
545
546 /**
547 * Helper method that takes in a "raw" hash value, shuffles it as necessary,
548 * and truncates to be used as the index.
549 */
550 public int _hashToIndex(int rawHash) {
551 // doing these seems to help a bit
552 rawHash += (rawHash >>> 15);
553 rawHash ^= (rawHash << 7);
554 rawHash += (rawHash >>> 3);
555 return (rawHash & _indexMask);
556 }
557
558 /**
559 * Implementation of a hashing method for variable length
560 * Strings. Most of the time intention is that this calculation
561 * is done by caller during parsing, not here; however, sometimes
562 * it needs to be done for parsed "String" too.
563 *
564 * @param len Length of String; has to be at least 1 (caller guarantees
565 * this pre-condition)
566 */
567 public int calcHash(char[] buffer, int start, int len) {
568 int hash = _seed;
569 for (int i = start, end = start+len; i < end; ++i) {
570 hash = (hash * HASH_MULT) + (int) buffer[i];
571 }
572 // NOTE: shuffling, if any, is done in 'findSymbol()', not here:
573 return (hash == 0) ? 1 : hash;
574 }
575
576 public int calcHash(String key)
577 {
578 final int len = key.length();
579
580 int hash = _seed;
581 for (int i = 0; i < len; ++i) {
582 hash = (hash * HASH_MULT) + (int) key.charAt(i);
583 }
584 // NOTE: shuffling, if any, is done in 'findSymbol()', not here:
585 return (hash == 0) ? 1 : hash;
586 }
587
588 /*
589 /**********************************************************
590 /* Internal methods
591 /**********************************************************
592 */
593
594 /**
595 * Method called when copy-on-write is needed; generally when first
596 * change is made to a derived symbol table.
597 */
598 private void copyArrays() {
599 final String[] oldSyms = _symbols;
600 _symbols = Arrays.copyOf(oldSyms, oldSyms.length);
601 final Bucket[] oldBuckets = _buckets;
602 _buckets = Arrays.copyOf(oldBuckets, oldBuckets.length);
603 }
604
605 /**
606 * Method called when size (number of entries) of symbol table grows
607 * so big that load factor is exceeded. Since size has to remain
608 * power of two, arrays will then always be doubled. Main work
609 * is really redistributing old entries into new String/Bucket
610 * entries.
611 */
612 private void rehash() {
613 final int size = _symbols.length;
614 int newSize = size + size;
615
616 /* 12-Mar-2010, tatu: Let's actually limit maximum size we are
617 * prepared to use, to guard against OOME in case of unbounded
618 * name sets (unique [non-repeating] names)
619 */
620 if (newSize > MAX_T_SIZE) {
621 // If this happens, there's no point in either growing or shrinking hash areas.
622 // Rather, let's just cut our losses and stop canonicalizing.
623 _size = 0;
624 _canonicalize = false;
625 // in theory, could just leave these as null, but...
626 _symbols = new String[DEFAULT_T_SIZE];
627 _buckets = new Bucket[DEFAULT_T_SIZE>>1];
628 _indexMask = DEFAULT_T_SIZE-1;
629 _hashShared = false;
630 return;
631 }
632
633 final String[] oldSyms = _symbols;
634 final Bucket[] oldBuckets = _buckets;
635 _symbols = new String[newSize];
636 _buckets = new Bucket[newSize >> 1];
637 // Let's update index mask, threshold, now (needed for rehashing)
638 _indexMask = newSize - 1;
639 _sizeThreshold = _thresholdSize(newSize);
640
641 int count = 0; // let's do sanity check
642
643 // Need to do two loops, unfortunately, since spill-over area is
644 // only half the size:
645 int maxColl = 0;
646 for (int i = 0; i < size; ++i) {
647 String symbol = oldSyms[i];
648 if (symbol != null) {
649 ++count;
650 int index = _hashToIndex(calcHash(symbol));
651 if (_symbols[index] == null) {
652 _symbols[index] = symbol;
653 } else {
654 int bix = (index >> 1);
655 Bucket newB = new Bucket(symbol, _buckets[bix]);
656 _buckets[bix] = newB;
657 maxColl = Math.max(maxColl, newB.length);
658 }
659 }
660 }
661
662 final int bucketSize = (size >> 1);
663 for (int i = 0; i < bucketSize; ++i) {
664 Bucket b = oldBuckets[i];
665 while (b != null) {
666 ++count;
667 String symbol = b.symbol;
668 int index = _hashToIndex(calcHash(symbol));
669 if (_symbols[index] == null) {
670 _symbols[index] = symbol;
671 } else {
672 int bix = (index >> 1);
673 Bucket newB = new Bucket(symbol, _buckets[bix]);
674 _buckets[bix] = newB;
675 maxColl = Math.max(maxColl, newB.length);
676 }
677 b = b.next;
678 }
679 }
680 _longestCollisionList = maxColl;
681 _overflows = null;
682
683 if (count != _size) {
684 throw new IllegalStateException(String.format(
685 "Internal error on SymbolTable.rehash(): had %d entries; now have %d",
686 _size, count));
687 }
688 }
689
690 /**
691 * @since 2.1
692 */
693 protected void reportTooManyCollisions(int maxLen) {
694 throw new IllegalStateException("Longest collision chain in symbol table (of size "+_size
695 +") now exceeds maximum, "+maxLen+" -- suspect a DoS attack based on hash collisions");
696 }
697
698 // since 2.10, for tests only
699 /**
700 * Diagnostics method that will verify that internal data structures are consistent;
701 * not meant as user-facing method but only for test suites and possible troubleshooting.
702 *
703 * @since 2.10
704 */
705 protected void verifyInternalConsistency() {
706 int count = 0;
707 final int size = _symbols.length;
708
709 for (int i = 0; i < size; ++i) {
710 String symbol = _symbols[i];
711 if (symbol != null) {
712 ++count;
713 }
714 }
715
716 final int bucketSize = (size >> 1);
717 for (int i = 0; i < bucketSize; ++i) {
718 for (Bucket b = _buckets[i]; b != null; b = b.next) {
719 ++count;
720 }
721 }
722 if (count != _size) {
723 throw new IllegalStateException(String.format("Internal error: expected internal size %d vs calculated count %d",
724 _size, count));
725 }
726 }
727
728 // For debugging, comment out
729 /*
730 @Override
731 public String toString()
732 {
733 StringBuilder sb = new StringBuilder();
734 int primaryCount = 0;
735 for (String s : _symbols) {
736 if (s != null) ++primaryCount;
737 }
738
739 sb.append("[BytesToNameCanonicalizer, size: ");
740 sb.append(_size);
741 sb.append('/');
742 sb.append(_symbols.length);
743 sb.append(", ");
744 sb.append(primaryCount);
745 sb.append('/');
746 sb.append(_size - primaryCount);
747 sb.append(" coll; avg length: ");
748
749 // Average length: minimum of 1 for all (1 == primary hit);
750 // and then 1 per each traversal for collisions/buckets
751 //int maxDist = 1;
752 int pathCount = _size;
753 for (Bucket b : _buckets) {
754 if (b != null) {
755 int spillLen = b.length;
756 for (int j = 1; j <= spillLen; ++j) {
757 pathCount += j;
758 }
759 }
760 }
761 double avgLength;
762
763 if (_size == 0) {
764 avgLength = 0.0;
765 } else {
766 avgLength = (double) pathCount / (double) _size;
767 }
768 // let's round up a bit (two 2 decimal places)
769 //avgLength -= (avgLength % 0.01);
770
771 sb.append(avgLength);
772 sb.append(']');
773 return sb.toString();
774 }
775 */
776
777 /*
778 /**********************************************************
779 /* Helper classes
780 /**********************************************************
781 */
782
783 /**
784 * This class is a symbol table entry. Each entry acts as a node
785 * in a linked list.
786 */
787 static final class Bucket
788 {
789 public final String symbol;
790 public final Bucket next;
791 public final int length;
792
793 public Bucket(String s, Bucket n) {
794 symbol = s;
795 next = n;
796 length = (n == null) ? 1 : n.length+1;
797 }
798
799 public String has(char[] buf, int start, int len) {
800 if (symbol.length() != len) {
801 return null;
802 }
803 int i = 0;
804 do {
805 if (symbol.charAt(i) != buf[start+i]) {
806 return null;
807 }
808 } while (++i < len);
809 return symbol;
810 }
811 }
812
813 /**
814 * Immutable value class used for sharing information as efficiently
815 * as possible, by only require synchronization of reference manipulation
816 * but not access to contents.
817 *
818 * @since 2.8.7
819 */
820 private final static class TableInfo
821 {
822 final int size;
823 final int longestCollisionList;
824 final String[] symbols;
825 final Bucket[] buckets;
826
827 public TableInfo(int size, int longestCollisionList,
828 String[] symbols, Bucket[] buckets)
829 {
830 this.size = size;
831 this.longestCollisionList = longestCollisionList;
832 this.symbols = symbols;
833 this.buckets = buckets;
834 }
835
836 public TableInfo(CharsToNameCanonicalizer src)
837 {
838 this.size = src._size;
839 this.longestCollisionList = src._longestCollisionList;
840 this.symbols = src._symbols;
841 this.buckets = src._buckets;
842 }
843
844 public static TableInfo createInitial(int sz) {
845 return new TableInfo(0,
846 0, // longestCollisionList
847 new String[sz], new Bucket[sz >> 1]);
848 }
849 }
850 }
851