1 /* Woodstox XML processor
2 *
3 * Copyright (c) 2004- Tatu Saloranta, tatu.saloranta@iki.fi
4 *
5 * Licensed under the License specified in the file LICENSE which is
6 * included with the source code.
7 * You may not use this file except in compliance with the License.
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package com.ctc.wstx.util;
17
18 /**
19 * This class is a kind of specialized type-safe Map, from char array to
20 * String value. Specialization means that in addition to type-safety
21 * and specific access patterns (key char array, Value optionally interned
22 * String; values added on access if necessary), and that instances are
23 * meant to be used concurrently, but by using well-defined mechanisms
24 * to obtain such concurrently usable instances. Main use for the class
25 * is to store symbol table information for things like compilers and
26 * parsers; especially when number of symbols (keywords) is limited.
27 *<p>
28 * For optimal performance, usage pattern should be one where matches
29 * should be very common (esp. after "warm-up"), and as with most hash-based
30 * maps/sets, that hash codes are uniformly distributed. Also, collisions
31 * are slightly more expensive than with HashMap or HashSet, since hash codes
32 * are not used in resolving collisions; that is, equals() comparison is
33 * done with all symbols in same bucket index.<br>
34 * Finally, rehashing is also more expensive, as hash codes are not
35 * stored; rehashing requires all entries' hash codes to be recalculated.
36 * Reason for not storing hash codes is reduced memory usage, hoping
37 * for better memory locality.
38 *<p>
39 * Usual usage pattern is to create a single "master" instance, and either
40 * use that instance in sequential fashion, or to create derived "child"
41 * instances, which after use, are asked to return possible symbol additions
42 * to master instance. In either case benefit is that symbol table gets
43 * initialized so that further uses are more efficient, as eventually all
44 * symbols needed will already be in symbol table. At that point no more
45 * Symbol String allocations are needed, nor changes to symbol table itself.
46 *<p>
47 * Note that while individual SymbolTable instances are NOT thread-safe
48 * (much like generic collection classes), concurrently used "child"
49 * instances can be freely used without synchronization. However, using
50 * master table concurrently with child instances can only be done if
51 * access to master instance is read-only (ie. no modifications done).
52 */
53
54 public class SymbolTable {
55
56 /**
57 * Default initial table size; no need to make it miniscule, due
58 * to couple of things: first, overhead of array reallocation
59 * is significant,
60 * and second, overhead of rehashing is also non-negligible.
61 *<p>
62 * Let's use 128 as the default; it allows for up to 96 symbols,
63 * and uses about 512 bytes on 32-bit machines.
64 */
65 protected static final int DEFAULT_TABLE_SIZE = 128;
66
67 protected static final float DEFAULT_FILL_FACTOR = 0.75f;
68
69 protected static final String EMPTY_STRING = "";
70
71 /*
72 ////////////////////////////////////////
73 // Configuration:
74 ////////////////////////////////////////
75 */
76
77 /**
78 * Flag that determines whether Strings to be added need to be
79 * interned before being added or not. Forcing intern()ing will add
80 * some overhead when adding new Strings, but may be beneficial if such
81 * Strings are generally used by other parts of system. Note that even
82 * without interning, all returned String instances are guaranteed
83 * to be comparable with equality (==) operator; it's just that such
84 * guarantees are not made for Strings other classes return.
85 */
86 protected boolean mInternStrings;
87
88 /*
89 ////////////////////////////////////////
90 // Actual symbol table data:
91 ////////////////////////////////////////
92 */
93
94 /**
95 * Primary matching symbols; it's expected most match occur from
96 * here.
97 */
98 protected String[] mSymbols;
99
100 /**
101 * Overflow buckets; if primary doesn't match, lookup is done
102 * from here.
103 *<p>
104 * Note: Number of buckets is half of number of symbol entries, on
105 * assumption there's less need for buckets.
106 */
107 protected Bucket[] mBuckets;
108
109 /**
110 * Current size (number of entries); needed to know if and when
111 * rehash.
112 */
113 protected int mSize;
114
115 /**
116 * Limit that indicates maximum size this instance can hold before
117 * it needs to be expanded and rehashed. Calculated using fill
118 * factor passed in to constructor.
119 */
120 protected int mSizeThreshold;
121
122 /**
123 * Mask used to get index from hash values; equal to
124 * <code>mBuckets.length - 1</code>, when mBuckets.length is
125 * a power of two.
126 */
127 protected int mIndexMask;
128
129 /*
130 ////////////////////////////////////////
131 // Information about concurrency
132 ////////////////////////////////////////
133 */
134
135 /**
136 * Version of this table instance; used when deriving new concurrently
137 * used versions from existing 'master' instance.
138 */
139 protected int mThisVersion;
140
141 /**
142 * Flag that indicates if any changes have been made to the data;
143 * used to both determine if bucket array needs to be copied when
144 * (first) change is made, and potentially if updated bucket list
145 * is to be resync'ed back to master instance.
146 */
147 protected boolean mDirty;
148
149 /*
150 ////////////////////////////////////////
151 // Life-cycle:
152 ////////////////////////////////////////
153 */
154
155 /**
156 * Method for constructing a master symbol table instance; this one
157 * will create master instance with default size, and with interning
158 * enabled.
159 */
160 public SymbolTable() {
161 this(true);
162 }
163
164 /**
165 * Method for constructing a master symbol table instance.
166 */
167 public SymbolTable(boolean internStrings) {
168 this(internStrings, DEFAULT_TABLE_SIZE);
169 }
170
171 /**
172 * Method for constructing a master symbol table instance.
173 */
174 public SymbolTable(boolean internStrings, int initialSize) {
175 this(internStrings, initialSize, DEFAULT_FILL_FACTOR);
176 }
177
178 /**
179 * Main method for constructing a master symbol table instance; will
180 * be called by other public constructors.
181 *
182 * @param internStrings Whether Strings to add are intern()ed or not
183 * @param initialSize Minimum initial size for bucket array; internally
184 * will always use a power of two equal to or bigger than this value.
185 * @param fillFactor Maximum fill factor allowed for bucket table;
186 * when more entries are added, table will be expanded.
187 */
188 public SymbolTable(boolean internStrings, int initialSize,
189 float fillFactor)
190 {
191 mInternStrings = internStrings;
192 // Let's start versions from 1
193 mThisVersion = 1;
194 // And we'll also set flags so no copying of buckets is needed:
195 mDirty = true;
196
197 // No point in requesting funny initial sizes...
198 if (initialSize < 1) {
199 throw new IllegalArgumentException("Can not use negative/zero initial size: "+initialSize);
200 }
201 /* Initial size has to be a power of two. Also, let's not honour
202 * sizes that are ridiculously small...
203 */
204 {
205 int currSize = 4;
206 while (currSize < initialSize) {
207 currSize += currSize;
208 }
209 initialSize = currSize;
210 }
211
212 mSymbols = new String[initialSize];
213 mBuckets = new Bucket[initialSize >> 1];
214 // Mask is easy to calc for powers of two.
215 mIndexMask = initialSize - 1;
216 mSize = 0;
217
218 // Sanity check for fill factor:
219 if (fillFactor < 0.01f) {
220 throw new IllegalArgumentException("Fill factor can not be lower than 0.01.");
221 }
222 if (fillFactor > 10.0f) { // just to catch stupid values, ie. useless from performance perspective
223 throw new IllegalArgumentException("Fill factor can not be higher than 10.0.");
224 }
225 mSizeThreshold = (int) (initialSize * fillFactor + 0.5);
226 }
227
228 /**
229 * Internal constructor used when creating child instances.
230 */
231 private SymbolTable(boolean internStrings, String[] symbols,
232 Bucket[] buckets, int size, int sizeThreshold,
233 int indexMask, int version)
234 {
235 mInternStrings = internStrings;
236 mSymbols = symbols;
237 mBuckets = buckets;
238 mSize = size;
239 mSizeThreshold = sizeThreshold;
240 mIndexMask = indexMask;
241 mThisVersion = version;
242
243 // Need to make copies of arrays, if/when adding new entries
244 mDirty = false;
245 }
246
247 /**
248 * "Factory" method; will create a new child instance of this symbol
249 * table. It will be a copy-on-write instance, ie. it will only use
250 * read-only copy of parent's data, but when changes are needed, a
251 * copy will be created.
252 *<p>
253 * Note: while data access part of this method is synchronized, it is
254 * generally not safe to both use makeChild/mergeChild, AND to use instance
255 * actively. Instead, a separate 'root' instance should be used
256 * on which only makeChild/mergeChild are called, but instance itself
257 * is not used as a symbol table.
258 */
259 public SymbolTable makeChild()
260 {
261 final boolean internStrings;
262 final String[] symbols;
263 final Bucket[] buckets;
264 final int size;
265 final int sizeThreshold;
266 final int indexMask;
267 final int version;
268
269 synchronized (this) {
270 internStrings = mInternStrings;
271 symbols = mSymbols;
272 buckets = mBuckets;
273 size = mSize;
274 sizeThreshold = mSizeThreshold;
275 indexMask = mIndexMask;
276 version = mThisVersion+1;
277 }
278 return new SymbolTable(internStrings, symbols, buckets,
279 size, sizeThreshold, indexMask, version);
280 }
281
282 /**
283 * Method that allows contents of child table to potentially be
284 * "merged in" with contents of this symbol table.
285 *<p>
286 * Note that caller has to make sure symbol table passed in is
287 * really a child or sibling of this symbol table.
288 */
289 public synchronized void mergeChild(SymbolTable child)
290 {
291 // Let's do a basic sanity check first:
292 if (child.size() <= size()) { // nothing to add
293 return;
294 }
295
296 // Okie dokie, let's get the data in!
297 mSymbols = child.mSymbols;
298 mBuckets = child.mBuckets;
299 mSize = child.mSize;
300 mSizeThreshold = child.mSizeThreshold;
301 mIndexMask = child.mIndexMask;
302 mThisVersion++; // to prevent other children from overriding
303
304 // Dirty flag... well, let's just clear it, to force copying just
305 // in case. Shouldn't really matter, for master tables.
306 mDirty = false;
307
308 /* However, we have to mark child as dirty, so that it will not
309 * be modifying arrays we "took over" (since child may have
310 * returned an updated table before it stopped fully using
311 * the SymbolTable: for example, it may still use it for
312 * parsing PI targets in epilog)
313 */
314 child.mDirty = false;
315 }
316
317 /*
318 ////////////////////////////////////////////////////
319 // Public API, configuration
320 ////////////////////////////////////////////////////
321 */
322
323 public void setInternStrings(boolean state) {
324 mInternStrings = state;
325 }
326
327 /*
328 ////////////////////////////////////////////////////
329 // Public API, generic accessors:
330 ////////////////////////////////////////////////////
331 */
332
333 public int size() { return mSize; }
334
335 public int version() { return mThisVersion; }
336
337 public boolean isDirty() { return mDirty; }
338
339 public boolean isDirectChildOf(SymbolTable t)
340 {
341 /* Actually, this doesn't really prove it is a child (would have to
342 * use sequence number, or identityHash to really prove it), but
343 * it's good enough if relationship is known to exist.
344 */
345 /* (for real check, one would need to child/descendant stuff; or
346 * at least an identity hash... or maybe even just a _static_ global
347 * counter for instances... maybe that would actually be worth
348 * doing?)
349 */
350 if (mThisVersion == (t.mThisVersion + 1)) {
351 return true;
352 }
353 return false;
354 }
355
356 /*
357 ////////////////////////////////////////////////////
358 // Public API, accessing symbols:
359 ////////////////////////////////////////////////////
360 */
361
362 /**
363 * Main access method; will check if actual symbol String exists;
364 * if so, returns it; if not, will create, add and return it.
365 *
366 * @return The symbol matching String in input array
367 */
368 /*
369 public String findSymbol(char[] buffer, int start, int len)
370 {
371 return findSymbol(buffer, start, len, calcHash(buffer, start, len));
372 }
373 */
374
375 public String findSymbol(char[] buffer, int start, int len, int hash)
376 {
377 // Sanity check:
378 if (len < 1) {
379 return EMPTY_STRING;
380 }
381
382 hash &= mIndexMask;
383
384 String sym = mSymbols[hash];
385
386 // Optimal case; checking existing primary symbol for hash index:
387 if (sym != null) {
388 // Let's inline primary String equality checking:
389 if (sym.length() == len) {
390 int i = 0;
391 do {
392 if (sym.charAt(i) != buffer[start+i]) {
393 break;
394 }
395 } while (++i < len);
396 // Optimal case; primary match found
397 if (i == len) {
398 return sym;
399 }
400 }
401 // How about collision bucket?
402 Bucket b = mBuckets[hash >> 1];
403 if (b != null) {
404 sym = b.find(buffer, start, len);
405 if (sym != null) {
406 return sym;
407 }
408 }
409 }
410
411 // Need to expand?
412 if (mSize >= mSizeThreshold) {
413 rehash();
414 /* Need to recalc hash; rare occurence (index mask has been
415 * recalculated as part of rehash)
416 */
417 hash = calcHash(buffer, start, len) & mIndexMask;
418 } else if (!mDirty) {
419 // Or perhaps we need to do copy-on-write?
420 copyArrays();
421 mDirty = true;
422 }
423 ++mSize;
424
425 String newSymbol = new String(buffer, start, len);
426 if (mInternStrings) {
427 newSymbol = newSymbol.intern();
428 }
429 // Ok; do we need to add primary entry, or a bucket?
430 if (mSymbols[hash] == null) {
431 mSymbols[hash] = newSymbol;
432 } else {
433 int bix = hash >> 1;
434 mBuckets[bix] = new Bucket(newSymbol, mBuckets[bix]);
435 }
436
437 return newSymbol;
438 }
439
440 /**
441 * Similar to {link #findSymbol}, but will not add passed in symbol
442 * if it is not in symbol table yet.
443 */
444 public String findSymbolIfExists(char[] buffer, int start, int len, int hash)
445 {
446 // Sanity check:
447 if (len < 1) {
448 return EMPTY_STRING;
449 }
450 hash &= mIndexMask;
451
452 String sym = mSymbols[hash];
453 // Optimal case; checking existing primary symbol for hash index:
454 if (sym != null) {
455 // Let's inline primary String equality checking:
456 if (sym.length() == len) {
457 int i = 0;
458 do {
459 if (sym.charAt(i) != buffer[start+i]) {
460 break;
461 }
462 } while (++i < len);
463 // Optimal case; primary match found
464 if (i == len) {
465 return sym;
466 }
467 }
468 // How about collision bucket?
469 Bucket b = mBuckets[hash >> 1];
470 if (b != null) {
471 sym = b.find(buffer, start, len);
472 if (sym != null) {
473 return sym;
474 }
475 }
476 }
477 return null;
478 }
479
480 /**
481 * Similar to to {@link #findSymbol(char[],int,int,int)}; used to either
482 * do potentially cheap intern() (if table already has intern()ed version),
483 * or to pre-populate symbol table with known values.
484 */
485 public String findSymbol(String str)
486 {
487 int len = str.length();
488 // Sanity check:
489 if (len < 1) {
490 return EMPTY_STRING;
491 }
492
493 int index = calcHash(str) & mIndexMask;
494 String sym = mSymbols[index];
495
496 // Optimal case; checking existing primary symbol for hash index:
497 if (sym != null) {
498 // Let's inline primary String equality checking:
499 if (sym.length() == len) {
500 int i = 0;
501 for (; i < len; ++i) {
502 if (sym.charAt(i) != str.charAt(i)) {
503 break;
504 }
505 }
506 // Optimal case; primary match found
507 if (i == len) {
508 return sym;
509 }
510 }
511 // How about collision bucket?
512 Bucket b = mBuckets[index >> 1];
513 if (b != null) {
514 sym = b.find(str);
515 if (sym != null) {
516 return sym;
517 }
518 }
519 }
520
521 // Need to expand?
522 if (mSize >= mSizeThreshold) {
523 rehash();
524 /* Need to recalc hash; rare occurence (index mask has been
525 * recalculated as part of rehash)
526 */
527 index = calcHash(str) & mIndexMask;
528 } else if (!mDirty) {
529 // Or perhaps we need to do copy-on-write?
530 copyArrays();
531 mDirty = true;
532 }
533 ++mSize;
534
535 if (mInternStrings) {
536 str = str.intern();
537 }
538 // Ok; do we need to add primary entry, or a bucket?
539 if (mSymbols[index] == null) {
540 mSymbols[index] = str;
541 } else {
542 int bix = index >> 1;
543 mBuckets[bix] = new Bucket(str, mBuckets[bix]);
544 }
545
546 return str;
547 }
548
549 /**
550 * Implementation of a hashing method for variable length
551 * Strings. Most of the time intention is that this calculation
552 * is done by caller during parsing, not here; however, sometimes
553 * it needs to be done for parsed "String" too.
554 *
555 * @param len Length of String; has to be at least 1 (caller guarantees
556 * this pre-condition)
557 */
558 @SuppressWarnings("cast")
559 public static int calcHash(char[] buffer, int start, int len) {
560 int hash = (int) buffer[start];
561 for (int i = 1; i < len; ++i) {
562 hash = (hash * 31) + (int) buffer[start+i];
563 }
564 return hash;
565 }
566
567 @SuppressWarnings("cast")
568 public static int calcHash(String key) {
569 int hash = (int) key.charAt(0);
570 for (int i = 1, len = key.length(); i < len; ++i) {
571 hash = (hash * 31) + (int) key.charAt(i);
572
573 }
574 return hash;
575 }
576
577 /*
578 //////////////////////////////////////////////////////////
579 // Internal methods
580 //////////////////////////////////////////////////////////
581 */
582
583 /**
584 * Method called when copy-on-write is needed; generally when first
585 * change is made to a derived symbol table.
586 */
587 private void copyArrays() {
588 String[] oldSyms = mSymbols;
589 int size = oldSyms.length;
590 mSymbols = new String[size];
591 System.arraycopy(oldSyms, 0, mSymbols, 0, size);
592 Bucket[] oldBuckets = mBuckets;
593 size = oldBuckets.length;
594 mBuckets = new Bucket[size];
595 System.arraycopy(oldBuckets, 0, mBuckets, 0, size);
596 }
597
598 /**
599 * Method called when size (number of entries) of symbol table grows
600 * so big that load factor is exceeded. Since size has to remain
601 * power of two, arrays will then always be doubled. Main work
602 * is really redistributing old entries into new String/Bucket
603 * entries.
604 */
605 private void rehash()
606 {
607 int size = mSymbols.length;
608 int newSize = size + size;
609 String[] oldSyms = mSymbols;
610 Bucket[] oldBuckets = mBuckets;
611 mSymbols = new String[newSize];
612 mBuckets = new Bucket[newSize >> 1];
613 // Let's update index mask, threshold, now (needed for rehashing)
614 mIndexMask = newSize - 1;
615 mSizeThreshold += mSizeThreshold;
616
617 int count = 0; // let's do sanity check
618
619 /* Need to do two loops, unfortunately, since spillover area is
620 * only half the size:
621 */
622 for (int i = 0; i < size; ++i) {
623 String symbol = oldSyms[i];
624 if (symbol != null) {
625 ++count;
626 int index = calcHash(symbol) & mIndexMask;
627 if (mSymbols[index] == null) {
628 mSymbols[index] = symbol;
629 } else {
630 int bix = index >> 1;
631 mBuckets[bix] = new Bucket(symbol, mBuckets[bix]);
632 }
633 }
634 }
635
636 size >>= 1;
637 for (int i = 0; i < size; ++i) {
638 Bucket b = oldBuckets[i];
639 while (b != null) {
640 ++count;
641 String symbol = b.getSymbol();
642 int index = calcHash(symbol) & mIndexMask;
643 if (mSymbols[index] == null) {
644 mSymbols[index] = symbol;
645 } else {
646 int bix = index >> 1;
647 mBuckets[bix] = new Bucket(symbol, mBuckets[bix]);
648 }
649 b = b.getNext();
650 }
651 }
652
653 if (count != mSize) {
654 throw new IllegalStateException("Internal error on SymbolTable.rehash(): had "+mSize+" entries; now have "+count+".");
655 }
656 }
657
658 /*
659 //////////////////////////////////////////////////////////
660 // Test/debug support:
661 //////////////////////////////////////////////////////////
662 */
663
664 public double calcAvgSeek() {
665 int count = 0;
666
667 for (int i = 0, len = mSymbols.length; i < len; ++i) {
668 if (mSymbols[i] != null) {
669 ++count;
670 }
671 }
672
673 for (int i = 0, len = mBuckets.length; i < len; ++i) {
674 Bucket b = mBuckets[i];
675 int cost = 2;
676 while (b != null) {
677 count += cost;
678 ++cost;
679 b = b.getNext();
680 }
681 }
682
683 return ((double) count) / ((double) mSize);
684 }
685
686 /*
687 //////////////////////////////////////////////////////////
688 // Bucket class
689 //////////////////////////////////////////////////////////
690 */
691
692 /**
693 * This class is a symbol table entry. Each entry acts as a node
694 * in a linked list.
695 */
696 static final class Bucket {
697 private final String mSymbol;
698 private final Bucket mNext;
699
700 public Bucket(String symbol, Bucket next) {
701 mSymbol = symbol;
702 mNext = next;
703 }
704
705 public String getSymbol() { return mSymbol; }
706 public Bucket getNext() { return mNext; }
707
708 public String find(char[] buf, int start, int len) {
709 String sym = mSymbol;
710 Bucket b = mNext;
711
712 while (true) { // Inlined equality comparison:
713 if (sym.length() == len) {
714 int i = 0;
715 do {
716 if (sym.charAt(i) != buf[start+i]) {
717 break;
718 }
719 } while (++i < len);
720 if (i == len) {
721 return sym;
722 }
723 }
724 if (b == null) {
725 break;
726 }
727 sym = b.getSymbol();
728 b = b.getNext();
729 }
730 return null;
731 }
732
733 public String find(String str) {
734 String sym = mSymbol;
735 Bucket b = mNext;
736
737 while (true) {
738 if (sym.equals(str)) {
739 return sym;
740 }
741 if (b == null) {
742 break;
743 }
744 sym = b.getSymbol();
745 b = b.getNext();
746 }
747 return null;
748 }
749 }
750 }
751