Monitoring JavaMelody on _ip-10-0-9-233.ec2.internal

1 package com.fasterxml.jackson.core.sym;

2 

3 import java.util.Arrays;

4 import java.util.concurrent.atomic.AtomicReference;

5 

6 import com.fasterxml.jackson.core.JsonFactory;

7 import com.fasterxml.jackson.core.util.InternCache;

8 

9 /**

10  * Replacement for <code>BytesToNameCanonicalizer</code> which aims at more localized

11  * memory access due to flattening of name quad data.

12  * Performance improvement modest for simple JSON document data binding (maybe 3%),

13  * but should help more for larger symbol tables, or for binary formats like Smile.

14  *<p>

15  * Hash area is divided into 4 sections:

16  *<ol>

17  * <li>Primary area (1/2 of total size), direct match from hash (LSB)</li>

18  * <li>Secondary area (1/4 of total size), match from {@code hash (LSB) >> 1}</li>

19  * <li>Tertiary area (1/8 of total size), match from {@code hash (LSB) >> 2}</li>

20  * <li>Spill-over area (remaining 1/8) with linear scan, insertion order</li>

21  * <li></li>

22  * </ol>

23  * and within every area, entries are 4 {@code int}s, where 1 - 3 ints contain 1 - 12

24  * UTF-8 encoded bytes of name (null-padded), and last int is offset in

25  * {@code _names} that contains actual name Strings.

26  *

27  * @since 2.6

28  */

29 public final class ByteQuadsCanonicalizer

30 {

31     /**

32      * Initial size of the primary hash area. Each entry consumes 4 ints (16 bytes),

33      * and secondary area is same as primary; so default size will use 2kB of memory

34      * (plus 64x4 or 64x8 (256/512 bytes) for references to Strings, and Strings

35      * themselves).

36      */

37     private static final int DEFAULT_T_SIZE = 64;

38 //    private static final int DEFAULT_T_SIZE = 256;

39 

40     /**

41      * Let's not expand symbol tables past some maximum size;

42      * with unique (~= random) names.

43      * Size is in 

44      */

45     private static final int MAX_T_SIZE = 0x10000; // 64k entries == 2M mem hash area

46 

47     /**

48      * No point in trying to construct tiny tables, just need to resize soon.

49      */

50     private final static int MIN_HASH_SIZE = 16;

51     

52     /**

53      * Let's only share reasonably sized symbol tables. Max size set to 3/4 of 8k;

54      * this corresponds to 256k main hash index. This should allow for enough distinct

55      * names for almost any case, while preventing ballooning for cases where names

56      * are unique (or close thereof).

57      */

58     final static int MAX_ENTRIES_FOR_REUSE = 6000;

59 

60     /*

61     /**********************************************************

62     /* Linkage, needed for merging symbol tables

63     /**********************************************************

64      */

65 

66     /**

67      * Reference to the root symbol table, for child tables, so

68      * that they can merge table information back as necessary.

69      */

70     final private ByteQuadsCanonicalizer _parent;

71 

72     /**

73      * Member that is only used by the root table instance: root

74      * passes immutable state info child instances, and children

75      * may return new state if they add entries to the table.

76      * Child tables do NOT use the reference.

77      */

78     final private AtomicReference<TableInfo> _tableInfo;

79     

80     /**

81      * Seed value we use as the base to make hash codes non-static between

82      * different runs, but still stable for lifetime of a single symbol table

83      * instance.

84      * This is done for security reasons, to avoid potential DoS attack via

85      * hash collisions.

86      */

87     final private int _seed;

88     

89     /*

90     /**********************************************************

91     /* Configuration

92     /**********************************************************

93      */

94 

95     /**

96      * Whether canonical symbol Strings are to be intern()ed before added

97      * to the table or not.

98      *<p>

99      * NOTE: non-final to allow disabling intern()ing in case of excessive

100      * collisions.

101      */

102     private boolean _intern;

103 

104     /**

105      * Flag that indicates whether we should throw an exception if enough 

106      * hash collisions are detected (true); or just worked around (false).

107      * 

108      * @since 2.4

109      */

110     private final boolean _failOnDoS;

111     

112     /*

113     /**********************************************************

114     /* First, main hash area info

115     /**********************************************************

116      */

117 

118     /**

119      * Primary hash information area: consists of <code>2 * _hashSize</code>

120      * entries of 16 bytes (4 ints), arranged in a cascading lookup

121      * structure (details of which may be tweaked depending on expected rates

122      * of collisions).

123      */

124     private int[] _hashArea;

125 

126     /**

127      * Number of slots for primary entries within {@link #_hashArea}; which is

128      * at most <code>1/8</code> of actual size of the underlying array (4-int slots,

129      * primary covers only half of the area; plus, additional area for longer

130      * symbols after hash area).

131      */

132     private int _hashSize;

133 

134     /**

135      * Offset within {@link #_hashArea} where secondary entries start

136      */

137     private int _secondaryStart;

138 

139     /**

140      * Offset within {@link #_hashArea} where tertiary entries start

141      */

142     private int _tertiaryStart;

143     

144     /**

145      * Constant that determines size of buckets for tertiary entries:

146      * <code>1 &lt;&lt; _tertiaryShift</code> is the size, and shift value

147      * is also used for translating from primary offset into

148      * tertiary bucket (shift right by <code>4 + _tertiaryShift</code>).

149      *<p>

150      * Default value is 2, for buckets of 4 slots; grows bigger with

151      * bigger table sizes.

152      */

153     private int _tertiaryShift;

154 

155     /**

156      * Total number of Strings in the symbol table; only used for child tables.

157      */

158     private int _count;

159 

160     /**

161      * Array that contains <code>String</code> instances matching

162      * entries in {@link #_hashArea}.

163      * Contains nulls for unused entries. Note that this size is twice

164      * that of {@link #_hashArea}

165      */

166     private String[] _names;

167 

168     /*

169     /**********************************************************

170     /* Then information on collisions etc

171     /**********************************************************

172      */

173 

174     /**

175      * Pointer to the offset within spill-over area where there is room

176      * for more spilled over entries (if any).

177      * Spill over area is within fixed-size portion of {@link #_hashArea}.

178      */

179     private int _spilloverEnd;

180 

181     /**

182      * Offset within {@link #_hashArea} that follows main slots and contains

183      * quads for longer names (13 bytes or longer), and points to the

184      * first available int that may be used for appending quads of the next

185      * long name.

186      * Note that long name area follows immediately after the fixed-size

187      * main hash area ({@link #_hashArea}).

188      */

189     private int _longNameOffset;

190 

191     /*

192     /**********************************************************

193     /* Sharing, versioning

194     /**********************************************************

195      */

196 

197     // // // Which of the buffers may be shared (and are copy-on-write)?

198 

199     /**

200      * Flag that indicates whether underlying data structures for

201      * the main hash area are shared or not. If they are, then they

202      * need to be handled in copy-on-write way, i.e. if they need

203      * to be modified, a copy needs to be made first; at this point

204      * it will not be shared any more, and can be modified.

205      *<p>

206      * This flag needs to be checked both when adding new main entries,

207      * and when adding new collision list queues (i.e. creating a new

208      * collision list head entry)

209      */

210     private boolean _hashShared;

211 

212     /*

213     /**********************************************************

214     /* Life-cycle: constructors

215     /**********************************************************

216      */

217 

218     /**

219      * Constructor used for creating per-<code>JsonFactory</code> "root"

220      * symbol tables: ones used for merging and sharing common symbols

221      * 

222      * @param sz Initial primary hash area size

223      * @param intern Whether Strings contained should be {@link String#intern}ed

224      * @param seed Random seed valued used to make it more difficult to cause

225      *   collisions (used for collision-based DoS attacks).

226      */

227     private ByteQuadsCanonicalizer(int sz, boolean intern, int seed, boolean failOnDoS) {

228         _parent = null;

229         _seed = seed;

230         _intern = intern;

231         _failOnDoS = failOnDoS;

232         // Sanity check: let's now allow hash sizes below certain minimum value

233         if (sz < MIN_HASH_SIZE) {

234             sz = MIN_HASH_SIZE;

235         } else {

236             // Also; size must be 2^N; otherwise hash algorithm won't

237             // work... so let's just pad it up, if so

238             if ((sz & (sz - 1)) != 0) { // only true if it's 2^N

239                 int curr = MIN_HASH_SIZE;

240                 while (curr < sz) {

241                     curr += curr;

242                 }

243                 sz = curr;

244             }

245         }

246         _tableInfo = new AtomicReference<TableInfo>(TableInfo.createInitial(sz));

247     }

248 

249     /**

250      * Constructor used when creating a child instance

251      */

252     private ByteQuadsCanonicalizer(ByteQuadsCanonicalizer parent, boolean intern,

253             int seed, boolean failOnDoS, TableInfo state)

254     {

255         _parent = parent;

256         _seed = seed;

257         _intern = intern;

258         _failOnDoS = failOnDoS;

259         _tableInfo = null; // not used by child tables

260 

261         // Then copy shared state

262         _count = state.count;

263         _hashSize = state.size;

264         _secondaryStart = _hashSize << 2; // right after primary area

265         _tertiaryStart = _secondaryStart + (_secondaryStart >> 1); // right after secondary

266         _tertiaryShift = state.tertiaryShift;

267         

268         _hashArea = state.mainHash;

269         _names = state.names;

270 

271         _spilloverEnd = state.spilloverEnd;

272         _longNameOffset = state.longNameOffset;

273 

274         // and then set other state to reflect sharing status

275         _hashShared = true;

276     }

277 

278     /*

279     /**********************************************************

280     /* Life-cycle: factory methods, merging

281     /**********************************************************

282      */

283     

284     /**

285      * Factory method to call to create a symbol table instance with a

286      * randomized seed value.

287      */

288     public static ByteQuadsCanonicalizer createRoot() {

289         // Need to use a variable seed, to thwart hash-collision based attacks.

290         // 14-Feb-2017, tatu: Does this actually help?

291         long now = System.currentTimeMillis();

292         // ensure it's not 0; and might as well require to be odd so:

293         int seed = (((int) now) + ((int) (now >>> 32))) | 1;

294         return createRoot(seed);

295     }

296 

297     // Factory method that should only be called from unit tests, where seed

298     // value should remain the same.

299     protected static ByteQuadsCanonicalizer createRoot(int seed) {

300         return new ByteQuadsCanonicalizer(DEFAULT_T_SIZE, true, seed, true);

301     }

302 

303     /**

304      * Factory method used to create actual symbol table instance to

305      * use for parsing.

306      */

307     public ByteQuadsCanonicalizer makeChild(int flags) {

308         return new ByteQuadsCanonicalizer(this,

309                 JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(flags),

310                 _seed,

311                 JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(flags),

312                 _tableInfo.get());

313     }

314 

315     /**

316      * Method called by the using code to indicate it is done with this instance.

317      * This lets instance merge accumulated changes into parent (if need be),

318      * safely and efficiently, and without calling code having to know about parent

319      * information.

320      */

321     public void release()

322     {

323         // we will try to merge if child table has new entries

324         // 28-Jul-2019, tatu: From [core#548]: do not share if immediate rehash needed

325         if ((_parent != null) && maybeDirty()) {

326             _parent.mergeChild(new TableInfo(this));

327             // Let's also mark this instance as dirty, so that just in

328             // case release was too early, there's no corruption of possibly shared data.

329             _hashShared = true;

330         }

331     }

332 

333     private void mergeChild(TableInfo childState)

334     {

335         final int childCount = childState.count;

336         TableInfo currState = _tableInfo.get();

337 

338         // Should usually grow; but occasionally could also shrink if (but only if)

339         // collision list overflow ends up clearing some collision lists.

340         if (childCount == currState.count) {

341             return;

342         }

343 

344         // One caveat: let's try to avoid problems with degenerate cases of documents with

345         // generated "random" names: for these, symbol tables would bloat indefinitely.

346         // One way to do this is to just purge tables if they grow

347         // too large, and that's what we'll do here.

348         if (childCount > MAX_ENTRIES_FOR_REUSE) {

349             // At any rate, need to clean up the tables

350             childState = TableInfo.createInitial(DEFAULT_T_SIZE);

351         }

352         _tableInfo.compareAndSet(currState, childState);

353     }

354 

355     /*

356     /**********************************************************

357     /* API, accessors

358     /**********************************************************

359      */

360 

361     public int size()

362     {

363         if (_tableInfo != null) { // root table

364             return _tableInfo.get().count;

365         }

366         // nope, child table

367         return _count;

368     }

369 

370     /**

371      * Returns number of primary slots table has currently

372      */

373     public int bucketCount() { return _hashSize; }

374 

375     /**

376      * Method called to check to quickly see if a child symbol table

377      * may have gotten additional entries. Used for checking to see

378      * if a child table should be merged into shared table.

379      */

380     public boolean maybeDirty() { return !_hashShared; }

381 

382     public int hashSeed() { return _seed; }

383     

384     /**

385      * Method mostly needed by unit tests; calculates number of

386      * entries that are in the primary slot set. These are

387      * "perfect" entries, accessible with a single lookup

388      */

389     public int primaryCount()

390     {

391         int count = 0;

392         for (int offset = 3, end = _secondaryStart; offset < end; offset += 4) {

393             if (_hashArea[offset] != 0) {

394                 ++count;

395             }

396         }

397         return count;

398     }

399 

400     /**

401      * Method mostly needed by unit tests; calculates number of entries

402      * in secondary buckets

403      */

404     public int secondaryCount() {

405         int count = 0;

406         int offset = _secondaryStart + 3;

407         for (int end = _tertiaryStart; offset < end; offset += 4) {

408             if (_hashArea[offset] != 0) {

409                 ++count;

410             }

411         }

412         return count;

413     }

414 

415     /**

416      * Method mostly needed by unit tests; calculates number of entries

417      * in tertiary buckets

418      */

419     public int tertiaryCount() {

420         int count = 0;

421         int offset = _tertiaryStart + 3; // to 1.5x, starting point of tertiary

422         for (int end = offset + _hashSize; offset < end; offset += 4) {

423             if (_hashArea[offset] != 0) {

424                 ++count;

425             }

426         }

427         return count;

428     }

429 

430     /**

431      * Method mostly needed by unit tests; calculates number of entries

432      * in shared spillover area

433      */

434     public int spilloverCount() {

435         // difference between spillover end, start, divided by 4 (four ints per slot)

436         return (_spilloverEnd - _spilloverStart()) >> 2;

437     }

438 

439     public int totalCount()

440     {

441         int count = 0;

442         for (int offset = 3, end = (_hashSize << 3); offset < end; offset += 4) {

443             if (_hashArea[offset] != 0) {

444                 ++count;

445             }

446         }

447         return count;

448     }

449 

450     @Override

451     public String toString() {

452         int pri = primaryCount();

453         int sec = secondaryCount();

454         int tert = tertiaryCount();

455         int spill = spilloverCount();

456         int total = totalCount();

457         return String.format("[%s: size=%d, hashSize=%d, %d/%d/%d/%d pri/sec/ter/spill (=%s), total:%d]",

458                 getClass().getName(), _count, _hashSize,

459                 pri, sec, tert, spill, (pri+sec+tert+spill), total);

460     }

461 

462     /*

463     /**********************************************************

464     /* Public API, accessing symbols

465     /**********************************************************

466      */

467 

468     public String findName(int q1)

469     {

470         int offset = _calcOffset(calcHash(q1));

471         // first: primary match?

472         final int[] hashArea = _hashArea;

473 

474         int len = hashArea[offset+3];

475 

476         if (len == 1) {

477             if (hashArea[offset] == q1) {

478                 return _names[offset >> 2];

479             }

480         } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so

481             return null;

482         }

483         // secondary? single slot shared by N/2 primaries

484         int offset2 = _secondaryStart + ((offset >> 3) << 2);

485 

486         len = hashArea[offset2+3];

487 

488         if (len == 1) {

489             if (hashArea[offset2] == q1) {

490                 return _names[offset2 >> 2];

491             }

492         } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so

493             return null;

494         }

495 

496         // tertiary lookup & spillovers best to offline

497         return _findSecondary(offset, q1);

498     }

499 

500     public String findName(int q1, int q2)

501     {

502         int offset = _calcOffset(calcHash(q1, q2));

503 

504         final int[] hashArea = _hashArea;

505 

506         int len = hashArea[offset+3];

507 

508         if (len == 2) {

509             if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1])) {

510                 return _names[offset >> 2];

511             }

512         } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so

513             return null;

514         }

515         // secondary?

516         int offset2 = _secondaryStart + ((offset >> 3) << 2);

517 

518         len = hashArea[offset2+3];

519 

520         if (len == 2) {

521             if ((q1 == hashArea[offset2]) && (q2 == hashArea[offset2+1])) {

522                 return _names[offset2 >> 2];

523             }

524         } else if (len == 0) { // empty slot? Short-circuit if no more spillovers

525             return null;

526         }

527         return _findSecondary(offset, q1, q2);

528     }

529 

530     public String findName(int q1, int q2, int q3)

531     {

532         int offset = _calcOffset(calcHash(q1, q2, q3));

533         final int[] hashArea = _hashArea;

534         int len = hashArea[offset+3];

535 

536         if (len == 3) {

537             if ((q1 == hashArea[offset]) && (hashArea[offset+1] == q2) && (hashArea[offset+2] == q3)) {

538                 return _names[offset >> 2];

539             }

540         } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so

541             return null;

542         }

543         // secondary?

544         int offset2 = _secondaryStart + ((offset >> 3) << 2);

545 

546         len = hashArea[offset2+3];

547 

548         if (len == 3) {

549             if ((q1 == hashArea[offset2]) && (hashArea[offset2+1] == q2) && (hashArea[offset2+2] == q3)) {

550                 return _names[offset2 >> 2];

551             }

552         } else if (len == 0) { // empty slot? Short-circuit if no more spillovers

553             return null;

554         }

555         return _findSecondary(offset, q1, q2, q3);

556     }

557 

558     public String findName(int[] q, int qlen)

559     {

560         /* This version differs significantly, because longer names do not fit within cell.

561          * Rather, they contain hash in main slot, and offset+length to extension area

562          * that contains actual quads.

563          */

564         if (qlen < 4) { // another sanity check

565             switch (qlen) {

566             case 3:

567                 return findName(q[0], q[1], q[2]);

568             case 2:

569                 return findName(q[0], q[1]);

570             case 1:

571                 return findName(q[0]);

572             default: // if 0 ever passed

573                 return "";

574             }

575         }

576         final int hash = calcHash(q, qlen);

577         int offset = _calcOffset(hash);

578 

579         final int[] hashArea = _hashArea;

580 

581         final int len = hashArea[offset+3];

582         

583         if ((hash == hashArea[offset]) && (len == qlen)) {

584             // probable but not guaranteed: verify

585             if (_verifyLongName(q, qlen, hashArea[offset+1])) {

586                 return _names[offset >> 2];

587             }

588         }

589         if (len == 0) { // empty slot; unlikely but avoid further lookups if so

590             return null;

591         }

592         // secondary?

593         int offset2 = _secondaryStart + ((offset >> 3) << 2);

594 

595         final int len2 = hashArea[offset2+3];

596         if ((hash == hashArea[offset2]) && (len2 == qlen)) {

597             if (_verifyLongName(q, qlen, hashArea[offset2+1])) {

598                 return _names[offset2 >> 2];

599             }

600         }

601         return _findSecondary(offset, hash, q, qlen);

602     }

603     

604     private final int _calcOffset(int hash)

605     {

606         // NOTE: simple for initial impl, but we may want to interleave it a bit

607         // in near future

608         // So: first, hash into primary hash index

609         int ix = hash & (_hashSize-1);

610         // keeping in mind we have 4 ints per entry

611         return (ix << 2);

612     }

613 

614     /*

615     /**********************************************************

616     /* Access from spill-over areas

617     /**********************************************************

618      */

619 

620     private String _findSecondary(int origOffset, int q1)

621     {

622         // tertiary area division is dynamic. First; its size is N/4 compared to

623         // primary hash size; and offsets are for 4 int slots. So to get to logical

624         // index would shift by 4. But! Tertiary area is further split into buckets,

625         // determined by shift value. And finally, from bucket back into physical offsets

626         int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);

627         final int[] hashArea = _hashArea;

628         final int bucketSize = (1 << _tertiaryShift);

629         for (int end = offset + bucketSize; offset < end; offset += 4) {

630             int len = hashArea[offset+3];

631             if ((q1 == hashArea[offset]) && (1 == len)) {

632                 return _names[offset >> 2];

633             }

634             if (len == 0) {

635                 return null;

636             }

637         }

638         // but if tertiary full, check out spill-over area as last resort

639         // shared spillover starts at 7/8 of the main hash area

640         // (which is sized at 2 * _hashSize), so:

641         for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {

642             if ((q1 == hashArea[offset]) && (1 == hashArea[offset+3])) {

643                 return _names[offset >> 2];

644             }

645         }

646         return null;

647     }

648 

649     private String _findSecondary(int origOffset, int q1, int q2)

650     {

651         int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);

652         final int[] hashArea = _hashArea;

653 

654         final int bucketSize = (1 << _tertiaryShift);

655         for (int end = offset + bucketSize; offset < end; offset += 4) {

656             int len = hashArea[offset+3];

657             if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (2 == len)) {

658                 return _names[offset >> 2];

659             }

660             if (len == 0) {

661                 return null;

662             }

663         }

664         for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {

665             if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (2 == hashArea[offset+3])) {

666                 return _names[offset >> 2];

667             }

668         }

669         return null;

670     }

671 

672     private String _findSecondary(int origOffset, int q1, int q2, int q3)

673     {

674         int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);

675         final int[] hashArea = _hashArea;

676 

677         final int bucketSize = (1 << _tertiaryShift);

678         for (int end = offset + bucketSize; offset < end; offset += 4) {

679             int len = hashArea[offset+3];

680             if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (q3 == hashArea[offset+2]) && (3 == len)) {

681                 return _names[offset >> 2];

682             }

683             if (len == 0) {

684                 return null;

685             }

686         }

687         for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {

688             if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (q3 == hashArea[offset+2])

689                     && (3 == hashArea[offset+3])) {

690                 return _names[offset >> 2];

691             }

692         }

693         return null;

694     }

695 

696     private String _findSecondary(int origOffset, int hash, int[] q, int qlen)

697     {

698         int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);

699         final int[] hashArea = _hashArea;

700 

701         final int bucketSize = (1 << _tertiaryShift);

702         for (int end = offset + bucketSize; offset < end; offset += 4) {

703             int len = hashArea[offset+3];

704             if ((hash == hashArea[offset]) && (qlen == len)) {

705                 if (_verifyLongName(q, qlen, hashArea[offset+1])) {

706                     return _names[offset >> 2];

707                 }

708             }

709             if (len == 0) {

710                 return null;

711             }

712         }

713         for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {

714             if ((hash == hashArea[offset]) && (qlen == hashArea[offset+3])) {

715                 if (_verifyLongName(q, qlen, hashArea[offset+1])) {

716                     return _names[offset >> 2];

717                 }

718             }

719         }

720         return null;

721     }

722     

723     private boolean _verifyLongName(int[] q, int qlen, int spillOffset)

724     {

725         final int[] hashArea = _hashArea;

726         // spillOffset assumed to be physical index right into quad string

727         int ix = 0;

728 

729         switch (qlen) {

730         default:

731             return _verifyLongName2(q, qlen, spillOffset);

732         case 8:

733             if (q[ix++] != hashArea[spillOffset++]) return false;

734         case 7:

735             if (q[ix++] != hashArea[spillOffset++]) return false;

736         case 6:

737             if (q[ix++] != hashArea[spillOffset++]) return false;

738         case 5:

739             if (q[ix++] != hashArea[spillOffset++]) return false;

740         case 4: // always at least 4

741             if (q[ix++] != hashArea[spillOffset++]) return false;

742             if (q[ix++] != hashArea[spillOffset++]) return false;

743             if (q[ix++] != hashArea[spillOffset++]) return false;

744             if (q[ix++] != hashArea[spillOffset++]) return false;

745         }

746         return true;

747     }

748 

749     private boolean _verifyLongName2(int[] q, int qlen, int spillOffset)

750     {

751         int ix = 0;

752         do {

753             if (q[ix++] != _hashArea[spillOffset++]) {

754                 return false;

755             }

756         } while (ix < qlen);

757         return true;

758     }

759 

760     /*

761     /**********************************************************

762     /* API, mutators

763     /**********************************************************

764      */

765 

766     public String addName(String name, int q1) {

767         _verifySharing();

768         if (_intern) {

769             name = InternCache.instance.intern(name);

770         }

771         int offset = _findOffsetForAdd(calcHash(q1));

772         _hashArea[offset] = q1;

773         _hashArea[offset+3] = 1;

774         _names[offset >> 2] = name;

775         ++_count;

776         return name;

777     }

778 

779     public String addName(String name, int q1, int q2) {

780         _verifySharing();

781         if (_intern) {

782             name = InternCache.instance.intern(name);

783         }

784         int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2);

785         int offset = _findOffsetForAdd(hash);

786         _hashArea[offset] = q1;

787         _hashArea[offset+1] = q2;

788         _hashArea[offset+3] = 2;

789         _names[offset >> 2] = name;

790         ++_count;

791         return name;

792     }

793 

794     public String addName(String name, int q1, int q2, int q3) {

795         _verifySharing();

796         if (_intern) {

797             name = InternCache.instance.intern(name);

798         }

799         int offset = _findOffsetForAdd(calcHash(q1, q2, q3));

800         _hashArea[offset] = q1;

801         _hashArea[offset+1] = q2;

802         _hashArea[offset+2] = q3;

803         _hashArea[offset+3] = 3;

804         _names[offset >> 2] = name;

805         ++_count;

806         return name;

807     }

808 

809     public String addName(String name, int[] q, int qlen)

810     {

811         _verifySharing();

812         if (_intern) {

813             name = InternCache.instance.intern(name);

814         }

815         int offset;

816         

817         switch (qlen) {

818         case 1:

819         {

820                 offset = _findOffsetForAdd(calcHash(q[0]));

821                 _hashArea[offset] = q[0];

822                 _hashArea[offset+3] = 1;

823             }

824             break;

825         case 2:

826             {

827                 offset = _findOffsetForAdd(calcHash(q[0], q[1]));

828                 _hashArea[offset] = q[0];

829                 _hashArea[offset+1] = q[1];

830                 _hashArea[offset+3] = 2;

831             }

832             break;

833         case 3:

834             {

835                 offset = _findOffsetForAdd(calcHash(q[0], q[1], q[2]));

836                 _hashArea[offset] = q[0];

837                 _hashArea[offset+1] = q[1];

838                 _hashArea[offset+2] = q[2];

839                 _hashArea[offset+3] = 3;

840             }

841             break;

842         default:

843             final int hash = calcHash(q, qlen);

844             offset = _findOffsetForAdd(hash);

845 

846             _hashArea[offset] = hash;

847             int longStart = _appendLongName(q, qlen);

848             _hashArea[offset+1] = longStart;

849             _hashArea[offset+3] = qlen;

850         }

851         // plus add the actual String

852         _names[offset >> 2] = name;

853 

854         // and finally; see if we really should rehash.

855         ++_count;

856         return name;

857     }

858 

859     private void _verifySharing()

860     {

861         if (_hashShared) {

862             _hashArea = Arrays.copyOf(_hashArea, _hashArea.length);

863             _names = Arrays.copyOf(_names, _names.length);

864             _hashShared = false;

865         }

866     }

867 

868     /**

869      * Method called to find the location within hash table to add a new symbol in.

870      */

871     private int _findOffsetForAdd(int hash)

872     {

873         // first, check the primary: if slot found, no need for resize

874         int offset = _calcOffset(hash);

875         final int[] hashArea = _hashArea;

876         if (hashArea[offset+3] == 0) {

877 //System.err.printf(" PRImary slot #%d, hash %X\n", (offset>>2), hash & 0x7F);

878             return offset;

879         }

880 

881         // Otherwise let's see if we are due resize():

882         if (_checkNeedForRehash()) {

883             return _resizeAndFindOffsetForAdd(hash);

884         }

885 

886         // If not, proceed with secondary slot

887         int offset2 = _secondaryStart + ((offset >> 3) << 2);

888         if (hashArea[offset2+3] == 0) {

889 //System.err.printf(" SECondary slot #%d (start x%X), hash %X\n",(offset >> 3), _secondaryStart, (hash & 0x7F));

890             return offset2;

891         }

892         // if not, tertiary?

893 

894         offset2 = _tertiaryStart + ((offset >> (_tertiaryShift + 2)) << _tertiaryShift);

895         final int bucketSize = (1 << _tertiaryShift);

896         for (int end = offset2 + bucketSize; offset2 < end; offset2 += 4) {

897             if (hashArea[offset2+3] == 0) {

898 //System.err.printf(" TERtiary slot x%X (from x%X, start x%X), hash %X.\n", offset2, ((offset >> (_tertiaryShift + 2)) << _tertiaryShift), _tertiaryStart, (hash & 0x7F));

899                 return offset2;

900             }

901         }

902 

903         // and if even tertiary full, append at the end of spill area

904         offset = _spilloverEnd;

905         _spilloverEnd += 4;

906 

907 //System.err.printf(" SPIll-over at x%X; start x%X; end x%X, hash %X\n", offset, _spilloverStart(), _hashArea.length, (hash & 0x7F));

908         

909         // one caveat: in the unlikely event if spill-over filling up,

910         // check if that could be considered a DoS attack; handle appropriately

911         // (NOTE: approximate for now; we could verify details if that becomes necessary)

912         /* 31-Jul-2015, tatu: Note that spillover area does NOT end at end of array,

913          *   since "long names" area follows. Instead, need to calculate from hash size.

914          */

915         final int end = (_hashSize << 3);

916         if (_spilloverEnd >= end) {

917             if (_failOnDoS) {

918                 _reportTooManyCollisions();

919             }

920             return _resizeAndFindOffsetForAdd(hash);

921         }

922         return offset;

923     }

924 

925     // @since 2.10

926     private int _resizeAndFindOffsetForAdd(int hash)

927     {

928         // First things first: we need to resize+rehash (or, if too big, nuke contents)

929         rehash();

930         

931         // Copy of main _findOffsetForAdd except for checks to resize: can not be needed

932         int offset = _calcOffset(hash);

933         final int[] hashArea = _hashArea;

934         if (hashArea[offset+3] == 0) {

935             return offset;

936         }

937         int offset2 = _secondaryStart + ((offset >> 3) << 2);

938         if (hashArea[offset2+3] == 0) {

939             return offset2;

940         }

941         offset2 = _tertiaryStart + ((offset >> (_tertiaryShift + 2)) << _tertiaryShift);

942         final int bucketSize = (1 << _tertiaryShift);

943         for (int end = offset2 + bucketSize; offset2 < end; offset2 += 4) {

944             if (hashArea[offset2+3] == 0) {

945                 return offset2;

946             }

947         }

948         offset = _spilloverEnd;

949         _spilloverEnd += 4;

950         return offset;

951     }

952 

953     // Helper method for checking if we should simply rehash() before add

954     private boolean _checkNeedForRehash() {

955         // Yes if above 80%, or above 50% AND have ~1% spill-overs

956         if (_count > (_hashSize >> 1)) { // over 50%

957             int spillCount = (_spilloverEnd - _spilloverStart()) >> 2;

958             if ((spillCount > (1 + _count >> 7))

959                     || (_count > (_hashSize * 0.80))) {

960                 return true;

961             }

962         }

963         return false;

964     }

965     

966     private int _appendLongName(int[] quads, int qlen)

967     {

968         int start = _longNameOffset;

969         

970         // note: at this point we must already be shared. But may not have enough space

971         if ((start + qlen) > _hashArea.length) {

972             // try to increment in reasonable chunks; at least space that we need

973             int toAdd = (start + qlen) - _hashArea.length;

974             // but at least 1/8 of regular hash area size or 16kB (whichever smaller)

975             int minAdd = Math.min(4096, _hashSize);

976 

977             int newSize = _hashArea.length + Math.max(toAdd, minAdd);

978             _hashArea = Arrays.copyOf(_hashArea, newSize);

979         }

980         System.arraycopy(quads, 0, _hashArea, start, qlen);

981         _longNameOffset += qlen;

982         return start;

983     }

984 

985     /*

986     /**********************************************************

987     /* Hash calculation

988     /**********************************************************

989      */

990 

991     /* Note on hash calculation: we try to make it more difficult to

992      * generate collisions automatically; part of this is to avoid

993      * simple "multiply-add" algorithm (like JDK String.hashCode()),

994      * and add bit of shifting. And other part is to make this

995      * non-linear, at least for shorter symbols.

996      */

997     

998     // JDK uses 31; other fine choices are 33 and 65599, let's use 33

999     // as it seems to give fewest collisions for us

1000     // (see [http://www.cse.yorku.ca/~oz/hash.html] for details)

1001     private final static int MULT = 33;

1002     private final static int MULT2 = 65599;

1003     private final static int MULT3 = 31;

1004     

1005     public int calcHash(int q1)

1006     {

1007         int hash = q1 ^ _seed;

1008         /* 29-Mar-2015, tatu: Earlier used 15 + 9 right shifts, which worked ok

1009          *    except for one specific problem case: numbers. So needed to make sure

1010          *    that all 4 least-significant bits participate in hash. Couple of ways

1011          *    to work it out, but this is the simplest, fast and seems to do ok.

1012          */

1013         hash += (hash >>> 16); // to xor hi- and low- 16-bits

1014         hash ^= (hash << 3); // shuffle back a bit

1015         hash += (hash >>> 12); // and bit more

1016         return hash;

1017     }

1018 

1019     public int calcHash(int q1, int q2)

1020     {

1021         // For two quads, let's change algorithm a bit, to spice

1022         // things up (can do bit more processing anyway)

1023         int hash = q1;

1024 

1025         hash += (hash >>> 15); // try mixing first and second byte pairs first

1026         hash ^= (hash >>> 9); // as well as lowest 2 bytes

1027         hash += (q2 * MULT); // then add second quad

1028         hash ^= _seed;

1029         hash += (hash >>> 16); // and shuffle some more

1030         hash ^= (hash >>> 4);

1031         hash += (hash << 3);

1032         

1033         return hash;

1034     }

1035 

1036     public int calcHash(int q1, int q2, int q3)

1037     { // use same algorithm as multi-byte, tested to work well

1038         int hash = q1 ^ _seed;

1039         hash += (hash >>> 9);

1040         hash *= MULT3;

1041         hash += q2;

1042         hash *= MULT;

1043         hash += (hash >>> 15);

1044         hash ^= q3;

1045         // 26-Mar-2015, tatu: As per two-quad case, a short shift seems to help more here

1046         hash += (hash >>> 4);

1047 

1048         hash += (hash >>> 15);

1049         hash ^= (hash << 9);

1050 

1051         return hash;

1052     }

1053 

1054     public int calcHash(int[] q, int qlen)

1055     {

1056         if (qlen < 4) {

1057             throw new IllegalArgumentException();

1058         }

1059         /* And then change handling again for "multi-quad" case; mostly

1060          * to make calculation of collisions less fun. For example,

1061          * add seed bit later in the game, and switch plus/xor around,

1062          * use different shift lengths.

1063          */

1064         int hash = q[0] ^ _seed;

1065         hash += (hash >>> 9);

1066         hash += q[1];

1067         hash += (hash >>> 15);

1068         hash *= MULT;

1069         hash ^= q[2];

1070         hash += (hash >>> 4);

1071 

1072         for (int i = 3; i < qlen; ++i) {

1073             int next = q[i];

1074             next = next ^ (next >> 21);

1075             hash += next;

1076         }

1077         hash *= MULT2;

1078         

1079         // and finally shuffle some more once done

1080         hash += (hash >>> 19);

1081         hash ^= (hash << 5);

1082         return hash;

1083     }

1084 

1085     /*

1086     /**********************************************************

1087     /* Rehashing

1088     /**********************************************************

1089      */

1090 

1091     private void rehash()

1092     {

1093         // Note: since we'll make copies, no need to unshare, can just mark as such:

1094         _hashShared = false;

1095 

1096         // And then we can first deal with the main hash area. Since we are expanding

1097         // linearly (double up), we know there'll be no collisions during this phase.

1098         final int[] oldHashArea = _hashArea;

1099         final String[] oldNames = _names;

1100         final int oldSize = _hashSize;

1101         final int oldCount = _count;

1102         final int newSize = oldSize + oldSize;

1103         final int oldEnd = _spilloverEnd;

1104 

1105         /* 13-Mar-2010, tatu: Let's guard against OOME that could be caused by

1106          *    large documents with unique (or mostly so) names

1107          */

1108         if (newSize > MAX_T_SIZE) {

1109             nukeSymbols(true);

1110             return;

1111         }

1112         // double up main hash area, but do not expand long-name area:

1113         _hashArea = new int[oldHashArea.length + (oldSize<<3)];

1114         _hashSize = newSize;

1115         _secondaryStart = (newSize << 2); // 4 ints per entry

1116         _tertiaryStart = _secondaryStart + (_secondaryStart >> 1); // right after secondary

1117         _tertiaryShift = _calcTertiaryShift(newSize);

1118         

1119         // and simply double up name array

1120         _names = new String[oldNames.length << 1];

1121         nukeSymbols(false);

1122 

1123         // Plus we can scan only through the primary hash area, looking for non-empty

1124         // slots, without worrying about ordering. This should never reduce priority

1125         // of existing entries: primaries remain primaries; however, due to increased

1126         // space, secondaries may become primaries etc

1127 

1128         int copyCount = 0;

1129         int[] q = new int[16];

1130         for (int offset = 0, end = oldEnd; offset < end; offset += 4) {

1131             int len = oldHashArea[offset+3];

1132             if (len == 0) { // empty slot, skip

1133                 continue;

1134             }

1135             ++copyCount;

1136             String name = oldNames[offset>>2];

1137             switch (len) {

1138             case 1:

1139                 q[0] = oldHashArea[offset];

1140                 addName(name, q, 1);

1141                 break;

1142             case 2:

1143                 q[0] = oldHashArea[offset];

1144                 q[1] = oldHashArea[offset+1];

1145                 addName(name, q, 2);

1146                 break;

1147             case 3:

1148                 q[0] = oldHashArea[offset];

1149                 q[1] = oldHashArea[offset+1];

1150                 q[2] = oldHashArea[offset+2];

1151                 addName(name, q, 3);

1152                 break;

1153             default:

1154                 if (len > q.length) {

1155                     q = new int[len];

1156                 }

1157                 // #0 is hash, #1 offset

1158                 int qoff = oldHashArea[offset+1];

1159                 System.arraycopy(oldHashArea, qoff, q, 0, len);

1160                 addName(name, q, len);

1161                 break;

1162             }

1163         }

1164 

1165         // Sanity checks: since corruption difficult to detect, assert explicitly

1166         // with production code

1167         if (copyCount != oldCount) {

1168             throw new IllegalStateException("Failed rehash(): old count="+oldCount+", copyCount="+copyCount);

1169         }

1170     }

1171 

1172     /**

1173      * Helper method called to empty all shared symbols, but to leave

1174      * arrays allocated

1175      */

1176     private void nukeSymbols(boolean fill) {

1177         _count = 0;

1178         // reset spill-over to empty (starting at 7/8 of hash area)

1179         _spilloverEnd = _spilloverStart();

1180         // and long name area to empty, starting immediately after hash area

1181         _longNameOffset = _hashSize << 3;

1182         if (fill) {

1183             Arrays.fill(_hashArea, 0);

1184             Arrays.fill(_names, null);

1185         }

1186     }

1187 

1188     /*

1189     /**********************************************************

1190     /* Helper methods

1191     /**********************************************************

1192      */

1193 

1194     /**

1195      * Helper method that calculates start of the spillover area

1196      */

1197     private final int _spilloverStart() {

1198         // we'll need slot at 1.75x of hashSize, but with 4-ints per slot.

1199         // So basically multiply by 7

1200         int offset = _hashSize;

1201         return (offset << 3) - offset;

1202     }

1203 

1204     protected void _reportTooManyCollisions()

1205     {

1206         // First: do not fuzz about small symbol tables; may get balanced by doubling up

1207         if (_hashSize <= 1024) { // would have spill-over area of 128 entries

1208             return;

1209         }

1210         throw new IllegalStateException("Spill-over slots in symbol table with "+_count

1211                 +" entries, hash area of "+_hashSize+" slots is now full (all "

1212                 +(_hashSize >> 3)+" slots -- suspect a DoS attack based on hash collisions."

1213                 +" You can disable the check via `JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW`");

1214     }

1215 

1216     static int _calcTertiaryShift(int primarySlots)

1217     {

1218         // first: we only get 1/4 of slots of primary, to divide

1219         int tertSlots = (primarySlots) >> 2;

1220         

1221         // default is for buckets of 4 slots (each 4 ints, i.e. 1 << 4)

1222         if (tertSlots < 64) {

1223             return 4;

1224         }

1225         if (tertSlots <= 256) { // buckets of 8 slots (up to 256 == 32 x 8)

1226             return 5;

1227         }

1228         if (tertSlots <= 1024) { // buckets of 16 slots (up to 1024 == 64 x 16)

1229             return 6;

1230         }

1231         // and biggest buckets have 32 slots

1232         return 7;

1233     }

1234 

1235     /*

1236     /**********************************************************

1237     /* Helper classes

1238     /**********************************************************

1239      */

1240 

1241     /**

1242      * Immutable value class used for sharing information as efficiently

1243      * as possible, by only require synchronization of reference manipulation

1244      * but not access to contents.

1245      * 

1246      * @since 2.1

1247      */

1248     private final static class TableInfo

1249     {

1250         public final int size;

1251         public final int count;

1252         public final int tertiaryShift;

1253         public final int[] mainHash;

1254         public final String[] names;

1255         public final int spilloverEnd;

1256         public final int longNameOffset;

1257 

1258         public TableInfo(int size, int count, int tertiaryShift, 

1259                 int[] mainHash, String[] names, int spilloverEnd, int longNameOffset)

1260         {

1261             this.size = size;

1262             this.count = count;

1263             this.tertiaryShift = tertiaryShift;

1264             this.mainHash = mainHash;

1265             this.names = names;

1266             this.spilloverEnd = spilloverEnd;

1267             this.longNameOffset = longNameOffset;

1268         }

1269 

1270         public TableInfo(ByteQuadsCanonicalizer src)

1271         {

1272             size = src._hashSize;

1273             count = src._count;

1274             tertiaryShift = src._tertiaryShift;

1275             mainHash = src._hashArea;

1276             names = src._names;

1277             spilloverEnd = src._spilloverEnd;

1278             longNameOffset = src._longNameOffset;

1279         }

1280 

1281         public static TableInfo createInitial(int sz) {

1282             int hashAreaSize = sz << 3;

1283             int tertShift = _calcTertiaryShift(sz);

1284 

1285             return new TableInfo(sz, // hashSize

1286                     0, // count

1287                     tertShift,

1288                     new int[hashAreaSize], // mainHash, 2x slots, 4 ints per slot

1289                     new String[sz << 1], // names == 2x slots

1290                     hashAreaSize - sz, // at 7/8 of the total area

1291                     hashAreaSize // longNameOffset, immediately after main hashes

1292             );

1293         }

1294     }

1295 }

1296