Monitoring JavaMelody on _ip-10-0-13-116.ec2.internal

1 package com.fasterxml.jackson.core.json;

2 

3 import java.io.*;

4 

5 import com.fasterxml.jackson.core.*;

6 import com.fasterxml.jackson.core.format.InputAccessor;

7 import com.fasterxml.jackson.core.format.MatchStrength;

8 import com.fasterxml.jackson.core.io.*;

9 import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;

10 import com.fasterxml.jackson.core.sym.CharsToNameCanonicalizer;

11 

12 /**

13  * This class is used to determine the encoding of byte stream

14  * that is to contain JSON content. Rules are fairly simple, and

15  * defined in JSON specification (RFC-4627 or newer), except

16  * for BOM handling, which is a property of underlying

17  * streams.

18  */

19 public final class ByteSourceJsonBootstrapper

20 {

21     public final static byte UTF8_BOM_1 = (byte) 0xEF;

22     public final static byte UTF8_BOM_2 = (byte) 0xBB;

23     public final static byte UTF8_BOM_3 = (byte) 0xBF;

24     

25     /*

26     /**********************************************************

27     /* Configuration

28     /**********************************************************

29      */

30 

31     private final IOContext _context;

32 

33     private final InputStream _in;

34 

35     /*

36     /**********************************************************

37     /* Input buffering

38     /**********************************************************

39      */

40 

41     private final byte[] _inputBuffer;

42 

43     private int _inputPtr;

44 

45     private int _inputEnd;

46 

47     /**

48      * Flag that indicates whether buffer above is to be recycled

49      * after being used or not.

50      */

51     private final boolean _bufferRecyclable;

52 

53     /*

54     /**********************************************************

55     /* Input location

56     /**********************************************************

57      */

58 

59     /**

60      * Current number of input units (bytes or chars) that were processed in

61      * previous blocks,

62      * before contents of current input buffer.

63      *<p>

64      * Note: includes possible BOMs, if those were part of the input.

65      */

66 //    private int _inputProcessed;

67 

68     /*

69     /**********************************************************

70     /* Data gathered

71     /**********************************************************

72      */

73 

74     /**

75      * Whether input has been detected to be in Big-Endian encoding or not.

76      */

77     private boolean _bigEndian = true;

78 

79     private int _bytesPerChar; // 0 means "dunno yet"

80 

81     /*

82     /**********************************************************

83     /* Life-cycle

84     /**********************************************************

85      */

86 

87     public ByteSourceJsonBootstrapper(IOContext ctxt, InputStream in) {

88         _context = ctxt;

89         _in = in;

90         _inputBuffer = ctxt.allocReadIOBuffer();

91         _inputEnd = _inputPtr = 0;

92 //        _inputProcessed = 0;

93         _bufferRecyclable = true;

94     }

95 

96     public ByteSourceJsonBootstrapper(IOContext ctxt, byte[] inputBuffer, int inputStart, int inputLen) {

97         _context = ctxt;

98         _in = null;

99         _inputBuffer = inputBuffer;

100         _inputPtr = inputStart;

101         _inputEnd = (inputStart + inputLen);

102         // Need to offset this for correct location info

103 //        _inputProcessed = -inputStart;

104         _bufferRecyclable = false;

105     }

106 

107     /*

108     /**********************************************************

109     /*  Encoding detection during bootstrapping

110     /**********************************************************

111      */

112     

113     /**

114      * Method that should be called after constructing an instace.

115      * It will figure out encoding that content uses, to allow

116      * for instantiating a proper scanner object.

117      */

118     public JsonEncoding detectEncoding() throws IOException

119     {

120         boolean foundEncoding = false;

121 

122         // First things first: BOM handling

123         /* Note: we can require 4 bytes to be read, since no

124          * combination of BOM + valid JSON content can have

125          * shorter length (shortest valid JSON content is single

126          * digit char, but BOMs are chosen such that combination

127          * is always at least 4 chars long)

128          */

129         if (ensureLoaded(4)) {

130             int quad =  (_inputBuffer[_inputPtr] << 24)

131                 | ((_inputBuffer[_inputPtr+1] & 0xFF) << 16)

132                 | ((_inputBuffer[_inputPtr+2] & 0xFF) << 8)

133                 | (_inputBuffer[_inputPtr+3] & 0xFF);

134             

135             if (handleBOM(quad)) {

136                 foundEncoding = true;

137             } else {

138                 /* If no BOM, need to auto-detect based on first char;

139                  * this works since it must be 7-bit ascii (wrt. unicode

140                  * compatible encodings, only ones JSON can be transferred

141                  * over)

142                  */

143                 // UTF-32?

144                 if (checkUTF32(quad)) {

145                     foundEncoding = true;

146                 } else if (checkUTF16(quad >>> 16)) {

147                     foundEncoding = true;

148                 }

149             }

150         } else if (ensureLoaded(2)) {

151             int i16 = ((_inputBuffer[_inputPtr] & 0xFF) << 8)

152                 | (_inputBuffer[_inputPtr+1] & 0xFF);

153             if (checkUTF16(i16)) {

154                 foundEncoding = true;

155             }

156         }

157 

158         JsonEncoding enc;

159 

160         /* Not found yet? As per specs, this means it must be UTF-8. */

161         if (!foundEncoding) {

162             enc = JsonEncoding.UTF8;

163         } else {

164             switch (_bytesPerChar) {

165             case 1: enc = JsonEncoding.UTF8;

166                 break;

167             case 2: enc = _bigEndian ? JsonEncoding.UTF16_BE : JsonEncoding.UTF16_LE;

168                 break;

169             case 4: enc = _bigEndian ? JsonEncoding.UTF32_BE : JsonEncoding.UTF32_LE;

170                 break;

171             default: throw new RuntimeException("Internal error"); // should never get here

172             }

173         }

174         _context.setEncoding(enc);

175         return enc;

176     }

177 

178     /**

179      * Helper method that may be called to see if given {@link DataInput}

180      * has BOM marker, and if so, to skip it.

181      * @throws IOException 

182      *

183      * @since 2.8

184      */

185     public static int skipUTF8BOM(DataInput input) throws IOException

186     {

187         int b = input.readUnsignedByte();

188         if (b != 0xEF) {

189             return b;

190         }

191         // since this is not legal byte in JSON otherwise, except

192         // that we do get BOM; if not, report error

193         b = input.readUnsignedByte();

194         if (b != 0xBB) {

195             throw new IOException("Unexpected byte 0x"+Integer.toHexString(b)

196                 +" following 0xEF; should get 0xBB as part of UTF-8 BOM");

197         }

198         b = input.readUnsignedByte();

199         if (b != 0xBF) {

200             throw new IOException("Unexpected byte 0x"+Integer.toHexString(b)

201                 +" following 0xEF 0xBB; should get 0xBF as part of UTF-8 BOM");

202         }

203         return input.readUnsignedByte();

204     }

205 

206     /*

207     /**********************************************************

208     /* Constructing a Reader

209     /**********************************************************

210      */

211     

212     @SuppressWarnings("resource")

213     public Reader constructReader() throws IOException

214     {

215         JsonEncoding enc = _context.getEncoding();

216         switch (enc.bits()) {

217         case 8: // only in non-common case where we don't want to do direct mapping

218         case 16:

219             {

220                 // First: do we have a Stream? If not, need to create one:

221                 InputStream in = _in;

222 

223                 if (in == null) {

224                     in = new ByteArrayInputStream(_inputBuffer, _inputPtr, _inputEnd);

225                 } else {

226                     /* Also, if we have any read but unused input (usually true),

227                      * need to merge that input in:

228                      */

229                     if (_inputPtr < _inputEnd) {

230                         in = new MergedStream(_context, in, _inputBuffer, _inputPtr, _inputEnd);

231                     }

232                 }

233                 return new InputStreamReader(in, enc.getJavaName());

234             }

235         case 32:

236             return new UTF32Reader(_context, _in, _inputBuffer, _inputPtr, _inputEnd,

237                     _context.getEncoding().isBigEndian());

238         }

239         throw new RuntimeException("Internal error"); // should never get here

240     }

241 

242     public JsonParser constructParser(int parserFeatures, ObjectCodec codec,

243             ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols,

244             int factoryFeatures) throws IOException

245     {

246         int prevInputPtr = _inputPtr;

247         JsonEncoding enc = detectEncoding();

248         int bytesProcessed = _inputPtr - prevInputPtr;

249 

250         if (enc == JsonEncoding.UTF8) {

251             /* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader

252              * (which is ok for larger input; not so hot for smaller; but this is not a common case)

253              */

254             if (JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(factoryFeatures)) {

255                 ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);

256                 return new UTF8StreamJsonParser(_context, parserFeatures, _in, codec, can,

257                         _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);

258             }

259         }

260         return new ReaderBasedJsonParser(_context, parserFeatures, constructReader(), codec,

261                 rootCharSymbols.makeChild(factoryFeatures));

262     }

263 

264     /*

265     /**********************************************************

266     /*  Encoding detection for data format auto-detection

267     /**********************************************************

268      */

269 

270     /**

271      * Current implementation is not as thorough as other functionality

272      * ({@link com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper}); 

273      * supports UTF-8, for example. But it should work, for now, and can

274      * be improved as necessary.

275      */

276     public static MatchStrength hasJSONFormat(InputAccessor acc) throws IOException

277     {

278         // Ideally we should see "[" or "{"; but if not, we'll accept double-quote (String)

279         // in future could also consider accepting non-standard matches?

280         

281         if (!acc.hasMoreBytes()) {

282             return MatchStrength.INCONCLUSIVE;

283         }

284         byte b = acc.nextByte();

285         // Very first thing, a UTF-8 BOM?

286         if (b == UTF8_BOM_1) { // yes, looks like UTF-8 BOM

287             if (!acc.hasMoreBytes()) {

288                 return MatchStrength.INCONCLUSIVE;

289             }

290             if (acc.nextByte() != UTF8_BOM_2) {

291                 return MatchStrength.NO_MATCH;

292             }

293             if (!acc.hasMoreBytes()) {

294                 return MatchStrength.INCONCLUSIVE;

295             }

296             if (acc.nextByte() != UTF8_BOM_3) {

297                 return MatchStrength.NO_MATCH;

298             }

299             if (!acc.hasMoreBytes()) {

300                 return MatchStrength.INCONCLUSIVE;

301             }

302             b = acc.nextByte();

303         }

304         // Then possible leading space

305         int ch = skipSpace(acc, b);

306         if (ch < 0) {

307             return MatchStrength.INCONCLUSIVE;

308         }

309         // First, let's see if it looks like a structured type:

310         if (ch == '{') { // JSON object?

311             // Ideally we need to find either double-quote or closing bracket

312             ch = skipSpace(acc);

313             if (ch < 0) {

314                 return MatchStrength.INCONCLUSIVE;

315             }

316             if (ch == '"' || ch == '}') {

317                 return MatchStrength.SOLID_MATCH;

318             }

319             // ... should we allow non-standard? Let's not yet... can add if need be

320             return MatchStrength.NO_MATCH;

321         }

322         MatchStrength strength;

323         

324         if (ch == '[') {

325             ch = skipSpace(acc);

326             if (ch < 0) {

327                 return MatchStrength.INCONCLUSIVE;

328             }

329             // closing brackets is easy; but for now, let's also accept opening...

330             if (ch == ']' || ch == '[') {

331                 return MatchStrength.SOLID_MATCH;

332             }

333             return MatchStrength.SOLID_MATCH;

334         } else {

335             // plain old value is not very convincing...

336             strength = MatchStrength.WEAK_MATCH;

337         }

338 

339         if (ch == '"') { // string value

340             return strength;

341         }

342         if (ch <= '9' && ch >= '0') { // number

343             return strength;

344         }

345         if (ch == '-') { // negative number

346             ch = skipSpace(acc);

347             if (ch < 0) {

348                 return MatchStrength.INCONCLUSIVE;

349             }

350             return (ch <= '9' && ch >= '0') ? strength : MatchStrength.NO_MATCH;

351         }

352         // or one of literals

353         if (ch == 'n') { // null

354             return tryMatch(acc, "ull", strength);

355         }

356         if (ch == 't') { // true

357             return tryMatch(acc, "rue", strength);

358         }

359         if (ch == 'f') { // false

360             return tryMatch(acc, "alse", strength);

361         }

362         return MatchStrength.NO_MATCH;

363     }

364 

365     private static MatchStrength tryMatch(InputAccessor acc, String matchStr, MatchStrength fullMatchStrength)

366         throws IOException

367     {

368         for (int i = 0, len = matchStr.length(); i < len; ++i) {

369             if (!acc.hasMoreBytes()) {

370                 return MatchStrength.INCONCLUSIVE;

371             }

372             if (acc.nextByte() != matchStr.charAt(i)) {

373                 return MatchStrength.NO_MATCH;

374             }

375         }

376         return fullMatchStrength;

377     }

378 

379     private static int skipSpace(InputAccessor acc) throws IOException

380     {

381         if (!acc.hasMoreBytes()) {

382             return -1;

383         }

384         return skipSpace(acc, acc.nextByte());

385     }

386 

387     private static int skipSpace(InputAccessor acc, byte b) throws IOException

388     {

389         while (true) {

390             int ch = (int) b & 0xFF;

391             if (!(ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t')) {

392                 return ch;

393             }

394             if (!acc.hasMoreBytes()) {

395                 return -1;

396             }

397             b = acc.nextByte();

398         }

399     }

400 

401     /*

402     /**********************************************************

403     /* Internal methods, parsing

404     /**********************************************************

405      */

406 

407     /**

408      * @return True if a BOM was succesfully found, and encoding

409      *   thereby recognized.

410      */

411     private boolean handleBOM(int quad) throws IOException

412     {

413         /* Handling of (usually) optional BOM (required for

414          * multi-byte formats); first 32-bit charsets:

415          */

416         switch (quad) {

417         case 0x0000FEFF:

418             _bigEndian = true;

419             _inputPtr += 4;

420             _bytesPerChar = 4;

421             return true;

422         case 0xFFFE0000: // UCS-4, LE?

423             _inputPtr += 4;

424             _bytesPerChar = 4;

425             _bigEndian = false;

426             return true;

427         case 0x0000FFFE: // UCS-4, in-order...

428             reportWeirdUCS4("2143"); // throws exception

429             break; // never gets here

430         case 0xFEFF0000: // UCS-4, in-order...

431             reportWeirdUCS4("3412"); // throws exception

432             break; // never gets here

433         default:

434         }

435         // Ok, if not, how about 16-bit encoding BOMs?

436         int msw = quad >>> 16;

437         if (msw == 0xFEFF) { // UTF-16, BE

438             _inputPtr += 2;

439             _bytesPerChar = 2;

440             _bigEndian = true;

441             return true;

442         }

443         if (msw == 0xFFFE) { // UTF-16, LE

444             _inputPtr += 2;

445             _bytesPerChar = 2;

446             _bigEndian = false;

447             return true;

448         }

449         // And if not, then UTF-8 BOM?

450         if ((quad >>> 8) == 0xEFBBBF) { // UTF-8

451             _inputPtr += 3;

452             _bytesPerChar = 1;

453             _bigEndian = true; // doesn't really matter

454             return true;

455         }

456         return false;

457     }

458 

459     private boolean checkUTF32(int quad) throws IOException

460     {

461         /* Handling of (usually) optional BOM (required for

462          * multi-byte formats); first 32-bit charsets:

463          */

464         if ((quad >> 8) == 0) { // 0x000000?? -> UTF32-BE

465             _bigEndian = true;

466         } else if ((quad & 0x00FFFFFF) == 0) { // 0x??000000 -> UTF32-LE

467             _bigEndian = false;

468         } else if ((quad & ~0x00FF0000) == 0) { // 0x00??0000 -> UTF32-in-order

469             reportWeirdUCS4("3412");

470         } else if ((quad & ~0x0000FF00) == 0) { // 0x0000??00 -> UTF32-in-order

471             reportWeirdUCS4("2143");

472         } else {

473             // Can not be valid UTF-32 encoded JSON...

474             return false;

475         }

476         // Not BOM (just regular content), nothing to skip past:

477         //_inputPtr += 4;

478         _bytesPerChar = 4;

479         return true;

480     }

481 

482     private boolean checkUTF16(int i16)

483     {

484         if ((i16 & 0xFF00) == 0) { // UTF-16BE

485             _bigEndian = true;

486         } else if ((i16 & 0x00FF) == 0) { // UTF-16LE

487             _bigEndian = false;

488         } else { // nope, not  UTF-16

489             return false;

490         }

491         // Not BOM (just regular content), nothing to skip past:

492         //_inputPtr += 2;

493         _bytesPerChar = 2;

494         return true;

495     }

496 

497     /*

498     /**********************************************************

499     /* Internal methods, problem reporting

500     /**********************************************************

501      */

502 

503     private void reportWeirdUCS4(String type) throws IOException {

504         throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");

505     }

506 

507     /*

508     /**********************************************************

509     /* Internal methods, raw input access

510     /**********************************************************

511      */

512 

513     protected boolean ensureLoaded(int minimum) throws IOException {

514         /* Let's assume here buffer has enough room -- this will always

515          * be true for the limited used this method gets

516          */

517         int gotten = (_inputEnd - _inputPtr);

518         while (gotten < minimum) {

519             int count;

520 

521             if (_in == null) { // block source

522                 count = -1;

523             } else {

524                 count = _in.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd);

525             }

526             if (count < 1) {

527                 return false;

528             }

529             _inputEnd += count;

530             gotten += count;

531         }

532         return true;

533     }

534 }

535