1 package com.fasterxml.jackson.core.json;
2
3 import java.io.*;
4
5 import com.fasterxml.jackson.core.*;
6 import com.fasterxml.jackson.core.format.InputAccessor;
7 import com.fasterxml.jackson.core.format.MatchStrength;
8 import com.fasterxml.jackson.core.io.*;
9 import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
10 import com.fasterxml.jackson.core.sym.CharsToNameCanonicalizer;
11
12 /**
13 * This class is used to determine the encoding of byte stream
14 * that is to contain JSON content. Rules are fairly simple, and
15 * defined in JSON specification (RFC-4627 or newer), except
16 * for BOM handling, which is a property of underlying
17 * streams.
18 */
19 public final class ByteSourceJsonBootstrapper
20 {
21 public final static byte UTF8_BOM_1 = (byte) 0xEF;
22 public final static byte UTF8_BOM_2 = (byte) 0xBB;
23 public final static byte UTF8_BOM_3 = (byte) 0xBF;
24
25 /*
26 /**********************************************************
27 /* Configuration
28 /**********************************************************
29 */
30
31 private final IOContext _context;
32
33 private final InputStream _in;
34
35 /*
36 /**********************************************************
37 /* Input buffering
38 /**********************************************************
39 */
40
41 private final byte[] _inputBuffer;
42
43 private int _inputPtr;
44
45 private int _inputEnd;
46
47 /**
48 * Flag that indicates whether buffer above is to be recycled
49 * after being used or not.
50 */
51 private final boolean _bufferRecyclable;
52
53 /*
54 /**********************************************************
55 /* Input location
56 /**********************************************************
57 */
58
59 /**
60 * Current number of input units (bytes or chars) that were processed in
61 * previous blocks,
62 * before contents of current input buffer.
63 *<p>
64 * Note: includes possible BOMs, if those were part of the input.
65 */
66 // private int _inputProcessed;
67
68 /*
69 /**********************************************************
70 /* Data gathered
71 /**********************************************************
72 */
73
74 /**
75 * Whether input has been detected to be in Big-Endian encoding or not.
76 */
77 private boolean _bigEndian = true;
78
79 private int _bytesPerChar; // 0 means "dunno yet"
80
81 /*
82 /**********************************************************
83 /* Life-cycle
84 /**********************************************************
85 */
86
87 public ByteSourceJsonBootstrapper(IOContext ctxt, InputStream in) {
88 _context = ctxt;
89 _in = in;
90 _inputBuffer = ctxt.allocReadIOBuffer();
91 _inputEnd = _inputPtr = 0;
92 // _inputProcessed = 0;
93 _bufferRecyclable = true;
94 }
95
96 public ByteSourceJsonBootstrapper(IOContext ctxt, byte[] inputBuffer, int inputStart, int inputLen) {
97 _context = ctxt;
98 _in = null;
99 _inputBuffer = inputBuffer;
100 _inputPtr = inputStart;
101 _inputEnd = (inputStart + inputLen);
102 // Need to offset this for correct location info
103 // _inputProcessed = -inputStart;
104 _bufferRecyclable = false;
105 }
106
107 /*
108 /**********************************************************
109 /* Encoding detection during bootstrapping
110 /**********************************************************
111 */
112
113 /**
114 * Method that should be called after constructing an instace.
115 * It will figure out encoding that content uses, to allow
116 * for instantiating a proper scanner object.
117 */
118 public JsonEncoding detectEncoding() throws IOException
119 {
120 boolean foundEncoding = false;
121
122 // First things first: BOM handling
123 /* Note: we can require 4 bytes to be read, since no
124 * combination of BOM + valid JSON content can have
125 * shorter length (shortest valid JSON content is single
126 * digit char, but BOMs are chosen such that combination
127 * is always at least 4 chars long)
128 */
129 if (ensureLoaded(4)) {
130 int quad = (_inputBuffer[_inputPtr] << 24)
131 | ((_inputBuffer[_inputPtr+1] & 0xFF) << 16)
132 | ((_inputBuffer[_inputPtr+2] & 0xFF) << 8)
133 | (_inputBuffer[_inputPtr+3] & 0xFF);
134
135 if (handleBOM(quad)) {
136 foundEncoding = true;
137 } else {
138 /* If no BOM, need to auto-detect based on first char;
139 * this works since it must be 7-bit ascii (wrt. unicode
140 * compatible encodings, only ones JSON can be transferred
141 * over)
142 */
143 // UTF-32?
144 if (checkUTF32(quad)) {
145 foundEncoding = true;
146 } else if (checkUTF16(quad >>> 16)) {
147 foundEncoding = true;
148 }
149 }
150 } else if (ensureLoaded(2)) {
151 int i16 = ((_inputBuffer[_inputPtr] & 0xFF) << 8)
152 | (_inputBuffer[_inputPtr+1] & 0xFF);
153 if (checkUTF16(i16)) {
154 foundEncoding = true;
155 }
156 }
157
158 JsonEncoding enc;
159
160 /* Not found yet? As per specs, this means it must be UTF-8. */
161 if (!foundEncoding) {
162 enc = JsonEncoding.UTF8;
163 } else {
164 switch (_bytesPerChar) {
165 case 1: enc = JsonEncoding.UTF8;
166 break;
167 case 2: enc = _bigEndian ? JsonEncoding.UTF16_BE : JsonEncoding.UTF16_LE;
168 break;
169 case 4: enc = _bigEndian ? JsonEncoding.UTF32_BE : JsonEncoding.UTF32_LE;
170 break;
171 default: throw new RuntimeException("Internal error"); // should never get here
172 }
173 }
174 _context.setEncoding(enc);
175 return enc;
176 }
177
178 /**
179 * Helper method that may be called to see if given {@link DataInput}
180 * has BOM marker, and if so, to skip it.
181 * @throws IOException
182 *
183 * @since 2.8
184 */
185 public static int skipUTF8BOM(DataInput input) throws IOException
186 {
187 int b = input.readUnsignedByte();
188 if (b != 0xEF) {
189 return b;
190 }
191 // since this is not legal byte in JSON otherwise, except
192 // that we do get BOM; if not, report error
193 b = input.readUnsignedByte();
194 if (b != 0xBB) {
195 throw new IOException("Unexpected byte 0x"+Integer.toHexString(b)
196 +" following 0xEF; should get 0xBB as part of UTF-8 BOM");
197 }
198 b = input.readUnsignedByte();
199 if (b != 0xBF) {
200 throw new IOException("Unexpected byte 0x"+Integer.toHexString(b)
201 +" following 0xEF 0xBB; should get 0xBF as part of UTF-8 BOM");
202 }
203 return input.readUnsignedByte();
204 }
205
206 /*
207 /**********************************************************
208 /* Constructing a Reader
209 /**********************************************************
210 */
211
212 @SuppressWarnings("resource")
213 public Reader constructReader() throws IOException
214 {
215 JsonEncoding enc = _context.getEncoding();
216 switch (enc.bits()) {
217 case 8: // only in non-common case where we don't want to do direct mapping
218 case 16:
219 {
220 // First: do we have a Stream? If not, need to create one:
221 InputStream in = _in;
222
223 if (in == null) {
224 in = new ByteArrayInputStream(_inputBuffer, _inputPtr, _inputEnd);
225 } else {
226 /* Also, if we have any read but unused input (usually true),
227 * need to merge that input in:
228 */
229 if (_inputPtr < _inputEnd) {
230 in = new MergedStream(_context, in, _inputBuffer, _inputPtr, _inputEnd);
231 }
232 }
233 return new InputStreamReader(in, enc.getJavaName());
234 }
235 case 32:
236 return new UTF32Reader(_context, _in, _inputBuffer, _inputPtr, _inputEnd,
237 _context.getEncoding().isBigEndian());
238 }
239 throw new RuntimeException("Internal error"); // should never get here
240 }
241
242 public JsonParser constructParser(int parserFeatures, ObjectCodec codec,
243 ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols,
244 int factoryFeatures) throws IOException
245 {
246 int prevInputPtr = _inputPtr;
247 JsonEncoding enc = detectEncoding();
248 int bytesProcessed = _inputPtr - prevInputPtr;
249
250 if (enc == JsonEncoding.UTF8) {
251 /* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader
252 * (which is ok for larger input; not so hot for smaller; but this is not a common case)
253 */
254 if (JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(factoryFeatures)) {
255 ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
256 return new UTF8StreamJsonParser(_context, parserFeatures, _in, codec, can,
257 _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
258 }
259 }
260 return new ReaderBasedJsonParser(_context, parserFeatures, constructReader(), codec,
261 rootCharSymbols.makeChild(factoryFeatures));
262 }
263
264 /*
265 /**********************************************************
266 /* Encoding detection for data format auto-detection
267 /**********************************************************
268 */
269
270 /**
271 * Current implementation is not as thorough as other functionality
272 * ({@link com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper});
273 * supports UTF-8, for example. But it should work, for now, and can
274 * be improved as necessary.
275 */
276 public static MatchStrength hasJSONFormat(InputAccessor acc) throws IOException
277 {
278 // Ideally we should see "[" or "{"; but if not, we'll accept double-quote (String)
279 // in future could also consider accepting non-standard matches?
280
281 if (!acc.hasMoreBytes()) {
282 return MatchStrength.INCONCLUSIVE;
283 }
284 byte b = acc.nextByte();
285 // Very first thing, a UTF-8 BOM?
286 if (b == UTF8_BOM_1) { // yes, looks like UTF-8 BOM
287 if (!acc.hasMoreBytes()) {
288 return MatchStrength.INCONCLUSIVE;
289 }
290 if (acc.nextByte() != UTF8_BOM_2) {
291 return MatchStrength.NO_MATCH;
292 }
293 if (!acc.hasMoreBytes()) {
294 return MatchStrength.INCONCLUSIVE;
295 }
296 if (acc.nextByte() != UTF8_BOM_3) {
297 return MatchStrength.NO_MATCH;
298 }
299 if (!acc.hasMoreBytes()) {
300 return MatchStrength.INCONCLUSIVE;
301 }
302 b = acc.nextByte();
303 }
304 // Then possible leading space
305 int ch = skipSpace(acc, b);
306 if (ch < 0) {
307 return MatchStrength.INCONCLUSIVE;
308 }
309 // First, let's see if it looks like a structured type:
310 if (ch == '{') { // JSON object?
311 // Ideally we need to find either double-quote or closing bracket
312 ch = skipSpace(acc);
313 if (ch < 0) {
314 return MatchStrength.INCONCLUSIVE;
315 }
316 if (ch == '"' || ch == '}') {
317 return MatchStrength.SOLID_MATCH;
318 }
319 // ... should we allow non-standard? Let's not yet... can add if need be
320 return MatchStrength.NO_MATCH;
321 }
322 MatchStrength strength;
323
324 if (ch == '[') {
325 ch = skipSpace(acc);
326 if (ch < 0) {
327 return MatchStrength.INCONCLUSIVE;
328 }
329 // closing brackets is easy; but for now, let's also accept opening...
330 if (ch == ']' || ch == '[') {
331 return MatchStrength.SOLID_MATCH;
332 }
333 return MatchStrength.SOLID_MATCH;
334 } else {
335 // plain old value is not very convincing...
336 strength = MatchStrength.WEAK_MATCH;
337 }
338
339 if (ch == '"') { // string value
340 return strength;
341 }
342 if (ch <= '9' && ch >= '0') { // number
343 return strength;
344 }
345 if (ch == '-') { // negative number
346 ch = skipSpace(acc);
347 if (ch < 0) {
348 return MatchStrength.INCONCLUSIVE;
349 }
350 return (ch <= '9' && ch >= '0') ? strength : MatchStrength.NO_MATCH;
351 }
352 // or one of literals
353 if (ch == 'n') { // null
354 return tryMatch(acc, "ull", strength);
355 }
356 if (ch == 't') { // true
357 return tryMatch(acc, "rue", strength);
358 }
359 if (ch == 'f') { // false
360 return tryMatch(acc, "alse", strength);
361 }
362 return MatchStrength.NO_MATCH;
363 }
364
365 private static MatchStrength tryMatch(InputAccessor acc, String matchStr, MatchStrength fullMatchStrength)
366 throws IOException
367 {
368 for (int i = 0, len = matchStr.length(); i < len; ++i) {
369 if (!acc.hasMoreBytes()) {
370 return MatchStrength.INCONCLUSIVE;
371 }
372 if (acc.nextByte() != matchStr.charAt(i)) {
373 return MatchStrength.NO_MATCH;
374 }
375 }
376 return fullMatchStrength;
377 }
378
379 private static int skipSpace(InputAccessor acc) throws IOException
380 {
381 if (!acc.hasMoreBytes()) {
382 return -1;
383 }
384 return skipSpace(acc, acc.nextByte());
385 }
386
387 private static int skipSpace(InputAccessor acc, byte b) throws IOException
388 {
389 while (true) {
390 int ch = (int) b & 0xFF;
391 if (!(ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t')) {
392 return ch;
393 }
394 if (!acc.hasMoreBytes()) {
395 return -1;
396 }
397 b = acc.nextByte();
398 }
399 }
400
401 /*
402 /**********************************************************
403 /* Internal methods, parsing
404 /**********************************************************
405 */
406
407 /**
408 * @return True if a BOM was succesfully found, and encoding
409 * thereby recognized.
410 */
411 private boolean handleBOM(int quad) throws IOException
412 {
413 /* Handling of (usually) optional BOM (required for
414 * multi-byte formats); first 32-bit charsets:
415 */
416 switch (quad) {
417 case 0x0000FEFF:
418 _bigEndian = true;
419 _inputPtr += 4;
420 _bytesPerChar = 4;
421 return true;
422 case 0xFFFE0000: // UCS-4, LE?
423 _inputPtr += 4;
424 _bytesPerChar = 4;
425 _bigEndian = false;
426 return true;
427 case 0x0000FFFE: // UCS-4, in-order...
428 reportWeirdUCS4("2143"); // throws exception
429 break; // never gets here
430 case 0xFEFF0000: // UCS-4, in-order...
431 reportWeirdUCS4("3412"); // throws exception
432 break; // never gets here
433 default:
434 }
435 // Ok, if not, how about 16-bit encoding BOMs?
436 int msw = quad >>> 16;
437 if (msw == 0xFEFF) { // UTF-16, BE
438 _inputPtr += 2;
439 _bytesPerChar = 2;
440 _bigEndian = true;
441 return true;
442 }
443 if (msw == 0xFFFE) { // UTF-16, LE
444 _inputPtr += 2;
445 _bytesPerChar = 2;
446 _bigEndian = false;
447 return true;
448 }
449 // And if not, then UTF-8 BOM?
450 if ((quad >>> 8) == 0xEFBBBF) { // UTF-8
451 _inputPtr += 3;
452 _bytesPerChar = 1;
453 _bigEndian = true; // doesn't really matter
454 return true;
455 }
456 return false;
457 }
458
459 private boolean checkUTF32(int quad) throws IOException
460 {
461 /* Handling of (usually) optional BOM (required for
462 * multi-byte formats); first 32-bit charsets:
463 */
464 if ((quad >> 8) == 0) { // 0x000000?? -> UTF32-BE
465 _bigEndian = true;
466 } else if ((quad & 0x00FFFFFF) == 0) { // 0x??000000 -> UTF32-LE
467 _bigEndian = false;
468 } else if ((quad & ~0x00FF0000) == 0) { // 0x00??0000 -> UTF32-in-order
469 reportWeirdUCS4("3412");
470 } else if ((quad & ~0x0000FF00) == 0) { // 0x0000??00 -> UTF32-in-order
471 reportWeirdUCS4("2143");
472 } else {
473 // Can not be valid UTF-32 encoded JSON...
474 return false;
475 }
476 // Not BOM (just regular content), nothing to skip past:
477 //_inputPtr += 4;
478 _bytesPerChar = 4;
479 return true;
480 }
481
482 private boolean checkUTF16(int i16)
483 {
484 if ((i16 & 0xFF00) == 0) { // UTF-16BE
485 _bigEndian = true;
486 } else if ((i16 & 0x00FF) == 0) { // UTF-16LE
487 _bigEndian = false;
488 } else { // nope, not UTF-16
489 return false;
490 }
491 // Not BOM (just regular content), nothing to skip past:
492 //_inputPtr += 2;
493 _bytesPerChar = 2;
494 return true;
495 }
496
497 /*
498 /**********************************************************
499 /* Internal methods, problem reporting
500 /**********************************************************
501 */
502
503 private void reportWeirdUCS4(String type) throws IOException {
504 throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
505 }
506
507 /*
508 /**********************************************************
509 /* Internal methods, raw input access
510 /**********************************************************
511 */
512
513 protected boolean ensureLoaded(int minimum) throws IOException {
514 /* Let's assume here buffer has enough room -- this will always
515 * be true for the limited used this method gets
516 */
517 int gotten = (_inputEnd - _inputPtr);
518 while (gotten < minimum) {
519 int count;
520
521 if (_in == null) { // block source
522 count = -1;
523 } else {
524 count = _in.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd);
525 }
526 if (count < 1) {
527 return false;
528 }
529 _inputEnd += count;
530 gotten += count;
531 }
532 return true;
533 }
534 }
535