1 package com.ctc.wstx.io;
2
3 import java.io.*;
4
5 import javax.xml.stream.Location;
6 import javax.xml.stream.XMLStreamException;
7
8 import com.ctc.wstx.api.ReaderConfig;
9 import com.ctc.wstx.cfg.ParsingErrorMsgs;
10 import com.ctc.wstx.cfg.XmlConsts;
11 import com.ctc.wstx.exc.*;
12
13 /**
14 * Input bootstrap class used with streams, when encoding is not known
15 * (when encoding is specified by application, a reader is constructed,
16 * and then reader-based bootstrapper is used).
17 *<p>
18 * Encoding used for an entity (including
19 * main document entity) is determined using algorithms suggested in
20 * XML 1.0#3 spec, appendix F
21 */
22 public final class StreamBootstrapper
23 extends InputBootstrapper
24 {
25 /**
26 * Let's size buffer at least big enough to contain the longest possible
27 * prefix of a document needed to positively identify it starts with
28 * the XML declaration. That means having (optional) BOM, and then first
29 * 6 characters ("<?xml "), in whatever encoding. With 4-byte encodings
30 * (UCS-4), that comes to 28 bytes. And for good measure, let's pad
31 * that a bit as well....
32 */
33 final static int MIN_BUF_SIZE = 128;
34
35 /*
36 ////////////////////////////////////////
37 // Configuration
38 ////////////////////////////////////////
39 */
40
41 /**
42 * Underlying InputStream to use for reading content. May be null
43 * if the actual data source is not stream-based but a block source.
44 */
45 final InputStream mIn;
46
47 /*
48 ///////////////////////////////////////////////////////////////
49 // Input buffering
50 ///////////////////////////////////////////////////////////////
51 */
52
53 private byte[] mByteBuffer;
54
55 /**
56 * Whether byte buffer is recyclable or not
57 */
58 private final boolean mRecycleBuffer;
59
60 private int mInputPtr;
61 private int mInputEnd;
62 /*
63 ///////////////////////////////////////////////////////////////
64 // Physical encoding properties found so far
65 ///////////////////////////////////////////////////////////////
66 */
67
68 boolean mBigEndian = true;
69
70 boolean mHadBOM = false;
71
72 boolean mByteSizeFound = false;
73
74 /**
75 * For most encodings, number of physical characters needed for
76 * decoding xml declaration characters (which for variable length
77 * encodings like UTF-8 will be 1). Exception is EBCDIC, which
78 * while a single-byte encoding, is denoted by -1 since it
79 * needs an additional translation lookup.
80 */
81 int mBytesPerChar; // minimum, ie. 1 for UTF-8
82
83 /**
84 * Special case for 1-byte encodings: EBCDIC is problematic
85 * as it's not 7-bit ascii compatible. We can deal with it,
86 * still, but only with bit of extra state.
87 */
88 boolean mEBCDIC = false;
89
90 String mInputEncoding = null;
91
92 /**
93 * For single-byte non-ascii-compatible encodings (ok ok, really
94 * just EBCDIC), we'll have to use a lookup table.
95 */
96 int[] mSingleByteTranslation = null;
97
98 /*
99 ////////////////////////////////////////
100 // Life-cycle
101 ////////////////////////////////////////
102 */
103
104 private StreamBootstrapper(String pubId, SystemId sysId, InputStream in)
105 {
106 super(pubId, sysId);
107 mIn = in;
108 mInputPtr = mInputEnd = 0;
109 mRecycleBuffer = true;
110 }
111
112 /**
113 * @param start Pointer to the first valid byte in the buffer
114 * @param end Pointer to the offset <b>after</b> last valid byte in the buffer
115 */
116 private StreamBootstrapper(String pubId, SystemId sysId, byte[] data, int start, int end)
117 {
118 super(pubId, sysId);
119 mIn = null;
120 mRecycleBuffer = false;
121 mByteBuffer = data;
122 mInputPtr = start;
123 mInputEnd = end;
124 }
125
126 /*
127 ////////////////////////////////////////
128 // Public API
129 ////////////////////////////////////////
130 */
131
132 /**
133 * Factory method used when the underlying data provider is an
134 * actual stream.
135 */
136 public static StreamBootstrapper getInstance(String pubId, SystemId sysId, InputStream in)
137 {
138 return new StreamBootstrapper(pubId, sysId, in);
139 }
140
141 /**
142 * Factory method used when the underlying data provider is a pre-allocated
143 * block source, and no stream is used.
144 * Additionally the buffer passed is not owned by the bootstrapper
145 * or Reader that is created, so it is not to be recycled.
146 */
147 public static StreamBootstrapper getInstance(String pubId, SystemId sysId, byte[] data, int start, int end)
148 {
149 return new StreamBootstrapper(pubId, sysId, data, start, end);
150 }
151
152 @Override
153 public Reader bootstrapInput(ReaderConfig cfg, boolean mainDoc, int xmlVersion)
154 throws IOException, XMLStreamException
155 {
156 String normEnc = null;
157
158 // First, let's get the buffers...
159 int bufSize = cfg.getInputBufferLength();
160 if (bufSize < MIN_BUF_SIZE) {
161 bufSize = MIN_BUF_SIZE;
162 }
163 if (mByteBuffer == null) { // non-null if we were passed a buffer
164 mByteBuffer = cfg.allocFullBBuffer(bufSize);
165 }
166
167 resolveStreamEncoding();
168
169 if (hasXmlDecl()) {
170 // note: readXmlDecl will set mXml11Handling too
171 readXmlDecl(mainDoc, xmlVersion);
172 if (mFoundEncoding != null) {
173 normEnc = verifyXmlEncoding(mFoundEncoding);
174 }
175 } else {
176 /* We'll actually then just inherit whatever main doc had...
177 * (or in case there was no parent, just copy the 'unknown')
178 */
179 mXml11Handling = (XmlConsts.XML_V_11 == xmlVersion);
180 }
181
182 // Now, have we figured out the encoding?
183
184 if (normEnc == null) { // not via xml declaration
185 /* 21-Sep-2007, TSa: As with any non-UTF-8 encoding, declaration
186 * isn't optional any more. Besides, we need that information
187 * anyway to know which variant it is.
188 */
189 if (mEBCDIC) {
190 if (mFoundEncoding == null || mFoundEncoding.length() == 0) {
191 reportXmlProblem("Missing encoding declaration: underlying encoding looks like an EBCDIC variant, but no xml encoding declaration found");
192 }
193 // Hmmh. What should be the canonical name? Let's just use found encoding?
194 normEnc = mFoundEncoding;
195 } else if (mBytesPerChar == 2) { // UTF-16, BE/LE
196 normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;
197 } else if (mBytesPerChar == 4) { // UCS-4... ?
198 /* 22-Mar-2005, TSa: JDK apparently has no way of dealing
199 * with these encodings... not sure if and how it should
200 * be dealt with, really. Name could be UCS-4xx... or
201 * perhaps UTF-32xx
202 */
203 normEnc = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;
204 } else {
205 // Ok, default has to be UTF-8, as per XML specs
206 normEnc = CharsetNames.CS_UTF8;
207 }
208 }
209
210 mInputEncoding = normEnc;
211
212 /* And then the reader. Let's figure out if we can use our own fast
213 * implementations first:
214 */
215 BaseReader r;
216
217 // Normalized, can thus use straight equality checks now
218 if (normEnc == CharsetNames.CS_UTF8) {
219 r = new UTF8Reader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);
220 } else if (normEnc == CharsetNames.CS_ISO_LATIN1) {
221 r = new ISOLatinReader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);
222 } else if (normEnc == CharsetNames.CS_US_ASCII) {
223 r = new AsciiReader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);
224 } else if (normEnc.startsWith(CharsetNames.CS_UTF32)) {
225 // let's augment with actual endianness info
226 if (normEnc == CharsetNames.CS_UTF32) {
227 mInputEncoding = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;
228 }
229 r = new UTF32Reader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd,
230 mRecycleBuffer, mBigEndian);
231 } else {
232 // Nah, JDK needs to try it
233 // Ok; first, do we need to merge stuff back?
234 InputStream in = mIn;
235 if (mInputPtr < mInputEnd) {
236 in = new MergedStream(cfg, in, mByteBuffer, mInputPtr, mInputEnd);
237 }
238 /* 20-Jan-2006, TSa: Ok; although it is possible to declare
239 * stream as 'UTF-16', JDK may need help in figuring out
240 * the right order, so let's be explicit:
241 */
242 if (normEnc == CharsetNames.CS_UTF16) {
243 mInputEncoding = normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;
244 }
245 try {
246 return new InputStreamReader(in, normEnc);
247 } catch (UnsupportedEncodingException usex) {
248 throw new WstxIOException("Unsupported encoding: "+usex.getMessage());
249 }
250 }
251
252 if (mXml11Handling) {
253 r.setXmlCompliancy(XmlConsts.XML_V_11);
254 }
255
256 return r;
257 }
258
259 /**
260 * Since this class only gets used when encoding is not explicitly
261 * passed, need use the encoding that was auto-detected...
262 */
263 @Override
264 public String getInputEncoding() {
265 return mInputEncoding;
266 }
267
268 @Override
269 public int getInputTotal() {
270 int total = mInputProcessed + mInputPtr;
271 if (mBytesPerChar > 1) {
272 total /= mBytesPerChar;
273 }
274 return total;
275 }
276
277 @Override
278 public int getInputColumn() {
279 int col = mInputPtr - mInputRowStart;
280 if (mBytesPerChar > 1) {
281 col /= mBytesPerChar;
282 }
283 return col;
284 }
285
286 /*
287 ////////////////////////////////////////
288 // Internal methods, parsing
289 ////////////////////////////////////////
290 */
291
292 /**
293 * Method called to try to figure out physical encoding the underlying
294 * input stream uses.
295 */
296 protected void resolveStreamEncoding()
297 throws IOException, WstxException
298 {
299 // Let's first set defaults:
300 mBytesPerChar = 0;
301 mBigEndian = true;
302
303 /* Ok; first just need 4 bytes for determining bytes-per-char from
304 * BOM or first char(s) of likely xml declaration:
305 */
306 if (ensureLoaded(4)) {
307 bomblock:
308 do { // BOM/auto-detection block
309 int quartet = (mByteBuffer[0] << 24)
310 | ((mByteBuffer[1] & 0xFF) << 16)
311 | ((mByteBuffer[2] & 0xFF) << 8)
312 | (mByteBuffer[3] & 0xFF);
313
314 /* Handling of (usually) optional BOM (required for
315 * multi-byte formats); first 32-bit charsets:
316 */
317 switch (quartet) {
318 case 0x0000FEFF:
319 mBigEndian = true;
320 mInputPtr = mBytesPerChar = 4;
321 break bomblock;
322 case 0xFFFE0000: // UCS-4, LE?
323 mInputPtr = mBytesPerChar = 4;
324 mBigEndian = false;
325 break bomblock;
326 case 0x0000FFFE: // UCS-4, in-order...
327 reportWeirdUCS4("2143");
328 break bomblock;
329 case 0x0FEFF0000: // UCS-4, in-order...
330 reportWeirdUCS4("3412");
331 break bomblock;
332 }
333
334 // Ok, if not, how about 16-bit encoding BOMs?
335 int msw = quartet >>> 16;
336 if (msw == 0xFEFF) { // UTF-16, BE
337 mInputPtr = mBytesPerChar = 2;
338 mBigEndian = true;
339 break;
340 }
341 if (msw == 0xFFFE) { // UTF-16, LE
342 mInputPtr = mBytesPerChar = 2;
343 mBigEndian = false;
344 break;
345 }
346
347 // And if not, then UTF-8 BOM?
348 if ((quartet >>> 8) == 0xEFBBBF) { // UTF-8
349 mInputPtr = 3;
350 mBytesPerChar = 1;
351 mBigEndian = true; // doesn't really matter
352 break;
353 }
354
355 /* And if that wasn't succesful, how about auto-detection
356 * for '<?xm' (or subset for multi-byte encodings) marker?
357 */
358 // Note: none of these consume bytes... so ptr remains at 0
359
360 switch (quartet) {
361 case 0x0000003c: // UCS-4, BE?
362 mBigEndian = true;
363 mBytesPerChar = 4;
364 break bomblock;
365 case 0x3c000000: // UCS-4, LE?
366 mBytesPerChar = 4;
367 mBigEndian = false;
368 break bomblock;
369 case 0x00003c00: // UCS-4, in-order...
370 reportWeirdUCS4("2143");
371 break bomblock;
372 case 0x003c0000: // UCS-4, in-order...
373 reportWeirdUCS4("3412");
374 break bomblock;
375 case 0x003c003f: // UTF-16, BE
376 mBytesPerChar = 2;
377 mBigEndian = true;
378 break bomblock;
379 case 0x3c003f00: // UTF-16, LE
380 mBytesPerChar = 2;
381 mBigEndian = false;
382 break bomblock;
383 case 0x3c3f786d: // UTF-8, Ascii, ISO-Latin
384 mBytesPerChar = 1;
385 mBigEndian = true; // doesn't really matter
386 break bomblock;
387
388 case 0x4c6fa794:
389 mBytesPerChar = -1;
390 mEBCDIC = true;
391
392 /* For xml declaration handling we can basically
393 * use any of EBCDIC variants, since declaration
394 * must not contain control or punctuation characters
395 * that would differ
396 */
397 mSingleByteTranslation = EBCDICCodec.getCp037Mapping();
398 break bomblock;
399 }
400
401 /* Otherwise it's either single-byte doc without xml
402 * declaration, or corrupt input...
403 */
404 } while (false); // BOM/auto-detection block
405
406 mHadBOM = (mInputPtr > 0);
407
408 // Let's update location markers to ignore BOM.
409 mInputProcessed = -mInputPtr;
410 mInputRowStart = mInputPtr;
411 }
412
413 /* Hmmh. If we haven't figured it out, let's just assume
414 * UTF-8 as per XML specs:
415 */
416 mByteSizeFound = (mBytesPerChar != 0);
417 if (!mByteSizeFound) {
418 mBytesPerChar = 1;
419 mBigEndian = true; // doesn't matter
420 }
421 }
422
423 /**
424 * @return Normalized encoding name
425 */
426 protected String verifyXmlEncoding(String enc)
427 throws WstxException
428 {
429 enc = CharsetNames.normalize(enc);
430
431 // Let's actually verify we got matching information:
432 if (enc == CharsetNames.CS_UTF8) {
433 verifyEncoding(enc, 1);
434 } else if (enc == CharsetNames.CS_ISO_LATIN1) {
435 verifyEncoding(enc, 1);
436 } else if (enc == CharsetNames.CS_US_ASCII) {
437 verifyEncoding(enc, 1);
438 } else if (enc == CharsetNames.CS_UTF16) {
439 // BOM is obligatory, to know the ordering
440 /* 22-Mar-2005, TSa: Actually, since we don't have a
441 * custom decoder, so the underlying JDK Reader may
442 * have dealt with it transparently... so we can not
443 * really throw an exception here.
444 */
445 //if (!mHadBOM) {
446 //reportMissingBOM(enc);
447 //}
448 verifyEncoding(enc, 2);
449 } else if (enc == CharsetNames.CS_UTF16LE) {
450 verifyEncoding(enc, 2, false);
451 } else if (enc == CharsetNames.CS_UTF16BE) {
452 verifyEncoding(enc, 2, true);
453
454 } else if (enc == CharsetNames.CS_UTF32) {
455 // Do we require a BOM here? we can live without it...
456 //if (!mHadBOM) {
457 // reportMissingBOM(enc);
458 //}
459 verifyEncoding(enc, 4);
460 } else if (enc == CharsetNames.CS_UTF32LE) {
461 verifyEncoding(enc, 4, false);
462 } else if (enc == CharsetNames.CS_UTF32BE) {
463 verifyEncoding(enc, 4, true);
464 }
465 return enc;
466 }
467
468 /*
469 /////////////////////////////////////////////////////
470 // Internal methods, loading input data
471 /////////////////////////////////////////////////////
472 */
473
474 protected boolean ensureLoaded(int minimum)
475 throws IOException
476 {
477 /* Let's assume here buffer has enough room -- this will always
478 * be true for the limited used this method gets
479 */
480 int gotten = (mInputEnd - mInputPtr);
481 while (gotten < minimum) {
482 int count = (mIn == null) ? -1 : mIn.read(mByteBuffer, mInputEnd, mByteBuffer.length - mInputEnd);
483 if (count < 1) {
484 return false;
485 }
486 mInputEnd += count;
487 gotten += count;
488 }
489 return true;
490 }
491
492 protected void loadMore()
493 throws IOException, WstxException
494 {
495 /* Need to make sure offsets are properly updated for error
496 * reporting purposes, and do this now while previous amounts
497 * are still known.
498 */
499 /* Note: at this point these are all in bytes, not chars (for multibyte
500 * encodings)
501 */
502 mInputProcessed += mInputEnd;
503 mInputRowStart -= mInputEnd;
504
505 mInputPtr = 0;
506 mInputEnd = (mIn == null) ? -1 : mIn.read(mByteBuffer, 0, mByteBuffer.length);
507 if (mInputEnd < 1) {
508 throw new WstxEOFException(ParsingErrorMsgs.SUFFIX_IN_XML_DECL,
509 getLocation());
510 }
511 }
512
513 /*
514 /////////////////////////////////////////////////////
515 // Implementations of abstract parsing methods
516 /////////////////////////////////////////////////////
517 */
518
519 @Override
520 protected void pushback() {
521 if (mBytesPerChar < 0) {
522 mInputPtr += mBytesPerChar;
523 } else {
524 mInputPtr -= mBytesPerChar;
525 }
526 }
527
528 @Override
529 protected int getNext()
530 throws IOException, WstxException
531 {
532 if (mBytesPerChar != 1) {
533 if (mBytesPerChar == -1) { // need to translate
534 return nextTranslated();
535 }
536 return nextMultiByte();
537 }
538 byte b = (mInputPtr < mInputEnd) ?
539 mByteBuffer[mInputPtr++] : nextByte();
540 return (b & 0xFF);
541 }
542
543 @Override
544 protected int getNextAfterWs(boolean reqWs)
545 throws IOException, WstxException
546 {
547 int count;
548
549 if (mBytesPerChar == 1) { // single byte
550 count = skipSbWs();
551 } else {
552 if (mBytesPerChar == -1) { // translated
553 count = skipTranslatedWs();
554 } else { // multi byte
555 count = skipMbWs();
556 }
557 }
558
559 if (reqWs && count == 0) {
560 reportUnexpectedChar(getNext(), ERR_XMLDECL_EXP_SPACE);
561 }
562
563 // inlined getNext()
564 if (mBytesPerChar != 1) {
565 if (mBytesPerChar == -1) { // translated
566 return nextTranslated();
567 }
568 return nextMultiByte();
569 }
570 byte b = (mInputPtr < mInputEnd) ?
571 mByteBuffer[mInputPtr++] : nextByte();
572 return (b & 0xFF);
573 }
574
575 /**
576 * @return First character that does not match expected, if any;
577 * CHAR_NULL if match succeeded
578 */
579 @Override
580 protected int checkKeyword(String exp)
581 throws IOException, WstxException
582 {
583 if (mBytesPerChar != 1) {
584 if (mBytesPerChar == -1) {
585 return checkTranslatedKeyword(exp);
586 }
587 return checkMbKeyword(exp);
588 }
589 return checkSbKeyword(exp);
590 }
591
592 @Override
593 protected int readQuotedValue(char[] kw, int quoteChar)
594 throws IOException, WstxException
595 {
596 int i = 0;
597 int len = kw.length;
598 boolean simple = (mBytesPerChar == 1);
599 boolean mb = !simple && (mBytesPerChar > 1);
600
601 while (i < len) {
602 int c;
603
604 if (simple) {
605 byte b = (mInputPtr < mInputEnd) ?
606 mByteBuffer[mInputPtr++] : nextByte();
607 if (b == BYTE_NULL) {
608 reportNull();
609 }
610 if (b == BYTE_CR || b == BYTE_LF) {
611 skipSbLF(b);
612 b = BYTE_LF;
613 }
614 c = (b & 0xFF);
615 } else {
616 if (mb) {
617 c = nextMultiByte();
618 if (c == CHAR_CR || c == CHAR_LF) {
619 skipMbLF(c);
620 c = CHAR_LF;
621 }
622 } else {
623 c = nextTranslated();
624 if (c == CHAR_CR || c == CHAR_LF) {
625 skipTranslatedLF(c);
626 c = CHAR_LF;
627 }
628 }
629 }
630
631 if (c == quoteChar) {
632 return (i < len) ? i : -1;
633 }
634
635 if (i < len) {
636 kw[i++] = (char) c;
637 }
638 }
639
640 /* If we end up this far, we ran out of buffer space... let's let
641 * caller figure that out, though
642 */
643 return -1;
644 }
645
646 protected boolean hasXmlDecl()
647 throws IOException, WstxException
648 {
649 /* Separate handling for common and fast case; 1/variable byte
650 * encodings that have ASCII subset:
651 */
652 if (mBytesPerChar == 1) {
653 /* However... there has to be at least 6 bytes available; and if
654 * so, can check the 'signature' easily:
655 */
656 if (ensureLoaded(6)) {
657 if (mByteBuffer[mInputPtr] == '<'
658 && mByteBuffer[mInputPtr+1] == '?'
659 && mByteBuffer[mInputPtr+2] == 'x'
660 && mByteBuffer[mInputPtr+3] == 'm'
661 && mByteBuffer[mInputPtr+4] == 'l'
662 && ((mByteBuffer[mInputPtr+5] & 0xFF) <= CHAR_SPACE)) {
663
664 // Let's skip stuff so far:
665 mInputPtr += 6;
666 return true;
667 }
668 }
669 } else if (mBytesPerChar == -1) { // translated (EBCDIC)
670 if (ensureLoaded(6)) {
671 int start = mInputPtr; // if we have to 'unread' chars
672 if (nextTranslated() == '<'
673 && nextTranslated() == '?'
674 && nextTranslated() == 'x'
675 && nextTranslated() == 'm'
676 && nextTranslated() == 'l'
677 && nextTranslated() <= CHAR_SPACE) {
678 return true;
679 }
680 mInputPtr = start; // push data back
681 }
682 } else {
683 // ... and then for slower fixed-multibyte encodings:
684
685 // Is there enough data for checks?
686 if (ensureLoaded (6 * mBytesPerChar)) {
687 int start = mInputPtr; // if we have to 'unread' chars
688 if (nextMultiByte() == '<'
689 && nextMultiByte() == '?'
690 && nextMultiByte() == 'x'
691 && nextMultiByte() == 'm'
692 && nextMultiByte() == 'l'
693 && nextMultiByte() <= CHAR_SPACE) {
694 return true;
695 }
696 mInputPtr = start; // push data back
697 }
698 }
699
700 return false;
701 }
702
703 @Override
704 protected Location getLocation()
705 {
706 /* Ok; for fixed-size multi-byte encodings, need to divide numbers
707 * to get character locations. For variable-length encodings the
708 * good thing is that xml declaration only uses shortest codepoints,
709 * ie. char count == byte count.
710 */
711 int total = mInputProcessed + mInputPtr;
712 int col = mInputPtr - mInputRowStart;
713
714 if (mBytesPerChar > 1) {
715 total /= mBytesPerChar;
716 col /= mBytesPerChar;
717 }
718
719 return new WstxInputLocation(null, mPublicId, mSystemId,
720 total - 1, // 0-based
721 mInputRow, col);
722 }
723
724 /*
725 /////////////////////////////////////////////////////
726 // Internal methods, single-byte access methods
727 /////////////////////////////////////////////////////
728 */
729
730 protected byte nextByte()
731 throws IOException, WstxException
732 {
733 if (mInputPtr >= mInputEnd) {
734 loadMore();
735 }
736 return mByteBuffer[mInputPtr++];
737 }
738
739 protected int skipSbWs()
740 throws IOException, WstxException
741 {
742 int count = 0;
743
744 while (true) {
745 byte b = (mInputPtr < mInputEnd) ?
746 mByteBuffer[mInputPtr++] : nextByte();
747
748 if ((b & 0xFF) > CHAR_SPACE) {
749 --mInputPtr;
750 break;
751 }
752 if (b == BYTE_CR || b == BYTE_LF) {
753 skipSbLF(b);
754 } else if (b == BYTE_NULL) {
755 reportNull();
756 }
757 ++count;
758 }
759 return count;
760 }
761
762 protected void skipSbLF(byte lfByte)
763 throws IOException, WstxException
764 {
765 if (lfByte == BYTE_CR) {
766 byte b = (mInputPtr < mInputEnd) ?
767 mByteBuffer[mInputPtr++] : nextByte();
768 if (b != BYTE_LF) {
769 --mInputPtr; // pushback if not 2-char/byte lf
770 }
771 }
772 ++mInputRow;
773 mInputRowStart = mInputPtr;
774 }
775
776 /**
777 * @return First character that does not match expected, if any;
778 * CHAR_NULL if match succeeded
779 */
780 protected int checkSbKeyword(String expected)
781 throws IOException, WstxException
782 {
783 int len = expected.length();
784
785 for (int ptr = 1; ptr < len; ++ptr) {
786 byte b = (mInputPtr < mInputEnd) ?
787 mByteBuffer[mInputPtr++] : nextByte();
788
789 if (b == BYTE_NULL) {
790 reportNull();
791 }
792 if ((b & 0xFF) != expected.charAt(ptr)) {
793 return (b & 0xFF);
794 }
795 }
796
797 return CHAR_NULL;
798 }
799
800 /*
801 /////////////////////////////////////////////////////
802 // Internal methods, multi-byte/translated access/checks
803 /////////////////////////////////////////////////////
804 */
805
806 protected int nextMultiByte()
807 throws IOException, WstxException
808 {
809 byte b = (mInputPtr < mInputEnd) ?
810 mByteBuffer[mInputPtr++] : nextByte();
811 byte b2 = (mInputPtr < mInputEnd) ?
812 mByteBuffer[mInputPtr++] : nextByte();
813 int c;
814
815 if (mBytesPerChar == 2) {
816 if (mBigEndian) {
817 c = ((b & 0xFF) << 8) | (b2 & 0xFF);
818 } else {
819 c = (b & 0xFF) | ((b2 & 0xFF) << 8);
820 }
821 } else {
822 // Has to be 4 bytes
823 byte b3 = (mInputPtr < mInputEnd) ?
824 mByteBuffer[mInputPtr++] : nextByte();
825 byte b4 = (mInputPtr < mInputEnd) ?
826 mByteBuffer[mInputPtr++] : nextByte();
827
828 if (mBigEndian) {
829 c = (b << 24) | ((b2 & 0xFF) << 16)
830 | ((b3 & 0xFF) << 8) | (b4 & 0xFF);
831 } else {
832 c = (b4 << 24) | ((b3 & 0xFF) << 16)
833 | ((b2 & 0xFF) << 8) | (b & 0xFF);
834 }
835 }
836
837 // Let's catch null chars early
838 if (c == 0) {
839 reportNull();
840 }
841 return c;
842 }
843
844 protected int nextTranslated()
845 throws IOException, WstxException
846 {
847 byte b = (mInputPtr < mInputEnd) ?
848 mByteBuffer[mInputPtr++] : nextByte();
849 int ch = mSingleByteTranslation[b & 0xFF];
850 if (ch < 0) { // special char... won't care for now
851 ch = -ch;
852 }
853 return ch;
854 }
855
856 protected int skipMbWs()
857 throws IOException, WstxException
858 {
859 int count = 0;
860
861 while (true) {
862 int c = nextMultiByte();
863
864 if (c > CHAR_SPACE) {
865 mInputPtr -= mBytesPerChar;
866 break;
867 }
868 if (c == CHAR_CR || c == CHAR_LF) {
869 skipMbLF(c);
870 } else if (c == CHAR_NULL) {
871 reportNull();
872 }
873 ++count;
874 }
875 return count;
876 }
877
878 protected int skipTranslatedWs()
879 throws IOException, WstxException
880 {
881 int count = 0;
882
883 while (true) {
884 int c = nextTranslated();
885
886 // Hmmh. Are we to accept NEL (0x85)?
887 if (c > CHAR_SPACE && c != CHAR_NEL) {
888 --mInputPtr;
889 break;
890 }
891 if (c == CHAR_CR || c == CHAR_LF) {
892 skipTranslatedLF(c);
893 } else if (c == CHAR_NULL) {
894 reportNull();
895 }
896 ++count;
897 }
898 return count;
899 }
900
901 protected void skipMbLF(int lf)
902 throws IOException, WstxException
903 {
904 if (lf == CHAR_CR) {
905 int c = nextMultiByte();
906 if (c != CHAR_LF) {
907 mInputPtr -= mBytesPerChar;
908 }
909 }
910 ++mInputRow;
911 mInputRowStart = mInputPtr;
912 }
913
914 protected void skipTranslatedLF(int lf)
915 throws IOException, WstxException
916 {
917 if (lf == CHAR_CR) {
918 int c = nextTranslated();
919 if (c != CHAR_LF) {
920 mInputPtr -= 1;
921 }
922 }
923 ++mInputRow;
924 mInputRowStart = mInputPtr;
925 }
926
927 /**
928 * @return First character that does not match expected, if any;
929 * CHAR_NULL if match succeeded
930 */
931 protected int checkMbKeyword(String expected)
932 throws IOException, WstxException
933 {
934 int len = expected.length();
935
936 for (int ptr = 1; ptr < len; ++ptr) {
937 int c = nextMultiByte();
938 if (c == BYTE_NULL) {
939 reportNull();
940 }
941 if (c != expected.charAt(ptr)) {
942 return c;
943 }
944 }
945
946 return CHAR_NULL;
947 }
948
949 protected int checkTranslatedKeyword(String expected)
950 throws IOException, WstxException
951 {
952 int len = expected.length();
953
954 for (int ptr = 1; ptr < len; ++ptr) {
955 int c = nextTranslated();
956 if (c == BYTE_NULL) {
957 reportNull();
958 }
959 if (c != expected.charAt(ptr)) {
960 return c;
961 }
962 }
963
964 return CHAR_NULL;
965 }
966
967 /*
968 ////////////////////////////////////////
969 // Other private methods:
970 ////////////////////////////////////////
971 */
972
973 private void verifyEncoding(String id, int bpc)
974 throws WstxException
975 {
976 if (mByteSizeFound) {
977 /* Let's verify that if we matched an encoding, it's the same
978 * as what was declared...
979 */
980 if (bpc != mBytesPerChar) {
981 // [WSTX-138]: Needs to detect EBCDIC discrepancy
982 if (mEBCDIC) {
983 reportXmlProblem("Declared encoding '"+id+"' incompatible with auto-detected physical encoding (EBCDIC variant), can not decode input since actual code page not known");
984 }
985 reportXmlProblem("Declared encoding '"+id+"' uses "+bpc
986 +" bytes per character; but physical encoding appeared to use "+mBytesPerChar+"; cannot decode");
987 }
988 }
989 }
990
991 private void verifyEncoding(String id, int bpc, boolean bigEndian)
992 throws WstxException
993 {
994 if (mByteSizeFound) {
995 verifyEncoding(id, bpc);
996
997 if (bigEndian != mBigEndian) {
998 String bigStr = bigEndian ? "big" : "little";
999 reportXmlProblem
1000 ("Declared encoding '"+id+"' has different endianness ("
1001 +bigStr+" endian) than what physical ordering appeared to be; cannot decode");
1002 }
1003 }
1004 }
1005
1006 private void reportWeirdUCS4(String type)
1007 throws IOException
1008 {
1009 throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
1010 }
1011
1012 /*
1013 private void reportMissingBOM(String enc)
1014 throws WstxException
1015 {
1016 throw new WstxException("Missing BOM for encoding '"+enc+"'; can not be omitted",
1017 getLocation());
1018 }
1019 */
1020 }
1021