Monitoring JavaMelody on _ip-10-0-15-189.ec2.internal

1 package com.ctc.wstx.io;

2 

3 import java.io.*;

4 

5 import javax.xml.stream.Location;

6 import javax.xml.stream.XMLStreamException;

7 

8 import com.ctc.wstx.api.ReaderConfig;

9 import com.ctc.wstx.cfg.ParsingErrorMsgs;

10 import com.ctc.wstx.cfg.XmlConsts;

11 import com.ctc.wstx.exc.*;

12 

13 /**

14  * Input bootstrap class used with streams, when encoding is not known

15  * (when encoding is specified by application, a reader is constructed,

16  * and then reader-based bootstrapper is used).

17  *<p>

18  * Encoding used for an entity (including

19  * main document entity) is determined using algorithms suggested in

20  * XML 1.0#3 spec, appendix F

21  */

22 public final class StreamBootstrapper

23     extends InputBootstrapper

24 {

25     /**

26      * Let's size buffer at least big enough to contain the longest possible

27      * prefix of a document needed to positively identify it starts with

28      * the XML declaration. That means having (optional) BOM, and then first

29      * 6 characters ("<?xml "), in whatever encoding. With 4-byte encodings

30      * (UCS-4), that comes to 28 bytes. And for good measure, let's pad

31      * that a bit as well....

32      */

33     final static int MIN_BUF_SIZE = 128;

34 

35     /*

36     ////////////////////////////////////////

37     // Configuration

38     ////////////////////////////////////////

39     */

40 

41     /**

42      * Underlying InputStream to use for reading content. May be null

43      * if the actual data source is not stream-based but a block source.

44      */

45     final InputStream mIn;

46 

47     /*

48     ///////////////////////////////////////////////////////////////

49     // Input buffering

50     ///////////////////////////////////////////////////////////////

51     */

52 

53     private byte[] mByteBuffer;

54 

55     /**

56      * Whether byte buffer is recyclable or not

57      */

58     private final boolean mRecycleBuffer;

59 

60     private int mInputPtr;

61     private int mInputEnd;

62     /*

63     ///////////////////////////////////////////////////////////////

64     // Physical encoding properties found so far

65     ///////////////////////////////////////////////////////////////

66     */

67 

68     boolean mBigEndian = true;

69 

70     boolean mHadBOM = false;

71 

72     boolean mByteSizeFound = false;

73 

74     /**

75      * For most encodings, number of physical characters needed for

76      * decoding xml declaration characters (which for variable length

77      * encodings like UTF-8 will be 1). Exception is EBCDIC, which

78      * while a single-byte encoding, is denoted by -1 since it

79      * needs an additional translation lookup.

80      */

81     int mBytesPerChar; // minimum, ie. 1 for UTF-8

82 

83     /**

84      * Special case for 1-byte encodings: EBCDIC is problematic

85      * as it's not 7-bit ascii compatible. We can deal with it,

86      * still, but only with bit of extra state.

87      */

88     boolean mEBCDIC = false;

89 

90     String mInputEncoding = null;

91 

92     /**

93      * For single-byte non-ascii-compatible encodings (ok ok, really

94      * just EBCDIC), we'll have to use a lookup table.

95      */

96     int[] mSingleByteTranslation = null;

97 

98     /*

99     ////////////////////////////////////////

100     // Life-cycle

101     ////////////////////////////////////////

102     */

103 

104     private StreamBootstrapper(String pubId, SystemId sysId, InputStream in)

105     {

106         super(pubId, sysId);

107         mIn = in;

108         mInputPtr = mInputEnd = 0;

109         mRecycleBuffer = true;

110     }

111 

112     /**

113      * @param start Pointer to the first valid byte in the buffer

114      * @param end Pointer to the offset <b>after</b> last valid byte in the buffer

115      */

116     private StreamBootstrapper(String pubId, SystemId sysId, byte[] data, int start, int end)

117     {

118         super(pubId, sysId);

119         mIn = null;

120         mRecycleBuffer = false;

121         mByteBuffer = data;

122         mInputPtr = start;

123         mInputEnd = end;

124     }

125 

126     /*

127     ////////////////////////////////////////

128     // Public API

129     ////////////////////////////////////////

130     */

131 

132     /**

133      * Factory method used when the underlying data provider is an 

134      * actual stream.

135      */

136     public static StreamBootstrapper getInstance(String pubId, SystemId sysId, InputStream in)

137     {

138         return new StreamBootstrapper(pubId, sysId, in);

139     }

140 

141     /**

142      * Factory method used when the underlying data provider is a pre-allocated

143      * block source, and no stream is used.

144      * Additionally the buffer passed is not owned by the bootstrapper

145      * or Reader that is created, so it is not to be recycled.

146      */

147     public static StreamBootstrapper getInstance(String pubId, SystemId sysId, byte[] data, int start, int end)

148     {

149         return new StreamBootstrapper(pubId, sysId, data, start, end);

150     }

151 

152     @Override

153     public Reader bootstrapInput(ReaderConfig cfg, boolean mainDoc, int xmlVersion)

154         throws IOException, XMLStreamException

155     {

156         String normEnc = null;

157 

158         // First, let's get the buffers...

159         int bufSize = cfg.getInputBufferLength();

160         if (bufSize < MIN_BUF_SIZE) {

161             bufSize = MIN_BUF_SIZE;

162         }

163     if (mByteBuffer == null) { // non-null if we were passed a buffer

164         mByteBuffer = cfg.allocFullBBuffer(bufSize);

165     }

166 

167         resolveStreamEncoding();

168 

169         if (hasXmlDecl()) {

170             // note: readXmlDecl will set mXml11Handling too

171             readXmlDecl(mainDoc, xmlVersion);

172             if (mFoundEncoding != null) {

173                 normEnc = verifyXmlEncoding(mFoundEncoding);

174             }

175         } else {

176             /* We'll actually then just inherit whatever main doc had...

177              * (or in case there was no parent, just copy the 'unknown')

178              */

179             mXml11Handling = (XmlConsts.XML_V_11 == xmlVersion);

180         }

181 

182         // Now, have we figured out the encoding?

183 

184         if (normEnc == null) { // not via xml declaration

185             /* 21-Sep-2007, TSa: As with any non-UTF-8 encoding, declaration

186              * isn't optional any more. Besides, we need that information

187              * anyway to know which variant it is.

188              */

189             if (mEBCDIC) {

190                 if (mFoundEncoding == null || mFoundEncoding.length() == 0) {

191                     reportXmlProblem("Missing encoding declaration: underlying encoding looks like an EBCDIC variant, but no xml encoding declaration found");

192                 }

193                 // Hmmh. What should be the canonical name? Let's just use found encoding?

194                 normEnc = mFoundEncoding;

195             } else if (mBytesPerChar == 2) { // UTF-16, BE/LE

196                 normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;

197             } else if (mBytesPerChar == 4) { // UCS-4... ?

198                 /* 22-Mar-2005, TSa: JDK apparently has no way of dealing

199                  *   with these encodings... not sure if and how it should

200                  *   be dealt with, really. Name could be UCS-4xx... or

201                  *   perhaps UTF-32xx

202                  */

203                 normEnc = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;

204             } else {

205                 // Ok, default has to be UTF-8, as per XML specs

206                 normEnc = CharsetNames.CS_UTF8;

207             }

208         }

209 

210         mInputEncoding = normEnc;

211 

212         /* And then the reader. Let's figure out if we can use our own fast

213          * implementations first:

214          */

215         BaseReader r;

216 

217         // Normalized, can thus use straight equality checks now

218         if (normEnc == CharsetNames.CS_UTF8) {

219             r = new UTF8Reader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);

220         } else if (normEnc == CharsetNames.CS_ISO_LATIN1) {

221             r = new ISOLatinReader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);

222         } else if (normEnc == CharsetNames.CS_US_ASCII) {

223             r = new AsciiReader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);

224         } else if (normEnc.startsWith(CharsetNames.CS_UTF32)) {

225             // let's augment with actual endianness info

226             if (normEnc == CharsetNames.CS_UTF32) {

227                 mInputEncoding = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;

228             }

229             r = new UTF32Reader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd,

230                 mRecycleBuffer, mBigEndian);

231         } else {

232             // Nah, JDK needs to try it

233             // Ok; first, do we need to merge stuff back?

234             InputStream in = mIn;

235             if (mInputPtr < mInputEnd) {

236                 in = new MergedStream(cfg, in, mByteBuffer, mInputPtr, mInputEnd);

237             }

238             /* 20-Jan-2006, TSa: Ok; although it is possible to declare

239              *   stream as 'UTF-16', JDK may need help in figuring out

240              *   the right order, so let's be explicit:

241              */

242             if (normEnc == CharsetNames.CS_UTF16) {

243                 mInputEncoding = normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;

244             }

245             try {

246                 return new InputStreamReader(in, normEnc);

247             } catch (UnsupportedEncodingException usex) {

248                 throw new WstxIOException("Unsupported encoding: "+usex.getMessage());

249             }

250         }

251 

252         if (mXml11Handling) {

253             r.setXmlCompliancy(XmlConsts.XML_V_11);

254         }

255 

256         return r;

257     }

258     

259     /**

260      * Since this class only gets used when encoding is not explicitly

261      * passed, need use the encoding that was auto-detected...

262      */

263     @Override

264     public String getInputEncoding() {

265         return mInputEncoding;

266     }

267 

268     @Override

269     public int getInputTotal() {

270         int total = mInputProcessed + mInputPtr;

271         if (mBytesPerChar > 1) {

272             total /= mBytesPerChar;

273         }

274         return total;

275     }

276 

277     @Override

278     public int getInputColumn() {

279         int col = mInputPtr - mInputRowStart;

280         if (mBytesPerChar > 1) {

281             col /= mBytesPerChar;

282         }

283         return col;

284     }

285 

286     /*

287     ////////////////////////////////////////

288     // Internal methods, parsing

289     ////////////////////////////////////////

290     */

291 

292     /**

293      * Method called to try to figure out physical encoding the underlying

294      * input stream uses.

295      */

296     protected void resolveStreamEncoding()

297         throws IOException, WstxException

298     {

299         // Let's first set defaults:

300         mBytesPerChar = 0;

301         mBigEndian = true;

302 

303         /* Ok; first just need 4 bytes for determining bytes-per-char from

304          * BOM or first char(s) of likely xml declaration:

305          */

306         if (ensureLoaded(4)) {

307             bomblock:

308             do { // BOM/auto-detection block

309                 int quartet = (mByteBuffer[0] << 24)

310                     | ((mByteBuffer[1] & 0xFF) << 16)

311                     | ((mByteBuffer[2] & 0xFF) << 8)

312                     | (mByteBuffer[3] & 0xFF);

313 

314                 /* Handling of (usually) optional BOM (required for

315                  * multi-byte formats); first 32-bit charsets:

316                  */

317                 switch (quartet) {

318                 case 0x0000FEFF:

319                     mBigEndian = true;

320                     mInputPtr = mBytesPerChar = 4;

321                     break bomblock;

322                 case 0xFFFE0000: // UCS-4, LE?

323                     mInputPtr = mBytesPerChar = 4;

324                     mBigEndian = false;

325                     break bomblock;

326                 case 0x0000FFFE: // UCS-4, in-order...

327                     reportWeirdUCS4("2143");

328                     break bomblock;

329                 case 0x0FEFF0000: // UCS-4, in-order...

330                     reportWeirdUCS4("3412");

331                     break bomblock;

332                 }

333 

334                 // Ok, if not, how about 16-bit encoding BOMs?

335                 int msw = quartet >>> 16;

336                 if (msw == 0xFEFF) { // UTF-16, BE

337                     mInputPtr = mBytesPerChar = 2;

338                     mBigEndian = true;

339                     break;

340                 }

341                 if (msw == 0xFFFE) { // UTF-16, LE

342                     mInputPtr = mBytesPerChar = 2;

343                     mBigEndian = false;

344                     break;

345                 }

346 

347                 // And if not, then UTF-8 BOM?

348                 if ((quartet >>> 8) == 0xEFBBBF) { // UTF-8

349                     mInputPtr = 3;

350                     mBytesPerChar = 1;

351                     mBigEndian = true; // doesn't really matter

352                     break;

353                 }

354 

355                 /* And if that wasn't succesful, how about auto-detection

356                  * for '<?xm' (or subset for multi-byte encodings) marker?

357                  */

358                 // Note: none of these consume bytes... so ptr remains at 0

359 

360                 switch (quartet) {

361                 case 0x0000003c: // UCS-4, BE?

362                     mBigEndian = true;

363                     mBytesPerChar = 4;

364                     break bomblock;

365                 case 0x3c000000: // UCS-4, LE?

366                     mBytesPerChar = 4;

367                     mBigEndian = false;

368                     break bomblock;

369                 case 0x00003c00: // UCS-4, in-order...

370                     reportWeirdUCS4("2143");

371                     break bomblock;

372                 case 0x003c0000: // UCS-4, in-order...

373                     reportWeirdUCS4("3412");

374                     break bomblock;

375                 case 0x003c003f: // UTF-16, BE

376                     mBytesPerChar = 2;

377                     mBigEndian = true;

378                     break bomblock;

379                 case 0x3c003f00: // UTF-16, LE

380                     mBytesPerChar = 2;

381                     mBigEndian = false;

382                     break bomblock;

383                 case 0x3c3f786d: // UTF-8, Ascii, ISO-Latin

384                     mBytesPerChar = 1;

385                     mBigEndian = true; // doesn't really matter

386                     break bomblock;

387 

388                 case 0x4c6fa794:

389                     mBytesPerChar = -1;

390                     mEBCDIC = true;

391 

392                     /* For xml declaration handling we can basically

393                      * use any of EBCDIC variants, since declaration

394                      * must not contain control or punctuation characters

395                      * that would differ

396                      */

397                     mSingleByteTranslation = EBCDICCodec.getCp037Mapping();

398                     break bomblock;

399                 }

400                 

401                 /* Otherwise it's either single-byte doc without xml

402                  * declaration, or corrupt input...

403                  */

404             } while (false); // BOM/auto-detection block

405             

406             mHadBOM = (mInputPtr > 0);

407 

408             // Let's update location markers to ignore BOM.

409             mInputProcessed = -mInputPtr;

410             mInputRowStart = mInputPtr;

411         }

412 

413         /* Hmmh. If we haven't figured it out, let's just assume

414          * UTF-8 as per XML specs:

415          */

416         mByteSizeFound = (mBytesPerChar != 0);

417         if (!mByteSizeFound) {

418             mBytesPerChar = 1;

419             mBigEndian = true; // doesn't matter

420         }

421     }

422 

423     /**

424      * @return Normalized encoding name

425      */

426     protected String verifyXmlEncoding(String enc)

427         throws WstxException

428     {

429         enc = CharsetNames.normalize(enc);

430 

431         // Let's actually verify we got matching information:

432         if (enc == CharsetNames.CS_UTF8) {

433             verifyEncoding(enc, 1);

434         } else if (enc == CharsetNames.CS_ISO_LATIN1) {

435             verifyEncoding(enc, 1);

436         } else if (enc == CharsetNames.CS_US_ASCII) {

437             verifyEncoding(enc, 1);

438         } else if (enc == CharsetNames.CS_UTF16) {

439             // BOM is obligatory, to know the ordering

440             /* 22-Mar-2005, TSa: Actually, since we don't have a

441              *   custom decoder, so the underlying JDK Reader may

442              *   have dealt with it transparently... so we can not

443              *   really throw an exception here.

444              */

445             //if (!mHadBOM) {

446             //reportMissingBOM(enc);

447             //}

448             verifyEncoding(enc, 2);

449         } else if (enc == CharsetNames.CS_UTF16LE) {

450             verifyEncoding(enc, 2, false);

451         } else if (enc == CharsetNames.CS_UTF16BE) {

452             verifyEncoding(enc, 2, true);

453 

454         } else if (enc == CharsetNames.CS_UTF32) {

455             // Do we require a BOM here? we can live without it...

456             //if (!mHadBOM) {

457             //    reportMissingBOM(enc);

458             //}

459             verifyEncoding(enc, 4);

460         } else if (enc == CharsetNames.CS_UTF32LE) {

461             verifyEncoding(enc, 4, false);

462         } else if (enc == CharsetNames.CS_UTF32BE) {

463             verifyEncoding(enc, 4, true);

464         }

465         return enc;

466     }

467 

468     /*

469     /////////////////////////////////////////////////////

470     // Internal methods, loading input data

471     /////////////////////////////////////////////////////

472     */

473 

474     protected boolean ensureLoaded(int minimum)

475         throws IOException

476     {

477         /* Let's assume here buffer has enough room -- this will always

478          * be true for the limited used this method gets

479          */

480         int gotten = (mInputEnd - mInputPtr);

481         while (gotten < minimum) {

482             int count = (mIn == null) ? -1 : mIn.read(mByteBuffer, mInputEnd, mByteBuffer.length - mInputEnd);

483             if (count < 1) {

484                 return false;

485             }

486             mInputEnd += count;

487             gotten += count;

488         }

489         return true;

490     }

491 

492     protected void loadMore()

493         throws IOException, WstxException

494     {

495         /* Need to make sure offsets are properly updated for error

496          * reporting purposes, and do this now while previous amounts

497          * are still known.

498          */

499         /* Note: at this point these are all in bytes, not chars (for multibyte

500          * encodings)

501          */

502         mInputProcessed += mInputEnd;

503         mInputRowStart -= mInputEnd;

504 

505         mInputPtr = 0;

506         mInputEnd = (mIn == null) ? -1 : mIn.read(mByteBuffer, 0, mByteBuffer.length);

507         if (mInputEnd < 1) {

508             throw new WstxEOFException(ParsingErrorMsgs.SUFFIX_IN_XML_DECL,

509                                        getLocation());

510         }

511     }

512 

513     /*

514     /////////////////////////////////////////////////////

515     // Implementations of abstract parsing methods

516     /////////////////////////////////////////////////////

517     */

518 

519     @Override

520     protected void pushback() {

521         if (mBytesPerChar < 0) {

522             mInputPtr += mBytesPerChar;

523         } else {

524             mInputPtr -= mBytesPerChar;

525         }

526     }

527 

528     @Override

529     protected int getNext()

530         throws IOException, WstxException

531     {

532         if (mBytesPerChar != 1) {

533             if (mBytesPerChar == -1) { // need to translate

534                 return nextTranslated();

535             }

536             return nextMultiByte();

537         }

538         byte b = (mInputPtr < mInputEnd) ?

539             mByteBuffer[mInputPtr++] : nextByte();

540         return (b & 0xFF);

541     }

542 

543     @Override

544     protected int getNextAfterWs(boolean reqWs)

545         throws IOException, WstxException

546     {

547         int count;

548 

549         if (mBytesPerChar == 1) { // single byte

550             count = skipSbWs();

551         } else {

552             if (mBytesPerChar == -1) { // translated

553                 count = skipTranslatedWs();

554             } else { // multi byte

555                 count = skipMbWs();

556             }

557         }

558 

559         if (reqWs && count == 0) {

560             reportUnexpectedChar(getNext(), ERR_XMLDECL_EXP_SPACE);

561         }

562 

563         // inlined getNext()

564         if (mBytesPerChar != 1) {

565             if (mBytesPerChar == -1) { // translated

566                 return nextTranslated();

567             }

568             return nextMultiByte();

569         }

570         byte b = (mInputPtr < mInputEnd) ?

571             mByteBuffer[mInputPtr++] : nextByte();

572         return (b & 0xFF);

573     }

574 

575     /**

576      * @return First character that does not match expected, if any;

577      *    CHAR_NULL if match succeeded

578      */

579     @Override

580     protected int checkKeyword(String exp)

581         throws IOException, WstxException

582     {

583         if (mBytesPerChar != 1) {

584             if (mBytesPerChar == -1) {

585                 return checkTranslatedKeyword(exp);

586             }

587             return checkMbKeyword(exp);

588         }

589         return checkSbKeyword(exp);

590     }

591 

592     @Override

593     protected int readQuotedValue(char[] kw, int quoteChar)

594         throws IOException, WstxException

595     {

596         int i = 0;

597         int len = kw.length;

598         boolean simple = (mBytesPerChar == 1);

599         boolean mb = !simple && (mBytesPerChar > 1);

600 

601         while (i < len) {

602             int c;

603 

604             if (simple) {

605                 byte b = (mInputPtr < mInputEnd) ?

606                     mByteBuffer[mInputPtr++] : nextByte();

607                 if (b == BYTE_NULL) {

608                     reportNull();

609                 }

610                 if (b == BYTE_CR || b == BYTE_LF) {

611                     skipSbLF(b);

612                     b = BYTE_LF;

613                 }

614                 c = (b & 0xFF);

615             } else {

616                 if (mb) {

617                     c = nextMultiByte();

618                     if (c ==  CHAR_CR || c == CHAR_LF) {

619                         skipMbLF(c);

620                         c = CHAR_LF;

621                     }

622                 } else {

623                     c = nextTranslated();

624                     if (c ==  CHAR_CR || c == CHAR_LF) {

625                         skipTranslatedLF(c);

626                         c = CHAR_LF;

627                     }

628                 }

629             }

630 

631             if (c == quoteChar) {

632                 return (i < len) ? i : -1;

633             }

634 

635             if (i < len) {

636                 kw[i++] = (char) c;

637             }

638         }

639         

640         /* If we end up this far, we ran out of buffer space... let's let

641          * caller figure that out, though

642          */

643         return -1;

644     }

645 

646     protected boolean hasXmlDecl()

647         throws IOException, WstxException

648     {

649         /* Separate handling for common and fast case; 1/variable byte

650          * encodings that have ASCII subset:

651          */

652         if (mBytesPerChar == 1) {

653             /* However... there has to be at least 6 bytes available; and if

654              * so, can check the 'signature' easily:

655              */

656             if (ensureLoaded(6)) {

657                 if (mByteBuffer[mInputPtr] == '<'

658                     && mByteBuffer[mInputPtr+1] == '?'

659                     && mByteBuffer[mInputPtr+2] == 'x'

660                     && mByteBuffer[mInputPtr+3] == 'm'

661                     && mByteBuffer[mInputPtr+4] == 'l'

662                     && ((mByteBuffer[mInputPtr+5] & 0xFF) <= CHAR_SPACE)) {

663 

664                     // Let's skip stuff so far:

665                     mInputPtr += 6;

666                     return true;

667                 }

668             }

669         } else if (mBytesPerChar == -1) { // translated (EBCDIC)

670             if (ensureLoaded(6)) {

671                 int start = mInputPtr; // if we have to 'unread' chars

672                 if (nextTranslated() == '<'

673                     && nextTranslated() == '?'

674                     && nextTranslated() == 'x'

675                     && nextTranslated() == 'm'

676                     && nextTranslated() == 'l'

677                     && nextTranslated() <= CHAR_SPACE) {

678                     return true;

679                 }

680                 mInputPtr = start; // push data back

681             }

682         } else {

683             // ... and then for slower fixed-multibyte encodings:

684 

685             // Is there enough data for checks?

686             if (ensureLoaded (6 * mBytesPerChar)) {

687                 int start = mInputPtr; // if we have to 'unread' chars

688                 if (nextMultiByte() == '<'

689                     && nextMultiByte() == '?'

690                     && nextMultiByte() == 'x'

691                     && nextMultiByte() == 'm'

692                     && nextMultiByte() == 'l'

693                     && nextMultiByte() <= CHAR_SPACE) {

694                     return true;

695                 }

696                 mInputPtr = start; // push data back

697             }

698         }

699 

700         return false;

701     }

702 

703     @Override

704     protected Location getLocation()

705     {

706         /* Ok; for fixed-size multi-byte encodings, need to divide numbers

707          * to get character locations. For variable-length encodings the

708          * good thing is that xml declaration only uses shortest codepoints,

709          * ie. char count == byte count.

710          */

711         int total = mInputProcessed + mInputPtr;

712         int col = mInputPtr - mInputRowStart;

713 

714         if (mBytesPerChar > 1) {

715             total /= mBytesPerChar;

716             col /= mBytesPerChar;

717         }

718 

719         return new WstxInputLocation(null, mPublicId, mSystemId,

720                                      total - 1, // 0-based

721                                      mInputRow, col);

722     }

723 

724     /*

725     /////////////////////////////////////////////////////

726     // Internal methods, single-byte access methods

727     /////////////////////////////////////////////////////

728     */

729 

730     protected byte nextByte()

731         throws IOException, WstxException

732     {

733         if (mInputPtr >= mInputEnd) {

734             loadMore();

735         }

736         return mByteBuffer[mInputPtr++];

737     }

738 

739     protected int skipSbWs()

740         throws IOException, WstxException

741     {

742         int count = 0;

743 

744         while (true) {

745             byte b = (mInputPtr < mInputEnd) ?

746                 mByteBuffer[mInputPtr++] : nextByte();

747 

748             if ((b & 0xFF) > CHAR_SPACE) {

749                 --mInputPtr;

750                 break;

751             }

752             if (b == BYTE_CR || b == BYTE_LF) {

753                 skipSbLF(b);

754             } else if (b == BYTE_NULL) {

755                 reportNull();

756             }

757             ++count;

758         }

759         return count;

760     }

761 

762     protected void skipSbLF(byte lfByte)

763         throws IOException, WstxException

764     {

765         if (lfByte == BYTE_CR) {

766             byte b = (mInputPtr < mInputEnd) ?

767                 mByteBuffer[mInputPtr++] : nextByte();

768             if (b != BYTE_LF) {

769                 --mInputPtr; // pushback if not 2-char/byte lf

770             }

771         }

772         ++mInputRow;

773         mInputRowStart = mInputPtr;

774     }

775 

776     /**

777      * @return First character that does not match expected, if any;

778      *    CHAR_NULL if match succeeded

779      */

780     protected int checkSbKeyword(String expected)

781         throws IOException, WstxException

782     {

783         int len = expected.length();

784         

785         for (int ptr = 1; ptr < len; ++ptr) {

786             byte b = (mInputPtr < mInputEnd) ?

787                 mByteBuffer[mInputPtr++] : nextByte();

788             

789             if (b == BYTE_NULL) {

790                 reportNull();

791             }

792             if ((b & 0xFF) != expected.charAt(ptr)) {

793                 return (b & 0xFF);

794             }

795         }

796 

797         return CHAR_NULL;

798     }

799 

800     /*

801     /////////////////////////////////////////////////////

802     // Internal methods, multi-byte/translated access/checks

803     /////////////////////////////////////////////////////

804     */

805 

806     protected int nextMultiByte()

807         throws IOException, WstxException

808     {

809         byte b = (mInputPtr < mInputEnd) ?

810             mByteBuffer[mInputPtr++] : nextByte();

811         byte b2 = (mInputPtr < mInputEnd) ?

812             mByteBuffer[mInputPtr++] : nextByte();

813         int c;

814 

815         if (mBytesPerChar == 2) {

816             if (mBigEndian) {

817                 c = ((b & 0xFF) << 8) | (b2 & 0xFF);

818             } else {

819                 c = (b & 0xFF) | ((b2 & 0xFF) << 8);

820             }

821         } else {

822             // Has to be 4 bytes

823             byte b3 = (mInputPtr < mInputEnd) ?

824                 mByteBuffer[mInputPtr++] : nextByte();

825             byte b4 = (mInputPtr < mInputEnd) ?

826                 mByteBuffer[mInputPtr++] : nextByte();

827             

828             if (mBigEndian) {

829                 c = (b  << 24) | ((b2 & 0xFF) << 16)

830                     | ((b3 & 0xFF) << 8) | (b4 & 0xFF);

831             } else {

832                 c = (b4  << 24) | ((b3 & 0xFF) << 16)

833                     | ((b2 & 0xFF) << 8) | (b & 0xFF);

834             }

835         }

836 

837         // Let's catch null chars early

838         if (c == 0) {

839             reportNull();

840         }

841         return c;

842     }

843 

844     protected int nextTranslated()

845         throws IOException, WstxException

846     {

847         byte b = (mInputPtr < mInputEnd) ?

848             mByteBuffer[mInputPtr++] : nextByte();

849         int ch = mSingleByteTranslation[b & 0xFF];

850         if (ch < 0) { // special char... won't care for now

851             ch = -ch;

852         }

853         return ch;

854     }

855 

856     protected int skipMbWs()

857         throws IOException, WstxException

858     {

859         int count = 0;

860 

861         while (true) {

862             int c = nextMultiByte();

863 

864             if (c > CHAR_SPACE) {

865                 mInputPtr -= mBytesPerChar;

866                 break;

867             }

868             if (c == CHAR_CR || c == CHAR_LF) {

869                 skipMbLF(c);

870             } else if (c == CHAR_NULL) {

871                 reportNull();

872             }

873             ++count;

874         }

875         return count;

876     }

877 

878     protected int skipTranslatedWs()

879         throws IOException, WstxException

880     {

881         int count = 0;

882 

883         while (true) {

884             int c = nextTranslated();

885 

886             // Hmmh. Are we to accept NEL (0x85)?

887             if (c > CHAR_SPACE && c != CHAR_NEL) {

888                 --mInputPtr;

889                 break;

890             }

891             if (c == CHAR_CR || c == CHAR_LF) {

892                 skipTranslatedLF(c);

893             } else if (c == CHAR_NULL) {

894                 reportNull();

895             }

896             ++count;

897         }

898         return count;

899     }

900 

901     protected void skipMbLF(int lf)

902         throws IOException, WstxException

903     {

904         if (lf == CHAR_CR) {

905             int c = nextMultiByte();

906             if (c != CHAR_LF) {

907                 mInputPtr -= mBytesPerChar;

908             }

909         }

910         ++mInputRow;

911         mInputRowStart = mInputPtr;

912     }

913 

914     protected void skipTranslatedLF(int lf)

915         throws IOException, WstxException

916     {

917         if (lf == CHAR_CR) {

918             int c = nextTranslated();

919             if (c != CHAR_LF) {

920                 mInputPtr -= 1;

921             }

922         }

923         ++mInputRow;

924         mInputRowStart = mInputPtr;

925     }

926 

927     /**

928      * @return First character that does not match expected, if any;

929      *    CHAR_NULL if match succeeded

930      */

931     protected int checkMbKeyword(String expected)

932         throws IOException, WstxException

933     {

934         int len = expected.length();

935         

936         for (int ptr = 1; ptr < len; ++ptr) {

937             int c = nextMultiByte();

938             if (c == BYTE_NULL) {

939                 reportNull();

940             }

941             if (c != expected.charAt(ptr)) {

942               return c;

943             }

944         }

945 

946         return CHAR_NULL;

947     }

948 

949     protected int checkTranslatedKeyword(String expected)

950         throws IOException, WstxException

951     {

952         int len = expected.length();

953         

954         for (int ptr = 1; ptr < len; ++ptr) {

955             int c = nextTranslated();

956             if (c == BYTE_NULL) {

957                 reportNull();

958             }

959             if (c != expected.charAt(ptr)) {

960               return c;

961             }

962         }

963 

964         return CHAR_NULL;

965     }

966 

967     /*

968     ////////////////////////////////////////

969     // Other private methods:

970     ////////////////////////////////////////

971     */

972 

973     private void verifyEncoding(String id, int bpc)

974         throws WstxException

975     {

976         if (mByteSizeFound) {

977             /* Let's verify that if we matched an encoding, it's the same

978              * as what was declared...

979              */

980             if (bpc != mBytesPerChar) {

981                 // [WSTX-138]: Needs to detect EBCDIC discrepancy

982                 if (mEBCDIC) {

983                     reportXmlProblem("Declared encoding '"+id+"' incompatible with auto-detected physical encoding (EBCDIC variant), can not decode input since actual code page not known");

984                 }

985                 reportXmlProblem("Declared encoding '"+id+"' uses "+bpc

986                                  +" bytes per character; but physical encoding appeared to use "+mBytesPerChar+"; cannot decode");

987             }

988         }

989     }

990 

991     private void verifyEncoding(String id, int bpc, boolean bigEndian)

992         throws WstxException

993     {

994         if (mByteSizeFound) {

995             verifyEncoding(id, bpc);

996 

997             if (bigEndian != mBigEndian) {

998                 String bigStr = bigEndian ? "big" : "little";

999                 reportXmlProblem

1000                     ("Declared encoding '"+id+"' has different endianness ("

1001                      +bigStr+" endian) than what physical ordering appeared to be; cannot decode");

1002             }

1003         }

1004     }

1005 

1006     private void reportWeirdUCS4(String type)

1007         throws IOException

1008     {

1009         throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");

1010     }

1011 

1012     /*

1013     private void reportMissingBOM(String enc)

1014         throws WstxException

1015     {

1016         throw new WstxException("Missing BOM for encoding '"+enc+"'; can not be omitted",

1017                                 getLocation());

1018     }

1019     */

1020 }

1021