1 /* Woodstox XML processor
2 *
3 * Copyright (c) 2004- Tatu Saloranta, tatu.saloranta@iki.fi
4 *
5 * Licensed under the License specified in file LICENSE, included with
6 * the source code.
7 * You may not use this file except in compliance with the License.
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package com.ctc.wstx.sr;
17
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.net.URL;
21 import java.text.MessageFormat;
22 import java.util.Collections;
23 import java.util.HashMap;
24 import java.util.Map;
25
26 import javax.xml.stream.Location;
27 import javax.xml.stream.XMLInputFactory;
28 import javax.xml.stream.XMLReporter;
29 import javax.xml.stream.XMLResolver;
30 import javax.xml.stream.XMLStreamException;
31
32 import org.codehaus.stax2.XMLReporter2;
33 import org.codehaus.stax2.XMLStreamLocation2;
34 import org.codehaus.stax2.validation.XMLValidationProblem;
35
36 import com.ctc.wstx.api.ReaderConfig;
37 import com.ctc.wstx.cfg.ErrorConsts;
38 import com.ctc.wstx.cfg.InputConfigFlags;
39 import com.ctc.wstx.cfg.ParsingErrorMsgs;
40 import com.ctc.wstx.cfg.XmlConsts;
41 import com.ctc.wstx.dtd.MinimalDTDReader;
42 import com.ctc.wstx.ent.EntityDecl;
43 import com.ctc.wstx.ent.IntEntity;
44 import com.ctc.wstx.exc.*;
45 import com.ctc.wstx.io.DefaultInputResolver;
46 import com.ctc.wstx.io.WstxInputData;
47 import com.ctc.wstx.io.WstxInputLocation;
48 import com.ctc.wstx.io.WstxInputSource;
49 import com.ctc.wstx.util.ExceptionUtil;
50 import com.ctc.wstx.util.SymbolTable;
51 import com.ctc.wstx.util.TextBuffer;
52
53 /**
54 * Abstract base class that defines some basic functionality that all
55 * Woodstox reader classes (main XML reader, DTD reader) extend from.
56 */
57 public abstract class StreamScanner
58 extends WstxInputData
59 implements InputProblemReporter,
60 InputConfigFlags, ParsingErrorMsgs
61 {
62
63 // // // Some well-known chars:
64
65 /**
66 * Last (highest) char code of the three, LF, CR and NULL
67 */
68 public final static char CHAR_CR_LF_OR_NULL = (char) 13;
69
70 public final static int INT_CR_LF_OR_NULL = 13;
71
72 /**
73 * Character that allows quick check of whether a char can potentially
74 * be some kind of markup, WRT input stream processing;
75 * has to contain linefeeds, {@code &}, {@code <} and {@code >} (note: {@code >} only matters when
76 * quoting text, as part of {@code ]]>})
77 */
78 protected final static char CHAR_FIRST_PURE_TEXT = (char) ('>' + 1);
79
80
81 /**
82 * First character in Unicode (ie one with lowest id) that is legal
83 * as part of a local name (all valid name chars minus ':'). Used
84 * for doing quick check for local name end; usually name ends in
85 * a whitespace or equals sign.
86 */
87 protected final static char CHAR_LOWEST_LEGAL_LOCALNAME_CHAR = '-';
88
89 /*
90 ///////////////////////////////////////////////////////////////////////
91 // Character validity constants, structs
92 ///////////////////////////////////////////////////////////////////////
93 */
94
95 /**
96 * We will only use validity array for first 256 characters, mostly
97 * because after those characters it's easier to do fairly simple
98 * block checks.
99 */
100 private final static int VALID_CHAR_COUNT = 0x100;
101
102 private final static byte NAME_CHAR_INVALID_B = (byte) 0;
103 private final static byte NAME_CHAR_ALL_VALID_B = (byte) 1;
104 private final static byte NAME_CHAR_VALID_NONFIRST_B = (byte) -1;
105
106 private final static byte[] sCharValidity = new byte[VALID_CHAR_COUNT];
107
108 static {
109 // First, since all valid-as-first chars are also valid-as-other chars,
110 // we'll initialize common chars:
111 sCharValidity['_'] = NAME_CHAR_ALL_VALID_B;
112 for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
113 sCharValidity['A' + i] = NAME_CHAR_ALL_VALID_B;
114 sCharValidity['a' + i] = NAME_CHAR_ALL_VALID_B;
115 }
116 for (int i = 0xC0; i < 0xF6; ++i) { // not all are fully valid, but
117 sCharValidity[i] = NAME_CHAR_ALL_VALID_B;
118 }
119 // ... now we can 'revert' ones not fully valid:
120 sCharValidity[0xD7] = NAME_CHAR_INVALID_B;
121 sCharValidity[0xF7] = NAME_CHAR_INVALID_B;
122
123 // And then we can proceed with ones only valid-as-other.
124 sCharValidity['-'] = NAME_CHAR_VALID_NONFIRST_B;
125 sCharValidity['.'] = NAME_CHAR_VALID_NONFIRST_B;
126 sCharValidity[0xB7] = NAME_CHAR_VALID_NONFIRST_B;
127 for (int i = '0'; i <= '9'; ++i) {
128 sCharValidity[i] = NAME_CHAR_VALID_NONFIRST_B;
129 }
130 }
131
132 /**
133 * Public identifiers only use 7-bit ascii range.
134 */
135 private final static int VALID_PUBID_CHAR_COUNT = 0x80;
136 private final static byte[] sPubidValidity = new byte[VALID_PUBID_CHAR_COUNT];
137 // private final static byte PUBID_CHAR_INVALID_B = (byte) 0;
138 private final static byte PUBID_CHAR_VALID_B = (byte) 1;
139 static {
140 for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
141 sPubidValidity['A' + i] = PUBID_CHAR_VALID_B;
142 sPubidValidity['a' + i] = PUBID_CHAR_VALID_B;
143 }
144 for (int i = '0'; i <= '9'; ++i) {
145 sPubidValidity[i] = PUBID_CHAR_VALID_B;
146 }
147
148 // 3 main white space types are valid
149 sPubidValidity[0x0A] = PUBID_CHAR_VALID_B;
150 sPubidValidity[0x0D] = PUBID_CHAR_VALID_B;
151 sPubidValidity[0x20] = PUBID_CHAR_VALID_B;
152
153 // And many of punctuation/separator ascii chars too:
154 sPubidValidity['-'] = PUBID_CHAR_VALID_B;
155 sPubidValidity['\''] = PUBID_CHAR_VALID_B;
156 sPubidValidity['('] = PUBID_CHAR_VALID_B;
157 sPubidValidity[')'] = PUBID_CHAR_VALID_B;
158 sPubidValidity['+'] = PUBID_CHAR_VALID_B;
159 sPubidValidity[','] = PUBID_CHAR_VALID_B;
160 sPubidValidity['.'] = PUBID_CHAR_VALID_B;
161 sPubidValidity['/'] = PUBID_CHAR_VALID_B;
162 sPubidValidity[':'] = PUBID_CHAR_VALID_B;
163 sPubidValidity['='] = PUBID_CHAR_VALID_B;
164 sPubidValidity['?'] = PUBID_CHAR_VALID_B;
165 sPubidValidity[';'] = PUBID_CHAR_VALID_B;
166 sPubidValidity['!'] = PUBID_CHAR_VALID_B;
167 sPubidValidity['*'] = PUBID_CHAR_VALID_B;
168 sPubidValidity['#'] = PUBID_CHAR_VALID_B;
169 sPubidValidity['@'] = PUBID_CHAR_VALID_B;
170 sPubidValidity['$'] = PUBID_CHAR_VALID_B;
171 sPubidValidity['_'] = PUBID_CHAR_VALID_B;
172 sPubidValidity['%'] = PUBID_CHAR_VALID_B;
173 }
174
175 /*
176 ///////////////////////////////////////////////////////////////////////
177 // Basic configuration
178 ///////////////////////////////////////////////////////////////////////
179 */
180
181 /**
182 * Copy of the configuration object passed by the factory.
183 * Contains immutable settings for this reader (or in case
184 * of DTD parsers, reader that uses it)
185 */
186 protected final ReaderConfig mConfig;
187
188 // // // Various extracted settings:
189
190 /**
191 * If true, Reader is namespace aware, and should do basic checks
192 * (usually enforcing limitations on having colons in names)
193 */
194 protected final boolean mCfgNsEnabled;
195
196 // Extracted standard on/off settings:
197
198 /**
199 * note: left non-final on purpose: sub-class may need to modify
200 * the default value after construction.
201 */
202 protected boolean mCfgReplaceEntities;
203
204 /*
205 ///////////////////////////////////////////////////////////////////////
206 // Symbol handling, if applicable
207 ///////////////////////////////////////////////////////////////////////
208 */
209
210 final SymbolTable mSymbols;
211
212 /**
213 * Local full name for the event, if it has one (note: element events
214 * do NOT use this variable; those names are stored in element stack):
215 * target for processing instructions.
216 *<p>
217 * Currently used for proc. instr. target, and entity name (at least
218 * when current entity reference is null).
219 *<p>
220 * Note: this variable is generally not cleared, since it comes from
221 * a symbol table, ie. this won't be the only reference.
222 */
223 protected String mCurrName;
224
225 /*
226 ///////////////////////////////////////////////////////////////////////
227 // Input handling
228 ///////////////////////////////////////////////////////////////////////
229 */
230
231 /**
232 * Currently active input source; contains link to parent (nesting) input
233 * sources, if any.
234 */
235 protected WstxInputSource mInput;
236
237 /**
238 * Top-most input source this reader can use; due to input source
239 * chaining, this is not necessarily the root of all input; for example,
240 * external DTD subset reader's root input still has original document
241 * input as its parent.
242 */
243 protected final WstxInputSource mRootInput;
244
245 /**
246 * Custom resolver used to handle external entities that are to be expanded
247 * by this reader (external param/general entity expander)
248 */
249 protected XMLResolver mEntityResolver = null;
250
251 /**
252 * This is the current depth of the input stack (same as what input
253 * element stack would return as its depth).
254 * It is used to enforce input scope constraints for nesting of
255 * elements (for xml reader) and dtd declaration (for dtd reader)
256 * with regards to input block (entity expansion) boundaries.
257 *<p>
258 * Basically this value is compared to {@link #mInputTopDepth}, which
259 * indicates what was the depth at the point where the currently active
260 * input scope/block was started.
261 */
262 protected int mCurrDepth;
263
264 protected int mInputTopDepth;
265
266 /**
267 * Number of times a parsed general entity has been expanded; used for
268 * (optionally) limiting number of expansion to guard against
269 * denial-of-service attacks like "Billion Laughs".
270 *
271 * @since 4.3
272 */
273 protected int mEntityExpansionCount;
274
275 /**
276 * Flag that indicates whether linefeeds in the input data are to
277 * be normalized or not.
278 * Xml specs mandate that the line feeds are only normalized
279 * when they are from the external entities (main doc, external
280 * general/parsed entities), so normalization has to be
281 * suppressed when expanding internal general/parsed entities.
282 */
283 protected boolean mNormalizeLFs;
284
285 /**
286 * Flag that indicates whether all escaped chars are accepted in XML 1.0.
287 *
288 * @since 5.2
289 */
290 protected boolean mAllowXml11EscapedCharsInXml10;
291
292 /*
293 ///////////////////////////////////////////////////////////////////////
294 // Buffer(s) for local name(s) and text content
295 ///////////////////////////////////////////////////////////////////////
296 */
297
298 /**
299 * Temporary buffer used if local name can not be just directly
300 * constructed from input buffer (name is on a boundary or such).
301 */
302 protected char[] mNameBuffer = null;
303
304 /*
305 ///////////////////////////////////////////////////////////////////////
306 // Information about starting location of event
307 // Reader is pointing to; updated on-demand
308 ///////////////////////////////////////////////////////////////////////
309 */
310
311 // // // Location info at point when current token was started
312
313 /**
314 * Total number of characters read before start of current token.
315 * For big (gigabyte-sized) sizes are possible, needs to be long,
316 * unlike pointers and sizes related to in-memory buffers.
317 */
318 protected long mTokenInputTotal = 0;
319
320 /**
321 * Input row on which current token starts, 1-based
322 */
323 protected int mTokenInputRow = 1;
324
325 /**
326 * Column on input row that current token starts; 0-based (although
327 * in the end it'll be converted to 1-based)
328 */
329 protected int mTokenInputCol = 0;
330
331 /*
332 ///////////////////////////////////////////////////////////////////////
333 // XML document information (from doc decl if one was found) common to
334 // all entities (main xml document, external DTD subset)
335 ///////////////////////////////////////////////////////////////////////
336 */
337
338 /**
339 * Input stream encoding, if known (passed in, or determined by
340 * auto-detection); null if not.
341 */
342 protected String mDocInputEncoding = null;
343
344 /**
345 * Character encoding from xml declaration, if any; null if no
346 * declaration, or it didn't specify encoding.
347 */
348 protected String mDocXmlEncoding = null;
349
350 /**
351 * XML version as declared by the document; one of constants
352 * from {@link XmlConsts} (like {@link XmlConsts#XML_V_10}).
353 */
354 protected int mDocXmlVersion = XmlConsts.XML_V_UNKNOWN;
355
356 /**
357 * Cache of internal character entities;
358 */
359 protected Map<String,IntEntity> mCachedEntities;
360
361 /**
362 * Flag for whether or not character references should be treated as entities
363 */
364 protected boolean mCfgTreatCharRefsAsEntities;
365
366 /**
367 * Entity reference stream currently points to.
368 */
369 protected EntityDecl mCurrEntity;
370
371 /*
372 ///////////////////////////////////////////////////////////////////////
373 // Life-cycle
374 ///////////////////////////////////////////////////////////////////////
375 */
376
377 /**
378 * Constructor used when creating a complete new (main-level) reader that
379 * does not share its input buffers or state with another reader.
380 */
381 protected StreamScanner(WstxInputSource input, ReaderConfig cfg,
382 XMLResolver res)
383 {
384 super();
385 mInput = input;
386 // 17-Jun-2004, TSa: Need to know root-level input source
387 mRootInput = input;
388
389 mConfig = cfg;
390 mSymbols = cfg.getSymbols();
391 int cf = cfg.getConfigFlags();
392 mCfgNsEnabled = (cf & CFG_NAMESPACE_AWARE) != 0;
393 mCfgReplaceEntities = (cf & CFG_REPLACE_ENTITY_REFS) != 0;
394
395 mAllowXml11EscapedCharsInXml10 = mConfig.willAllowXml11EscapedCharsInXml10();
396
397 mNormalizeLFs = mConfig.willNormalizeLFs();
398 mInputBuffer = null;
399 mInputPtr = mInputEnd = 0;
400 mEntityResolver = res;
401
402 mCfgTreatCharRefsAsEntities = mConfig.willTreatCharRefsAsEnts();
403 if (mCfgTreatCharRefsAsEntities) {
404 mCachedEntities = new HashMap<String,IntEntity>();
405 } else {
406 mCachedEntities = Collections.emptyMap();
407 }
408 }
409
410 /**
411 * @since 5.2
412 */
413 public ReaderConfig getConfig() {
414 return mConfig;
415 }
416
417 /*
418 ///////////////////////////////////////////////////////////////////////
419 // Package API
420 ///////////////////////////////////////////////////////////////////////
421 */
422
423 /**
424 * Method that returns location of the last character returned by this
425 * reader; that is, location "one less" than the currently pointed to
426 * location.
427 */
428 protected WstxInputLocation getLastCharLocation()
429 {
430 return mInput.getLocation(mCurrInputProcessed + mInputPtr - 1,
431 mCurrInputRow, mInputPtr - mCurrInputRowStart);
432 }
433
434 protected URL getSource() throws IOException {
435 return mInput.getSource();
436 }
437
438 protected String getSystemId() {
439 return mInput.getSystemId();
440 }
441
442 /*
443 ///////////////////////////////////////////////////////////////////////
444 // Partial `LocationInfo` implementation (not implemented
445 // by this base class, but is by some sub-classes)
446 ///////////////////////////////////////////////////////////////////////
447 */
448
449 /**
450 * Returns location of last properly parsed token; as per StAX specs,
451 * apparently needs to be the end of current event, which is the same
452 * as the start of the following event (or EOF if that's next).
453 */
454 @Override
455 public abstract Location getLocation();
456
457 public XMLStreamLocation2 getStartLocation()
458 {
459 // note: +1 is used as columns are 1-based...
460 return mInput.getLocation(mTokenInputTotal,
461 mTokenInputRow, mTokenInputCol + 1);
462 }
463
464 public XMLStreamLocation2 getCurrentLocation()
465 {
466 return mInput.getLocation(mCurrInputProcessed + mInputPtr,
467 mCurrInputRow, mInputPtr - mCurrInputRowStart + 1);
468 }
469
470 /*
471 ///////////////////////////////////////////////////////////////////////
472 // InputProblemReporter implementation
473 ///////////////////////////////////////////////////////////////////////
474 */
475
476 public WstxException throwWfcException(String msg, boolean deferErrors)
477 throws WstxException
478 {
479 WstxException ex = constructWfcException(msg);
480 if (!deferErrors) {
481 throw ex;
482 }
483 return ex;
484 }
485
486 @Override
487 public void throwParseError(String msg) throws XMLStreamException {
488 throwParseError(msg, null, null);
489 }
490
491 /**
492 * Throws generic parse error with specified message and current parsing
493 * location.
494 *<p>
495 * Note: public access only because core code in other packages needs
496 * to access it.
497 */
498 @Override
499 public void throwParseError(String format, Object arg, Object arg2)
500 throws XMLStreamException
501 {
502 String msg = (arg == null && arg2 == null) ? format
503 : MessageFormat.format(format, new Object[] { arg, arg2 });
504 throw constructWfcException(msg);
505 }
506
507 public void reportProblem(String probType, String format, Object arg, Object arg2)
508 throws XMLStreamException
509 {
510 XMLReporter rep = mConfig.getXMLReporter();
511 if (rep != null) {
512 _reportProblem(rep, probType,
513 MessageFormat.format(format, new Object[] { arg, arg2 }), null);
514 }
515 }
516
517 @Override
518 public void reportProblem(Location loc, String probType,
519 String format, Object arg, Object arg2)
520 throws XMLStreamException
521 {
522 XMLReporter rep = mConfig.getXMLReporter();
523 if (rep != null) {
524 String msg = (arg != null || arg2 != null) ?
525 MessageFormat.format(format, new Object[] { arg, arg2 }) : format;
526 _reportProblem(rep, probType, msg, loc);
527 }
528 }
529
530 protected void _reportProblem(XMLReporter rep, String probType, String msg, Location loc)
531 throws XMLStreamException
532 {
533 if (loc == null) {
534 loc = getLastCharLocation();
535 }
536 _reportProblem(rep, new XMLValidationProblem(loc, msg, XMLValidationProblem.SEVERITY_ERROR, probType));
537 }
538
539 protected void _reportProblem(XMLReporter rep, XMLValidationProblem prob)
540 throws XMLStreamException
541 {
542 if (rep != null) {
543 Location loc = prob.getLocation();
544 if (loc == null) {
545 loc = getLastCharLocation();
546 prob.setLocation(loc);
547 }
548 // Backwards-compatibility fix: add non-null type, if missing:
549 if (prob.getType() == null) {
550 prob.setType(ErrorConsts.WT_VALIDATION);
551 }
552 // [WSTX-154]: was catching and dropping thrown exception: shouldn't.
553 // [WTSX-157]: need to support XMLReporter2
554 if (rep instanceof XMLReporter2) {
555 ((XMLReporter2) rep).report(prob);
556 } else {
557 rep.report(prob.getMessage(), prob.getType(), prob, loc);
558 }
559 }
560 }
561
562 /**
563 *<p>
564 * Note: this is the base implementation used for implementing
565 * <code>ValidationContext</code>
566 */
567 @Override
568 public void reportValidationProblem(XMLValidationProblem prob)
569 throws XMLStreamException
570 {
571 // !!! TBI: Fail-fast vs. deferred modes?
572 /* For now let's implement basic functionality: warnings get
573 * reported via XMLReporter, errors and fatal errors result in
574 * immediate exceptions.
575 */
576 /* 27-May-2008, TSa: [WSTX-153] Above is incorrect: as per Stax
577 * javadocs for XMLReporter, both warnings and non-fatal errors
578 * (which includes all validation errors) should be reported via
579 * XMLReporter interface, and only fatals should cause an
580 * immediate stream exception (by-passing reporter)
581 */
582 if (prob.getSeverity() > XMLValidationProblem.SEVERITY_ERROR) {
583 throw WstxValidationException.create(prob);
584 }
585 XMLReporter rep = mConfig.getXMLReporter();
586 if (rep != null) {
587 _reportProblem(rep, prob);
588 } else {
589 /* If no reporter, regular non-fatal errors are to be reported
590 * as exceptions as well, for backwards compatibility
591 */
592 if (prob.getSeverity() >= XMLValidationProblem.SEVERITY_ERROR) {
593 throw WstxValidationException.create(prob);
594 }
595 }
596 }
597
598 public void reportValidationProblem(String msg, int severity)
599 throws XMLStreamException
600 {
601 reportValidationProblem(new XMLValidationProblem(getLastCharLocation(),
602 msg, severity));
603 }
604
605 @Override
606 public void reportValidationProblem(String msg)
607 throws XMLStreamException
608 {
609 reportValidationProblem(new XMLValidationProblem(getLastCharLocation(), msg,
610 XMLValidationProblem.SEVERITY_ERROR));
611 }
612
613 public void reportValidationProblem(Location loc, String msg)
614 throws XMLStreamException
615 {
616 reportValidationProblem(new XMLValidationProblem(loc, msg));
617 }
618
619 @Override
620 public void reportValidationProblem(String format, Object arg, Object arg2)
621 throws XMLStreamException
622 {
623 reportValidationProblem(MessageFormat.format(format, new Object[] { arg, arg2 }));
624 }
625
626 /*
627 ///////////////////////////////////////////////////////////////////////
628 // Other error reporting methods
629 ///////////////////////////////////////////////////////////////////////
630 */
631
632 protected WstxException constructWfcException(String msg)
633 {
634 return new WstxParsingException(msg, getLastCharLocation());
635 }
636
637 /**
638 * Construct and return a {@link XMLStreamException} to throw
639 * as a result of a failed Typed Access operation (but one not
640 * caused by a Well-Formedness Constraint or Validation Constraint
641 * problem)
642 */
643 /*
644 protected WstxException _constructTypeException(String msg)
645 {
646 // Hmmh. Should there be a distinct sub-type?
647 return new WstxParsingException(msg, getLastCharLocation());
648 }
649 */
650
651 protected WstxException constructFromIOE(IOException ioe)
652 {
653 return new WstxIOException(ioe);
654 }
655
656 protected WstxException constructNullCharException()
657 {
658 return new WstxUnexpectedCharException("Illegal character (NULL, unicode 0) encountered: not valid in any content",
659 getLastCharLocation(), CHAR_NULL);
660 }
661
662 protected void throwUnexpectedChar(int i, String msg) throws WstxException
663 {
664 char c = (char) i;
665 String excMsg = "Unexpected character "+getCharDesc(c)+msg;
666 throw new WstxUnexpectedCharException(excMsg, getLastCharLocation(), c);
667 }
668
669 protected void throwNullChar() throws WstxException {
670 throw constructNullCharException();
671 }
672
673 protected void throwInvalidSpace(int i) throws WstxException {
674 throwInvalidSpace(i, false);
675 }
676
677 protected WstxException throwInvalidSpace(int i, boolean deferErrors)
678 throws WstxException
679 {
680 char c = (char) i;
681 WstxException ex;
682 if (c == CHAR_NULL) {
683 ex = constructNullCharException();
684 } else {
685 String msg = "Illegal character ("+getCharDesc(c)+")";
686 if (mXml11) {
687 msg += " [note: in XML 1.1, it could be included via entity expansion]";
688 }
689 ex = new WstxUnexpectedCharException(msg, getLastCharLocation(), c);
690 }
691 if (!deferErrors) {
692 throw ex;
693 }
694 return ex;
695 }
696
697 protected void throwUnexpectedEOF(String msg)
698 throws WstxException
699 {
700 throw new WstxEOFException("Unexpected EOF"+(msg == null ? "" : msg),
701 getLastCharLocation());
702 }
703
704 /**
705 * Similar to {@link #throwUnexpectedEOF}, but only indicates ending
706 * of an input block. Used when reading a token that can not span
707 * input block boundaries (ie. can not continue past end of an
708 * entity expansion).
709 */
710 protected void throwUnexpectedEOB(String msg)
711 throws WstxException
712 {
713 throw new WstxEOFException("Unexpected end of input block"+(msg == null ? "" : msg),
714 getLastCharLocation());
715 }
716
717 protected void throwFromIOE(IOException ioe) throws WstxException {
718 throw new WstxIOException(ioe);
719 }
720
721 protected void throwFromStrE(XMLStreamException strex)
722 throws WstxException
723 {
724 if (strex instanceof WstxException) {
725 throw (WstxException) strex;
726 }
727 throw new WstxException(strex);
728 }
729
730 /**
731 * Method called to report an error, when caller's signature only
732 * allows runtime exceptions to be thrown.
733 */
734 protected void throwLazyError(Exception e)
735 {
736 if (e instanceof XMLStreamException) {
737 WstxLazyException.throwLazily((XMLStreamException) e);
738 }
739 ExceptionUtil.throwRuntimeException(e);
740 }
741
742 protected String tokenTypeDesc(int type) {
743 return ErrorConsts.tokenTypeDesc(type);
744 }
745
746 /*
747 ///////////////////////////////////////////////////////////////////////
748 // Input buffer handling
749 ///////////////////////////////////////////////////////////////////////
750 */
751
752 /**
753 * Returns current input source this source uses.
754 *<p>
755 * Note: public only because some implementations are on different
756 * package.
757 */
758 public final WstxInputSource getCurrentInput() {
759 return mInput;
760 }
761
762 protected final int inputInBuffer() {
763 return mInputEnd - mInputPtr;
764 }
765
766 @SuppressWarnings("cast")
767 protected final int getNext() throws XMLStreamException
768 {
769 if (mInputPtr >= mInputEnd) {
770 if (!loadMore()) {
771 return -1;
772 }
773 }
774 return (int) mInputBuffer[mInputPtr++];
775 }
776
777 /**
778 * Similar to {@link #getNext}, but does not advance pointer
779 * in input buffer.
780 *<p>
781 * Note: this method only peeks within current input source;
782 * it does not close it and check nested input source (if any).
783 * This is necessary when checking keywords, since they can never
784 * cross input block boundary.
785 */
786 @SuppressWarnings("cast")
787 protected final int peekNext()
788 throws XMLStreamException
789 {
790 if (mInputPtr >= mInputEnd) {
791 if (!loadMoreFromCurrent()) {
792 return -1;
793 }
794 }
795 return (int) mInputBuffer[mInputPtr];
796 }
797
798 protected final char getNextChar(String errorMsg)
799 throws XMLStreamException
800 {
801 if (mInputPtr >= mInputEnd) {
802 loadMore(errorMsg);
803 }
804 return mInputBuffer[mInputPtr++];
805 }
806
807 /**
808 * Similar to {@link #getNextChar}, but will not read more characters
809 * from parent input source(s) if the current input source doesn't
810 * have more content. This is often needed to prevent "runaway" content,
811 * such as comments that start in an entity but do not have matching
812 * close marker inside entity; XML specification specifically states
813 * such markup is not legal.
814 */
815 protected final char getNextCharFromCurrent(String errorMsg)
816 throws XMLStreamException
817 {
818 if (mInputPtr >= mInputEnd) {
819 loadMoreFromCurrent(errorMsg);
820 }
821 return mInputBuffer[mInputPtr++];
822 }
823
824 /**
825 * Method that will skip through zero or more white space characters,
826 * and return either the character following white space, or -1 to
827 * indicate EOF (end of the outermost input source)/
828 */
829 @SuppressWarnings("cast")
830 protected final int getNextAfterWS()
831 throws XMLStreamException
832 {
833 if (mInputPtr >= mInputEnd) {
834 if (!loadMore()) {
835 return -1;
836 }
837 }
838 char c = mInputBuffer[mInputPtr++];
839 while (c <= CHAR_SPACE) {
840 // Linefeed?
841 if (c == '\n' || c == '\r') {
842 skipCRLF(c);
843 } else if (c != CHAR_SPACE && c != '\t') {
844 throwInvalidSpace(c);
845 }
846 // Still a white space?
847 if (mInputPtr >= mInputEnd) {
848 if (!loadMore()) {
849 return -1;
850 }
851 }
852 c = mInputBuffer[mInputPtr++];
853 }
854 return (int) c;
855 }
856
857 protected final char getNextCharAfterWS(String errorMsg)
858 throws XMLStreamException
859 {
860 if (mInputPtr >= mInputEnd) {
861 loadMore(errorMsg);
862 }
863
864 char c = mInputBuffer[mInputPtr++];
865 while (c <= CHAR_SPACE) {
866 // Linefeed?
867 if (c == '\n' || c == '\r') {
868 skipCRLF(c);
869 } else if (c != CHAR_SPACE && c != '\t') {
870 throwInvalidSpace(c);
871 }
872
873 // Still a white space?
874 if (mInputPtr >= mInputEnd) {
875 loadMore(errorMsg);
876 }
877 c = mInputBuffer[mInputPtr++];
878 }
879 return c;
880 }
881
882 protected final char getNextInCurrAfterWS(String errorMsg)
883 throws XMLStreamException
884 {
885 return getNextInCurrAfterWS(errorMsg, getNextCharFromCurrent(errorMsg));
886 }
887
888 protected final char getNextInCurrAfterWS(String errorMsg, char c)
889 throws XMLStreamException
890 {
891 while (c <= CHAR_SPACE) {
892 // Linefeed?
893 if (c == '\n' || c == '\r') {
894 skipCRLF(c);
895 } else if (c != CHAR_SPACE && c != '\t') {
896 throwInvalidSpace(c);
897 }
898
899 // Still a white space?
900 if (mInputPtr >= mInputEnd) {
901 loadMoreFromCurrent(errorMsg);
902 }
903 c = mInputBuffer[mInputPtr++];
904 }
905 return c;
906 }
907
908 /**
909 * Method called when a CR has been spotted in input; checks if next
910 * char is LF, and if so, skips it. Note that next character has to
911 * come from the current input source, to qualify; it can never come
912 * from another (nested) input source.
913 *
914 * @return True, if passed in char is '\r' and next one is '\n'.
915 */
916 protected final boolean skipCRLF(char c)
917 throws XMLStreamException
918 {
919 boolean result;
920
921 if (c == '\r' && peekNext() == '\n') {
922 ++mInputPtr;
923 result = true;
924 } else {
925 result = false;
926 }
927 ++mCurrInputRow;
928 mCurrInputRowStart = mInputPtr;
929 return result;
930 }
931
932 protected final void markLF() {
933 ++mCurrInputRow;
934 mCurrInputRowStart = mInputPtr;
935 }
936
937 protected final void markLF(int inputPtr) {
938 ++mCurrInputRow;
939 mCurrInputRowStart = inputPtr;
940 }
941
942 /**
943 * Method to push back last character read; can only be called once,
944 * that is, no more than one char can be guaranteed to be succesfully
945 * returned.
946 */
947 protected final void pushback() { --mInputPtr; }
948
949 /*
950 ///////////////////////////////////////////////////////////////////////
951 // Sub-class overridable input handling methods
952 ///////////////////////////////////////////////////////////////////////
953 */
954
955 /**
956 * Method called when an entity has been expanded (new input source
957 * has been created). Needs to initialize location information and change
958 * active input source.
959 *
960 * @param entityId Name of the entity being expanded
961 */
962 protected void initInputSource(WstxInputSource newInput, boolean isExt,
963 String entityId)
964 throws XMLStreamException
965 {
966 // Let's make sure new input will be read next time input is needed:
967 mInputPtr = 0;
968 mInputEnd = 0;
969 /* Plus, reset the input location so that'll be accurate for
970 * error reporting etc.
971 */
972 mInputTopDepth = mCurrDepth;
973
974 // [WSTX-296]: Check for entity expansion depth against configurable limit
975 int entityDepth = mInput.getEntityDepth() + 1;
976 verifyLimit("Maximum entity expansion depth", mConfig.getMaxEntityDepth(), entityDepth);
977 mInput = newInput;
978 mInput.initInputLocation(this, mCurrDepth, entityDepth);
979
980 /* 21-Feb-2006, TSa: Linefeeds are NOT normalized when expanding
981 * internal entities (XML, 2.11)
982 */
983 if (isExt) {
984 mNormalizeLFs = true;
985 } else {
986 mNormalizeLFs = false;
987 }
988 }
989
990 /**
991 * Method that will try to read one or more characters from currently
992 * open input sources; closing input sources if necessary.
993 *
994 * @return true if reading succeeded (or may succeed), false if
995 * we reached EOF.
996 */
997 protected boolean loadMore()
998 throws XMLStreamException
999 {
1000 WstxInputSource input = mInput;
1001 do {
1002 /* Need to make sure offsets are properly updated for error
1003 * reporting purposes, and do this now while previous amounts
1004 * are still known.
1005 */
1006 mCurrInputProcessed += mInputEnd;
1007 verifyLimit("Maximum document characters", mConfig.getMaxCharacters(), mCurrInputProcessed);
1008 mCurrInputRowStart -= mInputEnd;
1009 int count;
1010 try {
1011 count = input.readInto(this);
1012 if (count > 0) {
1013 return true;
1014 }
1015 input.close();
1016 } catch (IOException ioe) {
1017 throw constructFromIOE(ioe);
1018 }
1019 if (input == mRootInput) {
1020 /* Note: no need to check entity/input nesting in this
1021 * particular case, since it will be handled by higher level
1022 * parsing code (results in an unexpected EOF)
1023 */
1024 return false;
1025 }
1026 WstxInputSource parent = input.getParent();
1027 if (parent == null) { // sanity check!
1028 throwNullParent(input);
1029 }
1030 /* 13-Feb-2006, TSa: Ok, do we violate a proper nesting constraints
1031 * with this input block closure?
1032 */
1033 if (mCurrDepth != input.getScopeId()) {
1034 handleIncompleteEntityProblem(input);
1035 }
1036
1037 mInput = input = parent;
1038 input.restoreContext(this);
1039 mInputTopDepth = input.getScopeId();
1040 /* 21-Feb-2006, TSa: Since linefeed normalization needs to be
1041 * suppressed for internal entity expansion, we may need to
1042 * change the state...
1043 */
1044 if (!mNormalizeLFs) {
1045 mNormalizeLFs = !input.fromInternalEntity();
1046 }
1047 // Maybe there are leftovers from that input in buffer now?
1048 } while (mInputPtr >= mInputEnd);
1049
1050 return true;
1051 }
1052
1053 protected final boolean loadMore(String errorMsg)
1054 throws XMLStreamException
1055 {
1056 if (!loadMore()) {
1057 throwUnexpectedEOF(errorMsg);
1058 }
1059 return true;
1060 }
1061
1062 protected boolean loadMoreFromCurrent()
1063 throws XMLStreamException
1064 {
1065 // Need to update offsets properly
1066 mCurrInputProcessed += mInputEnd;
1067 mCurrInputRowStart -= mInputEnd;
1068 verifyLimit("Maximum document characters", mConfig.getMaxCharacters(), mCurrInputProcessed);
1069 try {
1070 int count = mInput.readInto(this);
1071 return (count > 0);
1072 } catch (IOException ie) {
1073 throw constructFromIOE(ie);
1074 }
1075 }
1076
1077 protected final boolean loadMoreFromCurrent(String errorMsg)
1078 throws XMLStreamException
1079 {
1080 if (!loadMoreFromCurrent()) {
1081 throwUnexpectedEOB(errorMsg);
1082 }
1083 return true;
1084 }
1085
1086 /**
1087 * Method called to make sure current main-level input buffer has at
1088 * least specified number of characters available consequtively,
1089 * without having to call {@link #loadMore}. It can only be called
1090 * when input comes from main-level buffer; further, call can shift
1091 * content in input buffer, so caller has to flush any data still
1092 * pending. In short, caller has to know exactly what it's doing. :-)
1093 *<p>
1094 * Note: method does not check for any other input sources than the
1095 * current one -- if current source can not fulfill the request, a
1096 * failure is indicated.
1097 *
1098 * @return true if there's now enough data; false if not (EOF)
1099 */
1100 protected boolean ensureInput(int minAmount)
1101 throws XMLStreamException
1102 {
1103 int currAmount = mInputEnd - mInputPtr;
1104 if (currAmount >= minAmount) {
1105 return true;
1106 }
1107 try {
1108 return mInput.readMore(this, minAmount);
1109 } catch (IOException ie) {
1110 throw constructFromIOE(ie);
1111 }
1112 }
1113
1114 protected void closeAllInput(boolean force)
1115 throws XMLStreamException
1116 {
1117 WstxInputSource input = mInput;
1118 while (true) {
1119 try {
1120 if (force) {
1121 input.closeCompletely();
1122 } else {
1123 input.close();
1124 }
1125 } catch (IOException ie) {
1126 throw constructFromIOE(ie);
1127 }
1128 if (input == mRootInput) {
1129 break;
1130 }
1131 WstxInputSource parent = input.getParent();
1132 if (parent == null) { // sanity check!
1133 throwNullParent(input);
1134 }
1135 mInput = input = parent;
1136 }
1137 }
1138
1139 /**
1140 * @param curr Input source currently in use
1141 */
1142 protected void throwNullParent(WstxInputSource curr)
1143 {
1144 throw new IllegalStateException(ErrorConsts.ERR_INTERNAL);
1145 //throw new IllegalStateException("Internal error: null parent for input source '"+curr+"'; should never occur (should have stopped at root input '"+mRootInput+"').");
1146 }
1147
1148 /*
1149 ///////////////////////////////////////////////////////////////////////
1150 // Entity resolution
1151 ///////////////////////////////////////////////////////////////////////
1152 */
1153
1154 /**
1155 * Method that tries to resolve a character entity, or (if caller so
1156 * specifies), a pre-defined internal entity (lt, gt, amp, apos, quot).
1157 * It will succeed iff:
1158 * <ol>
1159 * <li>Entity in question is a simple character entity (either one of
1160 * 5 pre-defined ones, or using decimal/hex notation), AND
1161 * <li>
1162 * <li>Entity fits completely inside current input buffer.
1163 * <li>
1164 * </ol>
1165 * If so, character value of entity is returned. Character 0 is returned
1166 * otherwise; if so, caller needs to do full resolution.
1167 *<p>
1168 * Note: On entry we are guaranteed there are at least 3 more characters
1169 * in this buffer; otherwise we shouldn't be called.
1170 *
1171 * @param checkStd If true, will check pre-defined internal entities
1172 * (gt, lt, amp, apos, quot); if false, will only check actual
1173 * character entities.
1174 *
1175 * @return (Valid) character value, if entity is a character reference,
1176 * and could be resolved from current input buffer (does not span
1177 * buffer boundary); null char (code 0) if not (either non-char
1178 * entity, or spans input buffer boundary).
1179 */
1180 protected int resolveSimpleEntity(boolean checkStd)
1181 throws XMLStreamException
1182 {
1183 char[] buf = mInputBuffer;
1184 int ptr = mInputPtr;
1185 char c = buf[ptr++];
1186
1187 // Numeric reference?
1188 if (c == '#') {
1189 c = buf[ptr++];
1190 int value = 0;
1191 int inputLen = mInputEnd;
1192 if (c == 'x') { // hex
1193 while (ptr < inputLen) {
1194 c = buf[ptr++];
1195 if (c == ';') {
1196 break;
1197 }
1198 value = value << 4;
1199 if (c <= '9' && c >= '0') {
1200 value += (c - '0');
1201 } else if (c >= 'a' && c <= 'f') {
1202 value += (10 + (c - 'a'));
1203 } else if (c >= 'A' && c <= 'F') {
1204 value += (10 + (c - 'A'));
1205 } else {
1206 mInputPtr = ptr; // so error points to correct char
1207 throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
1208 }
1209 /* Need to check for overflow; easiest to do right as
1210 * it happens...
1211 */
1212 if (value > MAX_UNICODE_CHAR) {
1213 reportUnicodeOverflow();
1214 }
1215 }
1216 } else { // numeric (decimal)
1217 while (c != ';') {
1218 if (c <= '9' && c >= '0') {
1219 value = (value * 10) + (c - '0');
1220 // Overflow?
1221 if (value > MAX_UNICODE_CHAR) {
1222 reportUnicodeOverflow();
1223 }
1224 } else {
1225 mInputPtr = ptr; // so error points to correct char
1226 throwUnexpectedChar(c, "; expected a decimal number.");
1227 }
1228 if (ptr >= inputLen) {
1229 break;
1230 }
1231 c = buf[ptr++];
1232 }
1233 }
1234 /* We get here either if we got it all, OR if we ran out of
1235 * input in current buffer.
1236 */
1237 if (c == ';') { // got the full thing
1238 mInputPtr = ptr;
1239 validateChar(value);
1240 return value;
1241 }
1242
1243 /* If we ran out of input, need to just fall back, gets
1244 * resolved via 'full' resolution mechanism.
1245 */
1246 } else if (checkStd) {
1247 /* Caller may not want to resolve these quite yet...
1248 * (when it wants separate events for non-char entities)
1249 */
1250 if (c == 'a') { // amp or apos?
1251 c = buf[ptr++];
1252
1253 if (c == 'm') { // amp?
1254 if (buf[ptr++] == 'p') {
1255 if (ptr < mInputEnd && buf[ptr++] == ';') {
1256 mInputPtr = ptr;
1257 return '&';
1258 }
1259 }
1260 } else if (c == 'p') { // apos?
1261 if (buf[ptr++] == 'o') {
1262 int len = mInputEnd;
1263 if (ptr < len && buf[ptr++] == 's') {
1264 if (ptr < len && buf[ptr++] == ';') {
1265 mInputPtr = ptr;
1266 return '\'';
1267 }
1268 }
1269 }
1270 }
1271 } else if (c == 'g') { // gt?
1272 if (buf[ptr++] == 't' && buf[ptr++] == ';') {
1273 mInputPtr = ptr;
1274 return '>';
1275 }
1276 } else if (c == 'l') { // lt?
1277 if (buf[ptr++] == 't' && buf[ptr++] == ';') {
1278 mInputPtr = ptr;
1279 return '<';
1280 }
1281 } else if (c == 'q') { // quot?
1282 if (buf[ptr++] == 'u' && buf[ptr++] == 'o') {
1283 int len = mInputEnd;
1284 if (ptr < len && buf[ptr++] == 't') {
1285 if (ptr < len && buf[ptr++] == ';') {
1286 mInputPtr = ptr;
1287 return '"';
1288 }
1289 }
1290 }
1291 }
1292 }
1293 return 0;
1294 }
1295
1296 /**
1297 * Method called to resolve character entities, and only character
1298 * entities (except that pre-defined char entities -- amp, apos, lt,
1299 * gt, quote -- MAY be "char entities" in this sense, depending on
1300 * arguments).
1301 * Otherwise it is to return the null char; if so,
1302 * the input pointer will point to the same point as when method
1303 * entered (char after ampersand), plus the ampersand itself is
1304 * guaranteed to be in the input buffer (so caller can just push it
1305 * back if necessary).
1306 *<p>
1307 * Most often this method is called when reader is not to expand
1308 * non-char entities automatically, but to return them as separate
1309 * events.
1310 *<p>
1311 * Main complication here is that we need to do 5-char lookahead. This
1312 * is problematic if chars are on input buffer boundary. This is ok
1313 * for the root level input buffer, but not for some nested buffers.
1314 * However, according to XML specs, such split entities are actually
1315 * illegal... so we can throw an exception in those cases.
1316 *
1317 * @param checkStd If true, will check pre-defined internal entities
1318 * (gt, lt, amp, apos, quot) as character entities; if false, will only
1319 * check actual 'real' character entities.
1320 *
1321 * @return (Valid) character value, if entity is a character reference,
1322 * and could be resolved from current input buffer (does not span
1323 * buffer boundary); null char (code 0) if not (either non-char
1324 * entity, or spans input buffer boundary).
1325 */
1326 protected int resolveCharOnlyEntity(boolean checkStd)
1327 throws XMLStreamException
1328 {
1329 //int avail = inputInBuffer();
1330 int avail = mInputEnd - mInputPtr;
1331 if (avail < 6) {
1332 // split entity, or buffer boundary
1333 /* Don't want to lose leading '&' (in case we can not expand
1334 * the entity), so let's push it back first
1335 */
1336 --mInputPtr;
1337 /* Shortest valid reference would be 3 chars ('&a;'); which
1338 * would only be legal from an expanded entity...
1339 */
1340 if (!ensureInput(6)) {
1341 avail = inputInBuffer();
1342 if (avail < 3) {
1343 throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF);
1344 }
1345 } else {
1346 avail = 6;
1347 }
1348 // ... and now we can move pointer back as well:
1349 ++mInputPtr;
1350 }
1351
1352 /* Ok, now we have one more character to check, and that's enough
1353 * to determine type decisively.
1354 */
1355 char c = mInputBuffer[mInputPtr];
1356
1357 // A char reference?
1358 if (c == '#') { // yup
1359 ++mInputPtr;
1360 return resolveCharEnt(null);
1361 }
1362
1363 // nope... except may be a pre-def?
1364 if (checkStd) {
1365 if (c == 'a') {
1366 char d = mInputBuffer[mInputPtr+1];
1367 if (d == 'm') {
1368 if (avail >= 4
1369 && mInputBuffer[mInputPtr+2] == 'p'
1370 && mInputBuffer[mInputPtr+3] == ';') {
1371 mInputPtr += 4;
1372 return '&';
1373 }
1374 } else if (d == 'p') {
1375 if (avail >= 5
1376 && mInputBuffer[mInputPtr+2] == 'o'
1377 && mInputBuffer[mInputPtr+3] == 's'
1378 && mInputBuffer[mInputPtr+4] == ';') {
1379 mInputPtr += 5;
1380 return '\'';
1381 }
1382 }
1383 } else if (c == 'l') {
1384 if (avail >= 3
1385 && mInputBuffer[mInputPtr+1] == 't'
1386 && mInputBuffer[mInputPtr+2] == ';') {
1387 mInputPtr += 3;
1388 return '<';
1389 }
1390 } else if (c == 'g') {
1391 if (avail >= 3
1392 && mInputBuffer[mInputPtr+1] == 't'
1393 && mInputBuffer[mInputPtr+2] == ';') {
1394 mInputPtr += 3;
1395 return '>';
1396 }
1397 } else if (c == 'q') {
1398 if (avail >= 5
1399 && mInputBuffer[mInputPtr+1] == 'u'
1400 && mInputBuffer[mInputPtr+2] == 'o'
1401 && mInputBuffer[mInputPtr+3] == 't'
1402 && mInputBuffer[mInputPtr+4] == ';') {
1403 mInputPtr += 5;
1404 return '"';
1405 }
1406 }
1407 }
1408 return 0;
1409 }
1410
1411 /**
1412 * Reverse of {@link #resolveCharOnlyEntity}; will only resolve entity
1413 * if it is NOT a character entity (or pre-defined 'generic' entity;
1414 * amp, apos, lt, gt or quot). Only used in cases where entities
1415 * are to be separately returned unexpanded (in non-entity-replacing
1416 * mode); which means it's never called from dtd handler.
1417 */
1418 protected EntityDecl resolveNonCharEntity()
1419 throws XMLStreamException
1420 {
1421 //int avail = inputInBuffer();
1422 int avail = mInputEnd - mInputPtr;
1423 if (avail < 6) {
1424 // split entity, or buffer boundary
1425 /* Don't want to lose leading '&' (in case we can not expand
1426 * the entity), so let's push it back first
1427 */
1428 --mInputPtr;
1429
1430 /* Shortest valid reference would be 3 chars ('&a;'); which
1431 * would only be legal from an expanded entity...
1432 */
1433 if (!ensureInput(6)) {
1434 avail = inputInBuffer();
1435 if (avail < 3) {
1436 throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF);
1437 }
1438 } else {
1439 avail = 6;
1440 }
1441 // ... and now we can move pointer back as well:
1442 ++mInputPtr;
1443 }
1444
1445 // We don't care about char entities:
1446 char c = mInputBuffer[mInputPtr];
1447 if (c == '#') {
1448 return null;
1449 }
1450
1451 /* 19-Aug-2004, TSa: Need special handling for pre-defined
1452 * entities; they are not counted as 'real' general parsed
1453 * entities, but more as character entities...
1454 */
1455
1456 // have chars at least up to mInputPtr+4 by now
1457 if (c == 'a') {
1458 char d = mInputBuffer[mInputPtr+1];
1459 if (d == 'm') {
1460 if (avail >= 4
1461 && mInputBuffer[mInputPtr+2] == 'p'
1462 && mInputBuffer[mInputPtr+3] == ';') {
1463 // If not automatically expanding:
1464 //return sEntityAmp;
1465 // mInputPtr += 4;
1466 return null;
1467 }
1468 } else if (d == 'p') {
1469 if (avail >= 5
1470 && mInputBuffer[mInputPtr+2] == 'o'
1471 && mInputBuffer[mInputPtr+3] == 's'
1472 && mInputBuffer[mInputPtr+4] == ';') {
1473 return null;
1474 }
1475 }
1476 } else if (c == 'l') {
1477 if (avail >= 3
1478 && mInputBuffer[mInputPtr+1] == 't'
1479 && mInputBuffer[mInputPtr+2] == ';') {
1480 return null;
1481 }
1482 } else if (c == 'g') {
1483 if (avail >= 3
1484 && mInputBuffer[mInputPtr+1] == 't'
1485 && mInputBuffer[mInputPtr+2] == ';') {
1486 return null;
1487 }
1488 } else if (c == 'q') {
1489 if (avail >= 5
1490 && mInputBuffer[mInputPtr+1] == 'u'
1491 && mInputBuffer[mInputPtr+2] == 'o'
1492 && mInputBuffer[mInputPtr+3] == 't'
1493 && mInputBuffer[mInputPtr+4] == ';') {
1494 return null;
1495 }
1496 }
1497
1498 // Otherwise, let's just parse in generic way:
1499 ++mInputPtr; // since we already read the first letter
1500 String id = parseEntityName(c);
1501 mCurrName = id;
1502
1503 return findEntity(id, null);
1504 }
1505
1506 /**
1507 * Method that does full resolution of an entity reference, be it
1508 * character entity, internal entity or external entity, including
1509 * updating of input buffers, and depending on whether result is
1510 * a character entity (or one of 5 pre-defined entities), returns
1511 * char in question, or null character (code 0) to indicate it had
1512 * to change input source.
1513 *
1514 * @param allowExt If true, is allowed to expand external entities
1515 * (expanding text); if false, is not (expanding attribute value).
1516 *
1517 * @return Either single-character replacement (which is NOT to be
1518 * reparsed), or null char (0) to indicate expansion is done via
1519 * input source.
1520 */
1521 protected int fullyResolveEntity(boolean allowExt)
1522 throws XMLStreamException
1523 {
1524 char c = getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
1525 // Do we have a (numeric) character entity reference?
1526 if (c == '#') { // numeric
1527 final StringBuffer originalSurface = new StringBuffer("#");
1528 int ch = resolveCharEnt(originalSurface);
1529 if (mCfgTreatCharRefsAsEntities) {
1530 final char[] originalChars = new char[originalSurface.length()];
1531 originalSurface.getChars(0, originalSurface.length(), originalChars, 0);
1532 mCurrEntity = getIntEntity(ch, originalChars);
1533 return 0;
1534 }
1535 return ch;
1536 }
1537
1538 String id = parseEntityName(c);
1539
1540 // Perhaps we have a pre-defined char reference?
1541 c = id.charAt(0);
1542 /*
1543 * 16-May-2004, TSa: Should custom entities (or ones defined in int/ext subset) override
1544 * pre-defined settings for these?
1545 */
1546 char d = CHAR_NULL;
1547 if (c == 'a') { // amp or apos?
1548 if (id.equals("amp")) {
1549 d = '&';
1550 } else if (id.equals("apos")) {
1551 d = '\'';
1552 }
1553 } else if (c == 'g') { // gt?
1554 if (id.length() == 2 && id.charAt(1) == 't') {
1555 d = '>';
1556 }
1557 } else if (c == 'l') { // lt?
1558 if (id.length() == 2 && id.charAt(1) == 't') {
1559 d = '<';
1560 }
1561 } else if (c == 'q') { // quot?
1562 if (id.equals("quot")) {
1563 d = '"';
1564 }
1565 }
1566
1567 if (d != CHAR_NULL) {
1568 if (mCfgTreatCharRefsAsEntities) {
1569 final char[] originalChars = new char[id.length()];
1570 id.getChars(0, id.length(), originalChars, 0);
1571 mCurrEntity = getIntEntity(d, originalChars);
1572 return 0;
1573 }
1574 return d;
1575 }
1576
1577 final EntityDecl e = expandEntity(id, allowExt, null);
1578 if (mCfgTreatCharRefsAsEntities) {
1579 mCurrEntity = e;
1580 }
1581 return 0;
1582 }
1583
1584 /**
1585 * Returns an entity (possibly from cache) for the argument character using the encoded
1586 * representation in mInputBuffer[entityStartPos ... mInputPtr-1].
1587 */
1588 protected EntityDecl getIntEntity(int ch, final char[] originalChars)
1589 {
1590 String cacheKey = new String(originalChars);
1591
1592 IntEntity entity = mCachedEntities.get(cacheKey);
1593 if (entity == null) {
1594 String repl;
1595 if (ch <= 0xFFFF) {
1596 repl = Character.toString((char) ch);
1597 } else {
1598 StringBuffer sb = new StringBuffer(2);
1599 ch -= 0x10000;
1600 sb.append((char) ((ch >> 10) + 0xD800));
1601 sb.append((char) ((ch & 0x3FF) + 0xDC00));
1602 repl = sb.toString();
1603 }
1604 entity = IntEntity.create(new String(originalChars), repl);
1605 mCachedEntities.put(cacheKey, entity);
1606 }
1607 return entity;
1608 }
1609
1610
1611 /**
1612 * Helper method that will try to expand a parsed entity (parameter or
1613 * generic entity).
1614 *<p>
1615 * note: called by sub-classes (dtd parser), needs to be protected.
1616 *
1617 * @param id Name of the entity being expanded
1618 * @param allowExt Whether external entities can be expanded or not; if
1619 * not, and the entity to expand would be external one, an exception
1620 * will be thrown
1621 */
1622 protected EntityDecl expandEntity(String id, boolean allowExt,
1623 Object extraArg)
1624 throws XMLStreamException
1625 {
1626 mCurrName = id;
1627
1628 EntityDecl ed = findEntity(id, extraArg);
1629
1630 if (ed == null) {
1631 /* 30-Sep-2005, TSa: As per [WSTX-5], let's only throw exception
1632 * if we have to resolve it (otherwise it's just best-effort,
1633 * and null is ok)
1634 */
1635 /* 02-Oct-2005, TSa: Plus, [WSTX-4] adds "undeclared entity
1636 * resolver"
1637 */
1638 if (mCfgReplaceEntities) {
1639 mCurrEntity = expandUnresolvedEntity(id);
1640 }
1641 return null;
1642 }
1643
1644 if (!mCfgTreatCharRefsAsEntities || this instanceof MinimalDTDReader) {
1645 expandEntity(ed, allowExt);
1646 }
1647
1648 return ed;
1649 }
1650
1651 /**
1652 *<p>
1653 * note: defined as private for documentation, ie. it's just called
1654 * from within this class (not sub-classes), from one specific method
1655 * (see above)
1656 *
1657 * @param ed Entity to be expanded
1658 * @param allowExt Whether external entities are allowed or not.
1659 */
1660 private void expandEntity(EntityDecl ed, boolean allowExt)
1661 throws XMLStreamException
1662 {
1663 String id = ed.getName();
1664
1665 /* Very first thing; we can immediately check if expanding
1666 * this entity would result in infinite recursion:
1667 */
1668 if (mInput.isOrIsExpandedFrom(id)) {
1669 throwRecursionError(id);
1670 }
1671
1672 /* Should not refer unparsed entities from attribute values
1673 * or text content (except via notation mechanism, but that's
1674 * not parsed here)
1675 */
1676 if (!ed.isParsed()) {
1677 throwParseError("Illegal reference to unparsed external entity \"{0}\"", id, null);
1678 }
1679
1680 // 28-Jun-2004, TSa: Do we support external entity expansion?
1681 boolean isExt = ed.isExternal();
1682 if (isExt) {
1683 if (!allowExt) { // never ok in attribute value...
1684 throwParseError("Encountered a reference to external parsed entity \"{0}\" when expanding attribute value: not legal as per XML 1.0/1.1 #3.1", id, null);
1685 }
1686 if (!mConfig.willSupportExternalEntities()) {
1687 throwParseError("Encountered a reference to external entity \"{0}\", but stream reader has feature \"{1}\" disabled",
1688 id, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES);
1689 }
1690 }
1691 verifyLimit("Maximum entity expansion count", mConfig.getMaxEntityCount(), ++mEntityExpansionCount);
1692 // First, let's give current context chance to save its stuff
1693 WstxInputSource oldInput = mInput;
1694 oldInput.saveContext(this);
1695 WstxInputSource newInput = null;
1696 try {
1697 newInput = ed.expand(oldInput, mEntityResolver, mConfig, mDocXmlVersion);
1698 } catch (FileNotFoundException fex) {
1699 /* Let's catch and rethrow this just so we get more meaningful
1700 * description (with input source position etc)
1701 */
1702 throwParseError("(was {0}) {1}", fex.getClass().getName(), fex.getMessage());
1703 } catch (IOException ioe) {
1704 throw constructFromIOE(ioe);
1705 }
1706 /* And then we'll need to make sure new input comes from the new
1707 * input source
1708 */
1709 initInputSource(newInput, isExt, id);
1710 }
1711
1712 /**
1713 *<p>
1714 * note: only called from the local expandEntity() method
1715 */
1716 private EntityDecl expandUnresolvedEntity(String id)
1717 throws XMLStreamException
1718 {
1719 XMLResolver resolver = mConfig.getUndeclaredEntityResolver();
1720 if (resolver != null) {
1721 /* Ok, we can check for recursion here; but let's only do that
1722 * if there is any chance that it might get resolved by
1723 * the special resolver (it must have been resolved this way
1724 * earlier, too...)
1725 */
1726 if (mInput.isOrIsExpandedFrom(id)) {
1727 throwRecursionError(id);
1728 }
1729
1730 WstxInputSource oldInput = mInput;
1731 oldInput.saveContext(this);
1732 // null, null -> no public or system ids
1733 int xmlVersion = mDocXmlVersion;
1734 // 05-Feb-2006, TSa: If xmlVersion not explicitly known, defaults to 1.0
1735 if (xmlVersion == XmlConsts.XML_V_UNKNOWN) {
1736 xmlVersion = XmlConsts.XML_V_10;
1737 }
1738 WstxInputSource newInput;
1739 try {
1740 newInput = DefaultInputResolver.resolveEntityUsing
1741 (oldInput, id, null, null, resolver, mConfig, xmlVersion);
1742 if (mCfgTreatCharRefsAsEntities) {
1743 return new IntEntity(WstxInputLocation.getEmptyLocation(), newInput.getEntityId(),
1744 newInput.getSource(), new char[]{}, WstxInputLocation.getEmptyLocation());
1745 }
1746 } catch (IOException ioe) {
1747 throw constructFromIOE(ioe);
1748 }
1749 if (newInput != null) {
1750 // true -> is external
1751 initInputSource(newInput, true, id);
1752 return null;
1753 }
1754 }
1755 handleUndeclaredEntity(id);
1756 return null;
1757 }
1758
1759 /*
1760 ///////////////////////////////////////////////////////////////////////
1761 // Abstract methods for sub-classes to implement
1762 ///////////////////////////////////////////////////////////////////////
1763 */
1764
1765 /**
1766 * Abstract method for sub-classes to implement, for finding
1767 * a declared general or parsed entity.
1768 *
1769 * @param id Identifier of the entity to find
1770 * @param arg Optional argument passed from caller; needed by DTD
1771 * reader.
1772 */
1773 protected abstract EntityDecl findEntity(String id, Object arg)
1774 throws XMLStreamException;
1775
1776 /**
1777 * This method gets called if a declaration for an entity was not
1778 * found in entity expanding mode (enabled by default for xml reader,
1779 * always enabled for dtd reader).
1780 */
1781 protected abstract void handleUndeclaredEntity(String id)
1782 throws XMLStreamException;
1783
1784 protected abstract void handleIncompleteEntityProblem(WstxInputSource closing)
1785 throws XMLStreamException;
1786
1787 /*
1788 ///////////////////////////////////////////////////////////////////////
1789 // Basic tokenization
1790 ///////////////////////////////////////////////////////////////////////
1791 */
1792
1793 /**
1794 * Method that will parse name token (roughly equivalent to XML specs;
1795 * although bit lenier for more efficient handling); either uri prefix,
1796 * or local name.
1797 *<p>
1798 * Much of complexity in this method has to do with the intention to
1799 * try to avoid any character copies. In this optimal case algorithm
1800 * would be fairly simple. However, this only works if all data is
1801 * already in input buffer... if not, copy has to be made halfway
1802 * through parsing, and that complicates things.
1803 *<p>
1804 * One thing to note is that String returned has been canonicalized
1805 * and (if necessary) added to symbol table. It can thus be compared
1806 * against other such (usually id) Strings, with simple equality operator.
1807 *
1808 * @param c First character of the name; not yet checked for validity
1809 *
1810 * @return Canonicalized name String (which may have length 0, if
1811 * EOF or non-name-start char encountered)
1812 */
1813 protected String parseLocalName(char c)
1814 throws XMLStreamException
1815 {
1816 /* Has to start with letter, or '_' (etc); we won't allow ':' as that
1817 * is taken as namespace separator; no use trying to optimize
1818 * heavily as it's 98% likely it is a valid char...
1819 */
1820 if (!isNameStartChar(c)) {
1821 if (c == ':') {
1822 throwUnexpectedChar(c, " (missing namespace prefix?)");
1823 }
1824 throwUnexpectedChar(c, " (expected a name start character)");
1825 }
1826
1827 int ptr = mInputPtr;
1828 int hash = c;
1829 final int inputLen = mInputEnd;
1830 int startPtr = ptr-1; // already read previous char
1831 final char[] inputBuf = mInputBuffer;
1832
1833 /* After which there may be zero or more name chars
1834 * we have to consider
1835 */
1836 while (true) {
1837 if (ptr >= inputLen) {
1838 /* Ok, identifier may continue past buffer end, need
1839 * to continue with part 2 (separate method, as this is
1840 * not as common as having it all in buffer)
1841 */
1842 mInputPtr = ptr;
1843 return parseLocalName2(startPtr, hash);
1844 }
1845 // Ok, we have the char... is it a name char?
1846 c = inputBuf[ptr];
1847 if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1848 break;
1849 }
1850 if (!isNameChar(c)) {
1851 break;
1852 }
1853 hash = (hash * 31) + c;
1854 ++ptr;
1855 }
1856 mInputPtr = ptr;
1857 return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash);
1858 }
1859
1860 /**
1861 * Second part of name token parsing; called when name can continue
1862 * past input buffer end (so only part was read before calling this
1863 * method to read the rest).
1864 *<p>
1865 * Note that this isn't heavily optimized, on assumption it's not
1866 * called very often.
1867 */
1868 protected String parseLocalName2(int start, int hash)
1869 throws XMLStreamException
1870 {
1871 int ptr = mInputEnd - start;
1872 // Let's assume fairly short names
1873 char[] outBuf = getNameBuffer(ptr+8);
1874
1875 if (ptr > 0) {
1876 System.arraycopy(mInputBuffer, start, outBuf, 0, ptr);
1877 }
1878
1879 int outLen = outBuf.length;
1880 while (true) {
1881 // note: names can not cross input block (entity) boundaries...
1882 if (mInputPtr >= mInputEnd) {
1883 if (!loadMoreFromCurrent()) {
1884 break;
1885 }
1886 }
1887 char c = mInputBuffer[mInputPtr];
1888 if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1889 break;
1890 }
1891 if (!isNameChar(c)) {
1892 break;
1893 }
1894 ++mInputPtr;
1895 if (ptr >= outLen) {
1896 mNameBuffer = outBuf = expandBy50Pct(outBuf);
1897 outLen = outBuf.length;
1898 }
1899 outBuf[ptr++] = c;
1900 hash = (hash * 31) + c;
1901 }
1902 // Still need to canonicalize the name:
1903 return mSymbols.findSymbol(outBuf, 0, ptr, hash);
1904 }
1905
1906 /**
1907 * Method that will parse 'full' name token; what full means depends on
1908 * whether reader is namespace aware or not. If it is, full name means
1909 * local name with no namespace prefix (PI target, entity/notation name);
1910 * if not, name can contain arbitrary number of colons. Note that
1911 * element and attribute names are NOT parsed here, so actual namespace
1912 * prefix separation can be handled properly there.
1913 *<p>
1914 * Similar to {@link #parseLocalName}, much of complexity stems from
1915 * trying to avoid copying name characters from input buffer.
1916 *<p>
1917 * Note that returned String will be canonicalized, similar to
1918 * {@link #parseLocalName}, but without separating prefix/local name.
1919 *
1920 * @return Canonicalized name String (which may have length 0, if
1921 * EOF or non-name-start char encountered)
1922 */
1923 protected String parseFullName()
1924 throws XMLStreamException
1925 {
1926 if (mInputPtr >= mInputEnd) {
1927 loadMoreFromCurrent();
1928 }
1929 return parseFullName(mInputBuffer[mInputPtr++]);
1930 }
1931
1932 protected String parseFullName(char c)
1933 throws XMLStreamException
1934 {
1935 // First char has special handling:
1936 if (!isNameStartChar(c)) {
1937 if (c == ':') { // no name.... generally an error:
1938 if (mCfgNsEnabled) {
1939 throwNsColonException(parseFNameForError());
1940 }
1941 // Ok, that's fine actually
1942 } else {
1943 if (c <= CHAR_SPACE) {
1944 throwUnexpectedChar(c, " (missing name?)");
1945 }
1946 throwUnexpectedChar(c, " (expected a name start character)");
1947 }
1948 }
1949
1950 int ptr = mInputPtr;
1951 int hash = c;
1952 int inputLen = mInputEnd;
1953 int startPtr = ptr-1; // to account for the first char
1954
1955 /* After which there may be zero or more name chars
1956 * we have to consider
1957 */
1958 while (true) {
1959 if (ptr >= inputLen) {
1960 /* Ok, identifier may continue past buffer end, need
1961 * to continue with part 2 (separate method, as this is
1962 * not as common as having it all in buffer)
1963 */
1964 mInputPtr = ptr;
1965 return parseFullName2(startPtr, hash);
1966 }
1967 c = mInputBuffer[ptr];
1968 if (c == ':') { // colon only allowed in non-NS mode
1969 if (mCfgNsEnabled) {
1970 mInputPtr = ptr;
1971 throwNsColonException(new String(mInputBuffer, startPtr, ptr - startPtr) + parseFNameForError());
1972 }
1973 } else {
1974 if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1975 break;
1976 }
1977 if (!isNameChar(c)) {
1978 break;
1979 }
1980 }
1981 hash = (hash * 31) + c;
1982 ++ptr;
1983 }
1984 mInputPtr = ptr;
1985 return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash);
1986 }
1987
1988 @SuppressWarnings("cast")
1989 protected String parseFullName2(int start, int hash)
1990 throws XMLStreamException
1991 {
1992 int ptr = mInputEnd - start;
1993 // Let's assume fairly short names
1994 char[] outBuf = getNameBuffer(ptr+8);
1995
1996 if (ptr > 0) {
1997 System.arraycopy(mInputBuffer, start, outBuf, 0, ptr);
1998 }
1999
2000 int outLen = outBuf.length;
2001 while (true) {
2002 /* 06-Sep-2004, TSa: Name tokens are not allowed to continue
2003 * past entity expansion ranges... that is, all characters
2004 * have to come from the same input source. Thus, let's only
2005 * load things from same input level
2006 */
2007 if (mInputPtr >= mInputEnd) {
2008 if (!loadMoreFromCurrent()) {
2009 break;
2010 }
2011 }
2012 char c = mInputBuffer[mInputPtr];
2013 if (c == ':') { // colon only allowed in non-NS mode
2014 if (mCfgNsEnabled) {
2015 throwNsColonException(new String(outBuf, 0, ptr) + c + parseFNameForError());
2016 }
2017 } else if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
2018 break;
2019 } else if (!isNameChar(c)) {
2020 break;
2021 }
2022 ++mInputPtr;
2023
2024 if (ptr >= outLen) {
2025 mNameBuffer = outBuf = expandBy50Pct(outBuf);
2026 outLen = outBuf.length;
2027 }
2028 outBuf[ptr++] = c;
2029 hash = (hash * 31) + (int) c;
2030 }
2031
2032 // Still need to canonicalize the name:
2033 return mSymbols.findSymbol(outBuf, 0, ptr, hash);
2034 }
2035
2036 /**
2037 * Method called to read in full name, including unlimited number of
2038 * namespace separators (':'), for the purpose of displaying name in
2039 * an error message. Won't do any further validations, and parsing
2040 * is not optimized: main need is just to get more meaningful error
2041 * messages.
2042 */
2043 protected String parseFNameForError()
2044 throws XMLStreamException
2045 {
2046 StringBuilder sb = new StringBuilder(100);
2047 while (true) {
2048 char c;
2049
2050 if (mInputPtr < mInputEnd) {
2051 c = mInputBuffer[mInputPtr++];
2052 } else { // can't error here, so let's accept EOF for now:
2053 int i = getNext();
2054 if (i < 0) {
2055 break;
2056 }
2057 c = (char) i;
2058 }
2059 if (c != ':' && !isNameChar(c)) {
2060 --mInputPtr;
2061 break;
2062 }
2063 sb.append(c);
2064 }
2065 return sb.toString();
2066 }
2067
2068 protected final String parseEntityName(char c)
2069 throws XMLStreamException
2070 {
2071 String id = parseFullName(c);
2072 // Needs to be followed by a semi-colon, too.. from same input source:
2073 if (mInputPtr >= mInputEnd) {
2074 if (!loadMoreFromCurrent()) {
2075 throwParseError("Missing semicolon after reference for entity \"{0}\"", id, null);
2076 }
2077 }
2078 c = mInputBuffer[mInputPtr++];
2079 if (c != ';') {
2080 throwUnexpectedChar(c, "; expected a semi-colon after the reference for entity '"+id+"'");
2081 }
2082 return id;
2083 }
2084
2085 /**
2086 * Note: does not check for number of colons, amongst other things.
2087 * Main idea is to skip through what superficially seems like a valid
2088 * id, nothing more. This is only done when really skipping through
2089 * something we do not care about at all: not even whether names/ids
2090 * would be valid (for example, when ignoring internal DTD subset).
2091 *
2092 * @return Length of skipped name.
2093 */
2094 protected int skipFullName(char c)
2095 throws XMLStreamException
2096 {
2097 if (!isNameStartChar(c)) {
2098 --mInputPtr;
2099 return 0;
2100 }
2101
2102 /* After which there may be zero or more name chars
2103 * we have to consider
2104 */
2105 int count = 1;
2106 while (true) {
2107 c = (mInputPtr < mInputEnd) ?
2108 mInputBuffer[mInputPtr++] : getNextChar(SUFFIX_EOF_EXP_NAME);
2109 if (c != ':' && !isNameChar(c)) {
2110 break;
2111 }
2112 ++count;
2113 }
2114 return count;
2115 }
2116
2117 /**
2118 * Simple parsing method that parses system ids, which are generally
2119 * used in entities (from DOCTYPE declaration to internal/external
2120 * subsets).
2121 *<p>
2122 * NOTE: returned String is not canonicalized, on assumption that
2123 * external ids may be longish, and are not shared all that often, as
2124 * they are generally just used for resolving paths, if anything.
2125 *<br>
2126 * Also note that this method is not heavily optimized, as it's not
2127 * likely to be a bottleneck for parsing.
2128 */
2129 protected final String parseSystemId(char quoteChar, boolean convertLFs,
2130 String errorMsg)
2131 throws XMLStreamException
2132 {
2133 char[] buf = getNameBuffer(-1);
2134 int ptr = 0;
2135
2136 while (true) {
2137 char c = (mInputPtr < mInputEnd) ?
2138 mInputBuffer[mInputPtr++] : getNextChar(errorMsg);
2139 if (c == quoteChar) {
2140 break;
2141 }
2142 /* ??? 14-Jun-2004, TSa: Should we normalize linefeeds or not?
2143 * It seems like we should, for all input... so that's the way it
2144 * works.
2145 */
2146 if (c == '\n') {
2147 markLF();
2148 } else if (c == '\r') {
2149 if (peekNext() == '\n') {
2150 ++mInputPtr;
2151 if (!convertLFs) {
2152 /* The only tricky thing; need to preserve 2-char LF; need to
2153 * output one char from here, then can fall back to default:
2154 */
2155 if (ptr >= buf.length) {
2156 buf = expandBy50Pct(buf);
2157 }
2158 buf[ptr++] = '\r';
2159 }
2160 c = '\n';
2161 } else if (convertLFs) {
2162 c = '\n';
2163 }
2164 }
2165
2166 // Other than that, let's just append it:
2167 if (ptr >= buf.length) {
2168 buf = expandBy50Pct(buf);
2169 }
2170 buf[ptr++] = c;
2171 }
2172
2173 return (ptr == 0) ? "" : new String(buf, 0, ptr);
2174 }
2175
2176 /**
2177 * Simple parsing method that parses system ids, which are generally
2178 * used in entities (from DOCTYPE declaration to internal/external
2179 * subsets).
2180 *<p>
2181 * As per xml specs, the contents are actually normalized.
2182 *<p>
2183 * NOTE: returned String is not canonicalized, on assumption that
2184 * external ids may be longish, and are not shared all that often, as
2185 * they are generally just used for resolving paths, if anything.
2186 *<br>
2187 * Also note that this method is not heavily optimized, as it's not
2188 * likely to be a bottleneck for parsing.
2189 */
2190 protected final String parsePublicId(char quoteChar, String errorMsg)
2191 throws XMLStreamException
2192 {
2193 char[] buf = getNameBuffer(-1);
2194 int ptr = 0;
2195 boolean spaceToAdd = false;
2196
2197 while (true) {
2198 char c = (mInputPtr < mInputEnd) ?
2199 mInputBuffer[mInputPtr++] : getNextChar(errorMsg);
2200 if (c == quoteChar) {
2201 break;
2202 }
2203 if (c == '\n') {
2204 markLF();
2205 spaceToAdd = true;
2206 continue;
2207 } else if (c == '\r') {
2208 if (peekNext() == '\n') {
2209 ++mInputPtr;
2210 }
2211 spaceToAdd = true;
2212 continue;
2213 } else if (c == CHAR_SPACE) {
2214 spaceToAdd = true;
2215 continue;
2216 } else {
2217 // Verify it's a legal pubid char (see XML spec, #13, from 2.3)
2218 if ((c >= VALID_PUBID_CHAR_COUNT)
2219 || sPubidValidity[c] != PUBID_CHAR_VALID_B) {
2220 throwUnexpectedChar(c, " in public identifier");
2221 }
2222 }
2223
2224 // Other than that, let's just append it:
2225 if (ptr >= buf.length) {
2226 buf = expandBy50Pct(buf);
2227 }
2228 /* Space-normalization means scrapping leading and trailing
2229 * white space, and coalescing remaining ws into single spaces.
2230 */
2231 if (spaceToAdd) { // pending white space to add?
2232 if (c == CHAR_SPACE) { // still a space; let's skip
2233 continue;
2234 }
2235 /* ok: if we have non-space, we'll either forget about
2236 * space(s) (if nothing has been output, ie. leading space),
2237 * or output a single space (in-between non-white space)
2238 */
2239 spaceToAdd = false;
2240 if (ptr > 0) {
2241 buf[ptr++] = CHAR_SPACE;
2242 if (ptr >= buf.length) {
2243 buf = expandBy50Pct(buf);
2244 }
2245 }
2246 }
2247 buf[ptr++] = c;
2248 }
2249
2250 return (ptr == 0) ? "" : new String(buf, 0, ptr);
2251 }
2252
2253 protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs,
2254 String errorMsg)
2255 throws XMLStreamException
2256 {
2257 // Let's first ensure we have some data in there...
2258 if (mInputPtr >= mInputEnd) {
2259 loadMore(errorMsg);
2260 }
2261 while (true) {
2262 // Let's loop consequtive 'easy' spans:
2263 char[] inputBuf = mInputBuffer;
2264 int inputLen = mInputEnd;
2265 int ptr = mInputPtr;
2266 int startPtr = ptr;
2267 while (ptr < inputLen) {
2268 char c = inputBuf[ptr++];
2269 if (c == endChar) {
2270 int thisLen = ptr - startPtr - 1;
2271 if (thisLen > 0) {
2272 tb.append(inputBuf, startPtr, thisLen);
2273 }
2274 mInputPtr = ptr;
2275 return;
2276 }
2277 if (c == '\n') {
2278 mInputPtr = ptr; // markLF() requires this
2279 markLF();
2280 } else if (c == '\r') {
2281 if (!convertLFs && ptr < inputLen) {
2282 if (inputBuf[ptr] == '\n') {
2283 ++ptr;
2284 }
2285 mInputPtr = ptr;
2286 markLF();
2287 } else {
2288 int thisLen = ptr - startPtr - 1;
2289 if (thisLen > 0) {
2290 tb.append(inputBuf, startPtr, thisLen);
2291 }
2292 mInputPtr = ptr;
2293 c = getNextChar(errorMsg);
2294 if (c != '\n') {
2295 --mInputPtr; // pusback
2296 tb.append(convertLFs ? '\n' : '\r');
2297 } else {
2298 if (convertLFs) {
2299 tb.append('\n');
2300 } else {
2301 tb.append('\r');
2302 tb.append('\n');
2303 }
2304 }
2305 startPtr = ptr = mInputPtr;
2306 markLF();
2307 }
2308 }
2309 }
2310 int thisLen = ptr - startPtr;
2311 if (thisLen > 0) {
2312 tb.append(inputBuf, startPtr, thisLen);
2313 }
2314 loadMore(errorMsg);
2315 startPtr = ptr = mInputPtr;
2316 inputBuf = mInputBuffer;
2317 inputLen = mInputEnd;
2318 }
2319 }
2320
2321 /*
2322 ///////////////////////////////////////////////////////////////////////
2323 // Internal methods
2324 ///////////////////////////////////////////////////////////////////////
2325 */
2326
2327 private int resolveCharEnt(StringBuffer originalCharacters)
2328 throws XMLStreamException
2329 {
2330 int value = 0;
2331 char c = getNextChar(SUFFIX_IN_ENTITY_REF);
2332
2333 if (originalCharacters != null) {
2334 originalCharacters.append(c);
2335 }
2336
2337 if (c == 'x') { // hex
2338 while (true) {
2339 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
2340 : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
2341 if (c == ';') {
2342 break;
2343 }
2344
2345 if (originalCharacters != null) {
2346 originalCharacters.append(c);
2347 }
2348 value = value << 4;
2349 if (c <= '9' && c >= '0') {
2350 value += (c - '0');
2351 } else if (c >= 'a' && c <= 'f') {
2352 value += 10 + (c - 'a');
2353 } else if (c >= 'A' && c <= 'F') {
2354 value += 10 + (c - 'A');
2355 } else {
2356 throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
2357 }
2358 // Overflow?
2359 if (value > MAX_UNICODE_CHAR) {
2360 reportUnicodeOverflow();
2361 }
2362 }
2363 } else { // numeric (decimal)
2364 while (c != ';') {
2365 if (c <= '9' && c >= '0') {
2366 value = (value * 10) + (c - '0');
2367 // Overflow?
2368 if (value > MAX_UNICODE_CHAR) {
2369 reportUnicodeOverflow();
2370 }
2371 } else {
2372 throwUnexpectedChar(c, "; expected a decimal number.");
2373 }
2374 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
2375 : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
2376
2377 if (originalCharacters != null && c != ';') {
2378 originalCharacters.append(c);
2379 }
2380 }
2381 }
2382 validateChar(value);
2383 return value;
2384 }
2385
2386 /**
2387 * Method that will verify that expanded Unicode codepoint is a valid
2388 * XML content character.
2389 */
2390 private final void validateChar(int value)
2391 throws XMLStreamException
2392 {
2393 /* 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic,
2394 * need to be reported by a surrogate pair..
2395 */
2396 if (value >= 0xD800) {
2397 if (value < 0xE000) { // no surrogates via entity expansion
2398 reportIllegalChar(value);
2399 }
2400 if (value > 0xFFFF) {
2401 // Within valid range at all?
2402 if (value > MAX_UNICODE_CHAR) {
2403 reportUnicodeOverflow();
2404 }
2405 } else if (value >= 0xFFFE) { // 0xFFFE and 0xFFFF are illegal too
2406 reportIllegalChar(value);
2407 }
2408 // Ok, fine as is
2409 } else if (value < 32) {
2410 if (value == 0) {
2411 throwParseError("Invalid character reference: null character not allowed in XML content.");
2412 }
2413 // XML 1.1 allows most other chars; 1.0 does not:
2414 if (!mXml11 && !mAllowXml11EscapedCharsInXml10
2415 && (value != 0x9 && value != 0xA && value != 0xD)) {
2416 reportIllegalChar(value);
2417 }
2418 }
2419 }
2420
2421 protected final char[] getNameBuffer(int minSize)
2422 {
2423 char[] buf = mNameBuffer;
2424
2425 if (buf == null) {
2426 mNameBuffer = buf = new char[(minSize > 48) ? (minSize+16) : 64];
2427 } else if (minSize >= buf.length) { // let's allow one char extra...
2428 int len = buf.length;
2429 len += (len >> 1); // grow by 50%
2430 mNameBuffer = buf = new char[(minSize >= len) ? (minSize+16) : len];
2431 }
2432 return buf;
2433 }
2434
2435 protected final char[] expandBy50Pct(char[] buf)
2436 {
2437 int len = buf.length;
2438 char[] newBuf = new char[len + (len >> 1)];
2439 System.arraycopy(buf, 0, newBuf, 0, len);
2440 return newBuf;
2441 }
2442
2443 /**
2444 * Method called to throw an exception indicating that a name that
2445 * should not be namespace-qualified (PI target, entity/notation name)
2446 * is one, and reader is namespace aware.
2447 */
2448 private void throwNsColonException(String name)
2449 throws XMLStreamException
2450 {
2451 throwParseError("Illegal name \"{0}\" (PI target, entity/notation name): can not contain a colon (XML Namespaces 1.0#6)", name, null);
2452 }
2453
2454 private void throwRecursionError(String entityName)
2455 throws XMLStreamException
2456 {
2457 throwParseError("Illegal entity expansion: entity \"{0}\" expands itself recursively.", entityName, null);
2458 }
2459
2460 private void reportUnicodeOverflow()
2461 throws XMLStreamException
2462 {
2463 throwParseError("Illegal character entity: value higher than max allowed (0x{0})", Integer.toHexString(MAX_UNICODE_CHAR), null);
2464 }
2465
2466 private void reportIllegalChar(int value)
2467 throws XMLStreamException
2468 {
2469 throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
2470 }
2471
2472 protected void verifyLimit(String type, long maxValue, long currentValue)
2473 throws XMLStreamException
2474 {
2475 if (currentValue > maxValue) {
2476 throw constructLimitViolation(type, maxValue);
2477 }
2478 }
2479
2480 protected XMLStreamException constructLimitViolation(String type, long limit)
2481 throws XMLStreamException
2482 {
2483 return new XMLStreamException(type+" limit ("+limit+") exceeded");
2484 }
2485 }
2486