1 /* Woodstox XML processor
2 *
3 * Copyright (c) 2004- Tatu Saloranta, tatu.saloranta@iki.fi
4 *
5 * Licensed under the License specified in file LICENSE, included with
6 * the source code.
7 * You may not use this file except in compliance with the License.
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package com.ctc.wstx.sr;
17
18 import java.io.*;
19 import java.text.MessageFormat;
20 import java.util.Map;
21
22 import org.xml.sax.Attributes;
23 import org.xml.sax.ContentHandler;
24 import org.xml.sax.SAXException;
25 import org.xml.sax.ext.LexicalHandler;
26
27 import javax.xml.namespace.NamespaceContext;
28 import javax.xml.namespace.QName;
29 import javax.xml.stream.Location;
30 import javax.xml.stream.XMLStreamException;
31
32 import org.codehaus.stax2.AttributeInfo;
33 import org.codehaus.stax2.DTDInfo;
34 import org.codehaus.stax2.LocationInfo;
35 import org.codehaus.stax2.XMLStreamLocation2;
36 import org.codehaus.stax2.XMLStreamReader2;
37 import org.codehaus.stax2.typed.TypedXMLStreamException;
38 import org.codehaus.stax2.validation.*;
39
40 import com.ctc.wstx.api.ReaderConfig;
41 import com.ctc.wstx.api.WstxInputProperties;
42 import com.ctc.wstx.cfg.ErrorConsts;
43 import com.ctc.wstx.cfg.XmlConsts;
44 import com.ctc.wstx.dtd.MinimalDTDReader;
45 import com.ctc.wstx.ent.EntityDecl;
46 import com.ctc.wstx.exc.WstxException;
47 import com.ctc.wstx.io.*;
48 import com.ctc.wstx.util.DefaultXmlSymbolTable;
49 import com.ctc.wstx.util.ExceptionUtil;
50 import com.ctc.wstx.util.TextBuffer;
51 import com.ctc.wstx.util.TextBuilder;
52
53 /**
54 * Partial implementation of {@link XMLStreamReader2} consisting of
55 * all functionality other than DTD-validation-specific parts, and
56 * Typed Access API (Stax2 v3.0), which are implemented at
57 * sub-classes.
58 *
59 * @author Tatu Saloranta
60 */
61 public abstract class BasicStreamReader
62 extends StreamScanner
63 implements StreamReaderImpl, DTDInfo, LocationInfo
64 {
65 /*
66 ///////////////////////////////////////////////////////////////////////
67 // Constants
68 ///////////////////////////////////////////////////////////////////////
69 */
70
71 // // // Standalone values:
72
73 final static int DOC_STANDALONE_UNKNOWN = 0;
74 final static int DOC_STANDALONE_YES = 1;
75 final static int DOC_STANDALONE_NO = 2;
76
77 // // // Main state consts:
78
79 final static int STATE_PROLOG = 0; // Before root element
80 final static int STATE_TREE = 1; // Parsing actual XML tree
81 final static int STATE_EPILOG = 2; // After root element has been closed
82 final static int STATE_MULTIDOC_HACK = 3; // State "between" multiple documents (in multi-doc mode)
83 final static int STATE_CLOSED = 4; // After reader has been closed
84
85 // // // Tokenization state consts:
86
87 // no idea as to what comes next (unknown type):
88 final static int TOKEN_NOT_STARTED = 0;
89
90 // token type figured out, but not long enough:
91 final static int TOKEN_STARTED = 1;
92
93 /* minimum token length returnable achieved; only used for
94 * CHARACTERS event which allow fragments to be returned (and for
95 * CDATA in some limited cases)
96 */
97 final static int TOKEN_PARTIAL_SINGLE = 2;
98
99 /* a single physical event has been successfully tokenized; as with
100 * partial, only used with CDATA and CHARACTERS (meaningless for others,
101 * which should only use TOKEN_FULL_COALESCED, TOKEN_NOT_STARTED or
102 * TOKEN_STARTED.
103 */
104 final static int TOKEN_FULL_SINGLE = 3;
105
106 /* all adjacent (text) events have been tokenized and coalesced (for
107 * CDATA and CHARACTERS), or that the full event has been parsed (for
108 * others)
109 */
110 final static int TOKEN_FULL_COALESCED = 4;
111
112 // // // Bit masks used for quick type comparisons
113
114 /**
115 * This mask covers all types for which basic {@link #getText} method
116 * can be called.
117 */
118 final protected static int MASK_GET_TEXT =
119 (1 << CHARACTERS) | (1 << CDATA) | (1 << SPACE)
120 | (1 << COMMENT) | (1 << DTD) | (1 << ENTITY_REFERENCE);
121
122 /**
123 * This mask covers all types for which extends <code>getTextXxx</code>
124 * methods can be called; which is less than those for which
125 * {@link #getText} can be called. Specifically, <code>DTD</code> and
126 * <code>ENTITY_REFERENCE</code> types do not support these extended
127 */
128 final protected static int MASK_GET_TEXT_XXX =
129 (1 << CHARACTERS) | (1 << CDATA) | (1 << SPACE) | (1 << COMMENT);
130
131 /**
132 * This mask is used with Stax2 getText() method (one that takes
133 * Writer as an argument): accepts even wider range of event types.
134 */
135 final protected static int MASK_GET_TEXT_WITH_WRITER =
136 (1 << CHARACTERS) | (1 << CDATA) | (1 << SPACE)
137 | (1 << COMMENT) | (1 << DTD) | (1 << ENTITY_REFERENCE)
138 | (1 << PROCESSING_INSTRUCTION);
139
140 final protected static int MASK_GET_ELEMENT_TEXT =
141 (1 << CHARACTERS) | (1 << CDATA) | (1 << SPACE)
142 | (1 << ENTITY_REFERENCE);
143
144
145 // // // Indicator of type of text in text event (WRT white space)
146
147 final static int ALL_WS_UNKNOWN = 0x0000;
148 final static int ALL_WS_YES = 0x0001;
149 final static int ALL_WS_NO = 0x0002;
150
151 /* 2 magic constants used for enabling/disabling indentation checks:
152 * (to minimize negative impact for both small docs, and large
153 * docs with non-regular white space)
154 */
155
156 private final static int INDENT_CHECK_START = 16;
157
158 private final static int INDENT_CHECK_MAX = 40;
159
160 // // // Shared namespace symbols
161
162 final protected static String sPrefixXml = DefaultXmlSymbolTable.getXmlSymbol();
163
164 final protected static String sPrefixXmlns = DefaultXmlSymbolTable.getXmlnsSymbol();
165
166 /*
167 ///////////////////////////////////////////////////////////////////////
168 // Configuration
169 ///////////////////////////////////////////////////////////////////////
170 */
171
172 // note: mConfig defined in base class
173
174 /**
175 * Set of locally stored configuration flags
176 */
177 protected final int mConfigFlags;
178
179 // // // Various extracted settings:
180
181 protected final boolean mCfgCoalesceText;
182
183 protected final boolean mCfgReportTextAsChars;
184 protected final boolean mCfgLazyParsing;
185
186 /**
187 * Minimum number of characters parser can return as partial text
188 * segment, IF it's not required to coalesce adjacent text
189 * segments.
190 */
191 protected final int mShortestTextSegment;
192
193 /*
194 ///////////////////////////////////////////////////////////////////////
195 // Symbol handling
196 ///////////////////////////////////////////////////////////////////////
197 */
198
199 /**
200 * Object to notify about shared stuff, such as symbol tables, as well
201 * as to query for additional config settings if necessary.
202 */
203 final protected ReaderCreator mOwner;
204
205 /*
206 ///////////////////////////////////////////////////////////////////////
207 // Additional XML document information, in addition to what StreamScanner has
208 ///////////////////////////////////////////////////////////////////////
209 */
210
211 /**
212 * Status about "stand-aloneness" of document; set to 'yes'/'no'/'unknown'
213 * based on whether there was xml declaration, and if so, whether
214 * it had standalone attribute.
215 */
216 protected int mDocStandalone = DOC_STANDALONE_UNKNOWN;
217
218 /*
219 ///////////////////////////////////////////////////////////////////////
220 // DOCTYPE information from document type declaration (if any found)
221 ///////////////////////////////////////////////////////////////////////
222 */
223
224 /**
225 * Prefix of root element, as dictated by DOCTYPE declaration; null
226 * if no DOCTYPE declaration, or no root prefix
227 */
228 protected String mRootPrefix;
229
230 /**
231 * Local name of root element, as dictated by DOCTYPE declaration; null
232 * if no DOCTYPE declaration.
233 */
234 protected String mRootLName;
235
236 /**
237 * Public id of the DTD, if one exists and has been parsed.
238 */
239 protected String mDtdPublicId;
240
241 /**
242 * System id of the DTD, if one exists and has been parsed.
243 */
244 protected String mDtdSystemId;
245
246 /*
247 ///////////////////////////////////////////////////////////////////////
248 // Information about currently open subtree, content
249 ///////////////////////////////////////////////////////////////////////
250 */
251
252 /**
253 * TextBuffer mostly used to collect non-element textual content
254 * (text, CDATA, comment content, pi data)
255 */
256 final protected TextBuffer mTextBuffer;
257
258 /**
259 * Currently open element tree
260 */
261 final protected InputElementStack mElementStack;
262
263 /**
264 * Object that stores information about currently accessible attributes.
265 */
266 final protected AttributeCollector mAttrCollector;
267
268 /*
269 ///////////////////////////////////////////////////////////////////////
270 // Tokenization state
271 ///////////////////////////////////////////////////////////////////////
272 */
273
274 /// Flag set when DOCTYPE declaration has been parsed
275 protected boolean mStDoctypeFound = false;
276
277 /**
278 * State of the current token; one of M_ - constants from above.
279 *<p>
280 * Initially set to fully tokenized, since it's the virtual
281 * START_DOCUMENT event that we fully know by now (parsed by
282 * bootstrapper)
283 */
284 protected int mTokenState = TOKEN_FULL_COALESCED;
285
286 /**
287 * Threshold value that defines tokenization state that needs to be
288 * achieved to "finish" current <b>logical</b> text segment (which
289 * may consist of adjacent CDATA and text segments; or be a complete
290 * physical segment; or just even a fragment of such a segment)
291 */
292 protected final int mStTextThreshold;
293
294 /**
295 * Sized of currentTextLength for CDATA, CHARACTERS, WHITESPACE.
296 * When segmenting, this records to size of all the segments
297 * so we can track if the text length has exceeded limits.
298 */
299 protected int mCurrTextLength;
300
301 /// Flag that indicates current start element is an empty element
302 protected boolean mStEmptyElem = false;
303
304 /**
305 * Main parsing/tokenization state (STATE_xxx)
306 */
307 protected int mParseState;
308
309 /**
310 * Current state of the stream, ie token value returned by
311 * {@link #getEventType}. Needs to be initialized to START_DOCUMENT,
312 * since that's the state it starts in.
313 */
314 protected int mCurrToken = START_DOCUMENT;
315
316 /**
317 * Additional information sometimes stored (when generating dummy
318 * events in multi-doc mode, for example) temporarily when
319 * {@link #mCurrToken} is already populated.
320 */
321 protected int mSecondaryToken = START_DOCUMENT;
322
323 /**
324 * Status of current (text) token's "whitespaceness", that is,
325 * whether it is or is not all white space.
326 */
327 protected int mWsStatus;
328
329 /**
330 * Flag that indicates that textual content (CDATA, CHARACTERS) is to
331 * be validated within current element's scope. Enabled if one of
332 * validators returns {@link XMLValidator#CONTENT_ALLOW_VALIDATABLE_TEXT},
333 * and will prevent lazy parsing of text.
334 */
335 protected boolean mValidateText = false;
336
337 /**
338 * Counter used for determining whether we are to try to heuristically
339 * "intern" white space that seems to be used for indentation purposes
340 */
341 protected int mCheckIndentation;
342
343 /**
344 * Due to the way Stax API does not allow throwing stream exceptions
345 * from many methods for which Woodstox would need to throw one
346 * (especially <code>getText</code> and its variations), we may need
347 * to delay throwing an exception until {@link #next} is called next
348 * time. If so, this variable holds the pending stream exception.
349 */
350 protected XMLStreamException mPendingException = null;
351
352 /*
353 ///////////////////////////////////////////////////////////////////////
354 // DTD information (entities, content spec stub)
355 ///////////////////////////////////////////////////////////////////////
356 */
357
358 /**
359 * Entities parsed from internal/external DTD subsets. Although it
360 * will remain null for this class, extended classes make use of it,
361 * plus, to be able to share some of entity resolution code, instance
362 * is left here even though it semantically belongs to the sub-class.
363 */
364 protected Map<String, EntityDecl> mGeneralEntities = null;
365
366 /**
367 * Mode information needed at this level; mostly to check what kind
368 * of textual content (if any) is allowed in current element
369 * context. Constants come from
370 * {@link XMLValidator},
371 * (like {@link XMLValidator#CONTENT_ALLOW_VALIDATABLE_TEXT}).
372 * Only used inside tree; ignored for prolog/epilog (which
373 * have straight-forward static rules).
374 */
375 protected int mVldContent = XMLValidator.CONTENT_ALLOW_ANY_TEXT;
376
377 /**
378 * Configuration from {@code WstxInputProperties#RETURN_NULL_FOR_DEFAULT_NAMESPACE}
379 *
380 * @since 4.1.2
381 */
382 protected boolean mReturnNullForDefaultNamespace;
383
384 /*
385 ///////////////////////////////////////////////////////////////////////
386 // Instance construction, initialization
387 ///////////////////////////////////////////////////////////////////////
388 */
389
390 /**
391 * @param elemStack Input element stack to use; if null, will create
392 * instance locally.
393 * @param forER Override indicator; if true, this stream reader will be
394 * used by an event reader, and should modify some of the base config
395 * settings appropriately. If false, configuration settings are to
396 * be used as is.
397 */
398 protected BasicStreamReader(InputBootstrapper bs,
399 BranchingReaderSource input, ReaderCreator owner,
400 ReaderConfig cfg, InputElementStack elemStack,
401 boolean forER)
402 throws XMLStreamException
403 {
404 super(input, cfg, cfg.getEntityResolver());
405
406 mOwner = owner;
407
408 mTextBuffer = TextBuffer.createRecyclableBuffer(cfg);
409
410 // // // First, configuration settings:
411
412 mConfigFlags = cfg.getConfigFlags();
413 mCfgCoalesceText = (mConfigFlags & CFG_COALESCE_TEXT) != 0;
414 mCfgReportTextAsChars = (mConfigFlags & CFG_REPORT_CDATA) == 0;
415 mXml11 = cfg.isXml11();
416
417 // Can only use canonical white space if we are normalizing lfs
418 mCheckIndentation = mNormalizeLFs ? 16 : 0;
419
420 /* 30-Sep-2005, TSa: Let's not do lazy parsing when access is via
421 * Event API. Reason is that there will be no performance benefit
422 * (event objects always access full info right after traversal),
423 * but the wrapping of stream exceptions within runtime exception
424 * wrappers would happen, which is inconvenient (loss of stack trace,
425 * not catching all exceptions as expected)
426 */
427 mCfgLazyParsing = !forER && ((mConfigFlags & CFG_LAZY_PARSING) != 0);
428
429 /* There are a few derived settings used during tokenization that
430 * need to be initialized now...
431 */
432 if (mCfgCoalesceText) {
433 mStTextThreshold = TOKEN_FULL_COALESCED;
434 mShortestTextSegment = Integer.MAX_VALUE;
435 } else {
436 mStTextThreshold = TOKEN_PARTIAL_SINGLE;
437 if (forER) {
438 /* 30-Sep-2005, TSa: No point in returning runt segments for event readers
439 * (due to event object overhead, less convenient); let's just force
440 * returning of full length segments.
441 */
442 mShortestTextSegment = Integer.MAX_VALUE;
443 } else {
444 mShortestTextSegment = cfg.getShortestReportedTextSegment();
445 }
446 }
447
448 // // // Then handling of xml declaration data:
449
450 mDocXmlVersion = bs.getDeclaredVersion();
451 mDocInputEncoding = bs.getInputEncoding();
452 mDocXmlEncoding = bs.getDeclaredEncoding();
453
454 String sa = bs.getStandalone();
455 if (sa == null) {
456 mDocStandalone = DOC_STANDALONE_UNKNOWN;
457 } else {
458 if (XmlConsts.XML_SA_YES.equals(sa)) {
459 mDocStandalone = DOC_STANDALONE_YES;
460 } else {
461 mDocStandalone = DOC_STANDALONE_NO;
462 }
463 }
464
465 /* Ok; either we got declaration or not, but in either case we can
466 * now initialize prolog parsing settings, without having to really
467 * parse anything more.
468 */
469 /* 07-Oct-2005, TSa: Except, if we are in fragment mode, in which
470 * case we are kind of "in tree" mode...
471 */
472 mParseState = mConfig.inputParsingModeFragment() ?
473 STATE_TREE : STATE_PROLOG;
474
475 // // // And then connecting element stack and attribute collector
476
477 mElementStack = elemStack;
478 mAttrCollector = elemStack.getAttrCollector();
479
480 // And finally, location information may have offsets:
481 input.initInputLocation(this, mCurrDepth, 0);
482
483 elemStack.connectReporter(this);
484 mReturnNullForDefaultNamespace = mConfig.returnNullForDefaultNamespace();
485 }
486
487 protected static InputElementStack createElementStack(ReaderConfig cfg)
488 {
489 return new InputElementStack(cfg, cfg.willSupportNamespaces());
490 }
491
492 /*
493 ///////////////////////////////////////////////////////////////////////
494 // XMLStreamReader, document info
495 ///////////////////////////////////////////////////////////////////////
496 */
497
498 /**
499 * As per Stax (1.0) specs, needs to return whatever xml declaration
500 * claimed encoding is, if any; or null if no xml declaration found.
501 *<p>
502 * Note: method name is rather confusing (compare to {@link #getEncoding}).
503 */
504 @Override
505 public String getCharacterEncodingScheme() {
506 return mDocXmlEncoding;
507 }
508
509 /**
510 * As per Stax (1.0) specs, needs to return whatever parser determined
511 * the encoding was, if it was able to figure it out. If not (there are
512 * cases where this can not be found; specifically when being passed a
513 * {@link Reader}), it should return null.
514 */
515 @Override
516 public String getEncoding() {
517 return mDocInputEncoding;
518 }
519
520 @Override
521 public String getVersion()
522 {
523 if (mDocXmlVersion == XmlConsts.XML_V_10) {
524 return XmlConsts.XML_V_10_STR;
525 }
526 if (mDocXmlVersion == XmlConsts.XML_V_11) {
527 return XmlConsts.XML_V_11_STR;
528 }
529 return null; // unknown
530 }
531
532 @Override
533 public boolean isStandalone() {
534 return mDocStandalone == DOC_STANDALONE_YES;
535 }
536
537 @Override
538 public boolean standaloneSet() {
539 return mDocStandalone != DOC_STANDALONE_UNKNOWN;
540 }
541
542 /*
543 ///////////////////////////////////////////////////////////////////////
544 // Public API, configuration
545 ///////////////////////////////////////////////////////////////////////
546 */
547
548 @Override
549 public Object getProperty(String name)
550 {
551 /* 18-Nov-2008, TSa: As per [WSTX-50], should report the
552 * actual Base URL. It can be overridden by matching
553 * setProperty, but if not, is set to actual source
554 * of content being parsed.
555 */
556 if (WstxInputProperties.P_BASE_URL.equals(name)) {
557 try {
558 return mInput.getSource();
559 } catch (IOException e) { // not optimal but...
560 throw new IllegalStateException(e);
561 }
562 }
563 /* 23-Apr-2008, TSa: Let's NOT throw IllegalArgumentException
564 * for unknown property; JavaDocs do not suggest it needs
565 * to be done (different from that of XMLInputFactory
566 * and XMLStreamWriter specification)
567 */
568 return mConfig.safeGetProperty(name);
569 }
570
571 /*
572 ///////////////////////////////////////////////////////////////////////
573 // XMLStreamReader, current state
574 ///////////////////////////////////////////////////////////////////////
575 */
576
577 // // // Attribute access:
578
579 @Override
580 public int getAttributeCount() {
581 if (mCurrToken != START_ELEMENT) {
582 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
583 }
584 return mAttrCollector.getCount();
585 }
586
587 @Override
588 public String getAttributeLocalName(int index) {
589 if (mCurrToken != START_ELEMENT) {
590 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
591 }
592 return mAttrCollector.getLocalName(index);
593 }
594
595 @Override
596 public QName getAttributeName(int index) {
597 if (mCurrToken != START_ELEMENT) {
598 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
599 }
600 return mAttrCollector.getQName(index);
601 }
602
603 @Override
604 public String getAttributeNamespace(int index) {
605 if (mCurrToken != START_ELEMENT) {
606 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
607 }
608 // Internally it's marked as null, externally need to see ""
609 String uri = mAttrCollector.getURI(index);
610 return (uri == null) ? XmlConsts.ATTR_NO_NS_URI : uri;
611 }
612
613 @Override
614 public String getAttributePrefix(int index) {
615 if (mCurrToken != START_ELEMENT) {
616 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
617 }
618 // Internally it's marked as null, externally need to see ""
619 String p = mAttrCollector.getPrefix(index);
620 return (p == null) ? XmlConsts.ATTR_NO_PREFIX : p;
621 }
622
623 @Override
624 public String getAttributeType(int index) {
625 if (mCurrToken != START_ELEMENT) {
626 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
627 }
628 // Attr. collector doesn't know it, elem stack does:
629 return mElementStack.getAttributeType(index);
630 }
631
632 @Override
633 public String getAttributeValue(int index) {
634 if (mCurrToken != START_ELEMENT) {
635 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
636 }
637 return mAttrCollector.getValue(index);
638 }
639
640 @Override
641 public String getAttributeValue(String nsURI, String localName) {
642 if (mCurrToken != START_ELEMENT) {
643 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
644 }
645 // 22-Aug-2018, tatu: As per [woodstox-core#53], need different logic
646 // for `null` namespace URI argument
647 if (nsURI == null) {
648 return mAttrCollector.getValueByLocalName(localName);
649 }
650 return mAttrCollector.getValue(nsURI, localName);
651 }
652
653 /**
654 * From StAX specs:
655 *<blockquote>
656 * Reads the content of a text-only element, an exception is thrown if
657 * this is not a text-only element.
658 * Regardless of value of javax.xml.stream.isCoalescing this method always
659 * returns coalesced content.
660 *<br>Precondition: the current event is START_ELEMENT.
661 *<br>Postcondition: the current event is the corresponding END_ELEMENT.
662 *</blockquote>
663 */
664 @Override
665 public String getElementText()
666 throws XMLStreamException
667 {
668 if (mCurrToken != START_ELEMENT) {
669 throwParseError(ErrorConsts.ERR_STATE_NOT_STELEM, null, null);
670 }
671 /* Ok, now: with START_ELEMENT we know that it's not partially
672 * processed; that we are in-tree (not prolog or epilog).
673 * The only possible complication would be:
674 */
675 if (mStEmptyElem) {
676 /* And if so, we'll then get 'virtual' close tag; things
677 * are simple as location info was set when dealing with
678 * empty start element; and likewise, validation (if any)
679 * has been taken care of
680 */
681 mStEmptyElem = false;
682 mCurrToken = END_ELEMENT;
683 return "";
684 }
685
686 // First need to find a textual event
687 while (true) {
688 int type = next();
689 if (type == END_ELEMENT) {
690 return "";
691 }
692 if (type == COMMENT || type == PROCESSING_INSTRUCTION) {
693 continue;
694 }
695 if (((1 << type) & MASK_GET_ELEMENT_TEXT) == 0) {
696 throw _constructUnexpectedInTyped(type);
697 }
698 break;
699 }
700
701 if (mTokenState < TOKEN_FULL_COALESCED) {
702 readCoalescedText(mCurrToken, false);
703 }
704
705 /* Ok: then a quick check; if it looks like we are directly
706 * followed by the end tag, we need not construct String
707 * quite yet.
708 */
709 if ((mInputPtr + 1) < mInputEnd &&
710 mInputBuffer[mInputPtr] == '<' && mInputBuffer[mInputPtr+1] == '/') {
711 // Note: next() has validated text, no need for more validation
712 mInputPtr += 2;
713 mCurrToken = END_ELEMENT;
714 // must first get text, as call to readEndElem may break it:
715 String result = mTextBuffer.contentsAsString();
716 // Can by-pass next(), nextFromTree(), in this case:
717 readEndElem();
718 // and then return results
719 return result;
720 }
721
722 // Otherwise, we'll need to do slower processing
723 int extra = 1 + (mTextBuffer.size() >> 1); // let's add 50% space
724 StringBuilder sb = mTextBuffer.contentsAsStringBuilder(extra);
725 int type;
726
727 while ((type = next()) != END_ELEMENT) {
728 if (((1 << type) & MASK_GET_ELEMENT_TEXT) != 0) {
729 if (mTokenState < mStTextThreshold) {
730 finishToken(false);
731 }
732 verifyLimit("Text size", mConfig.getMaxTextLength(), sb.length());
733 mTextBuffer.contentsToStringBuilder(sb);
734 continue;
735 }
736 if (type != COMMENT && type != PROCESSING_INSTRUCTION) {
737 throw _constructUnexpectedInTyped(type);
738 }
739 }
740 // Note: calls next() have validated text, no need for more validation
741 return sb.toString();
742 }
743
744 /**
745 * Returns type of the last event returned; or START_DOCUMENT before
746 * any events has been explicitly returned.
747 */
748 @Override
749 public int getEventType()
750 {
751 /* Only complication -- multi-part coalesced text is to be reported
752 * as CHARACTERS always, never as CDATA (StAX specs).
753 */
754 if (mCurrToken == CDATA) {
755 if (mCfgCoalesceText || mCfgReportTextAsChars) {
756 return CHARACTERS;
757 }
758 }
759 return mCurrToken;
760 }
761
762 @Override
763 public String getLocalName()
764 {
765 // Note: for this we need not (yet) finish reading element
766 if (mCurrToken == START_ELEMENT || mCurrToken == END_ELEMENT) {
767 return mElementStack.getLocalName();
768 }
769 if (mCurrToken == ENTITY_REFERENCE) {
770 /* 30-Sep-2005, TSa: Entity will be null in non-expanding mode
771 * if no definition was found:
772 */
773 return (mCurrEntity == null) ? mCurrName: mCurrEntity.getName();
774 }
775 throw new IllegalStateException("Current state not START_ELEMENT, END_ELEMENT or ENTITY_REFERENCE");
776 }
777
778 // // // getLocation() defined in StreamScanner
779
780 @Override
781 public QName getName()
782 {
783 if (mCurrToken != START_ELEMENT && mCurrToken != END_ELEMENT) {
784 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_ELEM);
785 }
786 return mElementStack.getCurrentElementName();
787 }
788
789 // // // Namespace access
790
791 @Override
792 public NamespaceContext getNamespaceContext() {
793 /* Unlike other getNamespaceXxx methods, this is available
794 * for all events.
795 * Note that the context is "live", ie. remains active (but not
796 * static) even through calls to next(). StAX compliant apps
797 * should not count on this behaviour, however.
798 */
799 return mElementStack;
800 }
801
802 @Override
803 public int getNamespaceCount() {
804 if (mCurrToken != START_ELEMENT && mCurrToken != END_ELEMENT) {
805 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_ELEM);
806 }
807 return mElementStack.getCurrentNsCount();
808 }
809
810 @Override
811 public String getNamespacePrefix(int index) {
812 if (mCurrToken != START_ELEMENT && mCurrToken != END_ELEMENT) {
813 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_ELEM);
814 }
815 // Internally it's marked as null, externally need to see "" or null, depending
816 String p = mElementStack.getLocalNsPrefix(index);
817 if (p == null) {
818 return mReturnNullForDefaultNamespace ? null : XmlConsts.ATTR_NO_PREFIX;
819 }
820 return p;
821 }
822
823 @Override
824 public String getNamespaceURI() {
825 if (mCurrToken != START_ELEMENT && mCurrToken != END_ELEMENT) {
826 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_ELEM);
827 }
828 // Internally it's marked as null, externally need to see ""
829 String uri = mElementStack.getNsURI();
830 return (uri == null) ? XmlConsts.ELEM_NO_NS_URI : uri;
831 }
832
833 @Override
834 public String getNamespaceURI(int index)
835 {
836 if (mCurrToken != START_ELEMENT && mCurrToken != END_ELEMENT) {
837 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_ELEM);
838 }
839 // Internally it's marked as null, externally need to see ""
840 String uri = mElementStack.getLocalNsURI(index);
841 return (uri == null) ? XmlConsts.ATTR_NO_NS_URI : uri;
842 }
843
844 @Override
845 public String getNamespaceURI(String prefix)
846 {
847 if (mCurrToken != START_ELEMENT && mCurrToken != END_ELEMENT) {
848 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_ELEM);
849 }
850 /* Note: this will need to return null if no URI found for
851 * the prefix, so we can't mask it.
852 */
853 return mElementStack.getNamespaceURI(prefix);
854 }
855
856 @Override
857 public String getPIData() {
858 if (mCurrToken != PROCESSING_INSTRUCTION) {
859 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_PI);
860 }
861 if (mTokenState <= TOKEN_STARTED) {
862 safeFinishToken();
863 }
864 return mTextBuffer.contentsAsString();
865 }
866
867 @Override
868 public String getPITarget() {
869 if (mCurrToken != PROCESSING_INSTRUCTION) {
870 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_PI);
871 }
872 // Target is always parsed automatically, not lazily...
873 return mCurrName;
874 }
875
876 @Override
877 public String getPrefix() {
878 if (mCurrToken != START_ELEMENT && mCurrToken != END_ELEMENT) {
879 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_ELEM);
880 }
881 // Internally it's marked as null, externally need to see ""
882 String p = mElementStack.getPrefix();
883 return (p == null) ? XmlConsts.ELEM_NO_PREFIX : p;
884 }
885
886 @Override
887 public String getText()
888 {
889 final int currToken = mCurrToken;
890 if (((1 << currToken) & MASK_GET_TEXT) == 0) {
891 throwNotTextual(currToken);
892 }
893 if (mTokenState < mStTextThreshold) {
894 safeFinishToken();
895 }
896 if (currToken == ENTITY_REFERENCE) {
897 return (mCurrEntity == null) ? null : mCurrEntity.getReplacementText();
898 }
899 if (currToken == DTD) {
900 // 16-Aug-2004, TSa: Hmmh. Specs are bit ambiguous on whether this
901 // should return just the internal subset, or the whole thing...
902 return getDTDInternalSubset();
903 }
904 return mTextBuffer.contentsAsString();
905 }
906
907 @Override
908 public char[] getTextCharacters()
909 {
910 final int currToken = mCurrToken;
911 if (((1 << currToken) & MASK_GET_TEXT_XXX) == 0) {
912 throwNotTextXxx(currToken);
913 }
914 if (mTokenState < mStTextThreshold) {
915 safeFinishToken();
916 }
917 if (currToken == ENTITY_REFERENCE) {
918 return mCurrEntity.getReplacementChars();
919 }
920 if (currToken == DTD) {
921 return getDTDInternalSubsetArray();
922 }
923 return mTextBuffer.getTextBuffer();
924 }
925
926 @Override
927 public int getTextCharacters(int sourceStart, char[] target, int targetStart, int len)
928 {
929 final int currToken = mCurrToken;
930 if (((1 << currToken) & MASK_GET_TEXT_XXX) == 0) {
931 throwNotTextXxx(currToken);
932 }
933 if (mTokenState < mStTextThreshold) {
934 safeFinishToken();
935 }
936 return mTextBuffer.contentsToArray(sourceStart, target, targetStart, len);
937 }
938
939 @Override
940 public int getTextLength()
941 {
942 final int currToken = mCurrToken;
943 if (((1 << currToken) & MASK_GET_TEXT_XXX) == 0) {
944 throwNotTextXxx(currToken);
945 }
946 if (mTokenState < mStTextThreshold) {
947 safeFinishToken();
948 }
949 return mTextBuffer.size();
950 }
951
952 @Override
953 public int getTextStart()
954 {
955 final int currToken = mCurrToken;
956 if (((1 << currToken) & MASK_GET_TEXT_XXX) == 0) {
957 throwNotTextXxx(currToken);
958 }
959 if (mTokenState < mStTextThreshold) {
960 safeFinishToken();
961 }
962 return mTextBuffer.getTextStart();
963 }
964
965 @Override
966 public boolean hasName() {
967 return (mCurrToken == START_ELEMENT) || (mCurrToken == END_ELEMENT);
968 }
969
970 @Override
971 public boolean hasNext() {
972 // 08-Oct-2005, TSa: In multi-doc mode, we have different criteria...
973 return (mCurrToken != END_DOCUMENT)
974 || (mParseState == STATE_MULTIDOC_HACK);
975 }
976
977 @Override
978 public boolean hasText() {
979 return (((1 << mCurrToken) & MASK_GET_TEXT) != 0);
980 }
981
982 @Override
983 public boolean isAttributeSpecified(int index)
984 {
985 /* No need to check for ATTRIBUTE since we never return that...
986 */
987 if (mCurrToken != START_ELEMENT) {
988 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
989 }
990 return mAttrCollector.isSpecified(index);
991 }
992
993 @Override
994 public boolean isCharacters()
995 {
996 /* 21-Dec-2005, TSa: Changed for 3.0 to work the same way as stax
997 * ref impl.
998 */
999 //return (mCurrToken == CHARACTERS || mCurrToken == CDATA || mCurrToken == SPACE);
1000 /* 21-Apr-2009, TSa: As per [WSTX-201], should be consistent with
1001 * what getEventType() returns (affects CDATA, SPACE, in
1002 * coalescing mode or when explicitly asked to return CDATA
1003 * as CHARACTERS)
1004 */
1005 return (CHARACTERS == getEventType());
1006 }
1007
1008 @Override
1009 public boolean isEndElement() {
1010 return (mCurrToken == END_ELEMENT);
1011 }
1012
1013 @Override
1014 public boolean isStartElement() {
1015 return (mCurrToken == START_ELEMENT);
1016 }
1017
1018 /**
1019 *<p>
1020 * 05-Apr-2004, TSa: Could try to determine status when text is actually
1021 * read. That'd prevent double reads... but would it slow down that
1022 * one reading so that net effect would be negative?
1023 */
1024 @Override
1025 public boolean isWhiteSpace()
1026 {
1027 final int currToken = mCurrToken;
1028 if (currToken == CHARACTERS || currToken == CDATA) {
1029 if (mTokenState < mStTextThreshold) {
1030 safeFinishToken();
1031 }
1032 if (mWsStatus == ALL_WS_UNKNOWN) {
1033 mWsStatus = mTextBuffer.isAllWhitespace() ?
1034 ALL_WS_YES : ALL_WS_NO;
1035 }
1036 return mWsStatus == ALL_WS_YES;
1037 }
1038 return (currToken == SPACE);
1039 }
1040
1041 @Override
1042 public void require(int type, String nsUri, String localName)
1043 throws XMLStreamException
1044 {
1045 int curr = mCurrToken;
1046
1047 /* There are some special cases; specifically, CDATA
1048 * is sometimes reported as CHARACTERS. Let's be lenient by
1049 * allowing both 'real' and reported types, for now.
1050 */
1051 if (curr != type) {
1052 if (curr == CDATA) {
1053 if (mCfgCoalesceText || mCfgReportTextAsChars) {
1054 curr = CHARACTERS;
1055 }
1056 } else if (curr == SPACE) {
1057 // Hmmh. Should we require it to be empty or something?
1058 //curr = CHARACTERS;
1059 // For now, let's not change the check
1060 }
1061 }
1062
1063 if (type != curr) {
1064 throwParseError("Expected type "+tokenTypeDesc(type)
1065 +", current type "
1066 +tokenTypeDesc(curr));
1067 }
1068
1069 if (localName != null) {
1070 if (curr != START_ELEMENT && curr != END_ELEMENT
1071 && curr != ENTITY_REFERENCE) {
1072 throwParseError("Expected non-null local name, but current token not a START_ELEMENT, END_ELEMENT or ENTITY_REFERENCE (was "+tokenTypeDesc(mCurrToken)+")");
1073 }
1074 String n = getLocalName();
1075 if (n != localName && !n.equals(localName)) {
1076 throwParseError("Expected local name '"+localName+"'; current local name '"+n+"'.");
1077 }
1078 }
1079 if (nsUri != null) {
1080 if (curr != START_ELEMENT && curr != END_ELEMENT) {
1081 throwParseError("Expected non-null NS URI, but current token not a START_ELEMENT or END_ELEMENT (was "+tokenTypeDesc(curr)+")");
1082 }
1083 String uri = mElementStack.getNsURI();
1084 // No namespace?
1085 if (nsUri.length() == 0) {
1086 if (uri != null && uri.length() > 0) {
1087 throwParseError("Expected empty namespace, instead have '"+uri+"'.");
1088 }
1089 } else {
1090 if ((nsUri != uri) && !nsUri.equals(uri)) {
1091 throwParseError("Expected namespace '"+nsUri+"'; have '"
1092 +uri+"'.");
1093 }
1094 }
1095 }
1096 // Ok, fine, all's good
1097 }
1098
1099 /*
1100 ///////////////////////////////////////////////////////////////////////
1101 // XMLStreamReader, iterating
1102 ///////////////////////////////////////////////////////////////////////
1103 */
1104
1105 @Override
1106 public final int next() throws XMLStreamException
1107 {
1108 /* 24-Sep-2006, TSa: We may have deferred an exception that occurred
1109 * during parsing of the previous event. If so, now it needs to
1110 * be thrown.
1111 */
1112 if (mPendingException != null) {
1113 XMLStreamException strEx = mPendingException;
1114 mPendingException = null;
1115 throw strEx;
1116 }
1117
1118 /* Note: can not yet accurately record the location, since the
1119 * previous event might not yet be completely finished...
1120 */
1121 if (mParseState == STATE_TREE) {
1122 int type = nextFromTree();
1123 mCurrToken = type;
1124 if (mTokenState < mStTextThreshold) { // incomplete?
1125 /* Can remain incomplete if lazy parsing is enabled,
1126 * and this is not a validatable text segment; otherwise
1127 * must finish
1128 */
1129 if (!mCfgLazyParsing ||
1130 (mValidateText && (type == CHARACTERS || type == CDATA))) {
1131 finishToken(false);
1132 }
1133 }
1134
1135 /* Special cases -- sometimes (when coalescing text, or
1136 * when specifically configured to do so), CDATA and SPACE are
1137 * to be reported as CHARACTERS, although we still will
1138 * internally keep track of the real type.
1139 */
1140 if (type == CDATA) {
1141 if (mValidateText) {
1142 mElementStack.validateText(mTextBuffer, false);
1143 }
1144 if (mCfgCoalesceText || mCfgReportTextAsChars) {
1145 return CHARACTERS;
1146 }
1147 /*
1148 } else if (type == SPACE) {
1149 //if (mValidateText) { throw new IllegalStateException("Internal error: trying to validate SPACE event"); }
1150 */
1151 mCurrTextLength += mTextBuffer.size();
1152 verifyLimit("Text size", mConfig.getMaxTextLength(), mCurrTextLength);
1153 } else if (type == CHARACTERS) {
1154 if (mValidateText) {
1155 /* We may be able to determine that there will be
1156 * no more text coming for this element: but only
1157 * seeing the end tag marker ("</") is certain
1158 * (PIs and comments won't do, nor CDATA; start
1159 * element possibly... but that indicates mixed
1160 * content that's generally non-validatable)
1161 */
1162 if ((mInputPtr+1) < mInputEnd
1163 && mInputBuffer[mInputPtr] == '<'
1164 && mInputBuffer[mInputPtr+1] == '/') {
1165 // yup, it's all there is
1166 mElementStack.validateText(mTextBuffer, true);
1167 } else {
1168 mElementStack.validateText(mTextBuffer, false);
1169 }
1170 }
1171 mCurrTextLength += mTextBuffer.size();
1172 verifyLimit("Text size", mConfig.getMaxTextLength(), mCurrTextLength);
1173 } else if (type == START_ELEMENT || type == END_ELEMENT) {
1174 this.mCurrTextLength = 0;
1175 }
1176 return type;
1177 }
1178
1179 if (mParseState == STATE_PROLOG) {
1180 nextFromProlog(true);
1181 } else if (mParseState == STATE_EPILOG) {
1182 if (nextFromProlog(false)) {
1183 // We'll return END_DOCUMENT, need to mark it 'as consumed'
1184 mSecondaryToken = 0;
1185
1186 }
1187 } else if (mParseState == STATE_MULTIDOC_HACK) {
1188 mCurrToken = nextFromMultiDocState();
1189 } else { // == STATE_CLOSED
1190 if (mSecondaryToken == END_DOCUMENT) { // marker
1191 mSecondaryToken = 0; // mark end doc as consumed
1192 return END_DOCUMENT;
1193 }
1194 throw new java.util.NoSuchElementException();
1195 }
1196 return mCurrToken;
1197 }
1198
1199 @Override
1200 public int nextTag() throws XMLStreamException
1201 {
1202 while (true) {
1203 int next = next();
1204
1205 switch (next) {
1206 case SPACE:
1207 case COMMENT:
1208 case PROCESSING_INSTRUCTION:
1209 continue;
1210 case CDATA:
1211 case CHARACTERS:
1212 // inlined version of "isWhiteSpace()", so that exceptions can be passed as-is
1213 // without suppression
1214 if (mTokenState < mStTextThreshold) {
1215 finishToken(false);
1216 }
1217 if (mWsStatus == ALL_WS_UNKNOWN) {
1218 mWsStatus = mTextBuffer.isAllWhitespace() ? ALL_WS_YES : ALL_WS_NO;
1219 }
1220 if (mWsStatus == ALL_WS_YES) {
1221 continue;
1222 }
1223 throwParseError("Received non-all-whitespace CHARACTERS or CDATA event in nextTag().");
1224 break; // never gets here, but jikes complains without
1225 case START_ELEMENT:
1226 case END_ELEMENT:
1227 return next;
1228 }
1229 throwParseError("Received event "+ErrorConsts.tokenTypeDesc(next)
1230 +", instead of START_ELEMENT or END_ELEMENT.");
1231 }
1232 }
1233
1234 /**
1235 *<p>
1236 * Note: as per StAX 1.0 specs, this method does NOT close the underlying
1237 * input reader. That is, unless the new StAX2 property
1238 * {@link org.codehaus.stax2.XMLInputFactory2#P_AUTO_CLOSE_INPUT} is
1239 * set to true.
1240 */
1241 @Override
1242 public void close() throws XMLStreamException
1243 {
1244 if (mParseState != STATE_CLOSED) {
1245 mParseState = STATE_CLOSED;
1246 /* Let's see if we should notify factory that symbol table
1247 * has new entries, and may want to reuse this symbol table
1248 * instead of current root.
1249 */
1250 if (mCurrToken != END_DOCUMENT) {
1251 mCurrToken = mSecondaryToken = END_DOCUMENT;
1252 if (mSymbols.isDirty()) {
1253 mOwner.updateSymbolTable(mSymbols);
1254 }
1255 }
1256 /* Hmmh. Actually, we need to close all the dependant input
1257 * sources, first, and then also call close()
1258 * on the root input source object; it
1259 * will only do real close if that was enabled earlier.
1260 * The root input source also prevents multiple close() calls
1261 * for the underlying source, so we need not check that here.
1262 */
1263 closeAllInput(false);
1264 // And finally, can now recycle low-level (text) buffers
1265 mTextBuffer.recycle(true);
1266 }
1267 }
1268
1269 /*
1270 ///////////////////////////////////////////////////////////////////////
1271 // XMLStreamReader2 (StAX2) implementation
1272 ///////////////////////////////////////////////////////////////////////
1273 */
1274
1275 @Override
1276 @Deprecated
1277 public Object getFeature(String name) {
1278 throw new IllegalArgumentException(MessageFormat.format(ErrorConsts.ERR_UNKNOWN_FEATURE, new Object[] { name }));
1279 }
1280
1281 @Override
1282 @Deprecated
1283 public void setFeature(String name, Object value) {
1284 throw new IllegalArgumentException(MessageFormat.format(ErrorConsts.ERR_UNKNOWN_FEATURE, new Object[] { name }));
1285 }
1286
1287 // NOTE: getProperty() defined in Stax 1.0 interface
1288
1289 @Override
1290 public boolean isPropertySupported(String name) {
1291 // !!! TBI: not all these properties are really supported
1292 return mConfig.isPropertySupported(name);
1293 }
1294
1295 /**
1296 * @param name Name of the property to set
1297 * @param value Value to set property to.
1298 *
1299 * @return True, if the specified property was <b>succesfully</b>
1300 * set to specified value; false if its value was not changed
1301 */
1302 @Override
1303 public boolean setProperty(String name, Object value)
1304 {
1305 boolean ok = mConfig.setProperty(name, value);
1306 /* To make [WSTX-50] work fully dynamically (i.e. allow
1307 * setting BASE_URL after stream reader has been constructed)
1308 * need to force
1309 */
1310 if (ok && WstxInputProperties.P_BASE_URL.equals(name)) {
1311 // Easiest to just access from config: may come in as a String etc
1312 mInput.overrideSource(mConfig.getBaseURL());
1313 }
1314 return ok;
1315 }
1316
1317 // // // StAX2, additional traversal methods
1318
1319 @Override
1320 public void skipElement() throws XMLStreamException
1321 {
1322 if (mCurrToken != START_ELEMENT) {
1323 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
1324 }
1325 int nesting = 1; // need one more end elements than start elements
1326
1327 while (true) {
1328 int type = next();
1329 if (type == START_ELEMENT) {
1330 ++nesting;
1331 } else if (type == END_ELEMENT) {
1332 if (--nesting == 0) {
1333 break;
1334 }
1335 }
1336 }
1337 }
1338
1339 // // // StAX2, additional attribute access
1340
1341 @Override
1342 public AttributeInfo getAttributeInfo() throws XMLStreamException
1343 {
1344 if (mCurrToken != START_ELEMENT) {
1345 throw new IllegalStateException(ErrorConsts.ERR_STATE_NOT_STELEM);
1346 }
1347 /* Although attribute collector knows about specific parsed
1348 * information, the element stack has DTD-derived information (if
1349 * any)... and knows how to call attribute collector when necessary.
1350 */
1351 return mElementStack;
1352 }
1353
1354 // // // StAX2, Additional DTD access
1355
1356 /**
1357 * Since this class implements {@link DTDInfo}, method can just
1358 * return <code>this</code>.
1359 */
1360 @Override
1361 public DTDInfo getDTDInfo() throws XMLStreamException
1362 {
1363 /* Let's not allow it to be accessed during other events -- that
1364 * way callers won't count on it being available afterwards.
1365 */
1366 if (mCurrToken != DTD) {
1367 return null;
1368 }
1369 if (mTokenState < TOKEN_FULL_SINGLE) { // need to fully read it in now
1370 finishToken(false);
1371 }
1372 return this;
1373 }
1374
1375 // // // StAX2, Additional location information
1376
1377 /**
1378 * Location information is always accessible, for this reader.
1379 */
1380 @Override
1381 public final LocationInfo getLocationInfo() {
1382 return this;
1383 }
1384
1385 // // // StAX2, Pass-through text accessors
1386
1387
1388 /**
1389 * Method similar to {@link #getText()}, except
1390 * that it just uses provided Writer to write all textual content.
1391 * For further optimization, it may also be allowed to do true
1392 * pass-through, thus possibly avoiding one temporary copy of the
1393 * data.
1394 *<p>
1395 * TODO: try to optimize to allow completely streaming pass-through:
1396 * currently will still read all data in memory buffers before
1397 * outputting
1398 *
1399 * @param w Writer to use for writing textual contents
1400 * @param preserveContents If true, reader has to preserve contents
1401 * so that further calls to <code>getText</code> will return
1402 * proper conntets. If false, reader is allowed to skip creation
1403 * of such copies: this can improve performance, but it also means
1404 * that further calls to <code>getText</code> is not guaranteed to
1405 * return meaningful data.
1406 *
1407 * @return Number of characters written to the reader
1408 */
1409 @Override
1410 public int getText(Writer w, boolean preserveContents)
1411 throws IOException, XMLStreamException
1412 {
1413 final int currToken = mCurrToken;
1414 if (((1 << currToken) & MASK_GET_TEXT_WITH_WRITER) == 0) {
1415 throwNotTextual(currToken);
1416 }
1417 /* May need to be able to do fully streaming... but only for
1418 * text events that have not yet been fully read; for other
1419 * types there's less benefit, and for fully read ones, we
1420 * already have everything ready.
1421 */
1422 if (!preserveContents) {
1423 if (currToken == CHARACTERS) {
1424 int count = mTextBuffer.rawContentsTo(w);
1425 /* Let's also clear whatever was collected (as allowed by
1426 * method contract) previously, to both save memory, and
1427 * to ensure caller doesn't accidentally try to access it
1428 * (and get otherwise 'random' results).
1429 */
1430 mTextBuffer.resetWithEmpty();
1431 if (mTokenState < TOKEN_FULL_SINGLE) {
1432 count += readAndWriteText(w);
1433 }
1434 if (mCfgCoalesceText &&
1435 (mTokenState < TOKEN_FULL_COALESCED)) {
1436 if (mCfgCoalesceText) {
1437 count += readAndWriteCoalesced(w, false);
1438 }
1439 }
1440 return count;
1441 } else if (currToken == CDATA) {
1442 int count = mTextBuffer.rawContentsTo(w);
1443 mTextBuffer.resetWithEmpty(); // same as with CHARACTERS
1444 if (mTokenState < TOKEN_FULL_SINGLE) {
1445 count += readAndWriteCData(w);
1446 }
1447 if (mCfgCoalesceText &&
1448 (mTokenState < TOKEN_FULL_COALESCED)) {
1449 if (mCfgCoalesceText) {
1450 count += readAndWriteCoalesced(w, true);
1451 }
1452 }
1453 return count;
1454 }
1455 }
1456 if (mTokenState < mStTextThreshold) {
1457 /* Otherwise, let's just finish the token; and due to guarantee
1458 * by streaming method, let's try ensure we get it all.
1459 */
1460 finishToken(false); // false -> shouldn't defer errors
1461 }
1462 if (currToken == ENTITY_REFERENCE) {
1463 return mCurrEntity.getReplacementText(w);
1464 }
1465 if (currToken == DTD) {
1466 char[] ch = getDTDInternalSubsetArray();
1467 if (ch != null) {
1468 w.write(ch);
1469 return ch.length;
1470 }
1471 return 0;
1472 }
1473 return mTextBuffer.rawContentsTo(w);
1474 }
1475
1476 // // // StAX 2, Other accessors
1477
1478 /**
1479 * @return Number of open elements in the stack; 0 when parser is in
1480 * prolog/epilog, 1 inside root element and so on.
1481 */
1482 @Override
1483 public int getDepth() {
1484 /* Note: we can not necessarily use mCurrDepth, since it is
1485 * directly synchronized to the input (to catch unbalanced entity
1486 * expansion WRT element nesting), and not to actual token values
1487 * returned.
1488 */
1489 return mElementStack.getDepth();
1490 }
1491
1492 /**
1493 * @return True, if cursor points to a start or end element that is
1494 * constructed from 'empty' element (ends with {@code '/>'});
1495 * false otherwise.
1496 */
1497 @Override
1498 public boolean isEmptyElement() throws XMLStreamException {
1499 return (mCurrToken == START_ELEMENT) ? mStEmptyElem : false;
1500 }
1501
1502 @Override
1503 public NamespaceContext getNonTransientNamespaceContext() {
1504 // null -> no Location info, not needed with basic API
1505 return mElementStack.createNonTransientNsContext(null);
1506 }
1507
1508 @Override
1509 public String getPrefixedName()
1510 {
1511 switch (mCurrToken) {
1512 case START_ELEMENT:
1513 case END_ELEMENT:
1514 {
1515 String prefix = mElementStack.getPrefix();
1516 String ln = mElementStack.getLocalName();
1517
1518 if (prefix == null) {
1519 return ln;
1520 }
1521 StringBuilder sb = new StringBuilder(ln.length() + 1 + prefix.length());
1522 sb.append(prefix);
1523 sb.append(':');
1524 sb.append(ln);
1525 return sb.toString();
1526 }
1527 case ENTITY_REFERENCE:
1528 return getLocalName();
1529 case PROCESSING_INSTRUCTION:
1530 return getPITarget();
1531 case DTD:
1532 return getDTDRootName();
1533
1534 }
1535 throw new IllegalStateException("Current state not START_ELEMENT, END_ELEMENT, ENTITY_REFERENCE, PROCESSING_INSTRUCTION or DTD");
1536 }
1537
1538 @Override
1539 public void closeCompletely() throws XMLStreamException {
1540 closeAllInput(true);
1541 }
1542
1543 /*
1544 ///////////////////////////////////////////////////////////////////////
1545 // DTDInfo implementation (StAX 2)
1546 ///////////////////////////////////////////////////////////////////////
1547 */
1548
1549 /**
1550 *<p>
1551 * Note: DTD-handling sub-classes need to override this method.
1552 */
1553 @Override
1554 public Object getProcessedDTD() {
1555 return null;
1556 }
1557
1558 @Override
1559 public String getDTDRootName() {
1560 if (mRootPrefix == null) {
1561 return mRootLName;
1562 }
1563 return mRootPrefix + ":" + mRootLName;
1564 }
1565
1566 @Override
1567 public String getDTDPublicId() {
1568 return mDtdPublicId;
1569 }
1570
1571 @Override
1572 public String getDTDSystemId() {
1573 return mDtdSystemId;
1574 }
1575
1576 /**
1577 * @return Internal subset portion of the DOCTYPE declaration, if any;
1578 * empty String if none
1579 */
1580 @Override
1581 public String getDTDInternalSubset() {
1582 if (mCurrToken != DTD) {
1583 return null;
1584 }
1585 return mTextBuffer.contentsAsString();
1586 }
1587
1588 /**
1589 * Internal method used by implementation
1590 */
1591 private char[] getDTDInternalSubsetArray() {
1592 /* Note: no checks for current state, but only because it's
1593 * an internal method and callers are known to ensure it's ok
1594 * to call this
1595 */
1596 return mTextBuffer.contentsAsArray();
1597 }
1598
1599 // // StAX2, v2.0
1600
1601 /**
1602 * Sub-class will override this method
1603 */
1604 @Override
1605 public DTDValidationSchema getProcessedDTDSchema() {
1606 return null;
1607 }
1608
1609 /*
1610 ///////////////////////////////////////////////////////////////////////
1611 // LocationInfo implementation (StAX 2)
1612 ///////////////////////////////////////////////////////////////////////
1613 */
1614
1615 // // // First, the "raw" offset accessors:
1616
1617 @Override
1618 public long getStartingByteOffset() {
1619 /* 15-Apr-2005, TSa: No way to reliably keep track of byte offsets,
1620 * at least for variable-length encodings... so let's just
1621 * return -1 for now
1622 */
1623 return -1L;
1624 }
1625
1626 @Override
1627 public long getStartingCharOffset() {
1628 return mTokenInputTotal;
1629 }
1630
1631 @Override
1632 public long getEndingByteOffset() throws XMLStreamException
1633 {
1634 /* 15-Apr-2005, TSa: No way to reliably keep track of byte offsets,
1635 * at least for variable-length encodings... so let's just
1636 * return -1 for now
1637 */
1638 return -1;
1639 }
1640
1641 @Override
1642 public long getEndingCharOffset() throws XMLStreamException
1643 {
1644 // Need to get to the end of the token, if not there yet
1645 if (mTokenState < mStTextThreshold) {
1646 finishToken(false);
1647 }
1648 return mCurrInputProcessed + mInputPtr;
1649 }
1650
1651 // // // and then the object-based access methods:
1652
1653 @Override
1654 public final Location getLocation() {
1655 return getStartLocation();
1656 }
1657
1658 // public XMLStreamLocation2 getStartLocation() // from base class
1659 // public XMLStreamLocation2 getCurrentLocation() // - "" -
1660
1661 @Override
1662 public final XMLStreamLocation2 getEndLocation()
1663 throws XMLStreamException
1664 {
1665 // Need to get to the end of the token, if not there yet
1666 if (mTokenState < mStTextThreshold) {
1667 finishToken(false);
1668 }
1669 // And then we just need the current location!
1670 return getCurrentLocation();
1671 }
1672
1673 /*
1674 ///////////////////////////////////////////////////////////////////////
1675 // Stax2 validation
1676 ///////////////////////////////////////////////////////////////////////
1677 */
1678
1679 @Override
1680 public XMLValidator validateAgainst(XMLValidationSchema schema)
1681 throws XMLStreamException
1682 {
1683 // Not implemented by the basic reader:
1684 return null;
1685 }
1686
1687 @Override
1688 public XMLValidator stopValidatingAgainst(XMLValidationSchema schema)
1689 throws XMLStreamException
1690 {
1691 // Not implemented by the basic reader:
1692 return null;
1693 }
1694
1695 @Override
1696 public XMLValidator stopValidatingAgainst(XMLValidator validator)
1697 throws XMLStreamException
1698 {
1699 // Not implemented by the basic reader:
1700 return null;
1701 }
1702
1703 @Override
1704 public ValidationProblemHandler setValidationProblemHandler(ValidationProblemHandler h)
1705 {
1706 // Not implemented by the basic reader:
1707 return null;
1708 }
1709
1710 /*
1711 ///////////////////////////////////////////////////////////////////////
1712 // StreamReaderImpl implementation
1713 ///////////////////////////////////////////////////////////////////////
1714 */
1715
1716 @Override
1717 public EntityDecl getCurrentEntityDecl() {
1718 return mCurrEntity;
1719 }
1720
1721 /**
1722 * Method called by {@link com.ctc.wstx.evt.DefaultEventAllocator}
1723 * to get double-indirection necessary for constructing start element
1724 * events.
1725 *
1726 * @return Null, if stream does not point to start element; whatever
1727 * callback returns otherwise.
1728 */
1729 @Override
1730 public Object withStartElement(ElemCallback cb, Location loc)
1731 {
1732 if (mCurrToken != START_ELEMENT) {
1733 return null;
1734 }
1735 return cb.withStartElement(loc, getName(),
1736 mElementStack.createNonTransientNsContext(loc),
1737 mAttrCollector.buildAttrOb(),
1738 mStEmptyElem);
1739 }
1740
1741 @Override
1742 public boolean isNamespaceAware() {
1743 return mCfgNsEnabled;
1744 }
1745
1746 /**
1747 * Method needed by classes (like stream writer implementations)
1748 * that want to have efficient direct access to element stack
1749 * implementation
1750 */
1751 @Override
1752 public InputElementStack getInputElementStack() {
1753 return mElementStack;
1754 }
1755
1756 /**
1757 * Method needed by classes (like stream writer implementations)
1758 * that want to have efficient direct access to attribute collector
1759 * Object, for optimal attribute name and value access.
1760 */
1761 @Override
1762 public AttributeCollector getAttributeCollector() {
1763 return mAttrCollector;
1764 }
1765
1766 /*
1767 ///////////////////////////////////////////////////////////////////////
1768 // Support for SAX XMLReader implementation
1769 ///////////////////////////////////////////////////////////////////////
1770 */
1771
1772 public void fireSaxStartElement(ContentHandler h, Attributes attrs)
1773 throws SAXException
1774 {
1775 if (h != null) {
1776 // First; any ns declarations?
1777 int nsCount = mElementStack.getCurrentNsCount();
1778 for (int i = 0; i < nsCount; ++i) {
1779 String prefix = mElementStack.getLocalNsPrefix(i);
1780 String uri = mElementStack.getLocalNsURI(i);
1781 h.startPrefixMapping((prefix == null) ? "" : prefix, uri);
1782 }
1783
1784 // Then start-elem event itself:
1785 String uri = mElementStack.getNsURI();
1786 // Sax requires "" (not null) for ns uris...
1787 h.startElement((uri == null) ? "" : uri,
1788 mElementStack.getLocalName(), getPrefixedName(), attrs);
1789 }
1790 }
1791
1792 public void fireSaxEndElement(ContentHandler h)
1793 throws SAXException
1794 {
1795 if (h != null) {
1796 /* Order of events is reversed (wrt. start-element): first
1797 * the end tag event, then unbound prefixes
1798 */
1799 String uri = mElementStack.getNsURI();
1800 // Sax requires "" (not null) for ns uris...
1801 h.endElement((uri == null) ? "" : uri,
1802 mElementStack.getLocalName(), getPrefixedName());
1803 // Any expiring ns declarations?
1804 int nsCount = mElementStack.getCurrentNsCount();
1805 for (int i = 0; i < nsCount; ++i) {
1806 String prefix = mElementStack.getLocalNsPrefix(i);
1807 //String nsUri = mElementStack.getLocalNsURI(i);
1808 h.endPrefixMapping((prefix == null) ? "" : prefix);
1809 }
1810 }
1811 }
1812
1813 public void fireSaxCharacterEvents(ContentHandler h)
1814 throws XMLStreamException, SAXException
1815 {
1816 if (h != null) {
1817 if (mPendingException != null) {
1818 XMLStreamException sex = mPendingException;
1819 mPendingException = null;
1820 throw sex;
1821 }
1822 /* Let's not defer errors; SAXTest implies
1823 * it's expected errors are thrown right away
1824 */
1825 if (mTokenState < mStTextThreshold) {
1826 finishToken(false);
1827 }
1828 mTextBuffer.fireSaxCharacterEvents(h);
1829 }
1830 }
1831
1832 public void fireSaxSpaceEvents(ContentHandler h)
1833 throws XMLStreamException, SAXException
1834 {
1835 if (h != null) {
1836 if (mTokenState < mStTextThreshold) {
1837 finishToken(false); // no error deferring
1838 }
1839 mTextBuffer.fireSaxSpaceEvents(h);
1840 }
1841 }
1842
1843 public void fireSaxCommentEvent(LexicalHandler h)
1844 throws XMLStreamException, SAXException
1845 {
1846 if (h != null) {
1847 if (mTokenState < mStTextThreshold) {
1848 finishToken(false); // no error deferring
1849 }
1850 mTextBuffer.fireSaxCommentEvent(h);
1851 }
1852 }
1853
1854 public void fireSaxPIEvent(ContentHandler h)
1855 throws XMLStreamException, SAXException
1856 {
1857 if (h != null) {
1858 if (mTokenState < mStTextThreshold) {
1859 finishToken(false); // no error deferring
1860 }
1861 h.processingInstruction(mCurrName, mTextBuffer.contentsAsString());
1862 }
1863 }
1864
1865 /*
1866 ///////////////////////////////////////////////////////////////////////
1867 // Internal methods, config access
1868 ///////////////////////////////////////////////////////////////////////
1869 */
1870
1871 protected final boolean hasConfigFlags(int flags) {
1872 return (mConfigFlags & flags) == flags;
1873 }
1874
1875 /*
1876 ///////////////////////////////////////////////////////////////////////
1877 // Internal methods, parsing helper methods
1878 ///////////////////////////////////////////////////////////////////////
1879 */
1880
1881 /**
1882 * @return Null, if keyword matches ok; String that contains erroneous
1883 * keyword if not.
1884 */
1885 protected String checkKeyword(char c, String expected)
1886 throws XMLStreamException
1887 {
1888 int ptr = 0;
1889 int len = expected.length();
1890
1891 while (expected.charAt(ptr) == c && ++ptr < len) {
1892 if (mInputPtr < mInputEnd) {
1893 c = mInputBuffer[mInputPtr++];
1894 } else {
1895 int ci = getNext();
1896 if (ci < 0) { // EOF
1897 break;
1898 }
1899 c = (char) ci;
1900 }
1901 }
1902
1903 if (ptr == len) {
1904 // Probable match... but let's make sure keyword is finished:
1905 int i = peekNext();
1906 if (i < 0 || (!isNameChar((char) i) && i != ':')) {
1907 return null;
1908 }
1909 // Nope, continues, need to find the rest:
1910 }
1911
1912 StringBuilder sb = new StringBuilder(expected.length() + 16);
1913 sb.append(expected.substring(0, ptr));
1914 if (ptr < len) {
1915 sb.append(c);
1916 }
1917
1918 while (true) {
1919 if (mInputPtr < mInputEnd) {
1920 c = mInputBuffer[mInputPtr++];
1921 } else {
1922 int ci = getNext();
1923 if (ci < 0) { // EOF
1924 break;
1925 }
1926 c = (char) ci;
1927 }
1928 if (!isNameChar(c)) {
1929 // Let's push it back then
1930 --mInputPtr;
1931 break;
1932 }
1933 sb.append(c);
1934 }
1935
1936 return sb.toString();
1937 }
1938
1939 protected void checkCData() throws XMLStreamException
1940 {
1941 String wrong = checkKeyword(getNextCharFromCurrent(SUFFIX_IN_CDATA), "CDATA");
1942 if (wrong != null) {
1943 throwParseError("Unrecognized XML directive '"+wrong+"'; expected 'CDATA'.");
1944 }
1945 // Plus, need the bracket too:
1946 char c = getNextCharFromCurrent(SUFFIX_IN_CDATA);
1947 if (c != '[') {
1948 throwUnexpectedChar(c, "excepted '[' after '<![CDATA'");
1949 }
1950 // Cool, that's it!
1951 }
1952
1953 /**
1954 * Method that will parse an attribute value enclosed in quotes, using
1955 * an {@link TextBuilder} instance. Will normalize white space inside
1956 * attribute value using default XML rules (change linefeeds to spaces
1957 * etc.; but won't use DTD information for further coalescing).
1958 *
1959 * @param openingQuote Quote character (single or double quote) for
1960 * this attribute value
1961 * @param tb TextBuilder into which attribute value will be added
1962 */
1963 private final void parseAttrValue(char openingQuote, TextBuilder tb)
1964 throws XMLStreamException
1965 {
1966 char[] outBuf = tb.getCharBuffer();
1967 int outPtr = tb.getCharSize();
1968 // important! Underlying buffer may be shared, does not necessarily start from 0
1969 final int startingOffset = outPtr;
1970 final int maxAttrSize = mConfig.getMaxAttributeSize();
1971 int outLimit = Math.min(startingOffset+maxAttrSize, outBuf.length);
1972 final WstxInputSource currScope = mInput;
1973
1974 while (true) {
1975 char c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
1976 : getNextChar(SUFFIX_IN_ATTR_VALUE);
1977 // Let's do a quick for most attribute content chars:
1978 if (c <= '\'') {
1979 if (c < CHAR_SPACE) {
1980 if (c == '\n') {
1981 markLF();
1982 } else if (c == '\r') {
1983 // 04-Mar-2006, TSa: Linefeed normalization only done if enabled -
1984 // specifically, 2-char lfs from int. entities are not coalesced.
1985 // Now... whether to try to count them as one or not... easier not to;
1986 // esp. since we may not be able to distinguish char entity originated ones
1987 // from real ones.
1988 if (mNormalizeLFs) {
1989 c = getNextChar(SUFFIX_IN_ATTR_VALUE);
1990 if (c != '\n') { // nope, not 2-char lf (Mac?)
1991 --mInputPtr;
1992 }
1993 }
1994 markLF();
1995 } else if (c != '\t') {
1996 throwInvalidSpace(c);
1997 }
1998 // Whatever it was, it'll be 'normal' space now.
1999 c = CHAR_SPACE;
2000 } else if (c == openingQuote) {
2001 // 06-Aug-2004, TSa: Can get these via entities; only "real" end quotes in same
2002 // scope count. Note, too, that since this will only be done at root level,
2003 // there's no need to check for "runaway" values; they'll hit EOF
2004 if (mInput == currScope) {
2005 break;
2006 }
2007 } else if (c == '&') { // an entity of some sort...
2008 int ch;
2009 if (inputInBuffer() >= 3
2010 && (ch = resolveSimpleEntity(true)) != 0) {
2011 // Ok, fine, c is whatever it is
2012 ;
2013 } else { // full entity just changes buffer...
2014 ch = fullyResolveEntity(false);
2015 if (ch == 0) {
2016 // need to skip output, thusly (expanded to new input source)
2017 continue;
2018 }
2019 }
2020 if (ch <= 0xFFFF) {
2021 c = (char) ch;
2022 } else {
2023 ch -= 0x10000;
2024 if (outPtr >= outLimit) {
2025 outBuf = _checkAttributeLimit(tb, outBuf, outPtr, outPtr - startingOffset, maxAttrSize);
2026 outLimit = Math.min(startingOffset+maxAttrSize, outBuf.length);
2027 }
2028 outBuf[outPtr++] = (char) ((ch >> 10) + 0xD800);
2029 c = (char) ((ch & 0x3FF) + 0xDC00);
2030 }
2031 }
2032 } else if (c == '<') {
2033 throwUnexpectedChar(c, SUFFIX_IN_ATTR_VALUE);
2034 }
2035
2036 // Ok, let's just add char in, whatever it was
2037 if (outPtr >= outLimit) {
2038 outBuf = _checkAttributeLimit(tb, outBuf, outPtr, outPtr - startingOffset, maxAttrSize);
2039 outLimit = Math.min(startingOffset+maxAttrSize, outBuf.length);
2040 }
2041 outBuf[outPtr++] = c;
2042 }
2043
2044 // Fine; let's tell TextBuild we're done:
2045 tb.setBufferSize(outPtr);
2046 }
2047
2048 private final char[] _checkAttributeLimit(TextBuilder tb,
2049 char[] outBuf, int outPtr, int currAttrSize, int maxAttrSize)
2050 throws XMLStreamException
2051 {
2052 // Add +1 since we are at point where we are to append (at least) one more character
2053 verifyLimit("Maximum attribute size", maxAttrSize , currAttrSize+1);
2054 // just sanity check
2055 if (outPtr < outBuf.length) {
2056 ExceptionUtil.throwInternal("Expected either attr limit ("+maxAttrSize
2057 +") >= currAttrSize ("+currAttrSize+") OR >= outBuf.length ("+outBuf.length+")");
2058 }
2059 return tb.bufferFull(1);
2060 }
2061
2062 /*
2063 ///////////////////////////////////////////////////////////////////////
2064 // Internal methods, parsing prolog (before root) and epilog
2065 ///////////////////////////////////////////////////////////////////////
2066 */
2067
2068 /**
2069 * Method called to find type of next token in prolog; either reading
2070 * just enough information to know the type (lazy parsing), or the
2071 * full contents (non-lazy)
2072 *
2073 * @return True if we hit EOI, false otherwise
2074 */
2075 private boolean nextFromProlog(boolean isProlog)
2076 throws XMLStreamException
2077 {
2078 int i;
2079
2080 // First, do we need to finish currently open token?
2081 if (mTokenState < mStTextThreshold) {
2082 mTokenState = TOKEN_FULL_COALESCED;
2083 i = skipToken();
2084 // note: skipToken() updates the start location
2085 } else {
2086 // Need to update the start location...
2087 mTokenInputTotal = mCurrInputProcessed + mInputPtr;
2088 mTokenInputRow = mCurrInputRow;
2089 mTokenInputCol = mInputPtr - mCurrInputRowStart;
2090 i = getNext();
2091 }
2092
2093 // Any white space to parse or skip?
2094 if (i <= CHAR_SPACE && i >= 0) {
2095 // Need to return as an event?
2096 if (hasConfigFlags(CFG_REPORT_PROLOG_WS)) {
2097 mCurrToken = SPACE;
2098 if (readSpacePrimary((char) i, true)) {
2099 /* no need to worry about coalescing, since CDATA is not
2100 * allowed at this level...
2101 */
2102 mTokenState = TOKEN_FULL_COALESCED;
2103 } else {
2104 if (mCfgLazyParsing) {
2105 /* Let's not even bother checking if it's
2106 * "long enough"; shouldn't usually matter, but few
2107 * apps care to get multiple adjacent SPACE events...
2108 */
2109 mTokenState = TOKEN_STARTED;
2110 } else {
2111 readSpaceSecondary(true);
2112 mTokenState = TOKEN_FULL_COALESCED;
2113 }
2114 }
2115 return false;
2116 }
2117 // If not, can skip it right away
2118 --mInputPtr; // to handle linefeeds gracefully
2119 i = getNextAfterWS();
2120 if (i >= 0) {
2121 // ... after which location has to be reset properly:
2122 /* 11-Apr-2005, TSa: But note that we need to "move back"
2123 * column and total offset values by one, to compensate
2124 * for the char that was read (row can not have changed,
2125 * since it's non-WS, and thus non-lf/cr char)
2126 */
2127 mTokenInputTotal = mCurrInputProcessed + mInputPtr - 1;
2128 mTokenInputRow = mCurrInputRow;
2129 mTokenInputCol = mInputPtr - mCurrInputRowStart - 1;
2130 }
2131 }
2132
2133 // Did we hit EOI?
2134 if (i < 0) {
2135 handleEOF(isProlog);
2136 mParseState = STATE_CLOSED;
2137 return true;
2138 }
2139
2140 // Now we better have a lt...
2141 if (i != '<') {
2142 throwUnexpectedChar(i, (isProlog ? SUFFIX_IN_PROLOG : SUFFIX_IN_EPILOG)
2143 +"; expected '<'");
2144 }
2145
2146 // And then it should be easy to figure out type:
2147 char c = getNextChar(isProlog ? SUFFIX_IN_PROLOG : SUFFIX_IN_EPILOG);
2148
2149 if (c == '?') { // proc. inst
2150 mCurrToken = readPIPrimary();
2151 } else if (c == '!') { // DOCTYPE or comment (or CDATA, but not legal here)
2152 // Need to figure out bit more first...
2153 nextFromPrologBang(isProlog);
2154 } else if (c == '/') { // end tag not allowed...
2155 if (isProlog) {
2156 throwParseError("Unexpected character combination '</' in prolog.");
2157 }
2158 throwParseError("Unexpected character combination '</' in epilog (extra close tag?).");
2159 } else if (c == ':' || isNameStartChar(c)) {
2160 // Root element, only allowed after prolog
2161 if (!isProlog) {
2162 /* This call will throw an exception if there's a problem;
2163 * otherwise set up everything properly
2164 */
2165 mCurrToken = handleExtraRoot(c); // will check input parsing mode...
2166 return false;
2167 }
2168 handleRootElem(c);
2169 mCurrToken = START_ELEMENT;
2170 } else {
2171 throwUnexpectedChar(c, (isProlog ? SUFFIX_IN_PROLOG : SUFFIX_IN_EPILOG)
2172 +", after '<'.");
2173 }
2174
2175 // Ok; final twist, maybe we do NOT want lazy parsing?
2176 if (!mCfgLazyParsing && mTokenState < mStTextThreshold) {
2177 finishToken(false);
2178 }
2179
2180 return false;
2181 }
2182
2183 protected void handleRootElem(char c)
2184 throws XMLStreamException
2185 {
2186 mParseState = STATE_TREE;
2187 initValidation();
2188 handleStartElem(c);
2189 // Does name match with DOCTYPE declaration (if any)?
2190 // 20-Jan-2006, TSa: Only check this is we are (DTD) validating...
2191 if (mRootLName != null) {
2192 if (hasConfigFlags(CFG_VALIDATE_AGAINST_DTD)) {
2193 if (!mElementStack.matches(mRootPrefix, mRootLName)) {
2194 String actual = (mRootPrefix == null) ? mRootLName
2195 : (mRootPrefix + ":" + mRootLName);
2196 reportValidationProblem(ErrorConsts.ERR_VLD_WRONG_ROOT, actual, mRootLName);
2197 }
2198 }
2199 }
2200 }
2201
2202 /**
2203 * Method called right before the document root element is handled.
2204 * The default implementation is empty; validating stream readers
2205 * should override the method and do whatever initialization is
2206 * necessary
2207 */
2208 protected void initValidation()
2209 throws XMLStreamException
2210 {
2211 ; // nothing to do here
2212 }
2213
2214 protected int handleEOF(boolean isProlog)
2215 throws XMLStreamException
2216 {
2217 /* 19-Aug-2006, TSa: mSecondaryToken needs to be initialized to
2218 * END_DOCUMENT so we'll know it hasn't been yet accessed.
2219 */
2220 mCurrToken = mSecondaryToken = END_DOCUMENT;
2221
2222 /* Although buffers have most likely already been recycled,
2223 * let's call this again just in case. At this point we can
2224 * safely discard any contents
2225 */
2226 mTextBuffer.recycle(true); // true -> clean'n recycle
2227 // It's ok to get EOF from epilog but not from prolog
2228 if (isProlog) {
2229 throwUnexpectedEOF(SUFFIX_IN_PROLOG);
2230 }
2231 return mCurrToken;
2232 }
2233
2234 /**
2235 * Method called if a root-level element is found after the main
2236 * root element was closed. This is legal in multi-doc parsing
2237 * mode (and in fragment mode), but not in the default single-doc
2238 * mode.
2239 * @param c Character passed in (not currently used)
2240 *
2241 * @return Token to return
2242 */
2243 private int handleExtraRoot(char c)
2244 throws XMLStreamException
2245 {
2246 if (!mConfig.inputParsingModeDocuments()) {
2247 /* Has to be single-doc mode, since fragment mode
2248 * should never get here (since fragment mode never has epilog
2249 * or prolog modes)
2250 */
2251 throwParseError("Illegal to have multiple roots (start tag in epilog?).");
2252 }
2253 // Need to push back the char, since it is the first char of elem name
2254 --mInputPtr;
2255 return handleMultiDocStart(START_ELEMENT);
2256 }
2257
2258 /**
2259 * Method called when an event was encountered that indicates document
2260 * boundary in multi-doc mode. Needs to trigger dummy
2261 * END_DOCUMENT/START_DOCUMENT event combination, followed by the
2262 * handling of the original event.
2263 *
2264 * @return Event type to return
2265 */
2266 protected int handleMultiDocStart(int nextEvent)
2267 {
2268 mParseState = STATE_MULTIDOC_HACK;
2269 mTokenState = TOKEN_FULL_COALESCED; // this is a virtual event after all...
2270 mSecondaryToken = nextEvent;
2271 return END_DOCUMENT;
2272 }
2273
2274 /**
2275 * Method called to get the next event when we are "multi-doc hack" mode,
2276 * during which extra END_DOCUMENT/START_DOCUMENT events need to be
2277 * returned.
2278 */
2279 private int nextFromMultiDocState()
2280 throws XMLStreamException
2281 {
2282 if (mCurrToken == END_DOCUMENT) {
2283 /* Ok; this is the initial step; need to advance: need to parse
2284 * xml declaration if that was the cause, otherwise just clear
2285 * up values.
2286 */
2287 if (mSecondaryToken == START_DOCUMENT) {
2288 handleMultiDocXmlDecl();
2289 } else { // Nah, DOCTYPE or start element... just need to clear decl info:
2290 mDocXmlEncoding = null;
2291 mDocXmlVersion = XmlConsts.XML_V_UNKNOWN;
2292 mDocStandalone = DOC_STANDALONE_UNKNOWN;
2293 }
2294 return START_DOCUMENT;
2295 }
2296 if (mCurrToken == START_DOCUMENT) {
2297 mParseState = STATE_PROLOG; // yup, we are now officially in prolog again...
2298
2299 // Had an xml decl (ie. "real" START_DOCUMENT event)
2300 if (mSecondaryToken == START_DOCUMENT) { // was a real xml decl
2301 nextFromProlog(true);
2302 return mCurrToken;
2303 }
2304 // Nah, start elem or DOCTYPE
2305 if (mSecondaryToken == START_ELEMENT) {
2306 handleRootElem(getNextChar(SUFFIX_IN_ELEMENT));
2307 return START_ELEMENT;
2308 }
2309 if (mSecondaryToken == DTD) {
2310 mStDoctypeFound = true;
2311 startDTD();
2312 return DTD;
2313 }
2314 }
2315 throw new IllegalStateException("Internal error: unexpected state; current event "
2316 +tokenTypeDesc(mCurrToken)+", sec. state: "+tokenTypeDesc(mSecondaryToken));
2317 }
2318
2319 protected void handleMultiDocXmlDecl()
2320 throws XMLStreamException
2321 {
2322 // Let's default these first
2323 mDocStandalone = DOC_STANDALONE_UNKNOWN;
2324 mDocXmlEncoding = null;
2325
2326 char c = getNextInCurrAfterWS(SUFFIX_IN_XML_DECL);
2327 String wrong = checkKeyword(c, XmlConsts.XML_DECL_KW_VERSION);
2328 if (wrong != null) {
2329 throwParseError(ErrorConsts.ERR_UNEXP_KEYWORD, wrong, XmlConsts.XML_DECL_KW_VERSION);
2330 }
2331 c = skipEquals(XmlConsts.XML_DECL_KW_VERSION, SUFFIX_IN_XML_DECL);
2332 TextBuffer tb = mTextBuffer;
2333 tb.resetInitialized();
2334 parseQuoted(XmlConsts.XML_DECL_KW_VERSION, c, tb);
2335
2336 if (tb.equalsString(XmlConsts.XML_V_10_STR)) {
2337 mDocXmlVersion = XmlConsts.XML_V_10;
2338 mXml11 = false;
2339 } else if (tb.equalsString(XmlConsts.XML_V_11_STR)) {
2340 mDocXmlVersion = XmlConsts.XML_V_11;
2341 mXml11 = true;
2342 } else {
2343 mDocXmlVersion = XmlConsts.XML_V_UNKNOWN;
2344 mXml11 = false;
2345 throwParseError("Unexpected xml version '"+tb.toString()+"'; expected '"+XmlConsts.XML_V_10_STR+"' or '"+XmlConsts.XML_V_11_STR+"'");
2346 }
2347
2348 c = getNextInCurrAfterWS(SUFFIX_IN_XML_DECL);
2349
2350 if (c != '?') { // '?' signals end...
2351 if (c == 'e') { // encoding
2352 wrong = checkKeyword(c, XmlConsts.XML_DECL_KW_ENCODING);
2353 if (wrong != null) {
2354 throwParseError(ErrorConsts.ERR_UNEXP_KEYWORD, wrong, XmlConsts.XML_DECL_KW_ENCODING);
2355 }
2356 c = skipEquals(XmlConsts.XML_DECL_KW_ENCODING, SUFFIX_IN_XML_DECL);
2357 tb.resetWithEmpty();
2358 parseQuoted(XmlConsts.XML_DECL_KW_ENCODING, c, tb);
2359 mDocXmlEncoding = tb.toString();
2360 /* should we verify encoding at this point? let's not, for now;
2361 * since it's for information only, first declaration from
2362 * bootstrapper is used for the whole stream.
2363 */
2364 c = getNextInCurrAfterWS(SUFFIX_IN_XML_DECL);
2365 } else if (c != 's') {
2366 throwUnexpectedChar(c, " in xml declaration; expected either 'encoding' or 'standalone' pseudo-attribute");
2367 }
2368
2369 // Standalone?
2370 if (c == 's') {
2371 wrong = checkKeyword(c, XmlConsts.XML_DECL_KW_STANDALONE);
2372 if (wrong != null) {
2373 throwParseError(ErrorConsts.ERR_UNEXP_KEYWORD, wrong, XmlConsts.XML_DECL_KW_STANDALONE);
2374 }
2375 c = skipEquals(XmlConsts.XML_DECL_KW_STANDALONE, SUFFIX_IN_XML_DECL);
2376 tb.resetWithEmpty();
2377 parseQuoted(XmlConsts.XML_DECL_KW_STANDALONE, c, tb);
2378 if (tb.equalsString(XmlConsts.XML_SA_YES)) {
2379 mDocStandalone = DOC_STANDALONE_YES;
2380 } else if (tb.equalsString(XmlConsts.XML_SA_NO)) {
2381 mDocStandalone = DOC_STANDALONE_NO;
2382 } else {
2383 throwParseError("Unexpected xml '"+XmlConsts.XML_DECL_KW_STANDALONE+"' pseudo-attribute value '"
2384 +tb.toString()+"'; expected '"+XmlConsts.XML_SA_YES+"' or '"+
2385 XmlConsts.XML_SA_NO+"'");
2386 }
2387 c = getNextInCurrAfterWS(SUFFIX_IN_XML_DECL);
2388 }
2389 }
2390
2391 if (c != '?') {
2392 throwUnexpectedChar(c, " in xml declaration; expected '?>' as the end marker");
2393 }
2394 c = getNextCharFromCurrent(SUFFIX_IN_XML_DECL);
2395 if (c != '>') {
2396 throwUnexpectedChar(c, " in xml declaration; expected '>' to close the declaration");
2397 }
2398 }
2399
2400 /**
2401 * Method that checks that input following is of form
2402 * '[S]* '=' [S]*' (as per XML specs, production #25).
2403 * Will push back non-white space characters as necessary, in
2404 * case no equals char is encountered.
2405 */
2406 protected final char skipEquals(String name, String eofMsg)
2407 throws XMLStreamException
2408 {
2409 char c = getNextInCurrAfterWS(eofMsg);
2410 if (c != '=') {
2411 throwUnexpectedChar(c, " in xml declaration; expected '=' to follow pseudo-attribute '"+name+"'");
2412 }
2413 // trailing space?
2414 return getNextInCurrAfterWS(eofMsg);
2415 }
2416
2417 /**
2418 * Method called to parse quoted xml declaration pseudo-attribute values.
2419 * Works similar to attribute value parsing, except no entities can be
2420 * included, and in general need not be as picky (since caller is to
2421 * verify contents). One exception is that we do check for linefeeds
2422 * and lt chars, since they generally would indicate problems and
2423 * are useful to catch early on (can happen if a quote is missed etc)
2424 *<p>
2425 * Note: since it'll be called at most 3 times per document, this method
2426 * is not optimized too much.
2427 */
2428 protected final void parseQuoted(String name, char quoteChar, TextBuffer tbuf)
2429 throws XMLStreamException
2430 {
2431 if (quoteChar != '"' && quoteChar != '\'') {
2432 throwUnexpectedChar(quoteChar, " in xml declaration; waited ' or \" to start a value for pseudo-attribute '"+name+"'");
2433 }
2434 char[] outBuf = tbuf.getCurrentSegment();
2435 int outPtr = 0;
2436
2437 while (true) {
2438 char c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
2439 : getNextChar(SUFFIX_IN_XML_DECL);
2440
2441 if (c == quoteChar) {
2442 break;
2443 }
2444 if (c < CHAR_SPACE || c == '<') {
2445 throwUnexpectedChar(c, SUFFIX_IN_XML_DECL);
2446 } else if (c == CHAR_NULL) {
2447 throwNullChar();
2448 }
2449 if (outPtr >= outBuf.length) {
2450 outBuf = tbuf.finishCurrentSegment();
2451 outPtr = 0;
2452 }
2453 outBuf[outPtr++] = c;
2454 }
2455 tbuf.setCurrentLength(outPtr);
2456 }
2457
2458 /**
2459 * Called after character sequence '<!' has been found; expectation is
2460 * that it'll either be DOCTYPE declaration (if we are in prolog and
2461 * haven't yet seen one), or a comment. CDATA is not legal here;
2462 * it would start same way otherwise.
2463 */
2464 private void nextFromPrologBang(boolean isProlog)
2465 throws XMLStreamException
2466 {
2467 int i = getNext();
2468 if (i < 0) {
2469 throwUnexpectedEOF(SUFFIX_IN_PROLOG);
2470 }
2471 if (i == 'D') { // Doctype declaration?
2472 String keyw = checkKeyword('D', "DOCTYPE");
2473 if (keyw != null) {
2474 throwParseError("Unrecognized XML directive '<!"+keyw+"' (misspelled DOCTYPE?).");
2475 }
2476
2477 if (!isProlog) {
2478 // Still possibly ok in multidoc mode...
2479 if (mConfig.inputParsingModeDocuments()) {
2480 if (!mStDoctypeFound) {
2481 mCurrToken = handleMultiDocStart(DTD);
2482 return;
2483 }
2484 } else {
2485 throwParseError(ErrorConsts.ERR_DTD_IN_EPILOG);
2486 }
2487 }
2488 if (mStDoctypeFound) {
2489 throwParseError(ErrorConsts.ERR_DTD_DUP);
2490 }
2491 mStDoctypeFound = true;
2492 // Ok; let's read main input (all but internal subset)
2493 mCurrToken = DTD;
2494 startDTD();
2495 return;
2496 } else if (i == '-') { // comment
2497 char c = getNextChar(isProlog ? SUFFIX_IN_PROLOG : SUFFIX_IN_EPILOG);
2498 if (c != '-') {
2499 throwUnexpectedChar(i, " (malformed comment?)");
2500 }
2501 // Likewise, let's delay actual parsing/skipping.
2502 mTokenState = TOKEN_STARTED;
2503 mCurrToken = COMMENT;
2504 return;
2505 } else if (i == '[') { // erroneous CDATA?
2506 i = peekNext();
2507 // Let's just add bit of heuristics, to get better error msg
2508 if (i == 'C') {
2509 throwUnexpectedChar(i, ErrorConsts.ERR_CDATA_IN_EPILOG);
2510 }
2511 }
2512
2513 throwUnexpectedChar(i, " after '<!' (malformed comment?)");
2514 }
2515
2516 /**
2517 * Method called to parse through most of DOCTYPE declaration; excluding
2518 * optional internal subset.
2519 */
2520 private void startDTD()
2521 throws XMLStreamException
2522 {
2523 /* 21-Nov-2004, TSa: Let's make sure that the buffer gets cleared
2524 * at this point. Need not start branching yet, however, since
2525 * DTD event is often skipped.
2526 */
2527 mTextBuffer.resetInitialized();
2528
2529 /* So, what we need is:<code>
2530 * <!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>
2531 *</code>. And we have already read the DOCTYPE token.
2532 */
2533
2534 char c = getNextInCurrAfterWS(SUFFIX_IN_DTD);
2535 if (mCfgNsEnabled) {
2536 String str = parseLocalName(c);
2537 c = getNextChar(SUFFIX_IN_DTD);
2538 if (c == ':') { // Ok, got namespace and local name
2539 mRootPrefix = str;
2540 mRootLName = parseLocalName(getNextChar(SUFFIX_EOF_EXP_NAME));
2541 } else if (c <= CHAR_SPACE || c == '[' || c == '>') {
2542 // ok to get white space or '[', or closing '>'
2543 --mInputPtr; // pushback
2544 mRootPrefix = null;
2545 mRootLName = str;
2546 } else {
2547 throwUnexpectedChar(c, " in DOCTYPE declaration; expected '[' or white space.");
2548 }
2549 } else {
2550 mRootLName = parseFullName(c);
2551 mRootPrefix = null;
2552 }
2553
2554 // Ok, fine, what next?
2555 c = getNextInCurrAfterWS(SUFFIX_IN_DTD);
2556 if (c != '[' && c != '>') {
2557 String keyw = null;
2558
2559 if (c == 'P') {
2560 keyw = checkKeyword(getNextChar(SUFFIX_IN_DTD), "UBLIC");
2561 if (keyw != null) {
2562 keyw = "P" + keyw;
2563 } else {
2564 if (!skipWS(getNextChar(SUFFIX_IN_DTD))) {
2565 throwUnexpectedChar(c, SUFFIX_IN_DTD+"; expected a space between PUBLIC keyword and public id");
2566 }
2567 c = getNextCharFromCurrent(SUFFIX_IN_DTD);
2568 if (c != '"' && c != '\'') {
2569 throwUnexpectedChar(c, SUFFIX_IN_DTD+"; expected a public identifier.");
2570 }
2571 mDtdPublicId = parsePublicId(c, SUFFIX_IN_DTD);
2572 if (mDtdPublicId.length() == 0) {
2573 // According to XML specs, this isn't illegal?
2574 // however, better report it as empty, not null.
2575 //mDtdPublicId = null;
2576 }
2577 if (!skipWS(getNextChar(SUFFIX_IN_DTD))) {
2578 throwUnexpectedChar(c, SUFFIX_IN_DTD+"; expected a space between public and system identifiers");
2579 }
2580 c = getNextCharFromCurrent(SUFFIX_IN_DTD);
2581 if (c != '"' && c != '\'') {
2582 throwParseError(SUFFIX_IN_DTD+"; expected a system identifier.");
2583 }
2584 mDtdSystemId = parseSystemId(c, mNormalizeLFs, SUFFIX_IN_DTD);
2585 if (mDtdSystemId.length() == 0) {
2586 // According to XML specs, this isn't illegal?
2587 // however, better report it as empty, not null.
2588 //mDtdSystemId = null;
2589 }
2590 }
2591 } else if (c == 'S') {
2592 mDtdPublicId = null;
2593 keyw = checkKeyword(getNextChar(SUFFIX_IN_DTD), "YSTEM");
2594 if (keyw != null) {
2595 keyw = "S" + keyw;
2596 } else {
2597 c = getNextInCurrAfterWS(SUFFIX_IN_DTD);
2598 if (c != '"' && c != '\'') {
2599 throwUnexpectedChar(c, SUFFIX_IN_DTD+"; expected a system identifier.");
2600 }
2601 mDtdSystemId = parseSystemId(c, mNormalizeLFs, SUFFIX_IN_DTD);
2602 if (mDtdSystemId.length() == 0) {
2603 // According to XML specs, this isn't illegal?
2604 mDtdSystemId = null;
2605 }
2606 }
2607 } else {
2608 if (!isNameStartChar(c)) {
2609 throwUnexpectedChar(c, SUFFIX_IN_DTD+"; expected keywords 'PUBLIC' or 'SYSTEM'.");
2610 } else {
2611 --mInputPtr;
2612 keyw = checkKeyword(c, "SYSTEM"); // keyword passed in doesn't matter
2613 }
2614 }
2615
2616 if (keyw != null) { // error:
2617 throwParseError("Unexpected keyword '"+keyw+"'; expected 'PUBLIC' or 'SYSTEM'");
2618 }
2619
2620 // Ok, should be done with external DTD identifier:
2621 c = getNextInCurrAfterWS(SUFFIX_IN_DTD);
2622 }
2623
2624 if (c == '[') { // internal subset
2625 ;
2626 } else {
2627 if (c != '>') {
2628 throwUnexpectedChar(c, SUFFIX_IN_DTD+"; expected closing '>'.");
2629 }
2630 }
2631
2632 /* Actually, let's just push whatever char it is, back; this way
2633 * we can lazily initialize text buffer with DOCTYPE declaration
2634 * if/as necessary, even if there's no internal subset.
2635 */
2636 --mInputPtr; // pushback
2637 mTokenState = TOKEN_STARTED;
2638 }
2639
2640 /**
2641 * This method gets called to handle remainder of DOCTYPE declaration,
2642 * essentially the optional internal subset. This class implements the
2643 * basic "ignore it" functionality, but can optionally still store copy
2644 * of the contents to the read buffer.
2645 *<p>
2646 * NOTE: Since this default implementation will be overridden by
2647 * some sub-classes, make sure you do NOT change the method signature.
2648 *
2649 * @param copyContents If true, will copy contents of the internal
2650 * subset of DOCTYPE declaration
2651 * in the text buffer; if false, will just completely ignore the
2652 * subset (if one found).
2653 */
2654 protected void finishDTD(boolean copyContents)
2655 throws XMLStreamException
2656 {
2657 /* We know there are no spaces, as this char was read and pushed
2658 * back earlier...
2659 */
2660 char c = getNextChar(SUFFIX_IN_DTD);
2661 if (c == '[') {
2662 // Do we need to get contents as text too?
2663 if (copyContents) {
2664 ((BranchingReaderSource) mInput).startBranch(mTextBuffer, mInputPtr, mNormalizeLFs);
2665 }
2666
2667 try {
2668 MinimalDTDReader.skipInternalSubset(this, mInput, mConfig);
2669 } finally {
2670 /* Let's close branching in any and every case (may allow
2671 * graceful recovery in error cases in future
2672 */
2673 if (copyContents) {
2674 /* Need to "push back" ']' got in the succesful case
2675 * (that's -1 part below);
2676 * in error case it'll just be whatever last char was.
2677 */
2678 ((BranchingReaderSource) mInput).endBranch(mInputPtr-1);
2679 }
2680 }
2681
2682 // And then we need closing '>'
2683 c = getNextCharAfterWS(SUFFIX_IN_DTD_INTERNAL);
2684 }
2685
2686 if (c != '>') {
2687 throwUnexpectedChar(c, "; expected '>' to finish DOCTYPE declaration.");
2688 }
2689 }
2690
2691 /*
2692 ///////////////////////////////////////////////////////////////////////
2693 // Internal methods, main parsing (inside root)
2694 ///////////////////////////////////////////////////////////////////////
2695 */
2696
2697 /**
2698 * Method called to parse beginning of the next event within
2699 * document tree, and return its type.
2700 */
2701 private final int nextFromTree()
2702 throws XMLStreamException
2703 {
2704 int i;
2705
2706 // First, do we need to finish currently open token?
2707 if (mTokenState < mStTextThreshold) {
2708 // No need to update state... will get taken care of
2709 /* 03-Mar-2006, TSa: Let's add a sanity check here, temporarily,
2710 * to ensure we never skip any textual content when it is
2711 * to be validated
2712 */
2713 if (mVldContent == XMLValidator.CONTENT_ALLOW_VALIDATABLE_TEXT) {
2714 if (mCurrToken == CHARACTERS || mCurrToken == CDATA) { // should never happen
2715 throwParseError("Internal error: skipping validatable text");
2716 }
2717 }
2718 i = skipToken();
2719 // note: skipToken() updates the start location
2720 } else {
2721 // Start/end elements are never unfinished (ie. are always
2722 // completely read in)
2723 if (mCurrToken == START_ELEMENT) {
2724 // Start tag may be an empty tag:
2725 if (mStEmptyElem) {
2726 // and if so, we'll then get 'virtual' close tag:
2727 mStEmptyElem = false;
2728 // ... and location info is correct already
2729 // 27-Feb-2009, TSa: but we do have to handle validation of the end tag now
2730 int vld = mElementStack.validateEndElement();
2731 mVldContent = vld;
2732 mValidateText = (vld == XMLValidator.CONTENT_ALLOW_VALIDATABLE_TEXT);
2733 return END_ELEMENT;
2734 }
2735 } else if (mCurrToken == END_ELEMENT) {
2736 // Close tag removes current element from stack
2737 if (!mElementStack.pop()) { // false if root closed
2738 // if so, we'll get to epilog, unless in fragment mode
2739 if (!mConfig.inputParsingModeFragment()) {
2740 return closeContentTree();
2741 }
2742 // in fragment mode, fine, we'll just continue
2743 }
2744 } else if (mCurrToken == CDATA && mTokenState <= TOKEN_PARTIAL_SINGLE) {
2745 /* Just returned a partial CDATA... that's ok, just need to
2746 * know we won't get opening marker etc.
2747 * The tricky part here is just to ensure there's at least
2748 * one character; if not, need to just discard the empty
2749 * 'event' (note that it is possible to have an initial
2750 * empty CDATA event for truly empty CDATA block; but not
2751 * partial ones!). Let's just read it like a new
2752 * CData section first:
2753 */
2754 // First, need to update the start location...
2755 mTokenInputTotal = mCurrInputProcessed + mInputPtr;
2756 mTokenInputRow = mCurrInputRow;
2757 mTokenInputCol = mInputPtr - mCurrInputRowStart;
2758 char c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
2759 : getNextChar(SUFFIX_IN_CDATA);
2760 if (readCDataPrimary(c)) { // got it all!
2761 // note: can not be in coalescing mode at this point;
2762 // as we can never have partial cdata without unfinished token
2763 // ... still need to have gotten at least 1 char though:
2764 if (mTextBuffer.size() > 0) {
2765 return CDATA;
2766 }
2767 // otherwise need to continue and parse the next event
2768 } else {
2769 // Hmmh. Have to verify we get at least one char from
2770 // CData section; if so, we are good to go for now;
2771 // if not, need to get that damn char first:
2772 if (mTextBuffer.size() == 0
2773 && readCDataSecondary(mCfgLazyParsing
2774 ? 1 : mShortestTextSegment)) {
2775 // Ok, all of it read
2776 if (mTextBuffer.size() > 0) {
2777 // And had some contents
2778 mTokenState = TOKEN_FULL_SINGLE;
2779 return CDATA;
2780 }
2781 // if nothing read, we'll just fall back (see below)
2782 } else { // good enough!
2783 mTokenState = TOKEN_PARTIAL_SINGLE;
2784 return CDATA;
2785 }
2786 }
2787
2788 /* If we get here, it was the end of the section, without
2789 * any more text inside CDATA, so let's just continue
2790 */
2791 }
2792 // Once again, need to update the start location info:
2793 mTokenInputTotal = mCurrInputProcessed + mInputPtr;
2794 mTokenInputRow = mCurrInputRow;
2795 mTokenInputCol = mInputPtr - mCurrInputRowStart;
2796 i = getNext();
2797 }
2798
2799 if (i < 0) {
2800 // 07-Oct-2005, TSa: May be ok in fragment mode (not otherwise),
2801 // but we can just check if element stack has anything, as that handles all cases
2802 if (!mElementStack.isEmpty()) {
2803 throwUnexpectedEOF();
2804 }
2805 return handleEOF(false);
2806 }
2807
2808 /* 26-Aug-2004, TSa: We have to deal with entities, usually, if
2809 * they are the next thing; even in non-expanding mode there
2810 * are entities and then there are entities... :-)
2811 * Let's start with char entities; they can be expanded right away.
2812 */
2813 while (i == '&') {
2814 mWsStatus = ALL_WS_UNKNOWN;
2815
2816 /* 30-Aug-2004, TSa: In some contexts entities are not
2817 * allowed in any way, shape or form:
2818 */
2819 if (mVldContent == XMLValidator.CONTENT_ALLOW_NONE) {
2820 /* May be char entity, general entity; whatever it is it's
2821 * invalid!
2822 */
2823 reportInvalidContent(ENTITY_REFERENCE);
2824 }
2825
2826 /* Need to call different methods based on whether we can do
2827 * automatic entity expansion or not:
2828 */
2829 int ch = mCfgReplaceEntities ?
2830 fullyResolveEntity(true) : resolveCharOnlyEntity(true);
2831
2832 if (ch != 0) {
2833 /* Char-entity... need to initialize text output buffer, then;
2834 * independent of whether it'll be needed or not.
2835 */
2836 /* 30-Aug-2004, TSa: In some contexts only white space is
2837 * accepted...
2838 */
2839 if (mVldContent <= XMLValidator.CONTENT_ALLOW_WS) {
2840 // As per xml specs, only straight white space is legal
2841 if (ch > CHAR_SPACE) {
2842 /* 21-Sep-2008, TSa: Used to also require a call to
2843 * 'mElementStack.reallyValidating', if only ws
2844 * allowed, to cover the case where non-typing-dtd
2845 * was only used to discover SPACE type. But
2846 * now that we have CONTENT_ALLOW_WS_NONSTRICT,
2847 * shouldn't be needed.
2848 */
2849 //if (mVldContent < XMLValidator.CONTENT_ALLOW_WS || mElementStack.reallyValidating()) {
2850 reportInvalidContent(CHARACTERS);
2851 }
2852 }
2853 TextBuffer tb = mTextBuffer;
2854 tb.resetInitialized();
2855 if (ch <= 0xFFFF) {
2856 tb.append((char) ch);
2857 } else {
2858 ch -= 0x10000;
2859 tb.append((char) ((ch >> 10) + 0xD800));
2860 tb.append((char) ((ch & 0x3FF) + 0xDC00));
2861 }
2862 mTokenState = TOKEN_STARTED;
2863 return CHARACTERS;
2864 }
2865
2866 /* Nope; was a general entity... in auto-mode, it's now been
2867 * expanded; in non-auto, need to figure out entity itself.
2868 */
2869 if (!mCfgReplaceEntities|| mCfgTreatCharRefsAsEntities) {
2870 if (!mCfgTreatCharRefsAsEntities) {
2871 final EntityDecl ed = resolveNonCharEntity();
2872 // Note: ed may still be null at this point
2873 mCurrEntity = ed;
2874 }
2875 // Note: ed may still be null at this point
2876 mTokenState = TOKEN_FULL_COALESCED;
2877 /*
2878 // let's not worry about non-parsed entities, since this is unexpanded mode
2879 // ... although it'd be an error either way? Should we report it?
2880 if (ed != null && !ed.isParsed()) {
2881 throwParseError("Reference to unparsed entity '"+ed.getName()+"' from content not allowed.");
2882 }
2883 */
2884 return ENTITY_REFERENCE;
2885 }
2886
2887 // Otherwise automatic expansion fine; just need the next char:
2888 i = getNextChar(SUFFIX_IN_DOC);
2889 }
2890
2891 if (i == '<') { // Markup
2892 // And then it should be easy to figure out type:
2893 char c = getNextChar(SUFFIX_IN_ELEMENT);
2894 if (c == '?') { // proc. inst
2895 // 30-Aug-2004, TSa: Not legal for EMPTY elements
2896 if (mVldContent == XMLValidator.CONTENT_ALLOW_NONE) {
2897 reportInvalidContent(PROCESSING_INSTRUCTION);
2898 }
2899 return readPIPrimary();
2900 }
2901
2902 if (c == '!') { // CDATA or comment
2903 // Need to figure out bit more first...
2904 int type = nextFromTreeCommentOrCData();
2905 // 30-Aug-2004, TSa: Not legal for EMPTY elements
2906 if (mVldContent == XMLValidator.CONTENT_ALLOW_NONE) {
2907 reportInvalidContent(type);
2908 }
2909 return type;
2910 }
2911 if (c == '/') { // always legal (if name matches etc)
2912 readEndElem();
2913 return END_ELEMENT;
2914 }
2915
2916 if (c == ':' || isNameStartChar(c)) {
2917 /* Note: checking for EMPTY content type is done by the
2918 * validator, no need to check here
2919 */
2920 handleStartElem(c);
2921 return START_ELEMENT;
2922 }
2923 if (c == '[') {
2924 throwUnexpectedChar(c, " in content after '<' (malformed <![CDATA[]] directive?)");
2925 }
2926 throwUnexpectedChar(c, " in content after '<' (malformed start element?).");
2927 }
2928
2929 /* Text... ok; better parse the 'easy' (consequtive) portions right
2930 * away, since that's practically free (still need to scan those
2931 * characters no matter what, even if skipping).
2932 */
2933 /* But first, do we expect to get ignorable white space (only happens
2934 * in validating mode)? If so, needs bit different handling:
2935 */
2936 if (mVldContent <= XMLValidator.CONTENT_ALLOW_WS_NONSTRICT) {
2937 if (mVldContent == XMLValidator.CONTENT_ALLOW_NONE) {
2938 if (mElementStack.reallyValidating()) {
2939 reportInvalidContent(CHARACTERS);
2940 }
2941 }
2942 if (i <= CHAR_SPACE) {
2943 /* Note: need not worry about coalescing, since non-whitespace
2944 * text is illegal (ie. can not have CDATA)
2945 */
2946 mTokenState = (readSpacePrimary((char) i, false)) ?
2947 TOKEN_FULL_COALESCED : TOKEN_STARTED;
2948 return SPACE;
2949 }
2950 // Problem if we are really validating; otherwise not
2951 if (mElementStack.reallyValidating()) {
2952 reportInvalidContent(CHARACTERS);
2953 }
2954 /* otherwise, we know it's supposed to contain just space (or
2955 * be empty), but as we are not validating it's not an error
2956 * for this not to be true. Type should be changed to
2957 * CHARACTERS tho.
2958 */
2959 }
2960
2961 // Further, when coalescing, can not be sure if we REALLY got it all
2962 if (readTextPrimary((char) i)) { // reached following markup
2963 mTokenState = TOKEN_FULL_SINGLE;
2964 } else {
2965 // If not coalescing, this may be enough for current event
2966 if (!mCfgCoalesceText
2967 && mTextBuffer.size() >= mShortestTextSegment) {
2968 mTokenState = TOKEN_PARTIAL_SINGLE;
2969 } else {
2970 mTokenState = TOKEN_STARTED;
2971 }
2972 }
2973 return CHARACTERS;
2974 }
2975
2976 /**
2977 * Method called when advancing stream past the end tag that closes
2978 * the root element of the open document.
2979 * Document can be either the singular one, in regular mode, or one of
2980 * possibly multiple, in multi-doc mode: this method is never called
2981 * in fragment mode. Method needs to update state properly and
2982 * parse following epilog event (if any).
2983 *
2984 * @return Event following end tag of the root elemennt, if any;
2985 * END_DOCUMENT otherwis.e
2986 */
2987 private int closeContentTree()
2988 throws XMLStreamException
2989 {
2990 mParseState = STATE_EPILOG;
2991 // this call will update the location too...
2992 if (nextFromProlog(false)) {
2993 mSecondaryToken = 0;
2994 }
2995 /* 10-Apr-2006, TSa: Let's actually try to update
2996 * SymbolTable here (after main xml tree); caller
2997 * may not continue parsing after this.
2998 */
2999 if (mSymbols.isDirty()) {
3000 mOwner.updateSymbolTable(mSymbols);
3001 }
3002 // May be able to recycle, but not certain; and definitely can not just
3003 // clean contents (may contain space(s) read)
3004 mTextBuffer.recycle(false);
3005 return mCurrToken;
3006 }
3007
3008 /**
3009 * Method that takes care of parsing of start elements; including
3010 * full parsing of namespace declarations and attributes, as well as
3011 * namespace resolution.
3012 */
3013 private final void handleStartElem(char c)
3014 throws XMLStreamException
3015 {
3016 mTokenState = TOKEN_FULL_COALESCED;
3017 boolean empty;
3018
3019 if (mCfgNsEnabled) {
3020 String str = parseLocalName(c);
3021 c = (mInputPtr < mInputEnd) ?
3022 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_EOF_EXP_NAME);
3023 if (c == ':') { // Ok, got namespace and local name
3024 c = (mInputPtr < mInputEnd) ?
3025 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_EOF_EXP_NAME);
3026 mElementStack.push(str, parseLocalName(c));
3027 c = (mInputPtr < mInputEnd) ?
3028 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3029 } else {
3030 mElementStack.push(null, str);
3031 // c is fine as
3032 }
3033 /* Enough about element name itself; let's then parse attributes
3034 * and namespace declarations. Split into another method for clarity,
3035 * and so that maybe JIT has easier time to optimize it separately.
3036 */
3037 /* 04-Jul-2005, TSa: But hold up: we can easily check for a fairly
3038 * common case of no attributes showing up, and us getting the
3039 * closing '>' right away. Let's do that, since it can save
3040 * a call to a rather long method.
3041 */
3042 empty = (c == '>') ? false : handleNsAttrs(c);
3043 } else { // Namespace handling not enabled:
3044 mElementStack.push(null, parseFullName(c));
3045 c = (mInputPtr < mInputEnd) ?
3046 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3047 empty = (c == '>') ? false : handleNonNsAttrs(c);
3048 }
3049 if (!empty) {
3050 ++mCurrDepth; // needed to match nesting with entity expansion
3051 }
3052 mStEmptyElem = empty;
3053
3054 /* 27-Feb-2009, TSa: [WSTX-191]: We used to validate virtual
3055 * end element here for empty elements, but it really should
3056 * occur later on when actually returning that end element.
3057 */
3058 int vld = mElementStack.resolveAndValidateElement();
3059 mVldContent = vld;
3060 mValidateText = (vld == XMLValidator.CONTENT_ALLOW_VALIDATABLE_TEXT);
3061 }
3062
3063 /**
3064 * @return True if this is an empty element; false if not
3065 */
3066 private final boolean handleNsAttrs(char c)
3067 throws XMLStreamException
3068 {
3069 AttributeCollector ac = mAttrCollector;
3070
3071 while (true) {
3072 if (c <= CHAR_SPACE) {
3073 c = getNextInCurrAfterWS(SUFFIX_IN_ELEMENT, c);
3074 } else if (c != '/' && c != '>') {
3075 throwUnexpectedChar(c, " excepted space, or '>' or \"/>\"");
3076 }
3077
3078 if (c == '/') {
3079 c = getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3080 if (c != '>') {
3081 throwUnexpectedChar(c, " expected '>'");
3082 }
3083 return true;
3084 } else if (c == '>') {
3085 return false;
3086 } else if (c == '<') {
3087 throwParseError("Unexpected '<' character in element (missing closing '>'?)");
3088 }
3089
3090 String prefix, localName;
3091 String str = parseLocalName(c);
3092 c = (mInputPtr < mInputEnd) ?
3093 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_EOF_EXP_NAME);
3094 if (c == ':') { // Ok, got namespace and local name
3095 prefix = str;
3096 c = (mInputPtr < mInputEnd) ?
3097 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_EOF_EXP_NAME);
3098 localName = parseLocalName(c);
3099 } else {
3100 --mInputPtr; // pushback
3101 prefix = null;
3102 localName = str;
3103 }
3104
3105 c = (mInputPtr < mInputEnd) ?
3106 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3107 if (c <= CHAR_SPACE) {
3108 c = getNextInCurrAfterWS(SUFFIX_IN_ELEMENT, c);
3109 }
3110 if (c != '=') {
3111 throwUnexpectedChar(c, " expected '='");
3112 }
3113 c = (mInputPtr < mInputEnd) ?
3114 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3115 if (c <= CHAR_SPACE) {
3116 c = getNextInCurrAfterWS(SUFFIX_IN_ELEMENT, c);
3117 }
3118
3119 // And then a quote:
3120 if (c != '"' && c != '\'') {
3121 throwUnexpectedChar(c, SUFFIX_IN_ELEMENT+" Expected a quote");
3122 }
3123
3124 // And then the actual value
3125 int startLen = -1;
3126 TextBuilder tb;
3127
3128 if (prefix == sPrefixXmlns) { // non-default namespace declaration
3129 tb = ac.getNsBuilder(localName);
3130 // returns null if it's a dupe:
3131 if (null == tb) {
3132 throwParseError("Duplicate declaration for namespace prefix '"+localName+"'.");
3133 }
3134 startLen = tb.getCharSize();
3135 } else if (localName == sPrefixXmlns && prefix == null) {
3136 tb = ac.getDefaultNsBuilder();
3137 // returns null if default ns was already declared
3138 if (null == tb) {
3139 throwParseError("Duplicate default namespace declaration.");
3140 }
3141 } else {
3142 tb = ac.getAttrBuilder(prefix, localName);
3143 }
3144 parseAttrValue(c, tb);
3145
3146 /* 19-Jul-2004, TSa: Need to check that non-default namespace
3147 * URI is NOT empty, as per XML namespace specs, #2,
3148 * ("...In such declarations, the namespace name may not
3149 * be empty.")
3150 */
3151 /* (note: startLen is only set to first char position for
3152 * non-default NS declarations, see above...)
3153 */
3154 /* 04-Feb-2005, TSa: Namespaces 1.1 does allow this, though,
3155 * so for xml 1.1 documents we need to allow it
3156 */
3157 if (!mXml11) {
3158 if (startLen >= 0 && tb.getCharSize() == startLen) { // is empty!
3159 throwParseError(ErrorConsts.ERR_NS_EMPTY);
3160 }
3161 }
3162
3163 // and then we need to iterate some more
3164 c = (mInputPtr < mInputEnd) ?
3165 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3166 }
3167 // never gets here
3168 }
3169
3170 /**
3171 * @return True if this is an empty element; false if not
3172 */
3173 private final boolean handleNonNsAttrs(char c)
3174 throws XMLStreamException
3175 {
3176 AttributeCollector ac = mAttrCollector;
3177
3178 while (true) {
3179 if (c <= CHAR_SPACE) {
3180 c = getNextInCurrAfterWS(SUFFIX_IN_ELEMENT, c);
3181 } else if (c != '/' && c != '>') {
3182 throwUnexpectedChar(c, " excepted space, or '>' or \"/>\"");
3183 }
3184 if (c == '/') {
3185 c = getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3186 if (c != '>') {
3187 throwUnexpectedChar(c, " expected '>'");
3188 }
3189 return true;
3190 } else if (c == '>') {
3191 return false;
3192 } else if (c == '<') {
3193 throwParseError("Unexpected '<' character in element (missing closing '>'?)");
3194 }
3195
3196 String name = parseFullName(c);
3197 TextBuilder tb = ac.getAttrBuilder(null, name);
3198 c = (mInputPtr < mInputEnd) ?
3199 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3200 if (c <= CHAR_SPACE) {
3201 c = getNextInCurrAfterWS(SUFFIX_IN_ELEMENT, c);
3202 }
3203 if (c != '=') {
3204 throwUnexpectedChar(c, " expected '='");
3205 }
3206 c = (mInputPtr < mInputEnd) ?
3207 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3208 if (c <= CHAR_SPACE) {
3209 c = getNextInCurrAfterWS(SUFFIX_IN_ELEMENT, c);
3210 }
3211
3212 // And then a quote:
3213 if (c != '"' && c != '\'') {
3214 throwUnexpectedChar(c, SUFFIX_IN_ELEMENT+" Expected a quote");
3215 }
3216
3217 // And then the actual value
3218 parseAttrValue(c, tb);
3219 // and then we need to iterate some more
3220 c = (mInputPtr < mInputEnd) ?
3221 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_ELEMENT);
3222 }
3223 // never gets here
3224 }
3225
3226 /**
3227 * Method called to completely read a close tag, and update element
3228 * stack appropriately (including checking that tag matches etc).
3229 */
3230 protected final void readEndElem()
3231 throws XMLStreamException
3232 {
3233 mTokenState = TOKEN_FULL_COALESCED; // will be read completely
3234
3235 if (mElementStack.isEmpty()) {
3236 // Let's just offline this for clarity
3237 reportExtraEndElem();
3238 return; // never gets here
3239 }
3240
3241 char c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
3242 : getNextCharFromCurrent(SUFFIX_IN_CLOSE_ELEMENT);
3243 // Quick check first; missing name?
3244 if (!isNameStartChar(c) && c != ':') {
3245 if (c <= CHAR_SPACE) { // space
3246 throwUnexpectedChar(c, "; missing element name?");
3247 }
3248 throwUnexpectedChar(c, "; expected an element name.");
3249 }
3250
3251 /* Ok, now; good thing is we know exactly what to compare
3252 * against...
3253 */
3254 String expPrefix = mElementStack.getPrefix();
3255 String expLocalName = mElementStack.getLocalName();
3256
3257 // Prefix to match?
3258 if (expPrefix != null && expPrefix.length() > 0) {
3259 int len = expPrefix.length();
3260 int i = 0;
3261
3262 while (true){
3263 if (c != expPrefix.charAt(i)) {
3264 reportWrongEndPrefix(expPrefix, expLocalName, i);
3265 return; // never gets here
3266 }
3267 if (++i >= len) {
3268 break;
3269 }
3270 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
3271 : getNextCharFromCurrent(SUFFIX_IN_CLOSE_ELEMENT);
3272 }
3273 // And then we should get a colon
3274 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
3275 : getNextCharFromCurrent(SUFFIX_IN_CLOSE_ELEMENT);
3276 if (c != ':') {
3277 reportWrongEndPrefix(expPrefix, expLocalName, i);
3278 return;
3279 }
3280 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
3281 : getNextCharFromCurrent(SUFFIX_IN_CLOSE_ELEMENT);
3282 }
3283
3284 // Ok, then, does the local name match?
3285 int len = expLocalName.length();
3286 int i = 0;
3287
3288 while (true){
3289 if (c != expLocalName.charAt(i)) {
3290 // Not a match...
3291 reportWrongEndElem(expPrefix, expLocalName, i);
3292 return; // never gets here
3293 }
3294 if (++i >= len) {
3295 break;
3296 }
3297 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
3298 : getNextCharFromCurrent(SUFFIX_IN_CLOSE_ELEMENT);
3299 }
3300
3301 // Let's see if end element still continues, however?
3302 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
3303 : getNextCharFromCurrent(SUFFIX_IN_CLOSE_ELEMENT);
3304 if (c <= CHAR_SPACE) {
3305 c = getNextInCurrAfterWS(SUFFIX_IN_CLOSE_ELEMENT, c);
3306 } else if (c == '>') {
3307 ;
3308 } else if (c == ':' || isNameChar(c)) {
3309 reportWrongEndElem(expPrefix, expLocalName, len);
3310 }
3311
3312 // Ok, fine, match ok; now we just need the closing gt char.
3313 if (c != '>') {
3314 throwUnexpectedChar(c, SUFFIX_IN_CLOSE_ELEMENT+" Expected '>'.");
3315 }
3316
3317 // Finally, let's let validator detect if things are ok
3318 int vld = mElementStack.validateEndElement();
3319 mVldContent = vld;
3320 mValidateText = (vld == XMLValidator.CONTENT_ALLOW_VALIDATABLE_TEXT);
3321
3322 // Plus verify WFC that start and end tags came from same entity
3323 /* 13-Feb-2006, TSa: Are we about to close an element that
3324 * started within a parent element?
3325 * That's a GE/element nesting WFC violation...
3326 */
3327 if (mCurrDepth == mInputTopDepth) {
3328 handleGreedyEntityProblem(mInput);
3329 }
3330 --mCurrDepth;
3331 }
3332
3333 private void reportExtraEndElem()
3334 throws XMLStreamException
3335 {
3336 String name = parseFNameForError();
3337 throwParseError("Unbalanced close tag </"+name+">; no open start tag.");
3338 }
3339
3340 private void reportWrongEndPrefix(String prefix, String localName, int done)
3341 throws XMLStreamException
3342 {
3343 --mInputPtr; // pushback
3344 String fullName = prefix + ":" + localName;
3345 String rest = parseFNameForError();
3346 String actName = fullName.substring(0, done) + rest;
3347 throwParseError("Unexpected close tag </"+actName+">; expected </"
3348 +fullName+">.");
3349 }
3350
3351 private void reportWrongEndElem(String prefix, String localName, int done)
3352 throws XMLStreamException
3353 {
3354 --mInputPtr; // pushback
3355 String fullName;
3356 if (prefix != null && prefix.length() > 0) {
3357 fullName = prefix + ":" + localName;
3358 done += 1 + prefix.length();
3359 } else {
3360 fullName = localName;
3361 }
3362 String rest = parseFNameForError();
3363 String actName = fullName.substring(0, done) + rest;
3364 throwParseError("Unexpected close tag </"+actName+">; expected </"
3365 +fullName+">.");
3366 }
3367
3368 /**
3369 *<p>
3370 * Note: According to StAX 1.0, coalesced text events are always to be
3371 * returned as CHARACTERS, never as CDATA. And since at this point we
3372 * don't really know if there's anything to coalesce (but there may
3373 * be), let's convert CDATA if necessary.
3374 */
3375 private int nextFromTreeCommentOrCData()
3376 throws XMLStreamException
3377 {
3378 char c = getNextCharFromCurrent(SUFFIX_IN_DOC);
3379 if (c == '[') {
3380 checkCData();
3381 /* Good enough; it is a CDATA section... but let's just also
3382 * parse the easy ("free") stuff:
3383 */
3384 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
3385 : getNextCharFromCurrent(SUFFIX_IN_CDATA);
3386 readCDataPrimary(c); // sets token state appropriately...
3387 return CDATA;
3388 }
3389 if (c == '-' && getNextCharFromCurrent(SUFFIX_IN_DOC) == '-') {
3390 mTokenState = TOKEN_STARTED;
3391 return COMMENT;
3392 }
3393 throwParseError("Unrecognized XML directive; expected CDATA or comment ('<![CDATA[' or '<!--').");
3394 return 0; // never gets here, but compilers don't know it...
3395 }
3396
3397 /*
3398 ///////////////////////////////////////////////////////////////////////
3399 // Internal methods, skipping
3400 ///////////////////////////////////////////////////////////////////////
3401 */
3402
3403 /**
3404 * Method called to skip last part of current token, when full token
3405 * has not been parsed. Generally happens when caller is not interested
3406 * in current token and just calls next() to iterate to next token.
3407 *<p>
3408 * Note: this method is to accurately update the location information
3409 * to reflect where the next event will start (or, in case of EOF, where
3410 * EOF was encountered, ie. where event would start, if there was one).
3411 *
3412 * @return Next character after node has been skipped, or -1 if EOF
3413 * follows
3414 */
3415 private int skipToken()
3416 throws XMLStreamException
3417 {
3418 int result;
3419
3420 main_switch:
3421 switch (mCurrToken) {
3422 case CDATA:
3423 {
3424 /* 30-Aug-2004, TSa: Need to be careful here: we may
3425 * actually have finished with CDATA, but are just
3426 * coalescing... if so, need to skip first part of
3427 * skipping
3428 */
3429 if (mTokenState <= TOKEN_PARTIAL_SINGLE) {
3430 // Skipping CDATA is easy; just need to spot closing ]]>
3431 skipCommentOrCData(SUFFIX_IN_CDATA, ']', false);
3432 }
3433 result = getNext();
3434 // ... except if coalescing, may need to skip more:
3435 if (mCfgCoalesceText) {
3436 result = skipCoalescedText(result);
3437 }
3438 }
3439 break;
3440
3441 case COMMENT:
3442 skipCommentOrCData(SUFFIX_IN_COMMENT, '-', true);
3443 result = 0;
3444 break;
3445
3446 case CHARACTERS:
3447 {
3448 result = skipTokenText(getNext());
3449 // ... except if coalescing, need to skip more:
3450 if (mCfgCoalesceText) {
3451 result = skipCoalescedText(result);
3452 }
3453 }
3454 break;
3455
3456 case DTD:
3457 finishDTD(false);
3458 result = 0;
3459 break;
3460
3461 case PROCESSING_INSTRUCTION:
3462 while (true) {
3463 char c = (mInputPtr < mInputEnd)
3464 ? mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_PROC_INSTR);
3465 if (c == '?') {
3466 do {
3467 c = (mInputPtr < mInputEnd)
3468 ? mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_PROC_INSTR);
3469 } while (c == '?');
3470 if (c == '>') {
3471 result = 0;
3472 break main_switch;
3473 }
3474 }
3475 if (c < CHAR_SPACE) {
3476 if (c == '\n' || c == '\r') {
3477 skipCRLF(c);
3478 } else if (c != '\t') {
3479 throwInvalidSpace(c);
3480 }
3481 }
3482 }
3483 // never gets in here
3484
3485 case SPACE:
3486
3487 while (true) {
3488 // Fairly easy to skip through white space...
3489 while (mInputPtr < mInputEnd) {
3490 char c = mInputBuffer[mInputPtr++];
3491 if (c > CHAR_SPACE) { // non-EOF non-WS?
3492 result = c;
3493 break main_switch;
3494 }
3495 if (c == '\n' || c == '\r') {
3496 skipCRLF(c);
3497 } else if (c != CHAR_SPACE && c != '\t') {
3498 throwInvalidSpace(c);
3499 }
3500 }
3501 if (!loadMore()) {
3502 result = -1;
3503 break main_switch;
3504 }
3505 }
3506 // never gets in here
3507
3508 case ENTITY_REFERENCE: // these should never end up in here...
3509 case ENTITY_DECLARATION:
3510 case NOTATION_DECLARATION:
3511 case START_DOCUMENT:
3512 case END_DOCUMENT:
3513 // As are start/end document
3514 throw new IllegalStateException("skipToken() called when current token is "+tokenTypeDesc(mCurrToken));
3515
3516 case ATTRIBUTE:
3517 case NAMESPACE:
3518 // These two are never returned by this class
3519 case START_ELEMENT:
3520 case END_ELEMENT:
3521 /* Never called for elements tokens; start token handled
3522 * differently, end token always completely read in the first place
3523 */
3524
3525 default:
3526 throw new IllegalStateException("Internal error: unexpected token "+tokenTypeDesc(mCurrToken));
3527
3528 }
3529
3530 /* Ok; now we have 3 possibilities; result is:
3531 *
3532 * + 0 -> could reliably read the prev event, now need the
3533 * following char/EOF
3534 * + -1 -> hit EOF; can return it
3535 * + something else -> this is the next char, return it.
3536 *
3537 * In first 2 cases, next event start offset is the current location;
3538 * in third case, it needs to be backtracked by one char
3539 */
3540 if (result < 1) {
3541 mTokenInputRow = mCurrInputRow;
3542 mTokenInputTotal = mCurrInputProcessed + mInputPtr;
3543 mTokenInputCol = mInputPtr - mCurrInputRowStart;
3544 return (result < 0) ? result : getNext();
3545 }
3546
3547 // Ok, need to offset location, and return whatever we got:
3548 mTokenInputRow = mCurrInputRow;
3549 mTokenInputTotal = mCurrInputProcessed + mInputPtr - 1;
3550 mTokenInputCol = mInputPtr - mCurrInputRowStart - 1;
3551 return result;
3552 }
3553
3554 private void skipCommentOrCData(String errorMsg, char endChar, boolean preventDoubles)
3555 throws XMLStreamException
3556 {
3557 /* Let's skip all chars except for double-ending chars in
3558 * question (hyphen for comments, right brack for cdata)
3559 */
3560 int count = 0;
3561 while (true) {
3562 char c;
3563 while (true) {
3564 if (mInputPtr >= mInputEnd) {
3565 verifyLimit("Text size", mConfig.getMaxTextLength(), count);
3566 c = getNextCharFromCurrent(errorMsg);
3567 } else {
3568 c = mInputBuffer[mInputPtr++];
3569 }
3570 if (c < CHAR_SPACE) {
3571 if (c == '\n' || c == '\r') {
3572 skipCRLF(c);
3573 } else if (c != '\t') {
3574 throwInvalidSpace(c);
3575 }
3576 } else if (c == endChar) {
3577 break;
3578 }
3579 ++count;
3580 }
3581
3582 // Now, we may be getting end mark; first need second marker char:.
3583 c = getNextChar(errorMsg);
3584 if (c == endChar) { // Probably?
3585 // Now; we should be getting a '>', most likely.
3586 c = getNextChar(errorMsg);
3587 if (c == '>') {
3588 break;
3589 }
3590 if (preventDoubles) { // if not, it may be a problem...
3591 throwParseError("String '--' not allowed in comment (missing '>'?)");
3592 }
3593 // Otherwise, let's loop to see if there is end
3594 while (c == endChar) {
3595 c = (mInputPtr < mInputEnd)
3596 ? mInputBuffer[mInputPtr++] : getNextCharFromCurrent(errorMsg);
3597 }
3598 if (c == '>') {
3599 break;
3600 }
3601 }
3602
3603 // No match, did we get a linefeed?
3604 if (c < CHAR_SPACE) {
3605 if (c == '\n' || c == '\r') {
3606 skipCRLF(c);
3607 } else if (c != '\t') {
3608 throwInvalidSpace(c);
3609 }
3610 }
3611 // Let's continue from beginning, then
3612 }
3613 }
3614
3615 /**
3616 * Method called to skip past all following text and CDATA segments,
3617 * until encountering something else (including a general entity,
3618 * which may in turn expand to text).
3619 *
3620 * @return Character following all the skipped text and CDATA segments,
3621 * if any; or -1 to denote EOF
3622 */
3623 private int skipCoalescedText(int i)
3624 throws XMLStreamException
3625 {
3626 while (true) {
3627 // Ok, plain text or markup?
3628 if (i == '<') { // markup, maybe CDATA?
3629 // Need to distinguish "<![" from other tags/directives
3630 if (!ensureInput(3)) {
3631 /* Most likely an error condition, but let's leave
3632 * it up for other parts of code to complain.
3633 */
3634 return i;
3635 }
3636 if (mInputBuffer[mInputPtr] != '!'
3637 || mInputBuffer[mInputPtr+1] != '[') {
3638 // Nah, some other tag or directive
3639 return i;
3640 }
3641 // Let's skip beginning parts, then:
3642 mInputPtr += 2;
3643 // And verify we get proper CDATA directive
3644 checkCData();
3645 skipCommentOrCData(SUFFIX_IN_CDATA, ']', false);
3646 i = getNext();
3647 } else if (i < 0) { // eof
3648 return i;
3649 } else { // nah, normal text, gotta skip
3650 i = skipTokenText(i);
3651 /* Did we hit an unexpandable entity? If so, need to
3652 * return ampersand to the caller...
3653 * (and same for EOF too)
3654 */
3655 if (i == '&' || i < 0) {
3656 return i;
3657 }
3658 }
3659 }
3660 }
3661
3662 private int skipTokenText(int i)
3663 throws XMLStreamException
3664 {
3665 /* Fairly easy; except for potential to have entities
3666 * expand to some crap?
3667 */
3668 int count = 0;
3669
3670 main_loop:
3671 while (true) {
3672 if (i == '<') {
3673 return i;
3674 }
3675 if (i == '&') {
3676 // Can entities be resolved automatically?
3677 if (mCfgReplaceEntities) {
3678 // Let's first try quick resolution:
3679 if ((mInputEnd - mInputPtr) >= 3
3680 && resolveSimpleEntity(true) != 0) {
3681 ;
3682 } else {
3683 i = fullyResolveEntity(true);
3684 /* Either way, it's just fine; we don't care about
3685 * returned single-char value.
3686 */
3687 }
3688 } else {
3689 /* Can only skip character entities; others need to
3690 * be returned separately.
3691 */
3692 if (resolveCharOnlyEntity(true) == 0) {
3693 /* Now points to the char after ampersand, and we need
3694 * to return the ampersand itself
3695 */
3696 return i;
3697 }
3698 }
3699 } else if (i < CHAR_SPACE) {
3700 if (i == '\r' || i == '\n') {
3701 skipCRLF((char) i);
3702 } else if (i < 0) { // EOF
3703 return i;
3704 } else if (i != '\t') {
3705 throwInvalidSpace(i);
3706 }
3707
3708 }
3709 ++count;
3710 verifyLimit("Text size", mConfig.getMaxTextLength(), count);
3711
3712 // Hmmh... let's do quick looping here:
3713 while (mInputPtr < mInputEnd) {
3714 char c = mInputBuffer[mInputPtr++];
3715 if (c < CHAR_FIRST_PURE_TEXT) { // need to check it
3716 i = c;
3717 continue main_loop;
3718 }
3719 }
3720
3721 i = getNext();
3722 }
3723 // never gets here...
3724 }
3725
3726 /*
3727 ///////////////////////////////////////////////////////////////////////
3728 // Internal methods, parsing
3729 ///////////////////////////////////////////////////////////////////////
3730 */
3731
3732 protected void ensureFinishToken() throws XMLStreamException
3733 {
3734 if (mTokenState < mStTextThreshold) {
3735 finishToken(false);
3736 }
3737 }
3738
3739 protected void safeEnsureFinishToken()
3740 {
3741 if (mTokenState < mStTextThreshold) {
3742 safeFinishToken();
3743 }
3744 }
3745
3746 protected void safeFinishToken()
3747 {
3748 try {
3749 /* 24-Sep-2006, TSa: Let's try to reduce number of unchecked
3750 * (wrapped) exceptions we throw, and defer some. For now,
3751 * this is only for CHARACTERS (since it's always legal to
3752 * split CHARACTERS segment); could be expanded in future.
3753 */
3754 boolean deferErrors = (mCurrToken == CHARACTERS);
3755 finishToken(deferErrors);
3756 } catch (XMLStreamException strex) {
3757 throwLazyError(strex);
3758 }
3759 }
3760
3761 /**
3762 * Method called to read in contents of the token completely, if not
3763 * yet read. Generally called when caller needs to access anything
3764 * other than basic token type (except for elements), text contents
3765 * or such.
3766 *
3767 * @param deferErrors Flag to enable storing an exception to a
3768 * variable, instead of immediately throwing it. If true, will
3769 * just store the exception; if false, will not store, just throw.
3770 */
3771 protected void finishToken(boolean deferErrors)
3772 throws XMLStreamException
3773 {
3774 switch (mCurrToken) {
3775 case CDATA:
3776 if (mCfgCoalesceText) {
3777 readCoalescedText(mCurrToken, deferErrors);
3778 } else {
3779 if (readCDataSecondary(Integer.MAX_VALUE)) {
3780 mTokenState = TOKEN_FULL_SINGLE;
3781 } else { // can this ever happen?
3782 mTokenState = TOKEN_PARTIAL_SINGLE;
3783 }
3784 }
3785 return;
3786
3787 case CHARACTERS:
3788 if (mCfgCoalesceText) {
3789 /* 21-Sep-2005, TSa: It is often possible to optimize
3790 * here: if we get '<' NOT followed by '!', it can not
3791 * be CDATA, and thus we are done.
3792 */
3793 if (mTokenState == TOKEN_FULL_SINGLE
3794 && (mInputPtr + 1) < mInputEnd
3795 && mInputBuffer[mInputPtr+1] != '!') {
3796 mTokenState = TOKEN_FULL_COALESCED;
3797 return;
3798 }
3799 readCoalescedText(mCurrToken, deferErrors);
3800 } else {
3801 if (readTextSecondary(mShortestTextSegment, deferErrors)) {
3802 mTokenState = TOKEN_FULL_SINGLE;
3803 } else {
3804 mTokenState = TOKEN_PARTIAL_SINGLE;
3805 }
3806 }
3807 return;
3808
3809 case SPACE:
3810 {
3811 /* Only need to ensure there's no non-whitespace text
3812 * when parsing 'real' ignorable white space (in validating
3813 * mode, but that's implicit here)
3814 */
3815 boolean prolog = (mParseState != STATE_TREE);
3816 readSpaceSecondary(prolog);
3817 mTokenState = TOKEN_FULL_COALESCED;
3818 }
3819 return;
3820
3821 case COMMENT:
3822 readComment();
3823 mTokenState = TOKEN_FULL_COALESCED;
3824 return;
3825
3826 case DTD:
3827
3828 /* 05-Jan-2006, TSa: Although we shouldn't have to use finally
3829 * here, it's probably better to do that for robustness
3830 * (specifically, in case of a parsing problem, we don't want
3831 * to remain in 'DTD partially read' case -- it's better
3832 * to get in panic mode and skip the rest)
3833 */
3834 try {
3835 finishDTD(true);
3836 } finally {
3837 mTokenState = TOKEN_FULL_COALESCED;
3838 }
3839 return;
3840
3841 case PROCESSING_INSTRUCTION:
3842 readPI();
3843 mTokenState = TOKEN_FULL_COALESCED;
3844 return;
3845
3846 case START_ELEMENT:
3847 case END_ELEMENT: // these 2 should never end up in here...
3848 case ENTITY_REFERENCE:
3849 case ENTITY_DECLARATION:
3850 case NOTATION_DECLARATION:
3851 case START_DOCUMENT:
3852 case END_DOCUMENT:
3853 throw new IllegalStateException("finishToken() called when current token is "+tokenTypeDesc(mCurrToken));
3854
3855 case ATTRIBUTE:
3856 case NAMESPACE:
3857 // These two are never returned by this class
3858 default:
3859 }
3860
3861 throw new IllegalStateException("Internal error: unexpected token "+tokenTypeDesc(mCurrToken));
3862 }
3863
3864 private void readComment()
3865 throws XMLStreamException
3866 {
3867 char[] inputBuf = mInputBuffer;
3868 int inputLen = mInputEnd;
3869 int ptr = mInputPtr;
3870 int start = ptr;
3871
3872 // Let's first see if we can just share input buffer:
3873 while (ptr < inputLen) {
3874 char c = inputBuf[ptr++];
3875 if (c > '-') {
3876 continue;
3877 }
3878
3879 if (c < CHAR_SPACE) {
3880 if (c == '\n') {
3881 markLF(ptr);
3882 } else if (c == '\r') {
3883 if (!mNormalizeLFs && ptr < inputLen) {
3884 if (inputBuf[ptr] == '\n') {
3885 ++ptr;
3886 }
3887 markLF(ptr);
3888 } else {
3889 --ptr; // pushback
3890 break;
3891 }
3892 } else if (c != '\t') {
3893 throwInvalidSpace(c);
3894 }
3895 } else if (c == '-') {
3896 // Ok; need to get '->', can not get '--'
3897
3898 if ((ptr + 1) >= inputLen) {
3899 // Can't check next 2, let's push '-' back, for rest of
3900 // code to take care of
3901 --ptr;
3902 break;
3903 }
3904
3905 if (inputBuf[ptr] != '-') {
3906 // Can't skip, might be LF/CR
3907 continue;
3908 }
3909 // Ok; either get '>' or error:
3910 c = inputBuf[ptr+1];
3911 if (c != '>') {
3912 throwParseError("String '--' not allowed in comment (missing '>'?)");
3913 }
3914 mTextBuffer.resetWithShared(inputBuf, start, ptr-start-1);
3915 mInputPtr = ptr + 2;
3916 return;
3917 }
3918 }
3919 mInputPtr = ptr;
3920 mTextBuffer.resetWithCopy(inputBuf, start, ptr-start);
3921 readComment2(mTextBuffer);
3922 }
3923
3924 private void readComment2(TextBuffer tb)
3925 throws XMLStreamException
3926 {
3927 /* Output pointers; calls will also ensure that the buffer is
3928 * not shared, AND has room for at least one more char
3929 */
3930 char[] outBuf = tb.getCurrentSegment();
3931 int outPtr = tb.getCurrentSegmentSize();
3932 int outLen = outBuf.length;
3933
3934 while (true) {
3935 char c = (mInputPtr < mInputEnd) ?
3936 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_COMMENT);
3937
3938 if (c < CHAR_SPACE) {
3939 if (c == '\n') {
3940 markLF();
3941 } else if (c == '\r') {
3942 if (skipCRLF(c)) { // got 2 char LF
3943 if (!mNormalizeLFs) {
3944 if (outPtr >= outLen) { // need more room?
3945 outBuf = mTextBuffer.finishCurrentSegment();
3946 outLen = outBuf.length;
3947 outPtr = 0;
3948 }
3949 outBuf[outPtr++] = c;
3950 }
3951 // And let's let default output the 2nd char
3952 c = '\n';
3953 } else if (mNormalizeLFs) { // just \r, but need to convert
3954 c = '\n'; // For Mac text
3955 }
3956 } else if (c != '\t') {
3957 throwInvalidSpace(c);
3958 }
3959 } else if (c == '-') { // Ok; need to get '->', can not get '--'
3960 c = getNextCharFromCurrent(SUFFIX_IN_COMMENT);
3961 if (c == '-') { // Ok, has to be end marker then:
3962 // Either get '>' or error:
3963 c = getNextCharFromCurrent(SUFFIX_IN_COMMENT);
3964 if (c != '>') {
3965 throwParseError(ErrorConsts.ERR_HYPHENS_IN_COMMENT);
3966 }
3967 break;
3968 }
3969
3970 /* Not the end marker; let's just output the first hyphen,
3971 * push the second char back , and let main
3972 * code handle it.
3973 */
3974 c = '-';
3975 --mInputPtr;
3976 }
3977
3978 // Need more room?
3979 if (outPtr >= outLen) {
3980 outBuf = mTextBuffer.finishCurrentSegment();
3981 outLen = outBuf.length;
3982 outPtr = 0;
3983 verifyLimit("Text size", mConfig.getMaxTextLength(), mTextBuffer.size());
3984 }
3985 // Ok, let's add char to output:
3986 outBuf[outPtr++] = c;
3987 }
3988
3989 // Ok, all done, then!
3990 mTextBuffer.setCurrentLength(outPtr);
3991 }
3992
3993 /**
3994 * Method that reads the primary part of a PI, ie. target, and also
3995 * skips white space between target and data (if any data)
3996 *
3997 * @return Usually <code>PROCESSING_INSTRUCTION</code>; but may be
3998 * different in multi-doc mode, if we actually hit a secondary
3999 * xml declaration.
4000 */
4001 private final int readPIPrimary()
4002 throws XMLStreamException
4003 {
4004 // Ok, first we need the name:
4005 String target = parseFullName();
4006 mCurrName = target;
4007
4008 if (target.length() == 0) {
4009 throwParseError(ErrorConsts.ERR_WF_PI_MISSING_TARGET);
4010 }
4011
4012 // As per XML specs, #17, case-insensitive 'xml' is illegal:
4013 if (target.equalsIgnoreCase("xml")) {
4014 // 07-Oct-2005, TSa: Still legal in multi-doc mode...
4015 if (!mConfig.inputParsingModeDocuments()) {
4016 throwParseError(ErrorConsts.ERR_WF_PI_XML_TARGET, target, null);
4017 }
4018 // Ok, let's just verify we get space then
4019 char c = getNextCharFromCurrent(SUFFIX_IN_XML_DECL);
4020 if (!isSpaceChar(c)) {
4021 throwUnexpectedChar(c, "excepted a space in xml declaration after 'xml'");
4022 }
4023 return handleMultiDocStart(START_DOCUMENT);
4024 }
4025
4026 // And then either white space before data, or end marker:
4027 char c = (mInputPtr < mInputEnd) ?
4028 mInputBuffer[mInputPtr++] : getNextCharFromCurrent(SUFFIX_IN_PROC_INSTR);
4029 if (isSpaceChar(c)) { // Ok, space to skip
4030 mTokenState = TOKEN_STARTED;
4031 // Need to skip the WS...
4032 skipWS(c);
4033 } else { // Nope; apparently finishes right away...
4034 mTokenState = TOKEN_FULL_COALESCED;
4035 mTextBuffer.resetWithEmpty();
4036 // or does it?
4037 if (c != '?' || getNextCharFromCurrent(SUFFIX_IN_PROC_INSTR) != '>') {
4038 throwUnexpectedChar(c, ErrorConsts.ERR_WF_PI_XML_MISSING_SPACE);
4039 }
4040 }
4041
4042 return PROCESSING_INSTRUCTION;
4043 }
4044
4045 /**
4046 * Method that parses a processing instruction's data portion; at this
4047 * point target has been parsed.
4048 */
4049 private void readPI()
4050 throws XMLStreamException
4051 {
4052 int ptr = mInputPtr;
4053 int start = ptr;
4054 char[] inputBuf = mInputBuffer;
4055 int inputLen = mInputEnd;
4056
4057 outer_loop:
4058 while (ptr < inputLen) {
4059 char c = inputBuf[ptr++];
4060 if (c < CHAR_SPACE) {
4061 if (c == '\n') {
4062 markLF(ptr);
4063 } else if (c == '\r') {
4064 if (ptr < inputLen && !mNormalizeLFs) {
4065 if (inputBuf[ptr] == '\n') {
4066 ++ptr;
4067 }
4068 markLF(ptr);
4069 } else {
4070 --ptr; // pushback
4071 break;
4072 }
4073 } else if (c != '\t') {
4074 throwInvalidSpace(c);
4075 }
4076 } else if (c == '?') {
4077 // K; now just need '>' after zero or more '?'s
4078 while (true) {
4079 if (ptr >= inputLen) {
4080 /* end of buffer; need to push back at least one of
4081 * question marks (not all, since just one is needed
4082 * to close the PI)
4083 */
4084 --ptr;
4085 break outer_loop;
4086 }
4087 c = inputBuf[ptr++];
4088 if (c == '>') {
4089 mInputPtr = ptr;
4090 // Need to discard trailing '?>'
4091 mTextBuffer.resetWithShared(inputBuf, start, ptr-start-2);
4092 return;
4093 }
4094 if (c != '?') {
4095 // Not end, can continue, but need to push back last char, in case it's LF/CR
4096 --ptr;
4097 break;
4098 }
4099 }
4100 }
4101 }
4102
4103 mInputPtr = ptr;
4104 // No point in trying to share... let's just append
4105 mTextBuffer.resetWithCopy(inputBuf, start, ptr-start);
4106 readPI2(mTextBuffer);
4107 }
4108
4109 private void readPI2(TextBuffer tb)
4110 throws XMLStreamException
4111 {
4112 char[] inputBuf = mInputBuffer;
4113 int inputLen = mInputEnd;
4114 int inputPtr = mInputPtr;
4115
4116 /* Output pointers; calls will also ensure that the buffer is
4117 * not shared, AND has room for one more char
4118 */
4119 char[] outBuf = tb.getCurrentSegment();
4120 int outPtr = tb.getCurrentSegmentSize();
4121
4122 main_loop:
4123 while (true) {
4124 // Let's first ensure we have some data in there...
4125 if (inputPtr >= inputLen) {
4126 loadMoreFromCurrent(SUFFIX_IN_PROC_INSTR);
4127 inputBuf = mInputBuffer;
4128 inputPtr = mInputPtr;
4129 inputLen = mInputEnd;
4130 }
4131
4132 // And then do chunks
4133 char c = inputBuf[inputPtr++];
4134 if (c < CHAR_SPACE) {
4135 if (c == '\n') {
4136 markLF(inputPtr);
4137 } else if (c == '\r') {
4138 mInputPtr = inputPtr;
4139 if (skipCRLF(c)) { // got 2 char LF
4140 if (!mNormalizeLFs) {
4141 // Special handling, to output 2 chars at a time:
4142 if (outPtr >= outBuf.length) { // need more room?
4143 outBuf = mTextBuffer.finishCurrentSegment();
4144 outPtr = 0;
4145 }
4146 outBuf[outPtr++] = c;
4147 }
4148 // And let's let default output the 2nd char, either way
4149 c = '\n';
4150 } else if (mNormalizeLFs) { // just \r, but need to convert
4151 c = '\n'; // For Mac text
4152 }
4153 /* Since skipCRLF() needs to peek(), buffer may have
4154 * changed, even if there was no CR+LF.
4155 */
4156 inputPtr = mInputPtr;
4157 inputBuf = mInputBuffer;
4158 inputLen = mInputEnd;
4159 } else if (c != '\t') {
4160 throwInvalidSpace(c);
4161 }
4162 } else if (c == '?') { // Ok, just need '>' after zero or more '?'s
4163 mInputPtr = inputPtr; // to allow us to call getNextChar
4164
4165 qmLoop:
4166 while (true) {
4167 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
4168 : getNextCharFromCurrent(SUFFIX_IN_PROC_INSTR);
4169 if (c == '>') { // got it!
4170 break main_loop;
4171 } else if (c == '?') {
4172 if (outPtr >= outBuf.length) { // need more room?
4173 outBuf = tb.finishCurrentSegment();
4174 outPtr = 0;
4175 }
4176 outBuf[outPtr++] = c;
4177 } else {
4178 /* Hmmh. Wasn't end mark after all. Thus, need to
4179 * fall back to normal processing, with one more
4180 * question mark (first one matched that wasn't
4181 * yet output),
4182 * reset variables, and go back to main loop.
4183 */
4184 inputPtr = --mInputPtr; // push back last char
4185 inputBuf = mInputBuffer;
4186 inputLen = mInputEnd;
4187 c = '?';
4188 break qmLoop;
4189 }
4190 }
4191 } // if (c == '?)
4192
4193 // Need more room?
4194 if (outPtr >= outBuf.length) {
4195 outBuf = tb.finishCurrentSegment();
4196 outPtr = 0;
4197 }
4198 // Ok, let's add char to output:
4199 outBuf[outPtr++] = c;
4200
4201 } // while (true)
4202
4203 tb.setCurrentLength(outPtr);
4204 }
4205
4206 /**
4207 * Method called to read the content of both current CDATA/CHARACTERS
4208 * events, and all following consequtive events into the text buffer.
4209 * At this point the current type is known, prefix (for CDATA) skipped,
4210 * and initial consequtive contents (if any) read in.
4211 *
4212 * @param deferErrors Flag to enable storing an exception to a
4213 * variable, instead of immediately throwing it. If true, will
4214 * just store the exception; if false, will not store, just throw.
4215 */
4216 protected void readCoalescedText(int currType, boolean deferErrors)
4217 throws XMLStreamException
4218 {
4219 boolean wasCData;
4220
4221 // Ok; so we may need to combine adjacent text/CDATA chunks.
4222 if (currType == CHARACTERS || currType == SPACE) {
4223 readTextSecondary(Integer.MAX_VALUE, deferErrors);
4224 wasCData = false;
4225 } else if (currType == CDATA) {
4226 /* We may have actually really finished it, but just left
4227 * the 'unfinished' flag due to need to coalesce...
4228 */
4229 if (mTokenState <= TOKEN_PARTIAL_SINGLE) {
4230 readCDataSecondary(Integer.MAX_VALUE);
4231 }
4232 wasCData = true;
4233 } else {
4234 throw new IllegalStateException("Internal error: unexpected token "+tokenTypeDesc(mCurrToken)+"; expected CHARACTERS, CDATA or SPACE.");
4235 }
4236
4237 // But how about additional text?
4238 while (!deferErrors || (mPendingException == null)) {
4239 if (mInputPtr >= mInputEnd) {
4240 mTextBuffer.ensureNotShared();
4241 if (!loadMore()) {
4242 // ??? Likely an error but let's just break
4243 break;
4244 }
4245 }
4246 // Let's peek, ie. not advance it yet
4247 char c = mInputBuffer[mInputPtr];
4248 if (c == '<') { // CDATA, maybe?
4249 // Need to distinguish "<![" from other tags/directives
4250 // 26-Feb-2014, tatu: Wrt [WSTX-294], need to unshare buffer
4251 // unless whole leading CDATA marker fits in buffer
4252 if ((mInputEnd - mInputPtr) < 9) { // 3 for "<![" and 6 more for "CDATA["
4253 mTextBuffer.ensureNotShared();
4254 if (!ensureInput(3)) {
4255 break;
4256 }
4257 }
4258 if (mInputBuffer[mInputPtr+1] != '!'
4259 || mInputBuffer[mInputPtr+2] != '[') {
4260 // Nah, some other tag or directive
4261 break;
4262 }
4263 // Let's skip beginning parts, then:
4264 mInputPtr += 3;
4265 // And verify we get proper CDATA directive
4266 checkCData();
4267 /* No need to call the primary data; it's only useful if
4268 * there's a chance for sharing buffers... so let's call
4269 * the secondary loop straight on.
4270 */
4271 readCDataSecondary(Integer.MAX_VALUE);
4272 wasCData = true;
4273 } else { // text
4274 /* Did we hit an 'unexpandable' entity? If so, need to
4275 * just bail out.
4276 */
4277 if (c == '&' && !wasCData) {
4278 break;
4279 }
4280 // Likewise, can't share buffers, let's call secondary loop:
4281 readTextSecondary(Integer.MAX_VALUE, deferErrors);
4282 wasCData = false;
4283 }
4284 }
4285
4286 mTokenState = TOKEN_FULL_COALESCED;
4287 }
4288
4289 /**
4290 * Method called to read in consecutive beginning parts of a CDATA
4291 * segment, up to either end of the segment (]] and >) or until
4292 * first 'hole' in text (buffer end, 2-char lf to convert, entity).
4293 *<p>
4294 * When the method is called, it's expected that the first character
4295 * has been read as is in the current input buffer just before current
4296 * pointer
4297 *
4298 * @param c First character in the CDATA segment (possibly part of end
4299 * marker for empty segments
4300 *
4301 * @return True if the whole CDATA segment was completely read; this
4302 * happens only if lt-char is hit; false if it's possible that
4303 * it wasn't read (ie. end-of-buffer or entity encountered).
4304 */
4305 private final boolean readCDataPrimary(char c)
4306 throws XMLStreamException
4307 {
4308 mWsStatus = (c <= CHAR_SPACE) ? ALL_WS_UNKNOWN : ALL_WS_NO;
4309
4310 int ptr = mInputPtr;
4311 int inputLen = mInputEnd;
4312 char[] inputBuf = mInputBuffer;
4313 int start = ptr-1;
4314
4315 while (true) {
4316 if (c < CHAR_SPACE) {
4317 if (c == '\n') {
4318 markLF(ptr);
4319 } else if (c == '\r') {
4320 if (ptr >= inputLen) { // can't peek?
4321 --ptr;
4322 break;
4323 }
4324 if (mNormalizeLFs) { // can we do in-place Mac replacement?
4325 if (inputBuf[ptr] == '\n') { // nope, 2 char lf
4326 --ptr;
4327 break;
4328 }
4329 inputBuf[ptr-1] = '\n'; // yup
4330 } else {
4331 // No LF normalization... can we just skip it?
4332 if (inputBuf[ptr] == '\n') {
4333 ++ptr;
4334 }
4335 }
4336 markLF(ptr);
4337 } else if (c != '\t') {
4338 throwInvalidSpace(c);
4339 }
4340 } else if (c == ']') {
4341 // Ok; need to get one or more ']'s, then '>'
4342 if ((ptr + 1) >= inputLen) { // not enough room? need to push it back
4343 --ptr;
4344 break;
4345 }
4346
4347 // Needs to be followed by another ']'...
4348 if (inputBuf[ptr] == ']') {
4349 ++ptr;
4350 inner_loop:
4351 while (true) {
4352 if (ptr >= inputLen) {
4353 /* Need to push back last 2 right brackets; it may
4354 * be end marker divided by input buffer boundary
4355 */
4356 ptr -= 2;
4357 break inner_loop;
4358 }
4359 c = inputBuf[ptr++];
4360 if (c == '>') { // Ok, got it!
4361 mInputPtr = ptr;
4362 ptr -= (start+3);
4363 mTextBuffer.resetWithShared(inputBuf, start, ptr);
4364 mTokenState = TOKEN_FULL_SINGLE;
4365 return true;
4366 }
4367 if (c != ']') {
4368 // Need to re-check this char (may be linefeed)
4369 --ptr;
4370 break inner_loop;
4371 }
4372 // Fall through to next round
4373 }
4374 }
4375 }
4376
4377 if (ptr >= inputLen) { // end-of-buffer?
4378 break;
4379 }
4380 c = inputBuf[ptr++];
4381 }
4382
4383 mInputPtr = ptr;
4384
4385 /* If we end up here, we either ran out of input, or hit something
4386 * which would leave 'holes' in buffer... fine, let's return then;
4387 * we can still update shared buffer copy: would be too early to
4388 * make a copy since caller may not even be interested in the
4389 * stuff.
4390 */
4391 int len = ptr - start;
4392 mTextBuffer.resetWithShared(inputBuf, start, len);
4393 if (mCfgCoalesceText ||
4394 (mTextBuffer.size() < mShortestTextSegment)) {
4395 mTokenState = TOKEN_STARTED;
4396 } else {
4397 mTokenState = TOKEN_PARTIAL_SINGLE;
4398 }
4399 return false;
4400 }
4401
4402 /**
4403 * @return True if the whole CData section was completely read (we
4404 * hit the end marker); false if a shorter segment was returned.
4405 */
4406 protected boolean readCDataSecondary(int shortestSegment)
4407 throws XMLStreamException
4408 {
4409 // Input pointers
4410 char[] inputBuf = mInputBuffer;
4411 int inputLen = mInputEnd;
4412 int inputPtr = mInputPtr;
4413
4414 /* Output pointers; calls will also ensure that the buffer is
4415 * not shared, AND has room for one more char
4416 */
4417 char[] outBuf = mTextBuffer.getCurrentSegment();
4418 int outPtr = mTextBuffer.getCurrentSegmentSize();
4419
4420 while (true) {
4421 if (inputPtr >= inputLen) {
4422 loadMore(SUFFIX_IN_CDATA);
4423 inputBuf = mInputBuffer;
4424 inputPtr = mInputPtr;
4425 inputLen = mInputEnd;
4426 }
4427 char c = inputBuf[inputPtr++];
4428
4429 if (c < CHAR_SPACE) {
4430 if (c == '\n') {
4431 markLF(inputPtr);
4432 } else if (c == '\r') {
4433 mInputPtr = inputPtr;
4434 if (skipCRLF(c)) { // got 2 char LF
4435 if (!mNormalizeLFs) {
4436 // Special handling, to output 2 chars at a time:
4437 outBuf[outPtr++] = c;
4438 if (outPtr >= outBuf.length) { // need more room?
4439 outBuf = mTextBuffer.finishCurrentSegment();
4440 outPtr = 0;
4441 }
4442 }
4443 // And let's let default output the 2nd char, either way
4444 c = '\n';
4445 } else if (mNormalizeLFs) { // just \r, but need to convert
4446 c = '\n'; // For Mac text
4447 }
4448 /* Since skipCRLF() needs to peek(), buffer may have
4449 * changed, even if there was no CR+LF.
4450 */
4451 inputPtr = mInputPtr;
4452 inputBuf = mInputBuffer;
4453 inputLen = mInputEnd;
4454 } else if (c != '\t') {
4455 throwInvalidSpace(c);
4456 }
4457 } else if (c == ']') {
4458 // Ok; need to get ']>'
4459 mInputPtr = inputPtr;
4460 if (checkCDataEnd(outBuf, outPtr)) {
4461 return true;
4462 }
4463 inputPtr = mInputPtr;
4464 inputBuf = mInputBuffer;
4465 inputLen = mInputEnd;
4466
4467 outBuf = mTextBuffer.getCurrentSegment();
4468 outPtr = mTextBuffer.getCurrentSegmentSize();
4469 continue; // need to re-process last (non-bracket) char
4470 }
4471
4472 // Ok, let's add char to output:
4473 outBuf[outPtr++] = c;
4474
4475 // Need more room?
4476 if (outPtr >= outBuf.length) {
4477 TextBuffer tb = mTextBuffer;
4478 // Perhaps we have now enough to return?
4479 if (!mCfgCoalesceText) {
4480 tb.setCurrentLength(outBuf.length);
4481 if (tb.size() >= shortestSegment) {
4482 mInputPtr = inputPtr;
4483 return false;
4484 }
4485 }
4486 // If not, need more buffer space:
4487 outBuf = tb.finishCurrentSegment();
4488 outPtr = 0;
4489 // 17-Aug-2016, tatu: need to make sure to enforce size limits here too
4490 verifyLimit("Text size", mConfig.getMaxTextLength(), mTextBuffer.size());
4491 }
4492 }
4493 // never gets here
4494 }
4495
4496 /**
4497 * Method that will check, given the starting ']', whether there is
4498 * ending ']]>' (including optional extra ']'s); if so, will updated
4499 * output buffer with extra ]s, if not, will make sure input and output
4500 * are positioned for further checking.
4501 *
4502 * @return True, if we hit the end marker; false if not.
4503 */
4504 private boolean checkCDataEnd(char[] outBuf, int outPtr)
4505 throws XMLStreamException
4506 {
4507 int bracketCount = 0;
4508 char c;
4509 do {
4510 ++bracketCount;
4511 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
4512 : getNextCharFromCurrent(SUFFIX_IN_CDATA);
4513 } while (c == ']');
4514
4515 boolean match = (bracketCount >= 2 && c == '>');
4516 if (match) {
4517 bracketCount -= 2;
4518 }
4519 while (bracketCount > 0) {
4520 --bracketCount;
4521 outBuf[outPtr++] = ']';
4522 if (outPtr >= outBuf.length) {
4523 /* Can't really easily return, even if we have enough
4524 * stuff here, since we've more than one char...
4525 */
4526 outBuf = mTextBuffer.finishCurrentSegment();
4527 outPtr = 0;
4528 }
4529 }
4530 mTextBuffer.setCurrentLength(outPtr);
4531 // Match? Can break, then:
4532 if (match) {
4533 return true;
4534 }
4535 // No match, need to push the last char back and admit defeat...
4536 --mInputPtr;
4537 return false;
4538 }
4539
4540 /**
4541 * Method called to read in consecutive beginning parts of a text
4542 * segment, up to either end of the segment (lt char) or until
4543 * first 'hole' in text (buffer end, 2-char lf to convert, entity).
4544 *<p>
4545 * When the method is called, it's expected that the first character
4546 * has been read as is in the current input buffer just before current
4547 * pointer
4548 *
4549 * @param c First character of the text segment
4550 *
4551 * @return True if the whole text segment was completely read; this
4552 * happens only if lt-char is hit; false if it's possible that
4553 * it wasn't read (ie. end-of-buffer or entity encountered).
4554 */
4555 private final boolean readTextPrimary(char c) throws XMLStreamException
4556 {
4557 int ptr = mInputPtr;
4558 int start = ptr-1;
4559
4560 // First: can we heuristically canonicalize ws used for indentation?
4561 if (c <= CHAR_SPACE) {
4562 int len = mInputEnd;
4563 /* Even without indentation removal, it's good idea to
4564 * 'convert' \r or \r\n into \n (by replacing or skipping first
4565 * char): this may allow reusing the buffer.
4566 * But note that conversion MUST be enabled -- this is toggled
4567 * by code that includes internal entities, to prevent replacement
4568 * of CRs from int. general entities, as applicable.
4569 */
4570 do {
4571 // We'll need at least one char, no matter what:
4572 if (ptr < len && mNormalizeLFs) {
4573 if (c == '\r') {
4574 c = '\n';
4575 if (mInputBuffer[ptr] == c) {
4576 // Ok, whatever happens, can 'skip' \r, to point to following \n:
4577 ++start;
4578 // But if that's buffer end, can't skip that
4579 if (++ptr >= len) {
4580 break;
4581 }
4582 } else {
4583 mInputBuffer[start] = c;
4584 }
4585 } else if (c != '\n') {
4586 break;
4587 }
4588 markLF(ptr);
4589 if (mCheckIndentation > 0) {
4590 ptr = readIndentation(c, ptr);
4591 if (ptr < 0) { // success!
4592 return true;
4593 }
4594 }
4595 // If we got this far, we skipped a lf, need to read next char
4596 c = mInputBuffer[ptr++];
4597 }
4598 } while (false);
4599
4600 // can we figure out indentation?
4601 mWsStatus = ALL_WS_UNKNOWN;
4602 } else {
4603 mWsStatus = ALL_WS_NO;
4604 }
4605
4606 char[] inputBuf = mInputBuffer;
4607 int inputLen = mInputEnd;
4608
4609 // Let's first see if we can just share input buffer:
4610 while (true) {
4611 if (c < CHAR_FIRST_PURE_TEXT) {
4612 if (c == '<') {
4613 mInputPtr = --ptr;
4614 mTextBuffer.resetWithShared(inputBuf, start, ptr-start);
4615 return true;
4616 }
4617 if (c < CHAR_SPACE) {
4618 if (c == '\n') {
4619 markLF(ptr);
4620 } else if (c == '\r') {
4621 if (ptr >= inputLen) { // can't peek?
4622 --ptr;
4623 break;
4624 }
4625 if (mNormalizeLFs) { // can we do in-place Mac replacement?
4626 if (inputBuf[ptr] == '\n') { // nope, 2 char lf
4627 --ptr;
4628 break;
4629 }
4630 /* This would otherwise be risky (may modify value of
4631 * a shared entity value), but since DTDs are cached/accessed
4632 * based on properties including lf-normalization there's no
4633 * harm in 'fixing' it in place.
4634 */
4635 inputBuf[ptr-1] = '\n'; // yup
4636 } else {
4637 // No LF normalization... can we just skip it?
4638 if (inputBuf[ptr] == '\n') {
4639 ++ptr;
4640 }
4641 }
4642 markLF(ptr);
4643 } else if (c != '\t') {
4644 // Should consume invalid char, but not include in result
4645 mInputPtr = ptr;
4646 mTextBuffer.resetWithShared(inputBuf, start, ptr-start-1);
4647 /* Let's defer exception, provided we got at least
4648 * one valid character (if not, better throw
4649 * exception right away)
4650 */
4651 boolean deferErrors = (ptr - start) > 1;
4652 mPendingException = throwInvalidSpace(c, deferErrors);
4653 return true;
4654 }
4655 } else if (c == '&') {
4656 // Let's push it back and break
4657 --ptr;
4658 break;
4659 } else if (c == '>') {
4660 // Let's see if we got ']]>'?
4661 if ((ptr - start) >= 3) {
4662 if (inputBuf[ptr-3] == ']' && inputBuf[ptr-2] == ']') {
4663 /* Let's include ']]' in there, not '>' (since that
4664 * makes it non-wellformed): but need to consume
4665 * that char nonetheless
4666 */
4667 mInputPtr = ptr;
4668 mTextBuffer.resetWithShared(inputBuf, start, ptr-start-1);
4669 mPendingException = throwWfcException(ErrorConsts.ERR_BRACKET_IN_TEXT, true);
4670 return true; // and we are fully done
4671 }
4672 }
4673 }
4674 } // if (char in lower code range)
4675
4676 if (ptr >= inputLen) { // end-of-buffer?
4677 break;
4678 }
4679 c = inputBuf[ptr++];
4680 }
4681 mInputPtr = ptr;
4682
4683 /* If we end up here, we either ran out of input, or hit something
4684 * which would leave 'holes' in buffer... fine, let's return then;
4685 * we can still update shared buffer copy: would be too early to
4686 * make a copy since caller may not even be interested in the
4687 * stuff.
4688 */
4689 mTextBuffer.resetWithShared(inputBuf, start, ptr - start);
4690 return false;
4691 }
4692
4693 /**
4694 *
4695 * @param deferErrors Flag to enable storing an exception to a
4696 * variable, instead of immediately throwing it. If true, will
4697 * just store the exception; if false, will not store, just throw.
4698 *
4699 * @return True if the text segment was completely read ({@code '<'} was hit,
4700 * or in non-entity-expanding mode, a non-char entity); false if
4701 * it may still continue
4702 */
4703 protected final boolean readTextSecondary(int shortestSegment, boolean deferErrors)
4704 throws XMLStreamException
4705 {
4706 /* Output pointers; calls will also ensure that the buffer is
4707 * not shared, AND has room for at least one more char
4708 */
4709 char[] outBuf = mTextBuffer.getCurrentSegment();
4710 int outPtr = mTextBuffer.getCurrentSegmentSize();
4711 int inputPtr = mInputPtr;
4712 char[] inputBuffer = mInputBuffer;
4713 int inputLen = mInputEnd;
4714
4715 while (true) {
4716 if (inputPtr >= inputLen) {
4717 /* 07-Oct-2005, TSa: Let's not throw an exception for EOF from
4718 * here -- in fragment mode, it shouldn't be thrown, and in
4719 * other modes we might as well first return text, and only
4720 * then throw an exception: no need to do that yet.
4721 */
4722 mInputPtr = inputPtr;
4723 if (!loadMore()) {
4724 break;
4725 }
4726 inputPtr = mInputPtr;
4727 inputBuffer = mInputBuffer;
4728 inputLen = mInputEnd;
4729 }
4730 char c = inputBuffer[inputPtr++];
4731
4732 // Most common case is we don't have special char, thus:
4733 if (c < CHAR_FIRST_PURE_TEXT) {
4734 if (c < CHAR_SPACE) {
4735 if (c == '\n') {
4736 markLF(inputPtr);
4737 } else if (c == '\r') {
4738 mInputPtr = inputPtr;
4739 if (skipCRLF(c)) { // got 2 char LF
4740 if (!mNormalizeLFs) {
4741 // Special handling, to output 2 chars at a time:
4742 outBuf[outPtr++] = c;
4743 if (outPtr >= outBuf.length) { // need more room?
4744 outBuf = mTextBuffer.finishCurrentSegment();
4745 outPtr = 0;
4746 }
4747 }
4748 // And let's let default output the 2nd char
4749 c = '\n';
4750 } else if (mNormalizeLFs) { // just \r, but need to convert
4751 c = '\n'; // For Mac text
4752 }
4753 /* note: skipCRLF() may change ptr and len, but since
4754 * it does not close input source, it won't change
4755 * actual buffer object:
4756 */
4757 //inputBuffer = mInputBuffer;
4758 inputLen = mInputEnd;
4759 inputPtr = mInputPtr;
4760 } else if (c != '\t') {
4761 mTextBuffer.setCurrentLength(outPtr);
4762 mInputPtr = inputPtr;
4763 mPendingException = throwInvalidSpace(c, deferErrors);
4764 break;
4765 }
4766 } else if (c == '<') { // end is nigh!
4767 mInputPtr = inputPtr-1;
4768 break;
4769 } else if (c == '&') {
4770 mInputPtr = inputPtr;
4771 int ch;
4772 if (mCfgReplaceEntities) { // can we expand all entities?
4773 if ((inputLen - inputPtr) >= 3
4774 && (ch = resolveSimpleEntity(true)) != 0) {
4775 // Ok, it's fine then
4776 } else {
4777 ch = fullyResolveEntity(true);
4778 if (ch == 0) {
4779 // Input buffer changed, nothing to output quite yet:
4780 inputBuffer = mInputBuffer;
4781 inputLen = mInputEnd;
4782 inputPtr = mInputPtr;
4783 continue;
4784 }
4785 // otherwise char is now fine...
4786 }
4787 } else {
4788 /* Nope, can only expand char entities; others need
4789 * to be separately handled.
4790 */
4791 ch = resolveCharOnlyEntity(true);
4792 if (ch == 0) { // some other entity...
4793 /* can't expand; underlying pointer now points to
4794 * char after ampersand, need to rewind
4795 */
4796 --mInputPtr;
4797 break;
4798 }
4799 // .. otherwise we got char we needed
4800 }
4801 if (ch <= 0xFFFF) {
4802 c = (char) ch;
4803 } else {
4804 ch -= 0x10000;
4805 // need more room?
4806 if (outPtr >= outBuf.length) {
4807 outBuf = mTextBuffer.finishCurrentSegment();
4808 outPtr = 0;
4809 }
4810 outBuf[outPtr++] = (char) ((ch >> 10) + 0xD800);
4811 if (outPtr >= outBuf.length) {
4812 if ((outBuf = _expandOutputForText(inputPtr, outBuf, Integer.MAX_VALUE)) == null) { // got enough, leave
4813 return false;
4814 }
4815 outPtr = 0;
4816 }
4817 c = (char) ((ch & 0x3FF) + 0xDC00);
4818 }
4819 inputPtr = mInputPtr;
4820 // not quite sure why this is needed... but it is:
4821 inputLen = mInputEnd;
4822 } else if (c == '>') {
4823 // Let's see if we got ']]>'?
4824 /* 21-Apr-2005, TSa: But we can NOT check the output buffer
4825 * as it contains _expanded_ stuff... only input side.
4826 * For now, 98% accuracy has to do, as we may not be able
4827 * to access previous buffer's contents. But at least we
4828 * won't produce false positives from entity expansion
4829 */
4830 if (inputPtr > 2) { // can we do it here?
4831 // Since mInputPtr has been advanced, -1 refers to '>'
4832 if (inputBuffer[inputPtr-3] == ']'
4833 && inputBuffer[inputPtr-2] == ']') {
4834 mInputPtr = inputPtr;
4835 /* We have already added ']]' into output buffer...
4836 * should be ok, since only with '>' does it become
4837 * non-wellformed.
4838 */
4839 mTextBuffer.setCurrentLength(outPtr);
4840 mPendingException = throwWfcException(ErrorConsts.ERR_BRACKET_IN_TEXT, deferErrors);
4841 break;
4842 }
4843 } else {
4844 /* 21-Apr-2005, TSa: No good way to verify it,
4845 * at this point. Should come back and think of how
4846 * to properly handle this (rare) possibility.
4847 */
4848 ;
4849 }
4850 }
4851 }
4852 // Ok, let's add char to output:
4853 outBuf[outPtr++] = c;
4854
4855 // Need more room?
4856 if (outPtr >= outBuf.length) {
4857 if ((outBuf = _expandOutputForText(inputPtr, outBuf, shortestSegment)) == null) { // got enough, leave
4858 return false;
4859 }
4860 verifyLimit("Text size", mConfig.getMaxTextLength(), mTextBuffer.size());
4861 outPtr = 0;
4862 }
4863 }
4864 mTextBuffer.setCurrentLength(outPtr);
4865 return true;
4866 }
4867
4868 private final char[] _expandOutputForText(int inputPtr, char[] outBuf,
4869 int shortestSegment)
4870 {
4871 TextBuffer tb = mTextBuffer;
4872 // Perhaps we have now enough to return?
4873 tb.setCurrentLength(outBuf.length);
4874 if (tb.size() >= shortestSegment) {
4875 mInputPtr = inputPtr;
4876 return null;
4877 }
4878 // If not, need more buffer space:
4879 return tb.finishCurrentSegment();
4880 }
4881
4882 /**
4883 * Method called to try to parse and canonicalize white space that
4884 * has a good chance of being white space with somewhat regular
4885 * structure; specifically, something that looks like typical
4886 * indentation.
4887 *<p>
4888 * Note: Caller guarantees that there will be at least 2 characters
4889 * available in the input buffer. And method has to ensure that if
4890 * it does not find a match, it will return pointer value such
4891 * that there is at least one valid character remaining.
4892 *
4893 * @return -1, if the content was determined to be canonicalizable
4894 * (indentation) white space; and thus fully parsed. Otherwise
4895 * pointer (value to set to mInputPtr) to the next character
4896 * to process (not processed by this method)
4897 */
4898 private final int readIndentation(char c, int ptr)
4899 throws XMLStreamException
4900 {
4901 /* We need to verify that:
4902 * (a) we can read enough contiguous data to do determination
4903 * (b) sequence is a linefeed, with either zero or more following
4904 * spaces, or zero or more tabs; and followed by non-directive
4905 * tag (start/end tag)
4906 * and if so, we can use a canonical shared representation of
4907 * this even.
4908 */
4909 final int inputLen = mInputEnd;
4910 final char[] inputBuf = mInputBuffer;
4911 int start = ptr-1;
4912 final char lf = c;
4913
4914 // Note: caller guarantees at least one more char in the input buffer
4915 ws_loop:
4916 do { // dummy loop to allow for break (which indicates failure)
4917 c = inputBuf[ptr++];
4918 if (c == ' ' || c == '\t') { // indentation?
4919 // Need to limit to maximum
4920 int lastIndCharPos = (c == ' ') ? TextBuffer.MAX_INDENT_SPACES : TextBuffer.MAX_INDENT_TABS;
4921 lastIndCharPos += ptr;
4922 if (lastIndCharPos > inputLen) {
4923 lastIndCharPos = inputLen;
4924 }
4925
4926 inner_loop:
4927 while (true) {
4928 if (ptr >= lastIndCharPos) { // overflow; let's backtrack
4929 --ptr;
4930 break ws_loop;
4931 }
4932 char d = inputBuf[ptr++];
4933 if (d != c) {
4934 if (d == '<') { // yup, got it!
4935 break inner_loop;
4936 }
4937 --ptr; // caller needs to reprocess it
4938 break ws_loop; // nope, blew it
4939 }
4940 }
4941 // This means we had success case; let's fall through
4942 } else if (c != '<') { // nope, can not be
4943 --ptr; // simpler if we just push it back; needs to be processed later on
4944 break ws_loop;
4945 }
4946
4947 // Ok; we got '<'... just need any other char than '!'...
4948 if (ptr < inputLen && inputBuf[ptr] != '!') {
4949 // Voila!
4950 mInputPtr = --ptr; // need to push back that '<' too
4951 mTextBuffer.resetWithIndentation(ptr - start - 1, c);
4952 // One more thing: had a positive match, need to note it
4953 if (mCheckIndentation < INDENT_CHECK_MAX) {
4954 mCheckIndentation += INDENT_CHECK_START;
4955 }
4956 mWsStatus = ALL_WS_YES;
4957 return -1;
4958 }
4959 // Nope: need to push '<' back, then
4960 --ptr;
4961 } while (false);
4962
4963 // Ok, nope... caller can/need to take care of it:
4964 /* Also, we may need to subtract indentation check count to possibly
4965 * disable this check if it doesn't seem to work.
4966 */
4967 --mCheckIndentation;
4968 /* Also; if lf we got was \r, need to convert it now (this
4969 * method only gets called in lf converting mode)
4970 * (and yes, it is safe to modify input buffer at this point;
4971 * see calling method for details)
4972 */
4973 if (lf == '\r') {
4974 inputBuf[start] = '\n';
4975 }
4976 return ptr;
4977 }
4978
4979 /**
4980 * Reading whitespace should be very similar to reading normal text;
4981 * although couple of simplifications can be made. Further, since this
4982 * method is very unlikely to be of much performance concern, some
4983 * optimizations are left out, where it simplifies code.
4984 *
4985 * @param c First white space characters; known to contain white space
4986 * at this point
4987 * @param prologWS If true, is reading white space outside XML tree,
4988 * and as such can get EOF. If false, should not get EOF, nor be
4989 * followed by any other char than <
4990 *
4991 * @return True if the whole white space segment was read; false if
4992 * something prevented that (end of buffer, replaceable 2-char lf)
4993 */
4994 private final boolean readSpacePrimary(char c, boolean prologWS)
4995 throws XMLStreamException
4996 {
4997 int ptr = mInputPtr;
4998 char[] inputBuf = mInputBuffer;
4999 int inputLen = mInputEnd;
5000 int start = ptr-1;
5001
5002 // Let's first see if we can just share input buffer:
5003 while (true) {
5004 /* 30-Aug-2006, TSa: Let's not check for validity errors yet,
5005 * even if we could detect problems at this point.
5006 * This because it's not always
5007 * an error (in dtd-aware, non-validating mode); but also since
5008 * that way we can first return all space we got, and only
5009 * indicate error when next token is to be accessed.
5010 */
5011 if (c > CHAR_SPACE) { // End of whitespace
5012 mInputPtr = --ptr;
5013 mTextBuffer.resetWithShared(mInputBuffer, start, ptr-start);
5014 return true;
5015 }
5016
5017 if (c == '\n') {
5018 markLF(ptr);
5019 } else if (c == '\r') {
5020 if (ptr >= mInputEnd) { // can't peek?
5021 --ptr;
5022 break;
5023 }
5024 if (mNormalizeLFs) { // can we do in-place Mac replacement?
5025 if (inputBuf[ptr] == '\n') { // nope, 2 char lf
5026 --ptr;
5027 break;
5028 }
5029 inputBuf[ptr-1] = '\n'; // yup
5030 } else {
5031 // No LF normalization... can we just skip it?
5032 if (inputBuf[ptr] == '\n') {
5033 ++ptr;
5034 }
5035 }
5036 markLF(ptr);
5037 } else if (c != CHAR_SPACE && c != '\t') {
5038 throwInvalidSpace(c);
5039 }
5040 if (ptr >= inputLen) { // end-of-buffer?
5041 break;
5042 }
5043 c = inputBuf[ptr++];
5044 }
5045
5046 mInputPtr = ptr;
5047
5048 /* Ok, couldn't read it completely, let's just return whatever
5049 * we did get as shared data
5050 */
5051 mTextBuffer.resetWithShared(inputBuf, start, ptr - start);
5052 return false;
5053 }
5054
5055 /**
5056 * This is very similar to readSecondaryText(); called when we need
5057 * to read in rest of (ignorable) white space segment.
5058 *
5059 * @param prologWS True if the ignorable white space is within prolog
5060 * (or epilog); false if it's within xml tree.
5061 */
5062 private void readSpaceSecondary(boolean prologWS)
5063 throws XMLStreamException
5064 {
5065 /* Let's not bother optimizing input. However, we can easily optimize
5066 * output, since it's easy to do, yet has more effect on performance
5067 * than localizing input variables.
5068 */
5069 char[] outBuf = mTextBuffer.getCurrentSegment();
5070 int outPtr = mTextBuffer.getCurrentSegmentSize();
5071
5072 while (true) {
5073 if (mInputPtr >= mInputEnd) {
5074 /* 07-Oct-2005, TSa: Let's not throw an exception yet --
5075 * can return SPACE, and let exception be thrown
5076 * when trying to fetch next event.
5077 */
5078 if (!loadMore()) {
5079 break;
5080 }
5081 }
5082 char c = mInputBuffer[mInputPtr];
5083 if (c > CHAR_SPACE) { // end of WS?
5084 break;
5085 }
5086 ++mInputPtr;
5087 if (c == '\n') {
5088 markLF();
5089 } else if (c == '\r') {
5090 if (skipCRLF(c)) {
5091 if (!mNormalizeLFs) {
5092 // Special handling, to output 2 chars at a time:
5093 outBuf[outPtr++] = c;
5094 if (outPtr >= outBuf.length) { // need more room?
5095 outBuf = mTextBuffer.finishCurrentSegment();
5096 outPtr = 0;
5097 }
5098 }
5099 c = '\n';
5100 } else if (mNormalizeLFs) {
5101 c = '\n'; // For Mac text
5102 }
5103 } else if (c != CHAR_SPACE && c != '\t') {
5104 throwInvalidSpace(c);
5105 }
5106
5107 // Ok, let's add char to output:
5108 outBuf[outPtr++] = c;
5109
5110 // Need more room?
5111 if (outPtr >= outBuf.length) {
5112 outBuf = mTextBuffer.finishCurrentSegment();
5113 outPtr = 0;
5114 }
5115 }
5116 mTextBuffer.setCurrentLength(outPtr);
5117 }
5118
5119 /**
5120 * Method called to read the contents of the current CHARACTERS
5121 * event, and write all contents using the specified Writer.
5122 *
5123 * @param w Writer to use for writing out textual content parsed
5124 *
5125 * @return Total number of characters written using the writer
5126 */
5127 private int readAndWriteText(Writer w)
5128 throws IOException, XMLStreamException
5129 {
5130 mTokenState = TOKEN_FULL_SINGLE; // we'll read it all
5131
5132 /* We should be able to mostly just use the input buffer at this
5133 * point; exceptions being two-char linefeeds (when converting
5134 * to single ones) and entities (which likewise can expand or
5135 * shrink), both of which require flushing and/or single byte
5136 * output.
5137 */
5138 int start = mInputPtr;
5139 int count = 0;
5140
5141 main_loop:
5142 while (true) {
5143 char c;
5144 // Reached the end of buffer? Need to flush, then
5145 if (mInputPtr >= mInputEnd) {
5146 int len = mInputPtr - start;
5147 if (len > 0) {
5148 w.write(mInputBuffer, start, len);
5149 count += len;
5150 }
5151 c = getNextChar(SUFFIX_IN_TEXT);
5152 start = mInputPtr-1; // needs to be prior to char we got
5153 } else {
5154 c = mInputBuffer[mInputPtr++];
5155 }
5156 // Most common case is we don't have a special char, thus:
5157 if (c < CHAR_FIRST_PURE_TEXT) {
5158 if (c < CHAR_SPACE) {
5159 if (c == '\n') {
5160 markLF();
5161 } else if (c == '\r') {
5162 char d;
5163 final boolean atBoundary = (mInputPtr >= mInputEnd);
5164 if (atBoundary) {
5165 // If we can't peek easily, let's flush past stuff and load
5166 // more... (have to flush, since new read will overwrite input buffers)
5167 // 06-Dec-2019, tatu: [woodstox-core#97] Need to avoid copying \r tho:
5168 int len = mInputPtr - start - 1;
5169 if (len > 0) {
5170 w.write(mInputBuffer, start, len);
5171 count += len;
5172 }
5173 d = getNextChar(SUFFIX_IN_TEXT);
5174 start = mInputPtr; // to mark 'no past content'
5175 } else {
5176 d = mInputBuffer[mInputPtr++];
5177 }
5178 if (d == '\n') {
5179 if (mNormalizeLFs) {
5180 // Let's flush content prior to 2-char LF, and start the new
5181 // segment on the second char... this way, no mods are needed
5182 // for the buffer, AND it'll also work on split 2-char lf!
5183 int len = mInputPtr - start - 2;
5184 if (len > 0) {
5185 w.write(mInputBuffer, start, len);
5186 count += len;
5187 }
5188 start = mInputPtr-1; // so '\n' is the first char
5189 } else {
5190 // otherwise it's good as is... almost
5191 if (atBoundary) { // except, we don't want to lose that \r!
5192 w.write(c);
5193 }
5194 }
5195 } else { // not 2-char... need to replace?
5196 // First: push back whatever non-linefeed we got:
5197 --mInputPtr;
5198 // 06-Dec-2019, tatu: But beware [woodstox-core#97]
5199 if (atBoundary) {
5200 // If at boundary, no room to replace; must write single lf char
5201 w.write(mNormalizeLFs ? '\n' : c);
5202 ++count;
5203 } else { // but if not at boundary, can just replace lone '\r' if need be
5204 if (mNormalizeLFs) { // replace \r with \n
5205 mInputBuffer[mInputPtr-1] = '\n';
5206 }
5207 }
5208 }
5209 markLF();
5210 } else if (c != '\t') {
5211 throwInvalidSpace(c);
5212 }
5213 } else if (c == '<') { // end is nigh!
5214 break main_loop;
5215 } else if (c == '&') {
5216 // Have to flush all stuff, since entities pretty much
5217 // force it; input buffer won't be contiguous
5218 int len = mInputPtr - 1 - start; // -1 to remove ampersand
5219 if (len > 0) {
5220 w.write(mInputBuffer, start, len);
5221 count += len;
5222 }
5223 int ch;
5224 if (mCfgReplaceEntities) { // can we expand all entities?
5225 if ((mInputEnd - mInputPtr) < 3
5226 || (ch = resolveSimpleEntity(true)) == 0) {
5227 ch = fullyResolveEntity(true);
5228 }
5229 } else {
5230 ch = resolveCharOnlyEntity(true);
5231 if (ch == 0) { // some other entity...
5232 /* can't expand, so, let's just bail out... but
5233 * let's also ensure no text is added twice, as
5234 * all prev text was just flushed, but resolve
5235 * may have moved input buffer around.
5236 */
5237 start = mInputPtr;
5238 break main_loop;
5239 }
5240 }
5241 if (ch != 0) {
5242 if (ch <= 0xFFFF) {
5243 c = (char) ch;
5244 } else {
5245 ch -= 0x10000;
5246 w.write((char) ((ch >> 10) + 0xD800));
5247 c = (char) ((ch & 0x3FF) + 0xDC00);
5248 }
5249 w.write(c);
5250 ++count;
5251 }
5252 start = mInputPtr;
5253 } else if (c == '>') { // did we get ']]>'?
5254 /* 21-Apr-2005, TSa: But we can NOT check the output buffer
5255 * (see comments in readTextSecondary() for details)
5256 */
5257 if (mInputPtr >= 2) { // can we do it here?
5258 if (mInputBuffer[mInputPtr-2] == ']'
5259 && mInputBuffer[mInputPtr-1] == ']') {
5260 // Anything to flush?
5261 int len = mInputPtr - start;
5262 if (len > 0) {
5263 w.write(mInputBuffer, start, len);
5264 }
5265 throwParseError(ErrorConsts.ERR_BRACKET_IN_TEXT);
5266 }
5267 } else {
5268 ; // !!! TBI: how to check past boundary?
5269 }
5270 } else if (c == CHAR_NULL) {
5271 throwNullChar();
5272 }
5273 }
5274 } // while (true)
5275
5276 /* Need to push back '<' or '&', whichever caused us to
5277 * get out...
5278 */
5279 --mInputPtr;
5280
5281 // Anything left to flush?
5282 int len = mInputPtr - start;
5283 if (len > 0) {
5284 w.write(mInputBuffer, start, len);
5285 count += len;
5286 }
5287 return count;
5288 }
5289
5290 /**
5291 * Method called to read the contents of the current (possibly partially
5292 * read) CDATA
5293 * event, and write all contents using the specified Writer.
5294 *
5295 * @param w Writer to use for writing out textual content parsed
5296 *
5297 * @return Total number of characters written using the writer for
5298 * the current CDATA event
5299 */
5300 private int readAndWriteCData(Writer w)
5301 throws IOException, XMLStreamException
5302 {
5303 mTokenState = TOKEN_FULL_SINGLE; // we'll read it all
5304
5305 /* Ok; here we can basically have 2 modes; first the big loop to
5306 * gather all data up until a ']'; and then another loop to see
5307 * if ']' is part of ']]>', and after this if no end marker found,
5308 * go back to the first part.
5309 */
5310 char c = (mInputPtr < mInputEnd) ?
5311 mInputBuffer[mInputPtr++] : getNextChar(SUFFIX_IN_CDATA);
5312 int count = 0;
5313
5314 main_loop:
5315 while (true) {
5316 int start = mInputPtr-1;
5317
5318 quick_loop:
5319 while (true) {
5320 if (c > CHAR_CR_LF_OR_NULL) {
5321 if (c == ']') {
5322 break quick_loop;
5323 }
5324 } else {
5325 if (c < CHAR_SPACE) {
5326 if (c == '\n') {
5327 markLF();
5328 } else if (c == '\r') {
5329 char d;
5330 if (mInputPtr >= mInputEnd) {
5331 /* If we can't peek easily, let's flush past stuff
5332 * and load more... (have to flush, since new read
5333 * will overwrite inbut buffers)
5334 */
5335 int len = mInputPtr - start;
5336 if (len > 0) {
5337 w.write(mInputBuffer, start, len);
5338 count += len;
5339 }
5340 d = getNextChar(SUFFIX_IN_CDATA);
5341 start = mInputPtr; // to mark 'no past content'
5342 } else {
5343 d = mInputBuffer[mInputPtr++];
5344 }
5345 if (d == '\n') {
5346 if (mNormalizeLFs) {
5347 /* Let's flush content prior to 2-char LF, and
5348 * start the new segment on the second char...
5349 * this way, no mods are needed for the buffer,
5350 * AND it'll also work on split 2-char lf!
5351 */
5352 int len = mInputPtr - 2 - start;
5353 if (len > 0) {
5354 w.write(mInputBuffer, start, len);
5355 count += len;
5356 }
5357 start = mInputPtr-1; // so '\n' is the first char
5358 } else {
5359 // otherwise it's good as is
5360 }
5361 } else { // not 2-char... need to replace?
5362 --mInputPtr;
5363 if (mNormalizeLFs) {
5364 mInputBuffer[mInputPtr-1] = '\n';
5365 }
5366 }
5367 markLF();
5368 } else if (c != '\t') {
5369 throwInvalidSpace(c);
5370 }
5371 }
5372 }
5373 // Reached the end of buffer? Need to flush, then
5374 if (mInputPtr >= mInputEnd) {
5375 int len = mInputPtr - start;
5376 if (len > 0) {
5377 w.write(mInputBuffer, start, len);
5378 count += len;
5379 }
5380 start = 0;
5381 c = getNextChar(SUFFIX_IN_CDATA);
5382 } else {
5383 c = mInputBuffer[mInputPtr++];
5384 }
5385 } // while (true)
5386
5387 // Anything to flush once we hit ']'?
5388 {
5389 /* -1 since the last char in there (a '[') is NOT to be
5390 * output at this point
5391 */
5392 int len = mInputPtr - start - 1;
5393 if (len > 0) {
5394 w.write(mInputBuffer, start, len);
5395 count += len;
5396 }
5397 }
5398
5399 /* Ok; we only get this far when we hit a ']'. We got one,
5400 * so let's see if we can find at least one more bracket,
5401 * immediately followed by '>'...
5402 */
5403 int bracketCount = 0;
5404 do {
5405 ++bracketCount;
5406 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
5407 : getNextCharFromCurrent(SUFFIX_IN_CDATA);
5408 } while (c == ']');
5409
5410 boolean match = (bracketCount >= 2 && c == '>');
5411 if (match) {
5412 bracketCount -= 2;
5413 }
5414 while (bracketCount > 0) {
5415 --bracketCount;
5416 w.write(']');
5417 ++count;
5418 }
5419 if (match) {
5420 break main_loop;
5421 }
5422 /* Otherwise we'll just loop; now c is properly set to be
5423 * the next char as well.
5424 */
5425 } // while (true)
5426
5427 return count;
5428 }
5429
5430 /**
5431 * @return Number of characters written to Writer during the call
5432 */
5433 private int readAndWriteCoalesced(Writer w, boolean wasCData)
5434 throws IOException, XMLStreamException
5435 {
5436 mTokenState = TOKEN_FULL_COALESCED;
5437 int count = 0;
5438
5439 /* Ok, so what do we have next? CDATA, CHARACTERS, or something
5440 * else?
5441 */
5442 main_loop:
5443 while (true) {
5444 if (mInputPtr >= mInputEnd) {
5445 if (!loadMore()) {
5446 /* Shouldn't normally happen, but let's just let
5447 * caller deal with it...
5448 */
5449 break main_loop;
5450 }
5451 }
5452 // Let's peek, ie. not advance it yet
5453 char c = mInputBuffer[mInputPtr];
5454 if (c == '<') { // CDATA, maybe?
5455 // Need to distinguish "<![" from other tags/directives
5456 if ((mInputEnd - mInputPtr) < 3) {
5457 if (!ensureInput(3)) { // likewise, probably an error...
5458 break main_loop;
5459 }
5460 }
5461 if (mInputBuffer[mInputPtr+1] != '!'
5462 || mInputBuffer[mInputPtr+2] != '[') {
5463 // Nah, some other tag or directive
5464 break main_loop;
5465 }
5466 // Let's skip beginning parts, then:
5467 mInputPtr += 3;
5468 // And verify we get proper CDATA directive
5469 checkCData();
5470 // cool, let's just handle it then
5471 count += readAndWriteCData(w);
5472 wasCData = true;
5473 } else { // text
5474 /* Did we hit an 'unexpandable' entity? If so, need to
5475 * just bail out (only happens when Coalescing AND not
5476 * expanding -- a rather unlikely combination)
5477 */
5478 if (c == '&' && !wasCData) {
5479 break;
5480 }
5481 count += readAndWriteText(w);
5482 wasCData = false;
5483 }
5484 }
5485
5486 return count;
5487 }
5488
5489 /*
5490 ///////////////////////////////////////////////////////////////////////
5491 // Internal methods, low-level input access
5492 ///////////////////////////////////////////////////////////////////////
5493 */
5494
5495 /**
5496 * Method that will skip any white space from input source(s)
5497 *
5498 * @return true If at least one white space was skipped; false
5499 * if not (character passed was not white space)
5500 */
5501 protected final boolean skipWS(char c)
5502 throws XMLStreamException
5503 {
5504 if (c > CHAR_SPACE) {
5505 return false;
5506 }
5507 while (true) {
5508 // Linefeed?
5509 if (c == '\n' || c == '\r') {
5510 skipCRLF(c);
5511 } else if (c != CHAR_SPACE && c != '\t') {
5512 throwInvalidSpace(c);
5513 }
5514 if (mInputPtr >= mInputEnd) {
5515 // Let's see if current source has more
5516 if (!loadMoreFromCurrent()) {
5517 return true;
5518 }
5519 }
5520 c = mInputBuffer[mInputPtr];
5521 if (c > CHAR_SPACE) { // not WS? Need to return
5522 return true;
5523 }
5524 ++mInputPtr;
5525 }
5526 }
5527
5528 /*
5529 ///////////////////////////////////////////////////////////////////////
5530 // Abstract method implementations
5531 ///////////////////////////////////////////////////////////////////////
5532 */
5533
5534 @Override
5535 protected EntityDecl findEntity(String id, Object arg)
5536 throws XMLStreamException
5537 {
5538 EntityDecl ed = mConfig.findCustomInternalEntity(id);
5539 if (ed == null && mGeneralEntities != null) {
5540 ed = mGeneralEntities.get(id);
5541 }
5542 /* 05-Mar-2006, TSa: Externally declared entities are illegal
5543 * if we were declared as "standalone='yes'"...
5544 */
5545 if (mDocStandalone == DOC_STANDALONE_YES) {
5546 if (ed != null && ed.wasDeclaredExternally()) {
5547 throwParseError(ErrorConsts.ERR_WF_ENTITY_EXT_DECLARED, ed.getName(), null);
5548 }
5549 }
5550 return ed;
5551 }
5552
5553 @Override
5554 protected void handleUndeclaredEntity(String id)
5555 throws XMLStreamException
5556 {
5557 throwParseError(((mDocStandalone == DOC_STANDALONE_YES) ?
5558 ErrorConsts.ERR_WF_GE_UNDECLARED_SA :
5559 ErrorConsts.ERR_WF_GE_UNDECLARED),
5560 id, null);
5561 }
5562
5563 @Override
5564 protected void handleIncompleteEntityProblem(WstxInputSource closing)
5565 throws XMLStreamException
5566 {
5567 String top = mElementStack.isEmpty() ? "[ROOT]" : mElementStack.getTopElementDesc();
5568 throwParseError("Unexpected end of entity expansion for entity &{0}; was expecting a close tag for element <{1}>",
5569 closing.getEntityId(), top);
5570 }
5571
5572 /*
5573 ///////////////////////////////////////////////////////////////////////
5574 // Internal methods, validation, error handling and reporting
5575 ///////////////////////////////////////////////////////////////////////
5576 */
5577
5578 /**
5579 * This problem gets reported if an entity tries to expand to
5580 * a close tag matching start tag that did not came from the same
5581 * entity (but from parent).
5582 */
5583 protected void handleGreedyEntityProblem(WstxInputSource input)
5584 throws XMLStreamException
5585 {
5586 String top = mElementStack.isEmpty() ? "[ROOT]" : mElementStack.getTopElementDesc();
5587 throwParseError("Improper GE/element nesting: entity &"
5588 +input.getEntityId()+" contains closing tag for <"+top+">");
5589 }
5590
5591 private void throwNotTextual(int type) {
5592 throw new IllegalStateException("Not a textual event ("
5593 +tokenTypeDesc(type)+")");
5594 }
5595
5596 private void throwNotTextXxx(int type) {
5597 throw new IllegalStateException("getTextXxx() methods can not be called on "
5598 +tokenTypeDesc(type));
5599 }
5600
5601 protected void throwNotTextualOrElem(int type) {
5602 throw new IllegalStateException(MessageFormat.format(ErrorConsts.ERR_STATE_NOT_ELEM_OR_TEXT,
5603 new Object[] { tokenTypeDesc(type) }));
5604 }
5605
5606 /**
5607 * Method called when we get an EOF within content tree
5608 */
5609 protected void throwUnexpectedEOF() throws WstxException {
5610 throwUnexpectedEOF("; was expecting a close tag for element <"+mElementStack.getTopElementDesc()+">");
5611 }
5612
5613 /**
5614 * Method called to report a problem with
5615 */
5616 protected XMLStreamException _constructUnexpectedInTyped(int nextToken) {
5617 if (nextToken == START_ELEMENT) {
5618 return _constructTypeException("Element content can not contain child START_ELEMENT when using Typed Access methods", null);
5619 }
5620 return _constructTypeException("Expected a text token, got "+tokenTypeDesc(nextToken), null);
5621 }
5622
5623 protected TypedXMLStreamException _constructTypeException(String msg, String lexicalValue) {
5624 return new TypedXMLStreamException(lexicalValue, msg, getStartLocation());
5625 }
5626
5627 /**
5628 * Stub method implemented by validating parsers, to report content
5629 * that's not valid for current element context. Defined at this
5630 * level since some such problems need to be caught at low-level;
5631 * however, details of error reports are not needed here.
5632 *
5633 * @param evtType Type of event that contained unexpected content
5634 */
5635 protected void reportInvalidContent(int evtType) throws XMLStreamException {
5636 // should never happen; sub-class has to override:
5637 throwParseError("Internal error: sub-class should override method");
5638 }
5639 }
5640