1 /* Woodstox XML processor
2 *
3 * Copyright (c) 2004- Tatu Saloranta, tatu.saloranta@iki.fi
4 *
5 * Licensed under the License specified in the file LICENSE which is
6 * included with the source code.
7 * You may not use this file except in compliance with the License.
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package com.ctc.wstx.evt;
17
18 import java.util.NoSuchElementException;
19
20 import javax.xml.stream.*;
21 import javax.xml.stream.events.Characters;
22 import javax.xml.stream.events.XMLEvent;
23 import javax.xml.stream.util.XMLEventAllocator;
24
25 import org.codehaus.stax2.XMLEventReader2;
26 import org.codehaus.stax2.XMLStreamReader2;
27
28 import com.ctc.wstx.cfg.ErrorConsts;
29 import com.ctc.wstx.exc.WstxParsingException;
30 import com.ctc.wstx.sr.StreamScanner;
31
32 /**
33 * Woodstox version of {@link XMLEventReader2} (and {@link XMLEventReader}).
34 *<p>
35 * NOTE: up to Woodstox 5.1, this was based on Stax2 Reference Implementation
36 * ({@link org.codehaus.stax2.ri.Stax2EventReaderImpl}), but due to various issues
37 * has temporarily (?) been cut-paste-modified here. Ideally it would be reconciled
38 * once Stax2-api version 4.2 can be relied as baseline, but that may take time.
39 */
40 public class WstxEventReader
41 // extends Stax2EventReaderImpl // before 5.2
42 implements XMLEventReader2, XMLStreamConstants
43 {
44 // // // Enumerated state ids
45
46 protected final static int STATE_INITIAL = 1;
47 protected final static int STATE_END_OF_INPUT = 2;
48 protected final static int STATE_CONTENT = 3;
49
50
51 // // // Enumerated error case ids
52
53 /**
54 * Current state when getElementText() called not START_ELEMENT
55 */
56 protected final static int ERR_GETELEMTEXT_NOT_START_ELEM = 1;
57
58 /**
59 * Encountered non-textual event (other than closing END_ELEMENT)
60 * when collecting text for getElementText()
61 */
62 protected final static int ERR_GETELEMTEXT_NON_TEXT_EVENT = 2;
63
64 /**
65 * Encountered CHARACTERS or CDATA that contains non-white space
66 * char(s), when trying to locate tag with nextTag()
67 */
68 protected final static int ERR_NEXTTAG_NON_WS_TEXT = 3;
69
70 /**
71 * Encountered non-skippable non-text/element event with
72 * nextTag()
73 */
74 protected final static int ERR_NEXTTAG_WRONG_TYPE = 4;
75
76 /*
77 /**********************************************************************
78 /* Configuration
79 /**********************************************************************
80 */
81
82 protected final XMLEventAllocator mAllocator;
83
84 protected final XMLStreamReader2 mReader;
85
86 /*
87 /**********************************************************************
88 /* State
89 /**********************************************************************
90 */
91
92 /**
93 * Event that has been peeked, ie. loaded without call to
94 * {@link #nextEvent}; will be returned and cleared by
95 * call to {@link #nextEvent} (or, returned again if peeked
96 * again)
97 */
98 protected XMLEvent mPeekedEvent = null;
99
100 /**
101 * High-level state indicator, with currently three values:
102 * whether we are initializing (need to synthetize START_DOCUMENT),
103 * at END_OF_INPUT (end-of-doc), or otherwise, normal operation.
104 * Useful in simplifying some methods, as well as to make sure
105 * that independent of how stream reader handles things, event reader
106 * can reliably detect End-Of-Document.
107 */
108 protected int mState = STATE_INITIAL;
109
110 /**
111 * This variable keeps track of the type of the 'previous' event
112 * when peeking for the next Event. It is needed for some functionality,
113 * to remember state even when underlying parser has to move to peek
114 * the next event.
115 */
116 protected int mPrePeekEvent = START_DOCUMENT;
117
118 /*
119 /**********************************************************************
120 /* Woodstox-specific
121 /**********************************************************************
122 */
123
124 /**
125 * Marker flag to allow specialized handling in "multi-document" reading
126 * mode.
127 */
128 protected final boolean mCfgMultiDocMode;
129
130 /*
131 /**********************************************************************
132 /* Construction
133 /**********************************************************************
134 */
135
136 public WstxEventReader(XMLEventAllocator a, XMLStreamReader2 r)
137 {
138 mAllocator = a;
139 mReader = r;
140 mCfgMultiDocMode = (r instanceof StreamScanner)
141 && ((StreamScanner) r).getConfig().inputParsingModeDocuments();
142 }
143
144 /*
145 /**********************************************************************
146 /* Abstract methods that Stax2EventReaderImpl would expose
147 /**********************************************************************
148 */
149
150 @Override
151 public boolean isPropertySupported(String name)
152 {
153 return ((XMLStreamReader2)getStreamReader()).isPropertySupported(name);
154 }
155
156 @Override
157 public boolean setProperty(String name, Object value)
158 {
159 return ((XMLStreamReader2)getStreamReader()).setProperty(name, value);
160 }
161
162 /**
163 * Method called upon encountering a problem that should result
164 * in an exception being thrown. If non-null String is returned.
165 * that will be used as the message of exception thrown; if null,
166 * a standard message will be used instead.
167 *
168 * @param errorType Type of the problem, one of <code>ERR_</code>
169 * constants
170 * @param currEvent Type of the event that triggered the problem,
171 * if any; -1 if not available.
172 */
173 protected String getErrorDesc(int errorType, int currEvent)
174 {
175 // Defaults are mostly fine, except we can easily add event type desc
176 switch (errorType) {
177 case ERR_GETELEMTEXT_NOT_START_ELEM:
178 return ErrorConsts.ERR_STATE_NOT_STELEM+", got "+ErrorConsts.tokenTypeDesc(currEvent);
179 case ERR_GETELEMTEXT_NON_TEXT_EVENT:
180 return "Expected a text token, got "+ErrorConsts.tokenTypeDesc(currEvent);
181 case ERR_NEXTTAG_NON_WS_TEXT:
182 return "Only all-whitespace CHARACTERS/CDATA (or SPACE) allowed for nextTag(), got "+ErrorConsts.tokenTypeDesc(currEvent);
183 case ERR_NEXTTAG_WRONG_TYPE:
184 return "Got "+ErrorConsts.tokenTypeDesc(currEvent)+", instead of START_ELEMENT, END_ELEMENT or SPACE";
185 }
186 return null;
187 }
188
189 /*
190 /**********************************************************************
191 /* XMLEventReader API
192 /**********************************************************************
193 */
194
195 @Override
196 public void close() throws XMLStreamException
197 {
198 mReader.close();
199 }
200
201 @Override
202 public String getElementText() throws XMLStreamException
203 {
204 /* Simple, if no peeking occured: can just forward this to the
205 * underlying parser
206 */
207 if (mPeekedEvent == null) {
208 return mReader.getElementText();
209 }
210
211 XMLEvent evt = mPeekedEvent;
212 mPeekedEvent = null;
213
214 /* Otherwise need to verify that we are currently over START_ELEMENT.
215 * Problem is we have already went past it...
216 */
217 if (mPrePeekEvent != START_ELEMENT) {
218 reportProblem(findErrorDesc(ERR_GETELEMTEXT_NOT_START_ELEM, mPrePeekEvent));
219 }
220 // ??? do we need to update mPrePeekEvent now
221
222 String str = null;
223 StringBuffer sb = null;
224
225 // Ok, fine, then just need to loop through and get all the text...
226 for (; true; evt = nextEvent()) {
227 if (evt.isEndElement()) {
228 break;
229 }
230 int type = evt.getEventType();
231 if (type == COMMENT || type == PROCESSING_INSTRUCTION) {
232 // can/should just ignore them
233 continue;
234 }
235 if (!evt.isCharacters()) {
236 reportProblem(findErrorDesc(ERR_GETELEMTEXT_NON_TEXT_EVENT, type));
237 }
238 String curr = evt.asCharacters().getData();
239 if (str == null) {
240 str = curr;
241 } else {
242 if (sb == null) {
243 sb = new StringBuffer(str.length() + curr.length());
244 sb.append(str);
245 }
246 sb.append(curr);
247 }
248 }
249
250 if (sb != null) {
251 return sb.toString();
252 }
253 return (str == null) ? "" : str;
254 }
255
256 @Override
257 public Object getProperty(String name) {
258 return mReader.getProperty(name);
259 }
260
261 @Override
262 public boolean hasNext() {
263 return (mState != STATE_END_OF_INPUT);
264 }
265
266 @Override
267 public XMLEvent nextEvent() throws XMLStreamException
268 {
269 if (mState == STATE_END_OF_INPUT) {
270 throwEndOfInput();
271 } else if (mState == STATE_INITIAL) {
272 mState = STATE_CONTENT;
273 return createStartDocumentEvent();
274 }
275 if (mPeekedEvent != null) {
276 XMLEvent evt = mPeekedEvent;
277 mPeekedEvent = null;
278 if (evt.isEndDocument()) {
279 updateStateEndDocument();
280 }
281 return evt;
282 }
283 return createNextEvent(true, mReader.next());
284 }
285
286 @Override
287 public Object next() {
288 try {
289 return nextEvent();
290 } catch (XMLStreamException sex) {
291 throwUnchecked(sex);
292 return null;
293 }
294 }
295
296 @Override
297 public XMLEvent nextTag() throws XMLStreamException
298 {
299 // If we have peeked something, need to process it
300 if (mPeekedEvent != null) {
301 XMLEvent evt = mPeekedEvent;
302 mPeekedEvent = null;
303 int type = evt.getEventType();
304 switch (type) {
305 case END_DOCUMENT:
306 return null;
307 case START_DOCUMENT:
308 // Need to skip START_DOCUMENT to get the root elem
309 break;
310 case SPACE:
311 // Ignorable WS is just fine
312 break;
313
314 /* !!! 07-Dec-2004, TSa: Specs are mum about Comments and PIs.
315 * But why would they not be skipped just like what
316 * the stream reader does?
317 */
318 case COMMENT:
319 case PROCESSING_INSTRUCTION:
320 break;
321 case CDATA:
322 case CHARACTERS:
323 if (((Characters) evt).isWhiteSpace()) {
324 break;
325 }
326 reportProblem(findErrorDesc(ERR_NEXTTAG_NON_WS_TEXT, type));
327 break; // never gets here, but some compilers whine without...
328 case START_ELEMENT:
329 case END_ELEMENT:
330 return evt;
331
332 default:
333 reportProblem(findErrorDesc(ERR_NEXTTAG_WRONG_TYPE, type));
334 }
335 } else {
336 /* 13-Sep-2005, TSa: As pointed out by Patrick, we may need to
337 * initialize the state here, too; otherwise peek() won't work
338 * correctly. The problem is that following loop's get method
339 * does not use event reader's method but underlying reader's.
340 * As such, it won't update state: most importantly, initial
341 * state may not be changed to non-initial.
342 */
343 if (mState == STATE_INITIAL) {
344 mState = STATE_CONTENT;
345 }
346 }
347
348 while (true) {
349 int next = mReader.next();
350
351 switch (next) {
352 case END_DOCUMENT:
353 return null;
354 case SPACE:
355 case COMMENT:
356 case PROCESSING_INSTRUCTION:
357 continue;
358 case CDATA:
359 case CHARACTERS:
360 if (mReader.isWhiteSpace()) {
361 continue;
362 }
363 reportProblem(findErrorDesc(ERR_NEXTTAG_NON_WS_TEXT, next));
364 break; // just to keep Jikes happy...
365
366 case START_ELEMENT:
367 case END_ELEMENT:
368 return createNextEvent(false, next);
369
370 default:
371 reportProblem(findErrorDesc(ERR_NEXTTAG_WRONG_TYPE, next));
372 }
373 }
374 }
375
376 @Override
377 public XMLEvent peek() throws XMLStreamException
378 {
379 if (mPeekedEvent == null) {
380 if (mState == STATE_END_OF_INPUT) {
381 // 06-Mar-2006, TSa: Fixed as per Arjen's suggestion:
382 //throwEndOfInput();
383 return null;
384 }
385 if (mState == STATE_INITIAL) {
386 // Not sure what it should be... but this should do:
387 mPrePeekEvent = START_DOCUMENT;
388 mPeekedEvent = createStartDocumentEvent();
389 mState = STATE_CONTENT;
390 } else {
391 mPrePeekEvent = mReader.getEventType();
392 mPeekedEvent = createNextEvent(false, mReader.next());
393 }
394 }
395 return mPeekedEvent;
396 }
397
398 /**
399 * Note: only here because we implement Iterator interface. Will not
400 * work, don't bother calling it.
401 */
402 @Override
403 public void remove() {
404 throw new UnsupportedOperationException("Can not remove events from XMLEventReader.");
405 }
406
407 /**
408 * Method called when we are about to return <code>END_DOCUMENT</code> event.
409 * Usually this should change state to <code>STATE_END_OF_INPUT</code>, but
410 * may vary for some alternative read modes (like multi-document)
411 *
412 * @since 4.2
413 */
414 protected void updateStateEndDocument() throws XMLStreamException {
415 if (mCfgMultiDocMode) {
416 // As per [woodstox-core#42] should allow reading over multiple documents...
417 if (mReader.hasNext()) {
418 // Let's sanity-check that we get token we expect however:
419 int next = mReader.next();
420 if (next == START_DOCUMENT) {
421 mPrePeekEvent = START_DOCUMENT;
422 mPeekedEvent = createStartDocumentEvent();
423 mState = STATE_CONTENT;
424 return;
425 }
426 reportProblem("Unexpected token ("+ErrorConsts.tokenTypeDesc(next)
427 +") after END_DOCUMENT in multi-document mode, XMLStreamReader.hasNext() returning true");
428 }
429 }
430 mState = STATE_END_OF_INPUT;
431 }
432
433 /*
434 /**********************************************************************
435 /* XMLEventReader2 API
436 /**********************************************************************
437 */
438
439 /**
440 *<p>
441 * Note: although the interface allows implementations to
442 * throw an {@link XMLStreamException}, the reference implementation
443 * doesn't currently need to.
444 * It's still declared, in case in future there is need to throw
445 * such an exception.
446 */
447 @Override
448 public boolean hasNextEvent() throws XMLStreamException
449 {
450 return (mState != STATE_END_OF_INPUT);
451 }
452
453 /*
454 /**********************************************************************
455 /* Overridable factory methods
456 /**********************************************************************
457 */
458
459 protected XMLEvent createNextEvent(boolean checkEOD, int type)
460 throws XMLStreamException
461 {
462 try {
463 XMLEvent evt = mAllocator.allocate(mReader);
464 if (checkEOD && type == END_DOCUMENT) {
465 updateStateEndDocument();
466 }
467 return evt;
468 } catch (RuntimeException rex) {
469 throw _checkUnwrap(rex);
470 }
471 }
472
473 protected XMLStreamException _checkUnwrap(RuntimeException rex)
474 {
475 /* 29-Mar-2008, TSa: Due to some problems with Stax API
476 * (lack of 'throws XMLStreamException' in signature of
477 * XMLStreamReader.getText(), for one) it is possible
478 * we will get a wrapped XMLStreamException. If so,
479 * we should be able to unwrap it.
480 */
481 Throwable t = rex.getCause();
482 while (t != null) {
483 if (t instanceof XMLStreamException) {
484 return (XMLStreamException) t;
485 }
486 t = t.getCause();
487 }
488 // Nope, need to re-throw as is
489 throw rex;
490 }
491
492 /**
493 * Method called to create the very first event (START_DOCUMENT).
494 */
495 protected XMLEvent createStartDocumentEvent()
496 throws XMLStreamException
497 {
498 XMLEvent start = mAllocator.allocate(mReader);
499 return start;
500 }
501
502 /*
503 /**********************************************************************
504 /* Overridable error reporting methods
505 /**********************************************************************
506 */
507
508 // note: `private` before 4.2
509 protected void throwEndOfInput()
510 {
511 throw new NoSuchElementException();
512 }
513
514 protected void throwUnchecked(XMLStreamException sex)
515 {
516 // Wrapped root cause? Let's only unwrap one layer; one that
517 // must have been used to expose the problem (if any)
518 Throwable t = (sex.getNestedException() == null) ? sex : sex.getNestedException();
519 // Unchecked? Can re-throw as is
520 if (t instanceof RuntimeException) {
521 throw (RuntimeException) t;
522 }
523 if (t instanceof Error) {
524 throw (Error) t;
525 }
526 // Otherwise, let's just wrap it
527 throw new RuntimeException("[was "+t.getClass()+"] "+t.getMessage(), t);
528 }
529
530 protected void reportProblem(String msg)
531 throws XMLStreamException
532 {
533 reportProblem(msg, mReader.getLocation());
534 }
535
536 protected void reportProblem(String msg, Location loc)
537 throws XMLStreamException
538 {
539 if (loc == null) {
540 throw new WstxParsingException(msg);
541 }
542 throw new WstxParsingException(msg, loc);
543 }
544
545 /*
546 /**********************************************************************
547 /* Package methods for sub-classes
548 /**********************************************************************
549 */
550
551 protected XMLStreamReader getStreamReader()
552 {
553 return mReader;
554 }
555
556 /*
557 /**********************************************************************
558 /* Other internal methods
559 /**********************************************************************
560 */
561
562 // note: `private` before 4.2
563 /**
564 * Method used to locate error message description to use.
565 * Calls sub-classes <code>getErrorDesc()</code> first, and only
566 * if no message found, uses default messages defined here.
567 */
568 protected final String findErrorDesc(int errorType, int currEvent)
569 {
570 String msg = getErrorDesc(errorType, currEvent);
571 if (msg != null) {
572 return msg;
573 }
574 switch (errorType) {
575 case ERR_GETELEMTEXT_NOT_START_ELEM:
576 return "Current state not START_ELEMENT when calling getElementText()";
577 case ERR_GETELEMTEXT_NON_TEXT_EVENT:
578 return "Expected a text token";
579 case ERR_NEXTTAG_NON_WS_TEXT:
580 return "Only all-whitespace CHARACTERS/CDATA (or SPACE) allowed for nextTag()";
581 case ERR_NEXTTAG_WRONG_TYPE:
582 return "Should only encounter START_ELEMENT/END_ELEMENT, SPACE, or all-white-space CHARACTERS";
583 }
584
585 // should never happen, but it'd be bad to throw another exception...
586 return "Internal error (unrecognized error type: "+errorType+")";
587 }
588 }
589