1 /*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
3 *
4 * Copyright (c) 1997-2017 Oracle and/or its affiliates. All rights reserved.
5 *
6 * The contents of this file are subject to the terms of either the GNU
7 * General Public License Version 2 only ("GPL") or the Common Development
8 * and Distribution License("CDDL") (collectively, the "License"). You
9 * may not use this file except in compliance with the License. You can
10 * obtain a copy of the License at
11 * https://oss.oracle.com/licenses/CDDL+GPL-1.1
12 * or LICENSE.txt. See the License for the specific
13 * language governing permissions and limitations under the License.
14 *
15 * When distributing the software, include this License Header Notice in each
16 * file and include the License file at LICENSE.txt.
17 *
18 * GPL Classpath Exception:
19 * Oracle designates this particular file as subject to the "Classpath"
20 * exception as provided by Oracle in the GPL Version 2 section of the License
21 * file that accompanied this code.
22 *
23 * Modifications:
24 * If applicable, add the following below the License Header, with the fields
25 * enclosed by brackets [] replaced by your own identifying information:
26 * "Portions Copyright [year] [name of copyright owner]"
27 *
28 * Contributor(s):
29 * If you wish your version of this file to be governed by only the CDDL or
30 * only the GPL Version 2, indicate your decision by adding "[Contributor]
31 * elects to include this software in this distribution under the [CDDL or GPL
32 * Version 2] license." If you don't indicate a single choice of license, a
33 * recipient has the option to distribute your version of this file under
34 * either the CDDL, the GPL Version 2 or to extend the choice of license to
35 * its licensees as provided above. However, if you add GPL Version 2 code
36 * and therefore, elected the GPL Version 2 license, then the option applies
37 * only if the new code is made subject to such option by the copyright
38 * holder.
39 */
40
41 package javax.mail.internet;
42
43 import java.util.*;
44
45 /**
46 * This class tokenizes RFC822 and MIME headers into the basic
47 * symbols specified by RFC822 and MIME. <p>
48 *
49 * This class handles folded headers (ie headers with embedded
50 * CRLF SPACE sequences). The folds are removed in the returned
51 * tokens.
52 *
53 * @author John Mani
54 * @author Bill Shannon
55 */
56
57 public class HeaderTokenizer {
58
59 /**
60 * The Token class represents tokens returned by the
61 * HeaderTokenizer.
62 */
63 public static class Token {
64
65 private int type;
66 private String value;
67
68 /**
69 * Token type indicating an ATOM.
70 */
71 public static final int ATOM = -1;
72
73 /**
74 * Token type indicating a quoted string. The value
75 * field contains the string without the quotes.
76 */
77 public static final int QUOTEDSTRING = -2;
78
79 /**
80 * Token type indicating a comment. The value field
81 * contains the comment string without the comment
82 * start and end symbols.
83 */
84 public static final int COMMENT = -3;
85
86 /**
87 * Token type indicating end of input.
88 */
89 public static final int EOF = -4;
90
91 /**
92 * Constructor.
93 * @param type Token type
94 * @param value Token value
95 */
96 public Token(int type, String value) {
97 this.type = type;
98 this.value = value;
99 }
100
101 /**
102 * Return the type of the token. If the token represents a
103 * delimiter or a control character, the type is that character
104 * itself, converted to an integer. Otherwise, it's value is
105 * one of the following:
106 * <ul>
107 * <li><code>ATOM</code> A sequence of ASCII characters
108 * delimited by either SPACE, CTL, "(", <"> or the
109 * specified SPECIALS
110 * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
111 * within quotes
112 * <li><code>COMMENT</code> A sequence of ASCII characters
113 * within "(" and ")".
114 * <li><code>EOF</code> End of header
115 * </ul>
116 *
117 * @return the token type
118 */
119 public int getType() {
120 return type;
121 }
122
123 /**
124 * Returns the value of the token just read. When the current
125 * token is a quoted string, this field contains the body of the
126 * string, without the quotes. When the current token is a comment,
127 * this field contains the body of the comment.
128 *
129 * @return token value
130 */
131 public String getValue() {
132 return value;
133 }
134 }
135
136 private String string; // the string to be tokenized
137 private boolean skipComments; // should comments be skipped ?
138 private String delimiters; // delimiter string
139 private int currentPos; // current parse position
140 private int maxPos; // string length
141 private int nextPos; // track start of next Token for next()
142 private int peekPos; // track start of next Token for peek()
143
144 /**
145 * RFC822 specials
146 */
147 public final static String RFC822 = "()<>@,;:\\\"\t .[]";
148
149 /**
150 * MIME specials
151 */
152 public final static String MIME = "()<>@,;:\\\"\t []/?=";
153
154 // The EOF Token
155 private final static Token EOFToken = new Token(Token.EOF, null);
156
157 /**
158 * Constructor that takes a rfc822 style header.
159 *
160 * @param header The rfc822 header to be tokenized
161 * @param delimiters Set of delimiter characters
162 * to be used to delimit ATOMS. These
163 * are usually <code>RFC822</code> or
164 * <code>MIME</code>
165 * @param skipComments If true, comments are skipped and
166 * not returned as tokens
167 */
168 public HeaderTokenizer(String header, String delimiters,
169 boolean skipComments) {
170 string = (header == null) ? "" : header; // paranoia ?!
171 this.skipComments = skipComments;
172 this.delimiters = delimiters;
173 currentPos = nextPos = peekPos = 0;
174 maxPos = string.length();
175 }
176
177 /**
178 * Constructor. Comments are ignored and not returned as tokens
179 *
180 * @param header The header that is tokenized
181 * @param delimiters The delimiters to be used
182 */
183 public HeaderTokenizer(String header, String delimiters) {
184 this(header, delimiters, true);
185 }
186
187 /**
188 * Constructor. The RFC822 defined delimiters - RFC822 - are
189 * used to delimit ATOMS. Also comments are skipped and not
190 * returned as tokens
191 *
192 * @param header the header string
193 */
194 public HeaderTokenizer(String header) {
195 this(header, RFC822);
196 }
197
198 /**
199 * Parses the next token from this String. <p>
200 *
201 * Clients sit in a loop calling next() to parse successive
202 * tokens until an EOF Token is returned.
203 *
204 * @return the next Token
205 * @exception ParseException if the parse fails
206 */
207 public Token next() throws ParseException {
208 return next('\0', false);
209 }
210
211 /**
212 * Parses the next token from this String.
213 * If endOfAtom is not NUL, the token extends until the
214 * endOfAtom character is seen, or to the end of the header.
215 * This method is useful when parsing headers that don't
216 * obey the MIME specification, e.g., by failing to quote
217 * parameter values that contain spaces.
218 *
219 * @param endOfAtom if not NUL, character marking end of token
220 * @return the next Token
221 * @exception ParseException if the parse fails
222 * @since JavaMail 1.5
223 */
224 public Token next(char endOfAtom) throws ParseException {
225 return next(endOfAtom, false);
226 }
227
228 /**
229 * Parses the next token from this String.
230 * endOfAtom is handled as above. If keepEscapes is true,
231 * any backslash escapes are preserved in the returned string.
232 * This method is useful when parsing headers that don't
233 * obey the MIME specification, e.g., by failing to escape
234 * backslashes in the filename parameter.
235 *
236 * @param endOfAtom if not NUL, character marking end of token
237 * @param keepEscapes keep all backslashes in returned string?
238 * @return the next Token
239 * @exception ParseException if the parse fails
240 * @since JavaMail 1.5
241 */
242 public Token next(char endOfAtom, boolean keepEscapes)
243 throws ParseException {
244 Token tk;
245
246 currentPos = nextPos; // setup currentPos
247 tk = getNext(endOfAtom, keepEscapes);
248 nextPos = peekPos = currentPos; // update currentPos and peekPos
249 return tk;
250 }
251
252 /**
253 * Peek at the next token, without actually removing the token
254 * from the parse stream. Invoking this method multiple times
255 * will return successive tokens, until <code>next()</code> is
256 * called. <p>
257 *
258 * @return the next Token
259 * @exception ParseException if the parse fails
260 */
261 public Token peek() throws ParseException {
262 Token tk;
263
264 currentPos = peekPos; // setup currentPos
265 tk = getNext('\0', false);
266 peekPos = currentPos; // update peekPos
267 return tk;
268 }
269
270 /**
271 * Return the rest of the Header.
272 *
273 * @return String rest of header. null is returned if we are
274 * already at end of header
275 */
276 public String getRemainder() {
277 if (nextPos >= string.length())
278 return null;
279 return string.substring(nextPos);
280 }
281
282 /*
283 * Return the next token starting from 'currentPos'. After the
284 * parse, 'currentPos' is updated to point to the start of the
285 * next token.
286 */
287 private Token getNext(char endOfAtom, boolean keepEscapes)
288 throws ParseException {
289 // If we're already at end of string, return EOF
290 if (currentPos >= maxPos)
291 return EOFToken;
292
293 // Skip white-space, position currentPos beyond the space
294 if (skipWhiteSpace() == Token.EOF)
295 return EOFToken;
296
297 char c;
298 int start;
299 boolean filter = false;
300
301 c = string.charAt(currentPos);
302
303 // Check or Skip comments and position currentPos
304 // beyond the comment
305 while (c == '(') {
306 // Parsing comment ..
307 int nesting;
308 for (start = ++currentPos, nesting = 1;
309 nesting > 0 && currentPos < maxPos;
310 currentPos++) {
311 c = string.charAt(currentPos);
312 if (c == '\\') { // Escape sequence
313 currentPos++; // skip the escaped character
314 filter = true;
315 } else if (c == '\r')
316 filter = true;
317 else if (c == '(')
318 nesting++;
319 else if (c == ')')
320 nesting--;
321 }
322 if (nesting != 0)
323 throw new ParseException("Unbalanced comments");
324
325 if (!skipComments) {
326 // Return the comment, if we are asked to.
327 // Note that the comment start & end markers are ignored.
328 String s;
329 if (filter) // need to go thru the token again.
330 s = filterToken(string, start, currentPos-1, keepEscapes);
331 else
332 s = string.substring(start,currentPos-1);
333
334 return new Token(Token.COMMENT, s);
335 }
336
337 // Skip any whitespace after the comment.
338 if (skipWhiteSpace() == Token.EOF)
339 return EOFToken;
340 c = string.charAt(currentPos);
341 }
342
343 // Check for quoted-string and position currentPos
344 // beyond the terminating quote
345 if (c == '"') {
346 currentPos++; // skip initial quote
347 return collectString('"', keepEscapes);
348 }
349
350 // Check for SPECIAL or CTL
351 if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
352 if (endOfAtom > 0 && c != endOfAtom) {
353 // not expecting a special character here,
354 // pretend it's a quoted string
355 return collectString(endOfAtom, keepEscapes);
356 }
357 currentPos++; // re-position currentPos
358 char ch[] = new char[1];
359 ch[0] = c;
360 return new Token((int)c, new String(ch));
361 }
362
363 // Check for ATOM
364 for (start = currentPos; currentPos < maxPos; currentPos++) {
365 c = string.charAt(currentPos);
366 // ATOM is delimited by either SPACE, CTL, "(", <">
367 // or the specified SPECIALS
368 if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
369 c == '"' || delimiters.indexOf(c) >= 0) {
370 if (endOfAtom > 0 && c != endOfAtom) {
371 // not the expected atom after all;
372 // back up and pretend it's a quoted string
373 currentPos = start;
374 return collectString(endOfAtom, keepEscapes);
375 }
376 break;
377 }
378 }
379 return new Token(Token.ATOM, string.substring(start, currentPos));
380 }
381
382 private Token collectString(char eos, boolean keepEscapes)
383 throws ParseException {
384 int start;
385 boolean filter = false;
386 for (start = currentPos; currentPos < maxPos; currentPos++) {
387 char c = string.charAt(currentPos);
388 if (c == '\\') { // Escape sequence
389 currentPos++;
390 filter = true;
391 } else if (c == '\r')
392 filter = true;
393 else if (c == eos) {
394 currentPos++;
395 String s;
396
397 if (filter)
398 s = filterToken(string, start, currentPos-1, keepEscapes);
399 else
400 s = string.substring(start, currentPos-1);
401
402 if (c != '"') { // not a real quoted string
403 s = trimWhiteSpace(s);
404 currentPos--; // back up before the eos char
405 }
406
407 return new Token(Token.QUOTEDSTRING, s);
408 }
409 }
410
411 // ran off the end of the string
412
413 // if we're looking for a matching quote, that's an error
414 if (eos == '"')
415 throw new ParseException("Unbalanced quoted string");
416
417 // otherwise, just return whatever's left
418 String s;
419 if (filter)
420 s = filterToken(string, start, currentPos, keepEscapes);
421 else
422 s = string.substring(start, currentPos);
423 s = trimWhiteSpace(s);
424 return new Token(Token.QUOTEDSTRING, s);
425 }
426
427 // Skip SPACE, HT, CR and NL
428 private int skipWhiteSpace() {
429 char c;
430 for (; currentPos < maxPos; currentPos++)
431 if (((c = string.charAt(currentPos)) != ' ') &&
432 (c != '\t') && (c != '\r') && (c != '\n'))
433 return currentPos;
434 return Token.EOF;
435 }
436
437 // Trim SPACE, HT, CR and NL from end of string
438 private static String trimWhiteSpace(String s) {
439 char c;
440 int i;
441 for (i = s.length() - 1; i >= 0; i--) {
442 if (((c = s.charAt(i)) != ' ') &&
443 (c != '\t') && (c != '\r') && (c != '\n'))
444 break;
445 }
446 if (i <= 0)
447 return "";
448 else
449 return s.substring(0, i + 1);
450 }
451
452 /* Process escape sequences and embedded LWSPs from a comment or
453 * quoted string.
454 */
455 private static String filterToken(String s, int start, int end,
456 boolean keepEscapes) {
457 StringBuffer sb = new StringBuffer();
458 char c;
459 boolean gotEscape = false;
460 boolean gotCR = false;
461
462 for (int i = start; i < end; i++) {
463 c = s.charAt(i);
464 if (c == '\n' && gotCR) {
465 // This LF is part of an unescaped
466 // CRLF sequence (i.e, LWSP). Skip it.
467 gotCR = false;
468 continue;
469 }
470
471 gotCR = false;
472 if (!gotEscape) {
473 // Previous character was NOT '\'
474 if (c == '\\') // skip this character
475 gotEscape = true;
476 else if (c == '\r') // skip this character
477 gotCR = true;
478 else // append this character
479 sb.append(c);
480 } else {
481 // Previous character was '\'. So no need to
482 // bother with any special processing, just
483 // append this character. If keepEscapes is
484 // set, keep the backslash. IE6 fails to escape
485 // backslashes in quoted strings in HTTP headers,
486 // e.g., in the filename parameter.
487 if (keepEscapes)
488 sb.append('\\');
489 sb.append(c);
490 gotEscape = false;
491 }
492 }
493 return sb.toString();
494 }
495 }
496