1 /* Copyright (c) 2008 Google Inc.
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package org.yaml.snakeyaml.external.com.google.gdata.util.common.base;
17
18 import java.io.IOException;
19
20 /**
21 * An {@link Escaper} that converts literal text into a format safe for
22 * inclusion in a particular context (such as an XML document). Typically (but
23 * not always), the inverse process of "unescaping" the text is performed
24 * automatically by the relevant parser.
25 *
26 * <p>
27 * For example, an XML escaper would convert the literal string
28 * {@code "Foo<Bar>"} into {@code "Foo<Bar>"} to prevent {@code "<Bar>"}
29 * from being confused with an XML tag. When the resulting XML document is
30 * parsed, the parser API will return this text as the original literal string
31 * {@code "Foo<Bar>"}.
32 *
33 * <p>
34 * <b>Note:</b> This class is similar to {@link CharEscaper} but with one very
35 * important difference. A CharEscaper can only process Java <a
36 * href="http://en.wikipedia.org/wiki/UTF-16">UTF16</a> characters in isolation
37 * and may not cope when it encounters surrogate pairs. This class facilitates
38 * the correct escaping of all Unicode characters.
39 *
40 * <p>
41 * As there are important reasons, including potential security issues, to
42 * handle Unicode correctly if you are considering implementing a new escaper
43 * you should favor using UnicodeEscaper wherever possible.
44 *
45 * <p>
46 * A {@code UnicodeEscaper} instance is required to be stateless, and safe when
47 * used concurrently by multiple threads.
48 *
49 * <p>
50 * Several popular escapers are defined as constants in the class
51 * {@link CharEscapers}. To create your own escapers extend this class and
52 * implement the {@link #escape(int)} method.
53 *
54 *
55 */
56 public abstract class UnicodeEscaper implements Escaper {
57 /** The amount of padding (chars) to use when growing the escape buffer. */
58 private static final int DEST_PAD = 32;
59
60 /**
61 * Returns the escaped form of the given Unicode code point, or {@code null}
62 * if this code point does not need to be escaped. When called as part of an
63 * escaping operation, the given code point is guaranteed to be in the range
64 * {@code 0 <= cp <= Character#MAX_CODE_POINT}.
65 *
66 * <p>
67 * If an empty array is returned, this effectively strips the input
68 * character from the resulting text.
69 *
70 * <p>
71 * If the character does not need to be escaped, this method should return
72 * {@code null}, rather than an array containing the character
73 * representation of the code point. This enables the escaping algorithm to
74 * perform more efficiently.
75 *
76 * <p>
77 * If the implementation of this method cannot correctly handle a particular
78 * code point then it should either throw an appropriate runtime exception
79 * or return a suitable replacement character. It must never silently
80 * discard invalid input as this may constitute a security risk.
81 *
82 * @param cp
83 * the Unicode code point to escape if necessary
84 * @return the replacement characters, or {@code null} if no escaping was
85 * needed
86 */
87 protected abstract char[] escape(int cp);
88
89 /**
90 * Scans a sub-sequence of characters from a given {@link CharSequence},
91 * returning the index of the next character that requires escaping.
92 *
93 * <p>
94 * <b>Note:</b> When implementing an escaper, it is a good idea to override
95 * this method for efficiency. The base class implementation determines
96 * successive Unicode code points and invokes {@link #escape(int)} for each
97 * of them. If the semantics of your escaper are such that code points in
98 * the supplementary range are either all escaped or all unescaped, this
99 * method can be implemented more efficiently using
100 * {@link CharSequence#charAt(int)}.
101 *
102 * <p>
103 * Note however that if your escaper does not escape characters in the
104 * supplementary range, you should either continue to validate the
105 * correctness of any surrogate characters encountered or provide a clear
106 * warning to users that your escaper does not validate its input.
107 *
108 * <p>
109 * See {@link PercentEscaper} for an example.
110 *
111 * @param csq
112 * a sequence of characters
113 * @param start
114 * the index of the first character to be scanned
115 * @param end
116 * the index immediately after the last character to be scanned
117 * @throws IllegalArgumentException
118 * if the scanned sub-sequence of {@code csq} contains invalid
119 * surrogate pairs
120 */
121 protected int nextEscapeIndex(CharSequence csq, int start, int end) {
122 int index = start;
123 while (index < end) {
124 int cp = codePointAt(csq, index, end);
125 if (cp < 0 || escape(cp) != null) {
126 break;
127 }
128 index += Character.isSupplementaryCodePoint(cp) ? 2 : 1;
129 }
130 return index;
131 }
132
133 /**
134 * Returns the escaped form of a given literal string.
135 *
136 * <p>
137 * If you are escaping input in arbitrary successive chunks, then it is not
138 * generally safe to use this method. If an input string ends with an
139 * unmatched high surrogate character, then this method will throw
140 * {@link IllegalArgumentException}. You should either ensure your input is
141 * valid <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a> before
142 * calling this method or use an escaped {@link Appendable} (as returned by
143 * {@link #escape(Appendable)}) which can cope with arbitrarily split input.
144 *
145 * <p>
146 * <b>Note:</b> When implementing an escaper it is a good idea to override
147 * this method for efficiency by inlining the implementation of
148 * {@link #nextEscapeIndex(CharSequence, int, int)} directly. Doing this for
149 * {@link PercentEscaper} more than doubled the performance for unescaped
150 * strings (as measured by {@link CharEscapersBenchmark}).
151 *
152 * @param string
153 * the literal string to be escaped
154 * @return the escaped form of {@code string}
155 * @throws NullPointerException
156 * if {@code string} is null
157 * @throws IllegalArgumentException
158 * if invalid surrogate characters are encountered
159 */
160 public String escape(String string) {
161 int end = string.length();
162 int index = nextEscapeIndex(string, 0, end);
163 return index == end ? string : escapeSlow(string, index);
164 }
165
166 /**
167 * Returns the escaped form of a given literal string, starting at the given
168 * index. This method is called by the {@link #escape(String)} method when
169 * it discovers that escaping is required. It is protected to allow
170 * subclasses to override the fastpath escaping function to inline their
171 * escaping test. See {@link CharEscaperBuilder} for an example usage.
172 *
173 * <p>
174 * This method is not reentrant and may only be invoked by the top level
175 * {@link #escape(String)} method.
176 *
177 * @param s
178 * the literal string to be escaped
179 * @param index
180 * the index to start escaping from
181 * @return the escaped form of {@code string}
182 * @throws NullPointerException
183 * if {@code string} is null
184 * @throws IllegalArgumentException
185 * if invalid surrogate characters are encountered
186 */
187 protected final String escapeSlow(String s, int index) {
188 int end = s.length();
189
190 // Get a destination buffer and setup some loop variables.
191 char[] dest = DEST_TL.get();
192 int destIndex = 0;
193 int unescapedChunkStart = 0;
194
195 while (index < end) {
196 int cp = codePointAt(s, index, end);
197 if (cp < 0) {
198 throw new IllegalArgumentException("Trailing high surrogate at end of input");
199 }
200 char[] escaped = escape(cp);
201 if (escaped != null) {
202 int charsSkipped = index - unescapedChunkStart;
203
204 // This is the size needed to add the replacement, not the full
205 // size needed by the string. We only regrow when we absolutely
206 // must.
207 int sizeNeeded = destIndex + charsSkipped + escaped.length;
208 if (dest.length < sizeNeeded) {
209 int destLength = sizeNeeded + (end - index) + DEST_PAD;
210 dest = growBuffer(dest, destIndex, destLength);
211 }
212 // If we have skipped any characters, we need to copy them now.
213 if (charsSkipped > 0) {
214 s.getChars(unescapedChunkStart, index, dest, destIndex);
215 destIndex += charsSkipped;
216 }
217 if (escaped.length > 0) {
218 System.arraycopy(escaped, 0, dest, destIndex, escaped.length);
219 destIndex += escaped.length;
220 }
221 }
222 unescapedChunkStart = index + (Character.isSupplementaryCodePoint(cp) ? 2 : 1);
223 index = nextEscapeIndex(s, unescapedChunkStart, end);
224 }
225
226 // Process trailing unescaped characters - no need to account for
227 // escaped
228 // length or padding the allocation.
229 int charsSkipped = end - unescapedChunkStart;
230 if (charsSkipped > 0) {
231 int endIndex = destIndex + charsSkipped;
232 if (dest.length < endIndex) {
233 dest = growBuffer(dest, destIndex, endIndex);
234 }
235 s.getChars(unescapedChunkStart, end, dest, destIndex);
236 destIndex = endIndex;
237 }
238 return new String(dest, 0, destIndex);
239 }
240
241 /**
242 * Returns an {@code Appendable} instance which automatically escapes all
243 * text appended to it before passing the resulting text to an underlying
244 * {@code Appendable}.
245 *
246 * <p>
247 * Unlike {@link #escape(String)} it is permitted to append arbitrarily
248 * split input to this Appendable, including input that is split over a
249 * surrogate pair. In this case the pending high surrogate character will
250 * not be processed until the corresponding low surrogate is appended. This
251 * means that a trailing high surrogate character at the end of the input
252 * cannot be detected and will be silently ignored. This is unavoidable
253 * since the Appendable interface has no {@code close()} method, and it is
254 * impossible to determine when the last characters have been appended.
255 *
256 * <p>
257 * The methods of the returned object will propagate any exceptions thrown
258 * by the underlying {@code Appendable}.
259 *
260 * <p>
261 * For well formed <a href="http://en.wikipedia.org/wiki/UTF-16">UTF-16</a>
262 * the escaping behavior is identical to that of {@link #escape(String)} and
263 * the following code is equivalent to (but much slower than)
264 * {@code escaper.escape(string)}:
265 *
266 * <pre>
267 * {
268 * @code
269 * StringBuilder sb = new StringBuilder();
270 * escaper.escape(sb).append(string);
271 * return sb.toString();
272 * }
273 * </pre>
274 *
275 * @param out
276 * the underlying {@code Appendable} to append escaped output to
277 * @return an {@code Appendable} which passes text to {@code out} after
278 * escaping it
279 * @throws NullPointerException
280 * if {@code out} is null
281 * @throws IllegalArgumentException
282 * if invalid surrogate characters are encountered
283 *
284 */
285 public Appendable escape(final Appendable out) {
286 assert out != null;
287
288 return new Appendable() {
289 int pendingHighSurrogate = -1;
290 char[] decodedChars = new char[2];
291
292 public Appendable append(CharSequence csq) throws IOException {
293 return append(csq, 0, csq.length());
294 }
295
296 public Appendable append(CharSequence csq, int start, int end) throws IOException {
297 int index = start;
298 if (index < end) {
299 // This is a little subtle: index must never reference the
300 // middle of a
301 // surrogate pair but unescapedChunkStart can. The first
302 // time we enter
303 // the loop below it is possible that index !=
304 // unescapedChunkStart.
305 int unescapedChunkStart = index;
306 if (pendingHighSurrogate != -1) {
307 // Our last append operation ended halfway through a
308 // surrogate pair
309 // so we have to do some extra work first.
310 char c = csq.charAt(index++);
311 if (!Character.isLowSurrogate(c)) {
312 throw new IllegalArgumentException(
313 "Expected low surrogate character but got " + c);
314 }
315 char[] escaped = escape(Character.toCodePoint((char) pendingHighSurrogate,
316 c));
317 if (escaped != null) {
318 // Emit the escaped character and adjust
319 // unescapedChunkStart to
320 // skip the low surrogate we have consumed.
321 outputChars(escaped, escaped.length);
322 unescapedChunkStart += 1;
323 } else {
324 // Emit pending high surrogate (unescaped) but do
325 // not modify
326 // unescapedChunkStart as we must still emit the low
327 // surrogate.
328 out.append((char) pendingHighSurrogate);
329 }
330 pendingHighSurrogate = -1;
331 }
332 while (true) {
333 // Find and append the next subsequence of unescaped
334 // characters.
335 index = nextEscapeIndex(csq, index, end);
336 if (index > unescapedChunkStart) {
337 out.append(csq, unescapedChunkStart, index);
338 }
339 if (index == end) {
340 break;
341 }
342 // If we are not finished, calculate the next code
343 // point.
344 int cp = codePointAt(csq, index, end);
345 if (cp < 0) {
346 // Our sequence ended half way through a surrogate
347 // pair so just
348 // record the state and exit.
349 pendingHighSurrogate = -cp;
350 break;
351 }
352 // Escape the code point and output the characters.
353 char[] escaped = escape(cp);
354 if (escaped != null) {
355 outputChars(escaped, escaped.length);
356 } else {
357 // This shouldn't really happen if nextEscapeIndex
358 // is correct but
359 // we should cope with false positives.
360 int len = Character.toChars(cp, decodedChars, 0);
361 outputChars(decodedChars, len);
362 }
363 // Update our index past the escaped character and
364 // continue.
365 index += (Character.isSupplementaryCodePoint(cp) ? 2 : 1);
366 unescapedChunkStart = index;
367 }
368 }
369 return this;
370 }
371
372 public Appendable append(char c) throws IOException {
373 if (pendingHighSurrogate != -1) {
374 // Our last append operation ended halfway through a
375 // surrogate pair
376 // so we have to do some extra work first.
377 if (!Character.isLowSurrogate(c)) {
378 throw new IllegalArgumentException(
379 "Expected low surrogate character but got '" + c + "' with value "
380 + (int) c);
381 }
382 char[] escaped = escape(Character.toCodePoint((char) pendingHighSurrogate, c));
383 if (escaped != null) {
384 outputChars(escaped, escaped.length);
385 } else {
386 out.append((char) pendingHighSurrogate);
387 out.append(c);
388 }
389 pendingHighSurrogate = -1;
390 } else if (Character.isHighSurrogate(c)) {
391 // This is the start of a (split) surrogate pair.
392 pendingHighSurrogate = c;
393 } else {
394 if (Character.isLowSurrogate(c)) {
395 throw new IllegalArgumentException("Unexpected low surrogate character '"
396 + c + "' with value " + (int) c);
397 }
398 // This is a normal (non surrogate) char.
399 char[] escaped = escape(c);
400 if (escaped != null) {
401 outputChars(escaped, escaped.length);
402 } else {
403 out.append(c);
404 }
405 }
406 return this;
407 }
408
409 private void outputChars(char[] chars, int len) throws IOException {
410 for (int n = 0; n < len; n++) {
411 out.append(chars[n]);
412 }
413 }
414 };
415 }
416
417 /**
418 * Returns the Unicode code point of the character at the given index.
419 *
420 * <p>
421 * Unlike {@link Character#codePointAt(CharSequence, int)} or
422 * {@link String#codePointAt(int)} this method will never fail silently when
423 * encountering an invalid surrogate pair.
424 *
425 * <p>
426 * The behaviour of this method is as follows:
427 * <ol>
428 * <li>If {@code index >= end}, {@link IndexOutOfBoundsException} is thrown.
429 * <li><b>If the character at the specified index is not a surrogate, it is
430 * returned.</b>
431 * <li>If the first character was a high surrogate value, then an attempt is
432 * made to read the next character.
433 * <ol>
434 * <li><b>If the end of the sequence was reached, the negated value of the
435 * trailing high surrogate is returned.</b>
436 * <li><b>If the next character was a valid low surrogate, the code point
437 * value of the high/low surrogate pair is returned.</b>
438 * <li>If the next character was not a low surrogate value, then
439 * {@link IllegalArgumentException} is thrown.
440 * </ol>
441 * <li>If the first character was a low surrogate value,
442 * {@link IllegalArgumentException} is thrown.
443 * </ol>
444 *
445 * @param seq
446 * the sequence of characters from which to decode the code point
447 * @param index
448 * the index of the first character to decode
449 * @param end
450 * the index beyond the last valid character to decode
451 * @return the Unicode code point for the given index or the negated value
452 * of the trailing high surrogate character at the end of the
453 * sequence
454 */
455 protected static final int codePointAt(CharSequence seq, int index, int end) {
456 if (index < end) {
457 char c1 = seq.charAt(index++);
458 if (c1 < Character.MIN_HIGH_SURROGATE || c1 > Character.MAX_LOW_SURROGATE) {
459 // Fast path (first test is probably all we need to do)
460 return c1;
461 } else if (c1 <= Character.MAX_HIGH_SURROGATE) {
462 // If the high surrogate was the last character, return its
463 // inverse
464 if (index == end) {
465 return -c1;
466 }
467 // Otherwise look for the low surrogate following it
468 char c2 = seq.charAt(index);
469 if (Character.isLowSurrogate(c2)) {
470 return Character.toCodePoint(c1, c2);
471 }
472 throw new IllegalArgumentException("Expected low surrogate but got char '" + c2
473 + "' with value " + (int) c2 + " at index " + index);
474 } else {
475 throw new IllegalArgumentException("Unexpected low surrogate character '" + c1
476 + "' with value " + (int) c1 + " at index " + (index - 1));
477 }
478 }
479 throw new IndexOutOfBoundsException("Index exceeds specified range");
480 }
481
482 /**
483 * Helper method to grow the character buffer as needed, this only happens
484 * once in a while so it's ok if it's in a method call. If the index passed
485 * in is 0 then no copying will be done.
486 */
487 private static final char[] growBuffer(char[] dest, int index, int size) {
488 char[] copy = new char[size];
489 if (index > 0) {
490 System.arraycopy(dest, 0, copy, 0, index);
491 }
492 return copy;
493 }
494
495 /**
496 * A thread-local destination buffer to keep us from creating new buffers.
497 * The starting size is 1024 characters. If we grow past this we don't put
498 * it back in the threadlocal, we just keep going and grow as needed.
499 */
500 private static final ThreadLocal<char[]> DEST_TL = new ThreadLocal<char[]>() {
501 @Override
502 protected char[] initialValue() {
503 return new char[1024];
504 }
505 };
506 }
507