1 /* Copyright (c) 2008 Google Inc.
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package org.yaml.snakeyaml.external.com.google.gdata.util.common.base;
17
18 /**
19 * A {@code UnicodeEscaper} that escapes some set of Java characters using the
20 * URI percent encoding scheme. The set of safe characters (those which remain
21 * unescaped) can be specified on construction.
22 *
23 * <p>
24 * For details on escaping URIs for use in web pages, see section 2.4 of <a
25 * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
26 *
27 * <p>
28 * In most cases this class should not need to be used directly. If you have no
29 * special requirements for escaping your URIs, you should use either
30 * {@link CharEscapers#uriEscaper()} or {@link CharEscapers#uriEscaper(boolean)}.
31 *
32 * <p>
33 * When encoding a String, the following rules apply:
34 * <ul>
35 * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
36 * through "9" remain the same.
37 * <li>Any additionally specified safe characters remain the same.
38 * <li>If {@code plusForSpace} was specified, the space character " " is
39 * converted into a plus sign "+".
40 * <li>All other characters are converted into one or more bytes using UTF-8
41 * encoding and each byte is then represented by the 3-character string "%XY",
42 * where "XY" is the two-digit, uppercase, hexadecimal representation of the
43 * byte value.
44 * </ul>
45 *
46 * <p>
47 * RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
48 * "~", "*", "'", "(" and ")". It goes on to state:
49 *
50 * <p>
51 * <i>Unreserved characters can be escaped without changing the semantics of the
52 * URI, but this should not be done unless the URI is being used in a context
53 * that does not allow the unescaped character to appear.</i>
54 *
55 * <p>
56 * For performance reasons the only currently supported character encoding of
57 * this class is UTF-8.
58 *
59 * <p>
60 * <b>Note</b>: This escaper produces uppercase hexidecimal sequences. From <a
61 * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
62 * <i>"URI producers and normalizers should use uppercase hexadecimal digits for
63 * all percent-encodings."</i>
64 *
65 *
66 */
67 public class PercentEscaper extends UnicodeEscaper {
68 /**
69 * A string of safe characters that mimics the behavior of
70 * {@link java.net.URLEncoder}.
71 *
72 */
73 public static final String SAFECHARS_URLENCODER = "-_.*";
74
75 /**
76 * A string of characters that do not need to be encoded when used in URI
77 * path segments, as specified in RFC 3986. Note that some of these
78 * characters do need to be escaped when used in other parts of the URI.
79 */
80 public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
81
82 /**
83 * A string of characters that do not need to be encoded when used in URI
84 * query strings, as specified in RFC 3986. Note that some of these
85 * characters do need to be escaped when used in other parts of the URI.
86 */
87 public static final String SAFEQUERYSTRINGCHARS_URLENCODER = "-_.!~*'()@:$,;/?:";
88
89 // In some uri escapers spaces are escaped to '+'
90 private static final char[] URI_ESCAPED_SPACE = { '+' };
91
92 private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray();
93
94 /**
95 * If true we should convert space to the {@code +} character.
96 */
97 private final boolean plusForSpace;
98
99 /**
100 * An array of flags where for any {@code char c} if {@code safeOctets[c]}
101 * is true then {@code c} should remain unmodified in the output. If
102 * {@code c > safeOctets.length} then it should be escaped.
103 */
104 private final boolean[] safeOctets;
105
106 /**
107 * Constructs a URI escaper with the specified safe characters and optional
108 * handling of the space character.
109 *
110 * @param safeChars
111 * a non null string specifying additional safe characters for
112 * this escaper (the ranges 0..9, a..z and A..Z are always safe
113 * and should not be specified here)
114 * @param plusForSpace
115 * true if ASCII space should be escaped to {@code +} rather than
116 * {@code %20}
117 * @throws IllegalArgumentException
118 * if any of the parameters were invalid
119 */
120 public PercentEscaper(String safeChars, boolean plusForSpace) {
121 // Avoid any misunderstandings about the behavior of this escaper
122 if (safeChars.matches(".*[0-9A-Za-z].*")) {
123 throw new IllegalArgumentException(
124 "Alphanumeric characters are always 'safe' and should not be "
125 + "explicitly specified");
126 }
127 // Avoid ambiguous parameters. Safe characters are never modified so if
128 // space is a safe character then setting plusForSpace is meaningless.
129 if (plusForSpace && safeChars.contains(" ")) {
130 throw new IllegalArgumentException(
131 "plusForSpace cannot be specified when space is a 'safe' character");
132 }
133 if (safeChars.contains("%")) {
134 throw new IllegalArgumentException("The '%' character cannot be specified as 'safe'");
135 }
136 this.plusForSpace = plusForSpace;
137 this.safeOctets = createSafeOctets(safeChars);
138 }
139
140 /**
141 * Creates a boolean[] with entries corresponding to the character values
142 * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
143 * is as small as is required to hold the given character information.
144 */
145 private static boolean[] createSafeOctets(String safeChars) {
146 int maxChar = 'z';
147 char[] safeCharArray = safeChars.toCharArray();
148 for (char c : safeCharArray) {
149 maxChar = Math.max(c, maxChar);
150 }
151 boolean[] octets = new boolean[maxChar + 1];
152 for (int c = '0'; c <= '9'; c++) {
153 octets[c] = true;
154 }
155 for (int c = 'A'; c <= 'Z'; c++) {
156 octets[c] = true;
157 }
158 for (int c = 'a'; c <= 'z'; c++) {
159 octets[c] = true;
160 }
161 for (char c : safeCharArray) {
162 octets[c] = true;
163 }
164 return octets;
165 }
166
167 /*
168 * Overridden for performance. For unescaped strings this improved the
169 * performance of the uri escaper from ~760ns to ~400ns as measured by
170 * {@link CharEscapersBenchmark}.
171 */
172 @Override
173 protected int nextEscapeIndex(CharSequence csq, int index, int end) {
174 for (; index < end; index++) {
175 char c = csq.charAt(index);
176 if (c >= safeOctets.length || !safeOctets[c]) {
177 break;
178 }
179 }
180 return index;
181 }
182
183 /*
184 * Overridden for performance. For unescaped strings this improved the
185 * performance of the uri escaper from ~400ns to ~170ns as measured by
186 * {@link CharEscapersBenchmark}.
187 */
188 @Override
189 public String escape(String s) {
190 int slen = s.length();
191 for (int index = 0; index < slen; index++) {
192 char c = s.charAt(index);
193 if (c >= safeOctets.length || !safeOctets[c]) {
194 return escapeSlow(s, index);
195 }
196 }
197 return s;
198 }
199
200 /**
201 * Escapes the given Unicode code point in UTF-8.
202 */
203 @Override
204 protected char[] escape(int cp) {
205 // We should never get negative values here but if we do it will throw
206 // an
207 // IndexOutOfBoundsException, so at least it will get spotted.
208 if (cp < safeOctets.length && safeOctets[cp]) {
209 return null;
210 } else if (cp == ' ' && plusForSpace) {
211 return URI_ESCAPED_SPACE;
212 } else if (cp <= 0x7F) {
213 // Single byte UTF-8 characters
214 // Start with "%--" and fill in the blanks
215 char[] dest = new char[3];
216 dest[0] = '%';
217 dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
218 dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
219 return dest;
220 } else if (cp <= 0x7ff) {
221 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
222 // Start with "%--%--" and fill in the blanks
223 char[] dest = new char[6];
224 dest[0] = '%';
225 dest[3] = '%';
226 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
227 cp >>>= 4;
228 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
229 cp >>>= 2;
230 dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
231 cp >>>= 4;
232 dest[1] = UPPER_HEX_DIGITS[0xC | cp];
233 return dest;
234 } else if (cp <= 0xffff) {
235 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
236 // Start with "%E-%--%--" and fill in the blanks
237 char[] dest = new char[9];
238 dest[0] = '%';
239 dest[1] = 'E';
240 dest[3] = '%';
241 dest[6] = '%';
242 dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
243 cp >>>= 4;
244 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
245 cp >>>= 2;
246 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
247 cp >>>= 4;
248 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
249 cp >>>= 2;
250 dest[2] = UPPER_HEX_DIGITS[cp];
251 return dest;
252 } else if (cp <= 0x10ffff) {
253 char[] dest = new char[12];
254 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
255 // Start with "%F-%--%--%--" and fill in the blanks
256 dest[0] = '%';
257 dest[1] = 'F';
258 dest[3] = '%';
259 dest[6] = '%';
260 dest[9] = '%';
261 dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
262 cp >>>= 4;
263 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
264 cp >>>= 2;
265 dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
266 cp >>>= 4;
267 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
268 cp >>>= 2;
269 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
270 cp >>>= 4;
271 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
272 cp >>>= 2;
273 dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
274 return dest;
275 } else {
276 // If this ever happens it is due to bug in UnicodeEscaper, not bad
277 // input.
278 throw new IllegalArgumentException("Invalid unicode character value " + cp);
279 }
280 }
281 }
282