1 | /* Copyright (c) 2008 Google Inc.
|
---|
2 | *
|
---|
3 | * Licensed under the Apache License, Version 2.0 (the "License");
|
---|
4 | * you may not use this file except in compliance with the License.
|
---|
5 | * You may obtain a copy of the License at
|
---|
6 | *
|
---|
7 | * http://www.apache.org/licenses/LICENSE-2.0
|
---|
8 | *
|
---|
9 | * Unless required by applicable law or agreed to in writing, software
|
---|
10 | * distributed under the License is distributed on an "AS IS" BASIS,
|
---|
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
12 | * See the License for the specific language governing permissions and
|
---|
13 | * limitations under the License.
|
---|
14 | */
|
---|
15 |
|
---|
16 |
|
---|
17 | package com.google.gdata.util.common.base;
|
---|
18 |
|
---|
19 | /**
|
---|
20 | * A {@code UnicodeEscaper} that escapes some set of Java characters using
|
---|
21 | * the URI percent encoding scheme. The set of safe characters (those which
|
---|
22 | * remain unescaped) can be specified on construction.
|
---|
23 | *
|
---|
24 | * <p>For details on escaping URIs for use in web pages, see section 2.4 of
|
---|
25 | * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
|
---|
26 | *
|
---|
27 | * <p>In most cases this class should not need to be used directly. If you
|
---|
28 | * have no special requirements for escaping your URIs, you should use either
|
---|
29 | * {@link CharEscapers#uriEscaper()} or
|
---|
30 | * {@link CharEscapers#uriEscaper(boolean)}.
|
---|
31 | *
|
---|
32 | * <p>When encoding a String, the following rules apply:
|
---|
33 | * <ul>
|
---|
34 | * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
|
---|
35 | * through "9" remain the same.
|
---|
36 | * <li>Any additionally specified safe characters remain the same.
|
---|
37 | * <li>If {@code plusForSpace} was specified, the space character " " is
|
---|
38 | * converted into a plus sign "+".
|
---|
39 | * <li>All other characters are converted into one or more bytes using UTF-8
|
---|
40 | * encoding and each byte is then represented by the 3-character string
|
---|
41 | * "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation
|
---|
42 | * of the byte value.
|
---|
43 | * </ul>
|
---|
44 | *
|
---|
45 | * <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
|
---|
46 | * "~", "*", "'", "(" and ")". It goes on to state:
|
---|
47 | *
|
---|
48 | * <p><i>Unreserved characters can be escaped without changing the semantics
|
---|
49 | * of the URI, but this should not be done unless the URI is being used
|
---|
50 | * in a context that does not allow the unescaped character to appear.</i>
|
---|
51 | *
|
---|
52 | * <p>For performance reasons the only currently supported character encoding of
|
---|
53 | * this class is UTF-8.
|
---|
54 | *
|
---|
55 | * <p><b>Note</b>: This escaper produces uppercase hexidecimal sequences. From
|
---|
56 | * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
|
---|
57 | * <i>"URI producers and normalizers should use uppercase hexadecimal digits
|
---|
58 | * for all percent-encodings."</i>
|
---|
59 | *
|
---|
60 | *
|
---|
61 | */
|
---|
62 | public class PercentEscaper extends UnicodeEscaper {
|
---|
63 | /**
|
---|
64 | * A string of safe characters that mimics the behavior of
|
---|
65 | * {@link java.net.URLEncoder}.
|
---|
66 | *
|
---|
67 | */
|
---|
68 | public static final String SAFECHARS_URLENCODER = "-_.*";
|
---|
69 |
|
---|
70 | /**
|
---|
71 | * A string of characters that do not need to be encoded when used in URI
|
---|
72 | * path segments, as specified in RFC 3986. Note that some of these
|
---|
73 | * characters do need to be escaped when used in other parts of the URI.
|
---|
74 | */
|
---|
75 | public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
|
---|
76 |
|
---|
77 | /**
|
---|
78 | * A string of characters that do not need to be encoded when used in URI
|
---|
79 | * query strings, as specified in RFC 3986. Note that some of these
|
---|
80 | * characters do need to be escaped when used in other parts of the URI.
|
---|
81 | */
|
---|
82 | public static final String SAFEQUERYSTRINGCHARS_URLENCODER
|
---|
83 | = "-_.!~*'()@:$,;/?:";
|
---|
84 |
|
---|
85 | // In some uri escapers spaces are escaped to '+'
|
---|
86 | private static final char[] URI_ESCAPED_SPACE = { '+' };
|
---|
87 |
|
---|
88 | private static final char[] UPPER_HEX_DIGITS =
|
---|
89 | "0123456789ABCDEF".toCharArray();
|
---|
90 |
|
---|
91 | /**
|
---|
92 | * If true we should convert space to the {@code +} character.
|
---|
93 | */
|
---|
94 | private final boolean plusForSpace;
|
---|
95 |
|
---|
96 | /**
|
---|
97 | * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
|
---|
98 | * true then {@code c} should remain unmodified in the output. If
|
---|
99 | * {@code c > safeOctets.length} then it should be escaped.
|
---|
100 | */
|
---|
101 | private final boolean[] safeOctets;
|
---|
102 |
|
---|
103 | /**
|
---|
104 | * Constructs a URI escaper with the specified safe characters and optional
|
---|
105 | * handling of the space character.
|
---|
106 | *
|
---|
107 | * @param safeChars a non null string specifying additional safe characters
|
---|
108 | * for this escaper (the ranges 0..9, a..z and A..Z are always safe and
|
---|
109 | * should not be specified here)
|
---|
110 | * @param plusForSpace true if ASCII space should be escaped to {@code +}
|
---|
111 | * rather than {@code %20}
|
---|
112 | * @throws IllegalArgumentException if any of the parameters were invalid
|
---|
113 | */
|
---|
114 | public PercentEscaper(String safeChars, boolean plusForSpace) {
|
---|
115 | // Avoid any misunderstandings about the behavior of this escaper
|
---|
116 | if (safeChars.matches(".*[0-9A-Za-z].*")) {
|
---|
117 | throw new IllegalArgumentException(
|
---|
118 | "Alphanumeric characters are always 'safe' and should not be " +
|
---|
119 | "explicitly specified");
|
---|
120 | }
|
---|
121 | // Avoid ambiguous parameters. Safe characters are never modified so if
|
---|
122 | // space is a safe character then setting plusForSpace is meaningless.
|
---|
123 | if (plusForSpace && safeChars.contains(" ")) {
|
---|
124 | throw new IllegalArgumentException(
|
---|
125 | "plusForSpace cannot be specified when space is a 'safe' character");
|
---|
126 | }
|
---|
127 | if (safeChars.contains("%")) {
|
---|
128 | throw new IllegalArgumentException(
|
---|
129 | "The '%' character cannot be specified as 'safe'");
|
---|
130 | }
|
---|
131 | this.plusForSpace = plusForSpace;
|
---|
132 | this.safeOctets = createSafeOctets(safeChars);
|
---|
133 | }
|
---|
134 |
|
---|
135 | /**
|
---|
136 | * Creates a boolean[] with entries corresponding to the character values
|
---|
137 | * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
|
---|
138 | * is as small as is required to hold the given character information.
|
---|
139 | */
|
---|
140 | private static boolean[] createSafeOctets(String safeChars) {
|
---|
141 | int maxChar = 'z';
|
---|
142 | char[] safeCharArray = safeChars.toCharArray();
|
---|
143 | for (char c : safeCharArray) {
|
---|
144 | maxChar = Math.max(c, maxChar);
|
---|
145 | }
|
---|
146 | boolean[] octets = new boolean[maxChar + 1];
|
---|
147 | for (int c = '0'; c <= '9'; c++) {
|
---|
148 | octets[c] = true;
|
---|
149 | }
|
---|
150 | for (int c = 'A'; c <= 'Z'; c++) {
|
---|
151 | octets[c] = true;
|
---|
152 | }
|
---|
153 | for (int c = 'a'; c <= 'z'; c++) {
|
---|
154 | octets[c] = true;
|
---|
155 | }
|
---|
156 | for (char c : safeCharArray) {
|
---|
157 | octets[c] = true;
|
---|
158 | }
|
---|
159 | return octets;
|
---|
160 | }
|
---|
161 |
|
---|
162 | /*
|
---|
163 | * Overridden for performance. For unescaped strings this improved the
|
---|
164 | * performance of the uri escaper from ~760ns to ~400ns as measured by
|
---|
165 | * {@link CharEscapersBenchmark}.
|
---|
166 | */
|
---|
167 | @Override
|
---|
168 | protected int nextEscapeIndex(CharSequence csq, int index, int end) {
|
---|
169 | for (; index < end; index++) {
|
---|
170 | char c = csq.charAt(index);
|
---|
171 | if (c >= safeOctets.length || !safeOctets[c]) {
|
---|
172 | break;
|
---|
173 | }
|
---|
174 | }
|
---|
175 | return index;
|
---|
176 | }
|
---|
177 |
|
---|
178 | /*
|
---|
179 | * Overridden for performance. For unescaped strings this improved the
|
---|
180 | * performance of the uri escaper from ~400ns to ~170ns as measured by
|
---|
181 | * {@link CharEscapersBenchmark}.
|
---|
182 | */
|
---|
183 | @Override
|
---|
184 | public String escape(String s) {
|
---|
185 | int slen = s.length();
|
---|
186 | for (int index = 0; index < slen; index++) {
|
---|
187 | char c = s.charAt(index);
|
---|
188 | if (c >= safeOctets.length || !safeOctets[c]) {
|
---|
189 | return escapeSlow(s, index);
|
---|
190 | }
|
---|
191 | }
|
---|
192 | return s;
|
---|
193 | }
|
---|
194 |
|
---|
195 | /**
|
---|
196 | * Escapes the given Unicode code point in UTF-8.
|
---|
197 | */
|
---|
198 | @Override
|
---|
199 | protected char[] escape(int cp) {
|
---|
200 | // We should never get negative values here but if we do it will throw an
|
---|
201 | // IndexOutOfBoundsException, so at least it will get spotted.
|
---|
202 | if (cp < safeOctets.length && safeOctets[cp]) {
|
---|
203 | return null;
|
---|
204 | } else if (cp == ' ' && plusForSpace) {
|
---|
205 | return URI_ESCAPED_SPACE;
|
---|
206 | } else if (cp <= 0x7F) {
|
---|
207 | // Single byte UTF-8 characters
|
---|
208 | // Start with "%--" and fill in the blanks
|
---|
209 | char[] dest = new char[3];
|
---|
210 | dest[0] = '%';
|
---|
211 | dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
|
---|
212 | dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
|
---|
213 | return dest;
|
---|
214 | } else if (cp <= 0x7ff) {
|
---|
215 | // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
|
---|
216 | // Start with "%--%--" and fill in the blanks
|
---|
217 | char[] dest = new char[6];
|
---|
218 | dest[0] = '%';
|
---|
219 | dest[3] = '%';
|
---|
220 | dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
|
---|
221 | cp >>>= 4;
|
---|
222 | dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
---|
223 | cp >>>= 2;
|
---|
224 | dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
|
---|
225 | cp >>>= 4;
|
---|
226 | dest[1] = UPPER_HEX_DIGITS[0xC | cp];
|
---|
227 | return dest;
|
---|
228 | } else if (cp <= 0xffff) {
|
---|
229 | // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
|
---|
230 | // Start with "%E-%--%--" and fill in the blanks
|
---|
231 | char[] dest = new char[9];
|
---|
232 | dest[0] = '%';
|
---|
233 | dest[1] = 'E';
|
---|
234 | dest[3] = '%';
|
---|
235 | dest[6] = '%';
|
---|
236 | dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
|
---|
237 | cp >>>= 4;
|
---|
238 | dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
---|
239 | cp >>>= 2;
|
---|
240 | dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
|
---|
241 | cp >>>= 4;
|
---|
242 | dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
---|
243 | cp >>>= 2;
|
---|
244 | dest[2] = UPPER_HEX_DIGITS[cp];
|
---|
245 | return dest;
|
---|
246 | } else if (cp <= 0x10ffff) {
|
---|
247 | char[] dest = new char[12];
|
---|
248 | // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
|
---|
249 | // Start with "%F-%--%--%--" and fill in the blanks
|
---|
250 | dest[0] = '%';
|
---|
251 | dest[1] = 'F';
|
---|
252 | dest[3] = '%';
|
---|
253 | dest[6] = '%';
|
---|
254 | dest[9] = '%';
|
---|
255 | dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
|
---|
256 | cp >>>= 4;
|
---|
257 | dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
---|
258 | cp >>>= 2;
|
---|
259 | dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
|
---|
260 | cp >>>= 4;
|
---|
261 | dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
---|
262 | cp >>>= 2;
|
---|
263 | dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
|
---|
264 | cp >>>= 4;
|
---|
265 | dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
|
---|
266 | cp >>>= 2;
|
---|
267 | dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
|
---|
268 | return dest;
|
---|
269 | } else {
|
---|
270 | // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
|
---|
271 | throw new IllegalArgumentException(
|
---|
272 | "Invalid unicode character value " + cp);
|
---|
273 | }
|
---|
274 | }
|
---|
275 | }
|
---|