Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: josm/trunk/src/com/google/gdata/util/common/base/PercentEscaper.java@ 15121

Last change on this file since 15121 was 4231, checked in by stoecker, 14 years ago
add signpost and metadata extractor code to repository directly
File size: 9.7 KB

Line
1	/* Copyright (c) 2008 Google Inc.
2	*
3	* Licensed under the Apache License, Version 2.0 (the "License");
4	* you may not use this file except in compliance with the License.
5	* You may obtain a copy of the License at
6	*
7	* http://www.apache.org/licenses/LICENSE-2.0
8	*
9	* Unless required by applicable law or agreed to in writing, software
10	* distributed under the License is distributed on an "AS IS" BASIS,
11	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	* See the License for the specific language governing permissions and
13	* limitations under the License.
14	*/
15
16
17	package com.google.gdata.util.common.base;
18
19	/**
20	* A {@code UnicodeEscaper} that escapes some set of Java characters using
21	* the URI percent encoding scheme. The set of safe characters (those which
22	* remain unescaped) can be specified on construction.
23	*
24	* <p>For details on escaping URIs for use in web pages, see section 2.4 of
25	* <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
26	*
27	* <p>In most cases this class should not need to be used directly. If you
28	* have no special requirements for escaping your URIs, you should use either
29	* {@link CharEscapers#uriEscaper()} or
30	* {@link CharEscapers#uriEscaper(boolean)}.
31	*
32	* <p>When encoding a String, the following rules apply:
33	* <ul>
34	* <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
35	* through "9" remain the same.
36	* <li>Any additionally specified safe characters remain the same.
37	* <li>If {@code plusForSpace} was specified, the space character " " is
38	* converted into a plus sign "+".
39	* <li>All other characters are converted into one or more bytes using UTF-8
40	* encoding and each byte is then represented by the 3-character string
41	* "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation
42	* of the byte value.
43	* </ul>
44	*
45	* <p>RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!",
46	* "~", "*", "'", "(" and ")". It goes on to state:
47	*
48	* <p><i>Unreserved characters can be escaped without changing the semantics
49	* of the URI, but this should not be done unless the URI is being used
50	* in a context that does not allow the unescaped character to appear.</i>
51	*
52	* <p>For performance reasons the only currently supported character encoding of
53	* this class is UTF-8.
54	*
55	* <p><b>Note</b>: This escaper produces uppercase hexidecimal sequences. From
56	* <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
57	* <i>"URI producers and normalizers should use uppercase hexadecimal digits
58	* for all percent-encodings."</i>
59	*
60	*
61	*/
62	public class PercentEscaper extends UnicodeEscaper {
63	/**
64	* A string of safe characters that mimics the behavior of
65	* {@link java.net.URLEncoder}.
66	*
67	*/
68	public static final String SAFECHARS_URLENCODER = "-_.*";
69
70	/**
71	* A string of characters that do not need to be encoded when used in URI
72	* path segments, as specified in RFC 3986. Note that some of these
73	* characters do need to be escaped when used in other parts of the URI.
74	*/
75	public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
76
77	/**
78	* A string of characters that do not need to be encoded when used in URI
79	* query strings, as specified in RFC 3986. Note that some of these
80	* characters do need to be escaped when used in other parts of the URI.
81	*/
82	public static final String SAFEQUERYSTRINGCHARS_URLENCODER
83	= "-_.!~*'()@:$,;/?:";
84
85	// In some uri escapers spaces are escaped to '+'
86	private static final char[] URI_ESCAPED_SPACE = { '+' };
87
88	private static final char[] UPPER_HEX_DIGITS =
89	"0123456789ABCDEF".toCharArray();
90
91	/**
92	* If true we should convert space to the {@code +} character.
93	*/
94	private final boolean plusForSpace;
95
96	/**
97	* An array of flags where for any {@code char c} if {@code safeOctets[c]} is
98	* true then {@code c} should remain unmodified in the output. If
99	* {@code c > safeOctets.length} then it should be escaped.
100	*/
101	private final boolean[] safeOctets;
102
103	/**
104	* Constructs a URI escaper with the specified safe characters and optional
105	* handling of the space character.
106	*
107	* @param safeChars a non null string specifying additional safe characters
108	* for this escaper (the ranges 0..9, a..z and A..Z are always safe and
109	* should not be specified here)
110	* @param plusForSpace true if ASCII space should be escaped to {@code +}
111	* rather than {@code %20}
112	* @throws IllegalArgumentException if any of the parameters were invalid
113	*/
114	public PercentEscaper(String safeChars, boolean plusForSpace) {
115	// Avoid any misunderstandings about the behavior of this escaper
116	if (safeChars.matches(".[0-9A-Za-z].")) {
117	throw new IllegalArgumentException(
118	"Alphanumeric characters are always 'safe' and should not be " +
119	"explicitly specified");
120	}
121	// Avoid ambiguous parameters. Safe characters are never modified so if
122	// space is a safe character then setting plusForSpace is meaningless.
123	if (plusForSpace && safeChars.contains(" ")) {
124	throw new IllegalArgumentException(
125	"plusForSpace cannot be specified when space is a 'safe' character");
126	}
127	if (safeChars.contains("%")) {
128	throw new IllegalArgumentException(
129	"The '%' character cannot be specified as 'safe'");
130	}
131	this.plusForSpace = plusForSpace;
132	this.safeOctets = createSafeOctets(safeChars);
133	}
134
135	/**
136	* Creates a boolean[] with entries corresponding to the character values
137	* for 0-9, A-Z, a-z and those specified in safeChars set to true. The array
138	* is as small as is required to hold the given character information.
139	*/
140	private static boolean[] createSafeOctets(String safeChars) {
141	int maxChar = 'z';
142	char[] safeCharArray = safeChars.toCharArray();
143	for (char c : safeCharArray) {
144	maxChar = Math.max(c, maxChar);
145	}
146	boolean[] octets = new boolean[maxChar + 1];
147	for (int c = '0'; c <= '9'; c++) {
148	octets[c] = true;
149	}
150	for (int c = 'A'; c <= 'Z'; c++) {
151	octets[c] = true;
152	}
153	for (int c = 'a'; c <= 'z'; c++) {
154	octets[c] = true;
155	}
156	for (char c : safeCharArray) {
157	octets[c] = true;
158	}
159	return octets;
160	}
161
162	/*
163	* Overridden for performance. For unescaped strings this improved the
164	* performance of the uri escaper from ~760ns to ~400ns as measured by
165	* {@link CharEscapersBenchmark}.
166	*/
167	@Override
168	protected int nextEscapeIndex(CharSequence csq, int index, int end) {
169	for (; index < end; index++) {
170	char c = csq.charAt(index);
171	if (c >= safeOctets.length \|\| !safeOctets[c]) {
172	break;
173	}
174	}
175	return index;
176	}
177
178	/*
179	* Overridden for performance. For unescaped strings this improved the
180	* performance of the uri escaper from ~400ns to ~170ns as measured by
181	* {@link CharEscapersBenchmark}.
182	*/
183	@Override
184	public String escape(String s) {
185	int slen = s.length();
186	for (int index = 0; index < slen; index++) {
187	char c = s.charAt(index);
188	if (c >= safeOctets.length \|\| !safeOctets[c]) {
189	return escapeSlow(s, index);
190	}
191	}
192	return s;
193	}
194
195	/**
196	* Escapes the given Unicode code point in UTF-8.
197	*/
198	@Override
199	protected char[] escape(int cp) {
200	// We should never get negative values here but if we do it will throw an
201	// IndexOutOfBoundsException, so at least it will get spotted.
202	if (cp < safeOctets.length && safeOctets[cp]) {
203	return null;
204	} else if (cp == ' ' && plusForSpace) {
205	return URI_ESCAPED_SPACE;
206	} else if (cp <= 0x7F) {
207	// Single byte UTF-8 characters
208	// Start with "%--" and fill in the blanks
209	char[] dest = new char[3];
210	dest[0] = '%';
211	dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
212	dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
213	return dest;
214	} else if (cp <= 0x7ff) {
215	// Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
216	// Start with "%--%--" and fill in the blanks
217	char[] dest = new char[6];
218	dest[0] = '%';
219	dest[3] = '%';
220	dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
221	cp >>>= 4;
222	dest[4] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
223	cp >>>= 2;
224	dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
225	cp >>>= 4;
226	dest[1] = UPPER_HEX_DIGITS[0xC \| cp];
227	return dest;
228	} else if (cp <= 0xffff) {
229	// Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
230	// Start with "%E-%--%--" and fill in the blanks
231	char[] dest = new char[9];
232	dest[0] = '%';
233	dest[1] = 'E';
234	dest[3] = '%';
235	dest[6] = '%';
236	dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
237	cp >>>= 4;
238	dest[7] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
239	cp >>>= 2;
240	dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
241	cp >>>= 4;
242	dest[4] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
243	cp >>>= 2;
244	dest[2] = UPPER_HEX_DIGITS[cp];
245	return dest;
246	} else if (cp <= 0x10ffff) {
247	char[] dest = new char[12];
248	// Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
249	// Start with "%F-%--%--%--" and fill in the blanks
250	dest[0] = '%';
251	dest[1] = 'F';
252	dest[3] = '%';
253	dest[6] = '%';
254	dest[9] = '%';
255	dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
256	cp >>>= 4;
257	dest[10] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
258	cp >>>= 2;
259	dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
260	cp >>>= 4;
261	dest[7] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
262	cp >>>= 2;
263	dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
264	cp >>>= 4;
265	dest[4] = UPPER_HEX_DIGITS[0x8 \| (cp & 0x3)];
266	cp >>>= 2;
267	dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
268	return dest;
269	} else {
270	// If this ever happens it is due to bug in UnicodeEscaper, not bad input.
271	throw new IllegalArgumentException(
272	"Invalid unicode character value " + cp);
273	}
274	}
275	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: