Context Navigation

source: osm/applications/editors/josm/plugins/surveyor/src/org/dinopolis/util/io/Tokenizer.java@ 30737

Last change on this file since 30737 was 30737, checked in by donvip, 10 years ago
[josm_plugins] fix Java 7 / unused code warnings
Property svn:eol-style set to `native`
File size: 23.2 KB

Line
1	/***********************************************************************
2	* @(#)$RCSfile: Tokenizer.java,v $ $Revision: 1.6 $$Date: 2006/04/21 14:14:56 $
3	*
4	* Copyright (c) Christof Dallermassl
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU Lesser General Public License (LGPL)
8	* as published by the Free Software Foundation; either version 2.1 of
9	* the License, or (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this program; if not, write to the
18	* Free Software Foundation, Inc.,
19	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20	***********************************************************************/
21
22	package org.dinopolis.util.io;
23
24	import java.io.IOException;
25	import java.io.InputStream;
26	import java.io.InputStreamReader;
27	import java.io.PushbackReader;
28	import java.io.Reader;
29	import java.io.StringReader;
30	import java.util.ArrayList;
31	import java.util.Iterator;
32	import java.util.List;
33
34	//----------------------------------------------------------------------
35	/**
36
37	* This tokenizer merges the benefits of the java.lang.StringTokenizer
38	* class and the java.io.StreamTokenizer class. It provides a low
39	* level and a high level interface to the tokenizer. The low level
40	* interface consists of the method pair nextToken() and getWord(),
41	* where the first returns the type of token in the parsing process,
42	* and the latter returns the String element itself.
43	* <p>
44	* The high level interface consists of the methods hasNextLine() and
45	* nextLine(). They use the low level interface to parse the data line
46	* by line and create a list of strings from it.
47	* <p>
48	* It is unsure, if it is wise to mix the usage of the high and
49	* the low level interface. For normal usage, the high level interface
50	* should be more comfortable to use and does not provide any
51	* drawbacks.
52	* <p>
53
54	* An example for the high level interface:
55	* <pre>
56	* try
57	* {
58	* // simple example, tokenizing string, no escape, but quoted
59	* // works:
60	* System.out.println("example 1");
61	* Tokenizer tokenizer = new Tokenizer("text,,,\"another,text\"");
62	* List tokens;
63	* while(tokenizer.hasNextLine())
64	* {
65	* tokens = tokenizer.nextLine();
66	* System.out.println(tokens.get(0)); // prints 'text'
67	* System.out.println(tokens.get(1)); // prints ''
68	* System.out.println(tokens.get(2)); // prints ''
69	* System.out.println(tokens.get(3)); // prints 'another,text'
70	* }
71	*
72	* System.out.println("example 2");
73	* // simple example, tokenizing string, using escape char and
74	* // quoted strings:
75	* tokenizer = new Tokenizer("text,text with\\,comma,,\"another,text\"");
76	* tokenizer.respectEscapedCharacters(true);
77	* while(tokenizer.hasNextLine())
78	* {
79	* tokens = tokenizer.nextLine();
80	* System.out.println(tokens.get(0)); // prints 'text'
81	* System.out.println(tokens.get(1)); // prints 'text with, comma'
82	* System.out.println(tokens.get(2)); // prints ''
83	* System.out.println(tokens.get(3)); // prints 'another,text'
84	* }
85	* }
86	* catch(Exception ioe)
87	* {
88	* ioe.printStackTrace();
89	* }
90	* </pre>
91	* <p>
92	* The advantages compared to the StreamTokenizer class are: Unlike
93	* the StreamTokenizer, this Tokenizer class returns the delimiters as
94	* tokens and therefore may be used to tokenize e.g. comma separated
95	* files with empty fields (the StreamTokenizer handles multiple
96	* delimiters in a row like one delimiter).
97	* <p>
98	* The tokenizer respect quoted words, so the delimiter is ignored if
99	* inside quotes. And it may handle escaped characters (like an
100	* escaped quote character, or an escaped new line). So the line
101	* <code>eric,"he said, \"great!\""</code> returns <code>eric</code>
102	* and <code>he said, "great!"</code> as words.
103	* <p>
104	* Low level interface: The design of the Tokenizer allows to get
105	* empty columns as well as treat multiple delimiters in a row as one
106	* delimiter. For the first approach trigger the values on every
107	* DELIMITER and EOF token whereas for the second, trigger only on
108	* WORD tokens.
109	* <p>
110	* If one wants to be informed about empty words as well, use the
111	* Tokenizer like in the following code fragment:
112	* <pre>
113	* Tokenizer tokenizer = new Tokenizer("text,,,another text");
114	* String word = "";
115	* int token;
116	* while((token = tokenizer.nextToken()) != Tokenizer.EOF)
117	* {
118	* switch(token)
119	* {
120	* case Tokenizer.EOL:
121	* System.out.println("word: "+word);
122	* word = "";
123	* System.out.println("-------------");
124	* break;
125	* case Tokenizer.WORD:
126	* word = tokenizer.getWord();
127	* break;
128	* case Tokenizer.QUOTED_WORD:
129	* word = tokenizer.getWord() + " (quoted)";
130	* break;
131	* case Tokenizer.DELIMITER:
132	* System.out.println("word: "+word);
133	* word = "";
134	* break;
135	* default:
136	* System.err.println("Unknown Token: "+token);
137	* }
138	* }
139	* </pre>
140	* In this example, if the delimiter is set to a comma, a line like
141	* <code>column1,,,"column4,partofcolumn4"</code> would be treated correctly.
142	* <p>
143	* This tokenizer uses the LF character as end of line characters. It
144	* ignores any CR characters, so it can be used in windows
145	* environments as well.
146	*
147	* @author Christof Dallermassl
148	* @version $Revision: 1.6 $
149	*/
150
151	public class Tokenizer
152	{
153	/** the reader to read from */
154	protected PushbackReader reader_;
155	/** the buffer to create the tokens */
156	protected StringBuffer buffer_;
157	/** all characters in this string are used as delimiters */
158	protected String delimiters_ = ",";
159	/** the escape character */
160	protected int escapeChar_ = '\\';
161	/** the quote character */
162	protected int quoteChar_ = '"';
163
164	/** if true, characters are treated as escaped */
165	protected boolean escapeMode_ = false;
166
167	/** if true, end of line is respected */
168	protected boolean eolIsSignificant_ = true;
169	/** if true, escape characters are respected */
170	protected boolean respectEscapedChars_ = false;
171	/** if true, quoted words are respected */
172	protected boolean respectQuotedWords_ = true;
173
174	/** line count */
175	protected int lineCount_ = 1;
176
177	/** end of file marker */
178	protected boolean eofReached_ = false;
179
180	/** the last token that was found */
181	protected int lastToken_ = NOT_STARTED;
182
183	/** end of file token */
184	public static final int EOF = -1;
185	/** end of line token */
186	public static final int EOL = 0;
187	/** word token */
188	public static final int WORD = 1;
189	/** quoted word token */
190	public static final int QUOTED_WORD = 2;
191	/** delimiter token */
192	public static final int DELIMITER = 3;
193	/** error token */
194	public static final int ERROR = 4;
195	/** not started token */
196	public static final int NOT_STARTED = 5;
197
198
199	//----------------------------------------------------------------------
200	/**
201	* Creates a tokenizer that reads from the given string. It uses the
202	* comma as delimiter, does not respect escape characters but respects
203	* quoted words.
204	*
205	* @param string the string to read from.
206	*/
207	public Tokenizer(String string)
208	{
209	this(new StringReader(string));
210	}
211
212	//----------------------------------------------------------------------
213	/**
214	* Creates a tokenizer that reads from the given string. All
215	* characters in the given delimiters string are used as
216	* delimiter. The tokenizer does not respect escape characters but
217	* respects quoted words.
218	*
219	* @param string the string to read from.
220	* @param delimiters the delimiters to use.
221	*/
222	public Tokenizer(String string, String delimiters)
223	{
224	this(new StringReader(string));
225	setDelimiters(delimiters);
226	}
227
228	//----------------------------------------------------------------------
229	/**
230	* Creates a tokenizer that reads from the given string. It uses the
231	* comma as delimiter, does not respect escape characters but respects
232	* quoted words.
233	*
234	* @param inStream the stream to read from.
235	*/
236	public Tokenizer(InputStream inStream)
237	{
238	this(new InputStreamReader(inStream));
239	}
240
241	//----------------------------------------------------------------------
242	/**
243	* Creates a tokenizer that reads from the given reader. It uses the
244	* comma as delimiter, does not respect escape characters but respects
245	* quoted words.
246	*
247	* @param reader the reader to read from.
248	*/
249	public Tokenizer(Reader reader)
250	{
251	reader_ = new PushbackReader(reader,2);
252	buffer_ = new StringBuffer();
253	}
254
255	//----------------------------------------------------------------------
256	/**
257	* Set the delimiter character. The default is the comma.
258	*
259	* @param delimiterChar the delimiter character.
260	*/
261	public void setDelimiter(int delimiterChar)
262	{
263	delimiters_ = new String(new char[]{(char)delimiterChar});
264	}
265
266	//----------------------------------------------------------------------
267	/**
268	* Get the first delimiter character.
269	*
270	* @return the delimiter character.
271	* @deprecated use the getDelimiters() method now
272	*/
273	public int getDelimiter()
274	{
275	return(delimiters_.charAt(0));
276	}
277
278	//----------------------------------------------------------------------
279	/**
280	* Set the delimiter characters. All characters in the delimiters are
281	* used as delimiter.
282	*
283	* @param delimiters the delimiter characters.
284	*/
285	public void setDelimiters(String delimiters)
286	{
287	delimiters_ = delimiters;
288	}
289
290	//----------------------------------------------------------------------
291	/**
292	* Get the delimiter character.
293	*
294	* @return the delimiter character.
295	*/
296	public String getDelimiters()
297	{
298	return(delimiters_);
299	}
300
301	//----------------------------------------------------------------------
302	/**
303	* Set the escape character. The default is the backslash.
304	*
305	* @param escapeChar the escape character.
306	*/
307	public void setEscapeChar(int escapeChar)
308	{
309	escapeChar_ = escapeChar;
310	}
311
312	//----------------------------------------------------------------------
313	/**
314	* Get the escape character.
315	*
316	* @return the escape character.
317	*/
318	public int getEscapeChar()
319	{
320	return(escapeChar_);
321	}
322
323	//----------------------------------------------------------------------
324	/**
325	* If escape characters should be respected, set the param to
326	* <code>true</code>. The default is to ignore escape characters.
327	*
328	* @param respectEscaped If escape characters should be respected,
329	* set the param to <code>true</code>.
330	*/
331	public void respectEscapedCharacters(boolean respectEscaped)
332	{
333	respectEscapedChars_ = respectEscaped;
334	}
335
336	//----------------------------------------------------------------------
337	/**
338	* Returns <code>true</code>, if escape character is respected.
339	*
340	* @return <code>true</code>, if escape character is respected.
341	*/
342	public boolean respectEscapedCharacters()
343	{
344	return(respectEscapedChars_);
345	}
346
347	//----------------------------------------------------------------------
348	/**
349	* Get the quote character.
350	*
351	* @return the quote character.
352	*/
353	public int getQuoteChar()
354	{
355	return (quoteChar_);
356	}
357
358	//----------------------------------------------------------------------
359	/**
360	* Set the quote character. The default is the double quote.
361	*
362	* @param quoteChar the quote character.
363	*/
364	public void setQuoteChar(int quoteChar)
365	{
366	quoteChar_ = quoteChar;
367	}
368
369	//----------------------------------------------------------------------
370	/**
371	* If quoted words should be respected, set the param to
372	* <code>true</code>. The default is to respect quoted words.
373	*
374	* @param respectQuotes If quoted words should be respected,
375	* set the param to <code>true</code>.
376	*/
377	public void respectQuotedWords(boolean respectQuotes)
378	{
379	respectQuotedWords_ = respectQuotes;
380	}
381
382	//----------------------------------------------------------------------
383	/**
384	* Returns <code>true</code>, if quoted words are respected.
385	*
386	* @return <code>true</code>, if quoted words are respected.
387	*/
388	public boolean respectQuotedWords()
389	{
390	return(respectQuotedWords_);
391	}
392
393	//----------------------------------------------------------------------
394	/**
395	* If set to <code>true</code> the end of line is signaled by the EOL
396	* token. If set to <code>false</code> end of line is treated as a
397	* normal delimiter. The default value is true;
398	*
399	* @param significant if the end of line is treated as a special token
400	* or as a delimiter.
401	*/
402	public void eolIsSignificant(boolean significant)
403	{
404	eolIsSignificant_ = significant;
405	}
406
407	//----------------------------------------------------------------------
408	/**
409	* Returns <code>true</code>, if in case of an end of line detected,
410	* an EOL token is returned. If <code>false</code>, the end of line is
411	* treated as a normal delimiter.
412	*
413	* @return <code>true</code>, if in case of an end of line detected,
414	* an EOL token is returned. If <code>false</code>, the end of line is
415	* treated as a normal delimiter.
416	*/
417	public boolean isEolSignificant()
418	{
419	return(eolIsSignificant_);
420	}
421
422
423	//----------------------------------------------------------------------
424	/**
425	* Returns the current line number of the reader.
426	*
427	* @return the current line number of the reader.
428	*/
429	public int getLineNumber()
430	{
431	return(lineCount_);
432	}
433
434	//----------------------------------------------------------------------
435	/**
436	* Returns the value of the token. If the token was of the type WORD,
437	* the word is returned.
438	*
439	* @return the value of the token.
440	*/
441	public String getWord()
442	{
443	return(buffer_.toString());
444	}
445
446	//----------------------------------------------------------------------
447	/**
448	* Returns the last token that was returned from the nextToken() method.
449	*
450	* @return the last token.
451	*/
452	public int getLastToken()
453	{
454	return(lastToken_);
455	}
456
457	//----------------------------------------------------------------------
458	/**
459	* Returns true, if the given character is seen as a delimiter. This
460	* method respects escape_mode, so if the escape character was found
461	* before, it has to act accordingly (usually, return false, even if
462	* the character is a delimiter).
463	*
464	* @param character the character to check for delimiter
465	* @return true, if the given character is seen as a delimiter.
466	*/
467	protected boolean isDelimiter(int character)
468	{
469	// check for escape mode:
470	if(escapeMode_)
471	return(false);
472
473	return(delimiters_.indexOf(character) >= 0);
474	}
475
476	//----------------------------------------------------------------------
477	/**
478	* Returns true, if the given character is seen as a quote
479	* character. This method respects escape_mode, so if the escape
480	* character was found before, it has to act accordingly (usually,
481	* return false, even if the character is a quote character).
482	*
483	* @param character the character to check for quote.
484	* @return true, if the given character is seen as a quote character.
485	*/
486	protected boolean isQuoteChar(int character)
487	{
488	if(!respectQuotedWords_)
489	return(false);
490
491	// check for escape mode:
492	if(escapeMode_)
493	return(false);
494
495	return(character == quoteChar_);
496	}
497
498	//----------------------------------------------------------------------
499	/**
500	* Returns true, if the given character is seen as a escape
501	* character. This method respects escape_mode, so if the escape
502	* character was found before, it has to act accordingly (usually,
503	* return false, even if the character is a escape character).
504	* @param character the character to check for escape character.
505	* @return true, if the given character is seen as a escape character.
506	*/
507	protected boolean isEscapeChar(int character)
508	{
509	if(!respectEscapedChars_)
510	return(false);
511
512	// check for escape mode:
513	if(escapeMode_)
514	return(false);
515
516	return(character == escapeChar_);
517	}
518
519	//----------------------------------------------------------------------
520	/**
521	* Returns true, if the given character is seen as a end of line
522	* character. This method respects end of line_mode, so if the end of
523	* line character was found before, it has to act accordingly
524	* (usually, return false, even if the character is a end of line
525	* character).
526	* @param character the character to check for end of line.
527	* @return true, if the given character is seen as a end of line
528	* character.
529	*/
530	protected boolean isEndOfLine(int character)
531	{
532	// check for escape mode:
533	if(escapeMode_)
534	{
535	if(character == '\n') // add line count, even if in escape mode!
536	lineCount_++;
537	return(false);
538	}
539	if(character == -1)
540	eofReached_ = true;
541
542	return((character=='\n') \|\| (character=='\r') \|\| (character == -1));
543	}
544
545	//----------------------------------------------------------------------
546	/**
547	* Closes the tokenizer (and the reader is uses internally).
548	*
549	* @exception IOException if an error occurred.
550	*/
551	public void close()
552	throws IOException
553	{
554	reader_.close();
555	}
556
557	//----------------------------------------------------------------------
558	/**
559	* Reads and returns the next character from the reader and checks for
560	* the escape character. If an escape character is read, a flag is set
561	* and the next character is read. A newline following the escape
562	* character is ignored.
563	*
564	* @return the next character.
565	* @exception IOException if an error occurred.
566	*/
567	protected int readNextChar()
568	throws IOException
569	{
570	int next_char = reader_.read();
571	if(escapeMode_)
572	{
573	escapeMode_ = false;
574	}
575	else
576	{
577	if(isEscapeChar(next_char))
578	{
579	// ignore escape char itself:
580	next_char = reader_.read();
581
582	// check for newline and ignore it:
583	if(isEndOfLine(next_char))
584	{
585	lineCount_++;
586	next_char = reader_.read();
587	// ignore CR:
588	if(next_char == '\r')
589	{
590	next_char = readNextChar();
591	}
592	}
593	escapeMode_ = true;
594	}
595	}
596	// ignore CR:
597	if(next_char == '\r')
598	{
599	next_char = readNextChar();
600	}
601	return(next_char);
602	}
603
604	//----------------------------------------------------------------------
605	/**
606	* Returns the next token from the reader. The token's value may be
607	* WORD, QUOTED_WORD, EOF, EOL, or DELIMITER. In the case or WORD or
608	* QUOTED_WORD the actual word can be obtained by the use of the
609	* getWord method.
610	*
611	* @return the next token.
612	* @exception IOException if an error occurred.
613	*/
614	public int nextToken()
615	throws IOException
616	{
617	buffer_.setLength(0);
618
619	int next_char;
620	next_char = readNextChar();
621
622	// handle EOF:
623	if(eofReached_)
624	{
625	lastToken_ = EOF;
626	return(EOF);
627	}
628
629	// handle EOL:
630	if(isEndOfLine(next_char))
631	{
632	lineCount_++;
633	if(eolIsSignificant_)
634	{
635	lastToken_ = EOL;
636	return(EOL);
637	}
638	else
639	{
640	lastToken_ = DELIMITER;
641	return(DELIMITER);
642	}
643	}
644
645	// handle DELIMITER
646	if(isDelimiter(next_char))
647	{
648	lastToken_ = DELIMITER;
649	return(DELIMITER);
650	}
651
652	// handle quoted words:
653	if(isQuoteChar(next_char))
654	{
655	while(true)
656	{
657	next_char = readNextChar();
658	if(isEndOfLine(next_char))
659	{
660	lastToken_ = ERROR;
661	return(ERROR);
662	}
663	else
664	{
665	if(isQuoteChar(next_char))
666	{
667	lastToken_ = QUOTED_WORD;
668	return(QUOTED_WORD);
669	}
670
671	// no special char, then append to buffer:
672	buffer_.append((char)next_char);
673	}
674	}
675	}
676
677	// handle 'normal' words:
678	while(true)
679	{
680	buffer_.append((char)next_char);
681	next_char = readNextChar();
682	if(isDelimiter(next_char) \|\| isEndOfLine(next_char))
683	{
684	reader_.unread(next_char);
685	lastToken_ = WORD;
686	return(WORD);
687	}
688	}
689	}
690
691	//----------------------------------------------------------------------
692	/**
693	* Returns true, if the tokenizer can return another line.
694	*
695	* @return true, if the tokenizer can return another line.
696	* @exception IOException if an error occurred.
697	*/
698	public boolean hasNextLine()
699	throws IOException
700	{
701	if(lastToken_ == EOF)
702	return(false);
703
704	if((lastToken_ == EOL) \|\| (lastToken_ == NOT_STARTED))
705	{
706	int next_char = readNextChar();
707	if(next_char == -1)
708	return(false);
709
710	reader_.unread(next_char);
711	}
712	return(true);
713	}
714
715
716	//----------------------------------------------------------------------
717	/**
718	* Returns a list of elements (Strings) from the next line of the
719	* tokenizer. If there are multiple delimiters without any values in
720	* between, empty (zero length) strings are added to the list. They
721	* may be removed by the use of the {@link
722	* #removeZeroLengthElements(List)} method.
723	*
724	* @return a list of elements (Strings) from the next line of the
725	* tokenizer.
726	* @exception IOException if an error occurred.
727	*/
728	public List<String> nextLine()
729	throws IOException
730	{
731	int token = nextToken();
732	List<String> list = new ArrayList<>();
733	String word = "";
734	// while(token != Tokenizer.EOF)
735	while(true)
736	{
737	switch(token)
738	{
739	case Tokenizer.WORD:
740	word = getWord();
741	break;
742	case Tokenizer.QUOTED_WORD:
743	word = getWord();
744	break;
745	case Tokenizer.DELIMITER:
746	list.add(word);
747	word = "";
748	break;
749	case Tokenizer.EOL:
750	case Tokenizer.EOF:
751	list.add(word);
752	return(list);
753	default:
754	System.err.println("Unknown Token: "+token);
755	}
756	token = nextToken();
757	}
758	// return(list);
759	}
760
761	//----------------------------------------------------------------------
762	/**
763	* This helper method removes all zero length elements from the given
764	* list and returns it. The given list is not changed!
765	*
766	* @param list the list of String objects to remove the zero elements from.
767	* @return a copy of the given list where all zero length elements are removed.
768	*/
769	public static List<String> removeZeroLengthElements(List<String> list)
770	{
771	return removeZeroLengthElements(list, false);
772	}
773
774	//----------------------------------------------------------------------
775	/**
776	* This helper method trims all elements and removes all zero length
777	* (length is taken after trimming leading and trailing spaces) elements from the given
778	* list and returns it. This method copies the (trimmed and) non-zero elements to a
779	* new list.
780	*
781	* @param list the list of String objects to remove the zero elements from.
782	* @param trim if set to <code>true</code>, all leading and trailing spaces are removed from
783	* the elements. This is done, before the length is compared to zero (and the element
784	* may be removed if the length is zero). If set to <code>true</code>, elements
785	* that only consist of spaces are removed as well!
786	* @return the list where all zero length elements are remove.
787	*/
788	public static List<String> removeZeroLengthElements(List<String> list, boolean trim)
789	{
790	Iterator<String> iterator = list.iterator();
791	String value;
792	List<String> new_list = new ArrayList<>();
793	while(iterator.hasNext())
794	{
795	value = iterator.next();
796	if (trim)
797	value = value.trim();
798	if(value.length() != 0)
799	new_list.add(value);
800	}
801	return(new_list);
802	}
803	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: