Context Navigation

Tokenizer.java@ 34591

Last change on this file since 34591 was 34591, checked in by donvip, 6 years ago
rename packages, fix warnings
Property svn:eol-style set to `native`
File size: 22.8 KB

Line
1	/***********************************************************************
2	* @(#)$RCSfile: Tokenizer.java,v $ $Revision: 1.6 $$Date: 2006/04/21 14:14:56 $
3	*
4	* Copyright (c) Christof Dallermassl
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU Lesser General Public License (LGPL)
8	* as published by the Free Software Foundation; either version 2.1 of
9	* the License, or (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this program; if not, write to the
18	* Free Software Foundation, Inc.,
19	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20	***********************************************************************/
21
22	package org.dinopolis.util.io;
23
24	import java.io.IOException;
25	import java.io.PushbackReader;
26	import java.io.Reader;
27	import java.io.StringReader;
28	import java.util.ArrayList;
29	import java.util.Iterator;
30	import java.util.List;
31
32	//----------------------------------------------------------------------
33	/**
34
35	* This tokenizer merges the benefits of the java.lang.StringTokenizer
36	* class and the java.io.StreamTokenizer class. It provides a low
37	* level and a high level interface to the tokenizer. The low level
38	* interface consists of the method pair nextToken() and getWord(),
39	* where the first returns the type of token in the parsing process,
40	* and the latter returns the String element itself.
41	* <p>
42	* The high level interface consists of the methods hasNextLine() and
43	* nextLine(). They use the low level interface to parse the data line
44	* by line and create a list of strings from it.
45	* <p>
46	* It is unsure, if it is wise to mix the usage of the high and
47	* the low level interface. For normal usage, the high level interface
48	* should be more comfortable to use and does not provide any
49	* drawbacks.
50	* <p>
51
52	* An example for the high level interface:
53	* <pre>
54	* try
55	* {
56	* // simple example, tokenizing string, no escape, but quoted
57	* // works:
58	* System.out.println("example 1");
59	* Tokenizer tokenizer = new Tokenizer("text,,,\"another,text\"");
60	* List tokens;
61	* while(tokenizer.hasNextLine())
62	* {
63	* tokens = tokenizer.nextLine();
64	* System.out.println(tokens.get(0)); // prints 'text'
65	* System.out.println(tokens.get(1)); // prints ''
66	* System.out.println(tokens.get(2)); // prints ''
67	* System.out.println(tokens.get(3)); // prints 'another,text'
68	* }
69	*
70	* System.out.println("example 2");
71	* // simple example, tokenizing string, using escape char and
72	* // quoted strings:
73	* tokenizer = new Tokenizer("text,text with\\,comma,,\"another,text\"");
74	* tokenizer.respectEscapedCharacters(true);
75	* while(tokenizer.hasNextLine())
76	* {
77	* tokens = tokenizer.nextLine();
78	* System.out.println(tokens.get(0)); // prints 'text'
79	* System.out.println(tokens.get(1)); // prints 'text with, comma'
80	* System.out.println(tokens.get(2)); // prints ''
81	* System.out.println(tokens.get(3)); // prints 'another,text'
82	* }
83	* }
84	* catch(Exception ioe)
85	* {
86	* ioe.printStackTrace();
87	* }
88	* </pre>
89	* <p>
90	* The advantages compared to the StreamTokenizer class are: Unlike
91	* the StreamTokenizer, this Tokenizer class returns the delimiters as
92	* tokens and therefore may be used to tokenize e.g. comma separated
93	* files with empty fields (the StreamTokenizer handles multiple
94	* delimiters in a row like one delimiter).
95	* <p>
96	* The tokenizer respect quoted words, so the delimiter is ignored if
97	* inside quotes. And it may handle escaped characters (like an
98	* escaped quote character, or an escaped new line). So the line
99	* <code>eric,"he said, \"great!\""</code> returns <code>eric</code>
100	* and <code>he said, "great!"</code> as words.
101	* <p>
102	* Low level interface: The design of the Tokenizer allows to get
103	* empty columns as well as treat multiple delimiters in a row as one
104	* delimiter. For the first approach trigger the values on every
105	* DELIMITER and EOF token whereas for the second, trigger only on
106	* WORD tokens.
107	* <p>
108	* If one wants to be informed about empty words as well, use the
109	* Tokenizer like in the following code fragment:
110	* <pre>
111	* Tokenizer tokenizer = new Tokenizer("text,,,another text");
112	* String word = "";
113	* int token;
114	* while((token = tokenizer.nextToken()) != Tokenizer.EOF)
115	* {
116	* switch(token)
117	* {
118	* case Tokenizer.EOL:
119	* System.out.println("word: "+word);
120	* word = "";
121	* System.out.println("-------------");
122	* break;
123	* case Tokenizer.WORD:
124	* word = tokenizer.getWord();
125	* break;
126	* case Tokenizer.QUOTED_WORD:
127	* word = tokenizer.getWord() + " (quoted)";
128	* break;
129	* case Tokenizer.DELIMITER:
130	* System.out.println("word: "+word);
131	* word = "";
132	* break;
133	* default:
134	* System.err.println("Unknown Token: "+token);
135	* }
136	* }
137	* </pre>
138	* In this example, if the delimiter is set to a comma, a line like
139	* <code>column1,,,"column4,partofcolumn4"</code> would be treated correctly.
140	* <p>
141	* This tokenizer uses the LF character as end of line characters. It
142	* ignores any CR characters, so it can be used in windows
143	* environments as well.
144	*
145	* @author Christof Dallermassl
146	* @version $Revision: 1.6 $
147	*/
148
149	public class Tokenizer
150	{
151	/** the reader to read from */
152	protected PushbackReader reader_;
153	/** the buffer to create the tokens */
154	protected StringBuffer buffer_;
155	/** all characters in this string are used as delimiters */
156	protected String delimiters_ = ",";
157	/** the escape character */
158	protected int escapeChar_ = '\\';
159	/** the quote character */
160	protected int quoteChar_ = '"';
161
162	/** if true, characters are treated as escaped */
163	protected boolean escapeMode_ = false;
164
165	/** if true, end of line is respected */
166	protected boolean eolIsSignificant_ = true;
167	/** if true, escape characters are respected */
168	protected boolean respectEscapedChars_ = false;
169	/** if true, quoted words are respected */
170	protected boolean respectQuotedWords_ = true;
171
172	/** line count */
173	protected int lineCount_ = 1;
174
175	/** end of file marker */
176	protected boolean eofReached_ = false;
177
178	/** the last token that was found */
179	protected int lastToken_ = NOT_STARTED;
180
181	/** end of file token */
182	public static final int EOF = -1;
183	/** end of line token */
184	public static final int EOL = 0;
185	/** word token */
186	public static final int WORD = 1;
187	/** quoted word token */
188	public static final int QUOTED_WORD = 2;
189	/** delimiter token */
190	public static final int DELIMITER = 3;
191	/** error token */
192	public static final int ERROR = 4;
193	/** not started token */
194	public static final int NOT_STARTED = 5;
195
196
197	//----------------------------------------------------------------------
198	/**
199	* Creates a tokenizer that reads from the given string. It uses the
200	* comma as delimiter, does not respect escape characters but respects
201	* quoted words.
202	*
203	* @param string the string to read from.
204	*/
205	public Tokenizer(String string)
206	{
207	this(new StringReader(string));
208	}
209
210	//----------------------------------------------------------------------
211	/**
212	* Creates a tokenizer that reads from the given string. All
213	* characters in the given delimiters string are used as
214	* delimiter. The tokenizer does not respect escape characters but
215	* respects quoted words.
216	*
217	* @param string the string to read from.
218	* @param delimiters the delimiters to use.
219	*/
220	public Tokenizer(String string, String delimiters)
221	{
222	this(new StringReader(string));
223	setDelimiters(delimiters);
224	}
225
226	//----------------------------------------------------------------------
227	/**
228	* Creates a tokenizer that reads from the given reader. It uses the
229	* comma as delimiter, does not respect escape characters but respects
230	* quoted words.
231	*
232	* @param reader the reader to read from.
233	*/
234	public Tokenizer(Reader reader)
235	{
236	reader_ = new PushbackReader(reader,2);
237	buffer_ = new StringBuffer();
238	}
239
240	//----------------------------------------------------------------------
241	/**
242	* Set the delimiter character. The default is the comma.
243	*
244	* @param delimiterChar the delimiter character.
245	*/
246	public void setDelimiter(int delimiterChar)
247	{
248	delimiters_ = new String(new char[]{(char)delimiterChar});
249	}
250
251	//----------------------------------------------------------------------
252	/**
253	* Get the first delimiter character.
254	*
255	* @return the delimiter character.
256	* @deprecated use the getDelimiters() method now
257	*/
258	public int getDelimiter()
259	{
260	return(delimiters_.charAt(0));
261	}
262
263	//----------------------------------------------------------------------
264	/**
265	* Set the delimiter characters. All characters in the delimiters are
266	* used as delimiter.
267	*
268	* @param delimiters the delimiter characters.
269	*/
270	public void setDelimiters(String delimiters)
271	{
272	delimiters_ = delimiters;
273	}
274
275	//----------------------------------------------------------------------
276	/**
277	* Get the delimiter character.
278	*
279	* @return the delimiter character.
280	*/
281	public String getDelimiters()
282	{
283	return(delimiters_);
284	}
285
286	//----------------------------------------------------------------------
287	/**
288	* Set the escape character. The default is the backslash.
289	*
290	* @param escapeChar the escape character.
291	*/
292	public void setEscapeChar(int escapeChar)
293	{
294	escapeChar_ = escapeChar;
295	}
296
297	//----------------------------------------------------------------------
298	/**
299	* Get the escape character.
300	*
301	* @return the escape character.
302	*/
303	public int getEscapeChar()
304	{
305	return(escapeChar_);
306	}
307
308	//----------------------------------------------------------------------
309	/**
310	* If escape characters should be respected, set the param to
311	* <code>true</code>. The default is to ignore escape characters.
312	*
313	* @param respectEscaped If escape characters should be respected,
314	* set the param to <code>true</code>.
315	*/
316	public void respectEscapedCharacters(boolean respectEscaped)
317	{
318	respectEscapedChars_ = respectEscaped;
319	}
320
321	//----------------------------------------------------------------------
322	/**
323	* Returns <code>true</code>, if escape character is respected.
324	*
325	* @return <code>true</code>, if escape character is respected.
326	*/
327	public boolean respectEscapedCharacters()
328	{
329	return(respectEscapedChars_);
330	}
331
332	//----------------------------------------------------------------------
333	/**
334	* Get the quote character.
335	*
336	* @return the quote character.
337	*/
338	public int getQuoteChar()
339	{
340	return (quoteChar_);
341	}
342
343	//----------------------------------------------------------------------
344	/**
345	* Set the quote character. The default is the double quote.
346	*
347	* @param quoteChar the quote character.
348	*/
349	public void setQuoteChar(int quoteChar)
350	{
351	quoteChar_ = quoteChar;
352	}
353
354	//----------------------------------------------------------------------
355	/**
356	* If quoted words should be respected, set the param to
357	* <code>true</code>. The default is to respect quoted words.
358	*
359	* @param respectQuotes If quoted words should be respected,
360	* set the param to <code>true</code>.
361	*/
362	public void respectQuotedWords(boolean respectQuotes)
363	{
364	respectQuotedWords_ = respectQuotes;
365	}
366
367	//----------------------------------------------------------------------
368	/**
369	* Returns <code>true</code>, if quoted words are respected.
370	*
371	* @return <code>true</code>, if quoted words are respected.
372	*/
373	public boolean respectQuotedWords()
374	{
375	return(respectQuotedWords_);
376	}
377
378	//----------------------------------------------------------------------
379	/**
380	* If set to <code>true</code> the end of line is signaled by the EOL
381	* token. If set to <code>false</code> end of line is treated as a
382	* normal delimiter. The default value is true;
383	*
384	* @param significant if the end of line is treated as a special token
385	* or as a delimiter.
386	*/
387	public void eolIsSignificant(boolean significant)
388	{
389	eolIsSignificant_ = significant;
390	}
391
392	//----------------------------------------------------------------------
393	/**
394	* Returns <code>true</code>, if in case of an end of line detected,
395	* an EOL token is returned. If <code>false</code>, the end of line is
396	* treated as a normal delimiter.
397	*
398	* @return <code>true</code>, if in case of an end of line detected,
399	* an EOL token is returned. If <code>false</code>, the end of line is
400	* treated as a normal delimiter.
401	*/
402	public boolean isEolSignificant()
403	{
404	return(eolIsSignificant_);
405	}
406
407
408	//----------------------------------------------------------------------
409	/**
410	* Returns the current line number of the reader.
411	*
412	* @return the current line number of the reader.
413	*/
414	public int getLineNumber()
415	{
416	return(lineCount_);
417	}
418
419	//----------------------------------------------------------------------
420	/**
421	* Returns the value of the token. If the token was of the type WORD,
422	* the word is returned.
423	*
424	* @return the value of the token.
425	*/
426	public String getWord()
427	{
428	return(buffer_.toString());
429	}
430
431	//----------------------------------------------------------------------
432	/**
433	* Returns the last token that was returned from the nextToken() method.
434	*
435	* @return the last token.
436	*/
437	public int getLastToken()
438	{
439	return(lastToken_);
440	}
441
442	//----------------------------------------------------------------------
443	/**
444	* Returns true, if the given character is seen as a delimiter. This
445	* method respects escape_mode, so if the escape character was found
446	* before, it has to act accordingly (usually, return false, even if
447	* the character is a delimiter).
448	*
449	* @param character the character to check for delimiter
450	* @return true, if the given character is seen as a delimiter.
451	*/
452	protected boolean isDelimiter(int character)
453	{
454	// check for escape mode:
455	if(escapeMode_)
456	return(false);
457
458	return(delimiters_.indexOf(character) >= 0);
459	}
460
461	//----------------------------------------------------------------------
462	/**
463	* Returns true, if the given character is seen as a quote
464	* character. This method respects escape_mode, so if the escape
465	* character was found before, it has to act accordingly (usually,
466	* return false, even if the character is a quote character).
467	*
468	* @param character the character to check for quote.
469	* @return true, if the given character is seen as a quote character.
470	*/
471	protected boolean isQuoteChar(int character)
472	{
473	if(!respectQuotedWords_)
474	return(false);
475
476	// check for escape mode:
477	if(escapeMode_)
478	return(false);
479
480	return(character == quoteChar_);
481	}
482
483	//----------------------------------------------------------------------
484	/**
485	* Returns true, if the given character is seen as a escape
486	* character. This method respects escape_mode, so if the escape
487	* character was found before, it has to act accordingly (usually,
488	* return false, even if the character is a escape character).
489	* @param character the character to check for escape character.
490	* @return true, if the given character is seen as a escape character.
491	*/
492	protected boolean isEscapeChar(int character)
493	{
494	if(!respectEscapedChars_)
495	return(false);
496
497	// check for escape mode:
498	if(escapeMode_)
499	return(false);
500
501	return(character == escapeChar_);
502	}
503
504	//----------------------------------------------------------------------
505	/**
506	* Returns true, if the given character is seen as a end of line
507	* character. This method respects end of line_mode, so if the end of
508	* line character was found before, it has to act accordingly
509	* (usually, return false, even if the character is a end of line
510	* character).
511	* @param character the character to check for end of line.
512	* @return true, if the given character is seen as a end of line
513	* character.
514	*/
515	protected boolean isEndOfLine(int character)
516	{
517	// check for escape mode:
518	if(escapeMode_)
519	{
520	if(character == '\n') // add line count, even if in escape mode!
521	lineCount_++;
522	return(false);
523	}
524	if(character == -1)
525	eofReached_ = true;
526
527	return((character=='\n') \|\| (character=='\r') \|\| (character == -1));
528	}
529
530	//----------------------------------------------------------------------
531	/**
532	* Closes the tokenizer (and the reader is uses internally).
533	*
534	* @exception IOException if an error occurred.
535	*/
536	public void close()
537	throws IOException
538	{
539	reader_.close();
540	}
541
542	//----------------------------------------------------------------------
543	/**
544	* Reads and returns the next character from the reader and checks for
545	* the escape character. If an escape character is read, a flag is set
546	* and the next character is read. A newline following the escape
547	* character is ignored.
548	*
549	* @return the next character.
550	* @exception IOException if an error occurred.
551	*/
552	protected int readNextChar()
553	throws IOException
554	{
555	int next_char = reader_.read();
556	if(escapeMode_)
557	{
558	escapeMode_ = false;
559	}
560	else
561	{
562	if(isEscapeChar(next_char))
563	{
564	// ignore escape char itself:
565	next_char = reader_.read();
566
567	// check for newline and ignore it:
568	if(isEndOfLine(next_char))
569	{
570	lineCount_++;
571	next_char = reader_.read();
572	// ignore CR:
573	if(next_char == '\r')
574	{
575	next_char = readNextChar();
576	}
577	}
578	escapeMode_ = true;
579	}
580	}
581	// ignore CR:
582	if(next_char == '\r')
583	{
584	next_char = readNextChar();
585	}
586	return(next_char);
587	}
588
589	//----------------------------------------------------------------------
590	/**
591	* Returns the next token from the reader. The token's value may be
592	* WORD, QUOTED_WORD, EOF, EOL, or DELIMITER. In the case or WORD or
593	* QUOTED_WORD the actual word can be obtained by the use of the
594	* getWord method.
595	*
596	* @return the next token.
597	* @exception IOException if an error occurred.
598	*/
599	public int nextToken()
600	throws IOException
601	{
602	buffer_.setLength(0);
603
604	int next_char;
605	next_char = readNextChar();
606
607	// handle EOF:
608	if(eofReached_)
609	{
610	lastToken_ = EOF;
611	return(EOF);
612	}
613
614	// handle EOL:
615	if(isEndOfLine(next_char))
616	{
617	lineCount_++;
618	if(eolIsSignificant_)
619	{
620	lastToken_ = EOL;
621	return(EOL);
622	}
623	else
624	{
625	lastToken_ = DELIMITER;
626	return(DELIMITER);
627	}
628	}
629
630	// handle DELIMITER
631	if(isDelimiter(next_char))
632	{
633	lastToken_ = DELIMITER;
634	return(DELIMITER);
635	}
636
637	// handle quoted words:
638	if(isQuoteChar(next_char))
639	{
640	while(true)
641	{
642	next_char = readNextChar();
643	if(isEndOfLine(next_char))
644	{
645	lastToken_ = ERROR;
646	return(ERROR);
647	}
648	else
649	{
650	if(isQuoteChar(next_char))
651	{
652	lastToken_ = QUOTED_WORD;
653	return(QUOTED_WORD);
654	}
655
656	// no special char, then append to buffer:
657	buffer_.append((char)next_char);
658	}
659	}
660	}
661
662	// handle 'normal' words:
663	while(true)
664	{
665	buffer_.append((char)next_char);
666	next_char = readNextChar();
667	if(isDelimiter(next_char) \|\| isEndOfLine(next_char))
668	{
669	reader_.unread(next_char);
670	lastToken_ = WORD;
671	return(WORD);
672	}
673	}
674	}
675
676	//----------------------------------------------------------------------
677	/**
678	* Returns true, if the tokenizer can return another line.
679	*
680	* @return true, if the tokenizer can return another line.
681	* @exception IOException if an error occurred.
682	*/
683	public boolean hasNextLine()
684	throws IOException
685	{
686	if(lastToken_ == EOF)
687	return(false);
688
689	if((lastToken_ == EOL) \|\| (lastToken_ == NOT_STARTED))
690	{
691	int next_char = readNextChar();
692	if(next_char == -1)
693	return(false);
694
695	reader_.unread(next_char);
696	}
697	return(true);
698	}
699
700
701	//----------------------------------------------------------------------
702	/**
703	* Returns a list of elements (Strings) from the next line of the
704	* tokenizer. If there are multiple delimiters without any values in
705	* between, empty (zero length) strings are added to the list. They
706	* may be removed by the use of the {@link
707	* #removeZeroLengthElements(List)} method.
708	*
709	* @return a list of elements (Strings) from the next line of the
710	* tokenizer.
711	* @exception IOException if an error occurred.
712	*/
713	public List<String> nextLine()
714	throws IOException
715	{
716	int token = nextToken();
717	List<String> list = new ArrayList<>();
718	String word = "";
719	// while(token != Tokenizer.EOF)
720	while(true)
721	{
722	switch(token)
723	{
724	case Tokenizer.WORD:
725	word = getWord();
726	break;
727	case Tokenizer.QUOTED_WORD:
728	word = getWord();
729	break;
730	case Tokenizer.DELIMITER:
731	list.add(word);
732	word = "";
733	break;
734	case Tokenizer.EOL:
735	case Tokenizer.EOF:
736	list.add(word);
737	return(list);
738	default:
739	System.err.println("Unknown Token: "+token);
740	}
741	token = nextToken();
742	}
743	// return(list);
744	}
745
746	//----------------------------------------------------------------------
747	/**
748	* This helper method removes all zero length elements from the given
749	* list and returns it. The given list is not changed!
750	*
751	* @param list the list of String objects to remove the zero elements from.
752	* @return a copy of the given list where all zero length elements are removed.
753	*/
754	public static List<String> removeZeroLengthElements(List<String> list)
755	{
756	return removeZeroLengthElements(list, false);
757	}
758
759	//----------------------------------------------------------------------
760	/**
761	* This helper method trims all elements and removes all zero length
762	* (length is taken after trimming leading and trailing spaces) elements from the given
763	* list and returns it. This method copies the (trimmed and) non-zero elements to a
764	* new list.
765	*
766	* @param list the list of String objects to remove the zero elements from.
767	* @param trim if set to <code>true</code>, all leading and trailing spaces are removed from
768	* the elements. This is done, before the length is compared to zero (and the element
769	* may be removed if the length is zero). If set to <code>true</code>, elements
770	* that only consist of spaces are removed as well!
771	* @return the list where all zero length elements are remove.
772	*/
773	public static List<String> removeZeroLengthElements(List<String> list, boolean trim)
774	{
775	Iterator<String> iterator = list.iterator();
776	String value;
777	List<String> new_list = new ArrayList<>();
778	while(iterator.hasNext())
779	{
780	value = iterator.next();
781	if (trim)
782	value = value.trim();
783	if(value.length() != 0)
784	new_list.add(value);
785	}
786	return(new_list);
787	}
788	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: osm/applications/editors/josm/plugins/surveyor/src/org/dinopolis/util/io/Tokenizer.java@ 34591

Download in other formats: