source: osm/applications/editors/josm/plugins/surveyor/src/org/dinopolis/util/io/Tokenizer.java@ 34591

Last change on this file since 34591 was 34591, checked in by donvip, 6 years ago

rename packages, fix warnings

  • Property svn:eol-style set to native
File size: 22.8 KB
Line 
1/***********************************************************************
2 * @(#)$RCSfile: Tokenizer.java,v $ $Revision: 1.6 $$Date: 2006/04/21 14:14:56 $
3 *
4 * Copyright (c) Christof Dallermassl
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License (LGPL)
8 * as published by the Free Software Foundation; either version 2.1 of
9 * the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc.,
19 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 ***********************************************************************/
21
22package org.dinopolis.util.io;
23
24import java.io.IOException;
25import java.io.PushbackReader;
26import java.io.Reader;
27import java.io.StringReader;
28import java.util.ArrayList;
29import java.util.Iterator;
30import java.util.List;
31
32//----------------------------------------------------------------------
33/**
34
35 * This tokenizer merges the benefits of the java.lang.StringTokenizer
36 * class and the java.io.StreamTokenizer class. It provides a low
37 * level and a high level interface to the tokenizer. The low level
38 * interface consists of the method pair nextToken() and getWord(),
39 * where the first returns the type of token in the parsing process,
40 * and the latter returns the String element itself.
41 * <p>
42 * The high level interface consists of the methods hasNextLine() and
43 * nextLine(). They use the low level interface to parse the data line
44 * by line and create a list of strings from it.
45 * <p>
46 * It is unsure, if it is wise to mix the usage of the high and
47 * the low level interface. For normal usage, the high level interface
48 * should be more comfortable to use and does not provide any
49 * drawbacks.
50 * <p>
51
52 * An example for the high level interface:
53 * <pre>
54 * try
55 * {
56 * // simple example, tokenizing string, no escape, but quoted
57 * // works:
58 * System.out.println("example 1");
59 * Tokenizer tokenizer = new Tokenizer("text,,,\"another,text\"");
60 * List tokens;
61 * while(tokenizer.hasNextLine())
62 * {
63 * tokens = tokenizer.nextLine();
64 * System.out.println(tokens.get(0)); // prints 'text'
65 * System.out.println(tokens.get(1)); // prints ''
66 * System.out.println(tokens.get(2)); // prints ''
67 * System.out.println(tokens.get(3)); // prints 'another,text'
68 * }
69 *
70 * System.out.println("example 2");
71 * // simple example, tokenizing string, using escape char and
72 * // quoted strings:
73 * tokenizer = new Tokenizer("text,text with\\,comma,,\"another,text\"");
74 * tokenizer.respectEscapedCharacters(true);
75 * while(tokenizer.hasNextLine())
76 * {
77 * tokens = tokenizer.nextLine();
78 * System.out.println(tokens.get(0)); // prints 'text'
79 * System.out.println(tokens.get(1)); // prints 'text with, comma'
80 * System.out.println(tokens.get(2)); // prints ''
81 * System.out.println(tokens.get(3)); // prints 'another,text'
82 * }
83 * }
84 * catch(Exception ioe)
85 * {
86 * ioe.printStackTrace();
87 * }
88 * </pre>
89 * <p>
90 * The advantages compared to the StreamTokenizer class are: Unlike
91 * the StreamTokenizer, this Tokenizer class returns the delimiters as
92 * tokens and therefore may be used to tokenize e.g. comma separated
93 * files with empty fields (the StreamTokenizer handles multiple
94 * delimiters in a row like one delimiter).
95 * <p>
96 * The tokenizer respect quoted words, so the delimiter is ignored if
97 * inside quotes. And it may handle escaped characters (like an
98 * escaped quote character, or an escaped new line). So the line
99 * <code>eric,"he said, \"great!\""</code> returns <code>eric</code>
100 * and <code>he said, "great!"</code> as words.
101 * <p>
102 * Low level interface: The design of the Tokenizer allows to get
103 * empty columns as well as treat multiple delimiters in a row as one
104 * delimiter. For the first approach trigger the values on every
105 * DELIMITER and EOF token whereas for the second, trigger only on
106 * WORD tokens.
107 * <p>
108 * If one wants to be informed about empty words as well, use the
109 * Tokenizer like in the following code fragment:
110 * <pre>
111 * Tokenizer tokenizer = new Tokenizer("text,,,another text");
112 * String word = "";
113 * int token;
114 * while((token = tokenizer.nextToken()) != Tokenizer.EOF)
115 * {
116 * switch(token)
117 * {
118 * case Tokenizer.EOL:
119 * System.out.println("word: "+word);
120 * word = "";
121 * System.out.println("-------------");
122 * break;
123 * case Tokenizer.WORD:
124 * word = tokenizer.getWord();
125 * break;
126 * case Tokenizer.QUOTED_WORD:
127 * word = tokenizer.getWord() + " (quoted)";
128 * break;
129 * case Tokenizer.DELIMITER:
130 * System.out.println("word: "+word);
131 * word = "";
132 * break;
133 * default:
134 * System.err.println("Unknown Token: "+token);
135 * }
136 * }
137 * </pre>
138 * In this example, if the delimiter is set to a comma, a line like
139 * <code>column1,,,"column4,partofcolumn4"</code> would be treated correctly.
140 * <p>
141 * This tokenizer uses the LF character as end of line characters. It
142 * ignores any CR characters, so it can be used in windows
143 * environments as well.
144 *
145 * @author Christof Dallermassl
146 * @version $Revision: 1.6 $
147 */
148
149public class Tokenizer
150{
151 /** the reader to read from */
152 protected PushbackReader reader_;
153 /** the buffer to create the tokens */
154 protected StringBuffer buffer_;
155 /** all characters in this string are used as delimiters */
156 protected String delimiters_ = ",";
157 /** the escape character */
158 protected int escapeChar_ = '\\';
159 /** the quote character */
160 protected int quoteChar_ = '"';
161
162 /** if true, characters are treated as escaped */
163 protected boolean escapeMode_ = false;
164
165 /** if true, end of line is respected */
166 protected boolean eolIsSignificant_ = true;
167 /** if true, escape characters are respected */
168 protected boolean respectEscapedChars_ = false;
169 /** if true, quoted words are respected */
170 protected boolean respectQuotedWords_ = true;
171
172 /** line count */
173 protected int lineCount_ = 1;
174
175 /** end of file marker */
176 protected boolean eofReached_ = false;
177
178 /** the last token that was found */
179 protected int lastToken_ = NOT_STARTED;
180
181 /** end of file token */
182 public static final int EOF = -1;
183 /** end of line token */
184 public static final int EOL = 0;
185 /** word token */
186 public static final int WORD = 1;
187 /** quoted word token */
188 public static final int QUOTED_WORD = 2;
189 /** delimiter token */
190 public static final int DELIMITER = 3;
191 /** error token */
192 public static final int ERROR = 4;
193 /** not started token */
194 public static final int NOT_STARTED = 5;
195
196
197//----------------------------------------------------------------------
198/**
199 * Creates a tokenizer that reads from the given string. It uses the
200 * comma as delimiter, does not respect escape characters but respects
201 * quoted words.
202 *
203 * @param string the string to read from.
204 */
205 public Tokenizer(String string)
206 {
207 this(new StringReader(string));
208 }
209
210//----------------------------------------------------------------------
211/**
212 * Creates a tokenizer that reads from the given string. All
213 * characters in the given delimiters string are used as
214 * delimiter. The tokenizer does not respect escape characters but
215 * respects quoted words.
216 *
217 * @param string the string to read from.
218 * @param delimiters the delimiters to use.
219 */
220 public Tokenizer(String string, String delimiters)
221 {
222 this(new StringReader(string));
223 setDelimiters(delimiters);
224 }
225
226//----------------------------------------------------------------------
227/**
228 * Creates a tokenizer that reads from the given reader. It uses the
229 * comma as delimiter, does not respect escape characters but respects
230 * quoted words.
231 *
232 * @param reader the reader to read from.
233 */
234 public Tokenizer(Reader reader)
235 {
236 reader_ = new PushbackReader(reader,2);
237 buffer_ = new StringBuffer();
238 }
239
240//----------------------------------------------------------------------
241/**
242 * Set the delimiter character. The default is the comma.
243 *
244 * @param delimiterChar the delimiter character.
245 */
246 public void setDelimiter(int delimiterChar)
247 {
248 delimiters_ = new String(new char[]{(char)delimiterChar});
249 }
250
251//----------------------------------------------------------------------
252/**
253 * Get the first delimiter character.
254 *
255 * @return the delimiter character.
256 * @deprecated use the getDelimiters() method now
257 */
258 public int getDelimiter()
259 {
260 return(delimiters_.charAt(0));
261 }
262
263//----------------------------------------------------------------------
264/**
265 * Set the delimiter characters. All characters in the delimiters are
266 * used as delimiter.
267 *
268 * @param delimiters the delimiter characters.
269 */
270 public void setDelimiters(String delimiters)
271 {
272 delimiters_ = delimiters;
273 }
274
275//----------------------------------------------------------------------
276/**
277 * Get the delimiter character.
278 *
279 * @return the delimiter character.
280 */
281 public String getDelimiters()
282 {
283 return(delimiters_);
284 }
285
286//----------------------------------------------------------------------
287/**
288 * Set the escape character. The default is the backslash.
289 *
290 * @param escapeChar the escape character.
291 */
292 public void setEscapeChar(int escapeChar)
293 {
294 escapeChar_ = escapeChar;
295 }
296
297//----------------------------------------------------------------------
298/**
299 * Get the escape character.
300 *
301 * @return the escape character.
302 */
303 public int getEscapeChar()
304 {
305 return(escapeChar_);
306 }
307
308//----------------------------------------------------------------------
309/**
310 * If escape characters should be respected, set the param to
311 * <code>true</code>. The default is to ignore escape characters.
312 *
313 * @param respectEscaped If escape characters should be respected,
314 * set the param to <code>true</code>.
315 */
316 public void respectEscapedCharacters(boolean respectEscaped)
317 {
318 respectEscapedChars_ = respectEscaped;
319 }
320
321//----------------------------------------------------------------------
322/**
323 * Returns <code>true</code>, if escape character is respected.
324 *
325 * @return <code>true</code>, if escape character is respected.
326 */
327 public boolean respectEscapedCharacters()
328 {
329 return(respectEscapedChars_);
330 }
331
332//----------------------------------------------------------------------
333/**
334 * Get the quote character.
335 *
336 * @return the quote character.
337 */
338 public int getQuoteChar()
339 {
340 return (quoteChar_);
341 }
342
343//----------------------------------------------------------------------
344/**
345 * Set the quote character. The default is the double quote.
346 *
347 * @param quoteChar the quote character.
348 */
349 public void setQuoteChar(int quoteChar)
350 {
351 quoteChar_ = quoteChar;
352 }
353
354//----------------------------------------------------------------------
355/**
356 * If quoted words should be respected, set the param to
357 * <code>true</code>. The default is to respect quoted words.
358 *
359 * @param respectQuotes If quoted words should be respected,
360 * set the param to <code>true</code>.
361 */
362 public void respectQuotedWords(boolean respectQuotes)
363 {
364 respectQuotedWords_ = respectQuotes;
365 }
366
367//----------------------------------------------------------------------
368/**
369 * Returns <code>true</code>, if quoted words are respected.
370 *
371 * @return <code>true</code>, if quoted words are respected.
372 */
373 public boolean respectQuotedWords()
374 {
375 return(respectQuotedWords_);
376 }
377
378//----------------------------------------------------------------------
379/**
380 * If set to <code>true</code> the end of line is signaled by the EOL
381 * token. If set to <code>false</code> end of line is treated as a
382 * normal delimiter. The default value is true;
383 *
384 * @param significant if the end of line is treated as a special token
385 * or as a delimiter.
386 */
387 public void eolIsSignificant(boolean significant)
388 {
389 eolIsSignificant_ = significant;
390 }
391
392//----------------------------------------------------------------------
393/**
394 * Returns <code>true</code>, if in case of an end of line detected,
395 * an EOL token is returned. If <code>false</code>, the end of line is
396 * treated as a normal delimiter.
397 *
398 * @return <code>true</code>, if in case of an end of line detected,
399 * an EOL token is returned. If <code>false</code>, the end of line is
400 * treated as a normal delimiter.
401 */
402 public boolean isEolSignificant()
403 {
404 return(eolIsSignificant_);
405 }
406
407
408//----------------------------------------------------------------------
409/**
410 * Returns the current line number of the reader.
411 *
412 * @return the current line number of the reader.
413 */
414 public int getLineNumber()
415 {
416 return(lineCount_);
417 }
418
419//----------------------------------------------------------------------
420/**
421 * Returns the value of the token. If the token was of the type WORD,
422 * the word is returned.
423 *
424 * @return the value of the token.
425 */
426 public String getWord()
427 {
428 return(buffer_.toString());
429 }
430
431//----------------------------------------------------------------------
432/**
433 * Returns the last token that was returned from the nextToken() method.
434 *
435 * @return the last token.
436 */
437 public int getLastToken()
438 {
439 return(lastToken_);
440 }
441
442//----------------------------------------------------------------------
443/**
444 * Returns true, if the given character is seen as a delimiter. This
445 * method respects escape_mode, so if the escape character was found
446 * before, it has to act accordingly (usually, return false, even if
447 * the character is a delimiter).
448 *
449 * @param character the character to check for delimiter
450 * @return true, if the given character is seen as a delimiter.
451 */
452 protected boolean isDelimiter(int character)
453 {
454 // check for escape mode:
455 if(escapeMode_)
456 return(false);
457
458 return(delimiters_.indexOf(character) >= 0);
459 }
460
461//----------------------------------------------------------------------
462/**
463 * Returns true, if the given character is seen as a quote
464 * character. This method respects escape_mode, so if the escape
465 * character was found before, it has to act accordingly (usually,
466 * return false, even if the character is a quote character).
467 *
468 * @param character the character to check for quote.
469 * @return true, if the given character is seen as a quote character.
470 */
471 protected boolean isQuoteChar(int character)
472 {
473 if(!respectQuotedWords_)
474 return(false);
475
476 // check for escape mode:
477 if(escapeMode_)
478 return(false);
479
480 return(character == quoteChar_);
481 }
482
483//----------------------------------------------------------------------
484/**
485 * Returns true, if the given character is seen as a escape
486 * character. This method respects escape_mode, so if the escape
487 * character was found before, it has to act accordingly (usually,
488 * return false, even if the character is a escape character).
489 * @param character the character to check for escape character.
490 * @return true, if the given character is seen as a escape character.
491 */
492 protected boolean isEscapeChar(int character)
493 {
494 if(!respectEscapedChars_)
495 return(false);
496
497 // check for escape mode:
498 if(escapeMode_)
499 return(false);
500
501 return(character == escapeChar_);
502 }
503
504//----------------------------------------------------------------------
505/**
506 * Returns true, if the given character is seen as a end of line
507 * character. This method respects end of line_mode, so if the end of
508 * line character was found before, it has to act accordingly
509 * (usually, return false, even if the character is a end of line
510 * character).
511 * @param character the character to check for end of line.
512 * @return true, if the given character is seen as a end of line
513 * character.
514 */
515 protected boolean isEndOfLine(int character)
516 {
517 // check for escape mode:
518 if(escapeMode_)
519 {
520 if(character == '\n') // add line count, even if in escape mode!
521 lineCount_++;
522 return(false);
523 }
524 if(character == -1)
525 eofReached_ = true;
526
527 return((character=='\n') || (character=='\r') || (character == -1));
528 }
529
530//----------------------------------------------------------------------
531/**
532 * Closes the tokenizer (and the reader is uses internally).
533 *
534 * @exception IOException if an error occurred.
535 */
536 public void close()
537 throws IOException
538 {
539 reader_.close();
540 }
541
542//----------------------------------------------------------------------
543/**
544 * Reads and returns the next character from the reader and checks for
545 * the escape character. If an escape character is read, a flag is set
546 * and the next character is read. A newline following the escape
547 * character is ignored.
548 *
549 * @return the next character.
550 * @exception IOException if an error occurred.
551 */
552 protected int readNextChar()
553 throws IOException
554 {
555 int next_char = reader_.read();
556 if(escapeMode_)
557 {
558 escapeMode_ = false;
559 }
560 else
561 {
562 if(isEscapeChar(next_char))
563 {
564 // ignore escape char itself:
565 next_char = reader_.read();
566
567 // check for newline and ignore it:
568 if(isEndOfLine(next_char))
569 {
570 lineCount_++;
571 next_char = reader_.read();
572 // ignore CR:
573 if(next_char == '\r')
574 {
575 next_char = readNextChar();
576 }
577 }
578 escapeMode_ = true;
579 }
580 }
581 // ignore CR:
582 if(next_char == '\r')
583 {
584 next_char = readNextChar();
585 }
586 return(next_char);
587 }
588
589//----------------------------------------------------------------------
590/**
591 * Returns the next token from the reader. The token's value may be
592 * WORD, QUOTED_WORD, EOF, EOL, or DELIMITER. In the case or WORD or
593 * QUOTED_WORD the actual word can be obtained by the use of the
594 * getWord method.
595 *
596 * @return the next token.
597 * @exception IOException if an error occurred.
598 */
599 public int nextToken()
600 throws IOException
601 {
602 buffer_.setLength(0);
603
604 int next_char;
605 next_char = readNextChar();
606
607 // handle EOF:
608 if(eofReached_)
609 {
610 lastToken_ = EOF;
611 return(EOF);
612 }
613
614 // handle EOL:
615 if(isEndOfLine(next_char))
616 {
617 lineCount_++;
618 if(eolIsSignificant_)
619 {
620 lastToken_ = EOL;
621 return(EOL);
622 }
623 else
624 {
625 lastToken_ = DELIMITER;
626 return(DELIMITER);
627 }
628 }
629
630 // handle DELIMITER
631 if(isDelimiter(next_char))
632 {
633 lastToken_ = DELIMITER;
634 return(DELIMITER);
635 }
636
637 // handle quoted words:
638 if(isQuoteChar(next_char))
639 {
640 while(true)
641 {
642 next_char = readNextChar();
643 if(isEndOfLine(next_char))
644 {
645 lastToken_ = ERROR;
646 return(ERROR);
647 }
648 else
649 {
650 if(isQuoteChar(next_char))
651 {
652 lastToken_ = QUOTED_WORD;
653 return(QUOTED_WORD);
654 }
655
656 // no special char, then append to buffer:
657 buffer_.append((char)next_char);
658 }
659 }
660 }
661
662 // handle 'normal' words:
663 while(true)
664 {
665 buffer_.append((char)next_char);
666 next_char = readNextChar();
667 if(isDelimiter(next_char) || isEndOfLine(next_char))
668 {
669 reader_.unread(next_char);
670 lastToken_ = WORD;
671 return(WORD);
672 }
673 }
674 }
675
676//----------------------------------------------------------------------
677/**
678 * Returns true, if the tokenizer can return another line.
679 *
680 * @return true, if the tokenizer can return another line.
681 * @exception IOException if an error occurred.
682 */
683 public boolean hasNextLine()
684 throws IOException
685 {
686 if(lastToken_ == EOF)
687 return(false);
688
689 if((lastToken_ == EOL) || (lastToken_ == NOT_STARTED))
690 {
691 int next_char = readNextChar();
692 if(next_char == -1)
693 return(false);
694
695 reader_.unread(next_char);
696 }
697 return(true);
698 }
699
700
701//----------------------------------------------------------------------
702/**
703 * Returns a list of elements (Strings) from the next line of the
704 * tokenizer. If there are multiple delimiters without any values in
705 * between, empty (zero length) strings are added to the list. They
706 * may be removed by the use of the {@link
707 * #removeZeroLengthElements(List)} method.
708 *
709 * @return a list of elements (Strings) from the next line of the
710 * tokenizer.
711 * @exception IOException if an error occurred.
712 */
713 public List<String> nextLine()
714 throws IOException
715 {
716 int token = nextToken();
717 List<String> list = new ArrayList<>();
718 String word = "";
719// while(token != Tokenizer.EOF)
720 while(true)
721 {
722 switch(token)
723 {
724 case Tokenizer.WORD:
725 word = getWord();
726 break;
727 case Tokenizer.QUOTED_WORD:
728 word = getWord();
729 break;
730 case Tokenizer.DELIMITER:
731 list.add(word);
732 word = "";
733 break;
734 case Tokenizer.EOL:
735 case Tokenizer.EOF:
736 list.add(word);
737 return(list);
738 default:
739 System.err.println("Unknown Token: "+token);
740 }
741 token = nextToken();
742 }
743// return(list);
744 }
745
746//----------------------------------------------------------------------
747/**
748 * This helper method removes all zero length elements from the given
749 * list and returns it. The given list is not changed!
750 *
751 * @param list the list of String objects to remove the zero elements from.
752 * @return a copy of the given list where all zero length elements are removed.
753 */
754 public static List<String> removeZeroLengthElements(List<String> list)
755 {
756 return removeZeroLengthElements(list, false);
757 }
758
759//----------------------------------------------------------------------
760 /**
761 * This helper method trims all elements and removes all zero length
762 * (length is taken after trimming leading and trailing spaces) elements from the given
763 * list and returns it. This method copies the (trimmed and) non-zero elements to a
764 * new list.
765 *
766 * @param list the list of String objects to remove the zero elements from.
767 * @param trim if set to <code>true</code>, all leading and trailing spaces are removed from
768 * the elements. This is done, before the length is compared to zero (and the element
769 * may be removed if the length is zero). If set to <code>true</code>, elements
770 * that only consist of spaces are removed as well!
771 * @return the list where all zero length elements are remove.
772 */
773 public static List<String> removeZeroLengthElements(List<String> list, boolean trim)
774 {
775 Iterator<String> iterator = list.iterator();
776 String value;
777 List<String> new_list = new ArrayList<>();
778 while(iterator.hasNext())
779 {
780 value = iterator.next();
781 if (trim)
782 value = value.trim();
783 if(value.length() != 0)
784 new_list.add(value);
785 }
786 return(new_list);
787 }
788}
Note: See TracBrowser for help on using the repository browser.