Context Navigation

source: osm/applications/editors/josm/plugins/surveyor/src/org/dinopolis/util/io/Tokenizer.java@ 13497

Last change on this file since 13497 was 13497, checked in by skela, 16 years ago
applications/editors/josm: Set svn:eol-style native on all *.java files in plugins. Normalize the eol-style in plugins/lakewalker/src/org/openstreetmap/josm/plugins/lakewalker/StringEnumConfigurer.java.
Property svn:eol-style set to `native`
File size: 27.4 KB

Rev	Line
[13497]	1	/***********************************************************************
	2	* @(#)$RCSfile: Tokenizer.java,v $ $Revision: 1.6 $$Date: 2006/04/21 14:14:56 $
	3	*
	4	* Copyright (c) Christof Dallermassl
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU Lesser General Public License (LGPL)
	8	* as published by the Free Software Foundation; either version 2.1 of
	9	* the License, or (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with this program; if not, write to the
	18	* Free Software Foundation, Inc.,
	19	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	20	***********************************************************************/
	21
	22	package org.dinopolis.util.io;
	23
	24	import java.io.IOException;
	25	import java.io.InputStream;
	26	import java.io.InputStreamReader;
	27	import java.io.PushbackReader;
	28	import java.io.Reader;
	29	import java.io.StringReader;
	30	import java.util.ArrayList;
	31	import java.util.Iterator;
	32	import java.util.List;
	33
	34	//----------------------------------------------------------------------
	35	/**
	36
	37	* This tokenizer merges the benefits of the java.lang.StringTokenizer
	38	* class and the java.io.StreamTokenizer class. It provides a low
	39	* level and a high level interface to the tokenizer. The low level
	40	* interface consists of the method pair nextToken() and getWord(),
	41	* where the first returns the type of token in the parsing process,
	42	* and the latter returns the String element itself.
	43	* <p>
	44	* The high level interface consists of the methods hasNextLine() and
	45	* nextLine(). They use the low level interface to parse the data line
	46	* by line and create a list of strings from it.
	47	* <p>
	48	* It is unsure, if it is wise to mix the usage of the high and
	49	* the low level interface. For normal usage, the high level interface
	50	* should be more comfortable to use and does not provide any
	51	* drawbacks.
	52	* <p>
	53
	54	* An example for the high level interface:
	55	* <pre>
	56	* try
	57	* {
	58	* // simple example, tokenizing string, no escape, but quoted
	59	* // works:
	60	* System.out.println("example 1");
	61	* Tokenizer tokenizer = new Tokenizer("text,,,\"another,text\"");
	62	* List tokens;
	63	* while(tokenizer.hasNextLine())
	64	* {
	65	* tokens = tokenizer.nextLine();
	66	* System.out.println(tokens.get(0)); // prints 'text'
	67	* System.out.println(tokens.get(1)); // prints ''
	68	* System.out.println(tokens.get(2)); // prints ''
	69	* System.out.println(tokens.get(3)); // prints 'another,text'
	70	* }
	71	*
	72	* System.out.println("example 2");
	73	* // simple example, tokenizing string, using escape char and
	74	* // quoted strings:
	75	* tokenizer = new Tokenizer("text,text with\\,comma,,\"another,text\"");
	76	* tokenizer.respectEscapedCharacters(true);
	77	* while(tokenizer.hasNextLine())
	78	* {
	79	* tokens = tokenizer.nextLine();
	80	* System.out.println(tokens.get(0)); // prints 'text'
	81	* System.out.println(tokens.get(1)); // prints 'text with, comma'
	82	* System.out.println(tokens.get(2)); // prints ''
	83	* System.out.println(tokens.get(3)); // prints 'another,text'
	84	* }
	85	* }
	86	* catch(Exception ioe)
	87	* {
	88	* ioe.printStackTrace();
	89	* }
	90	* </pre>
	91	* <p>
	92	* The advantages compared to the StreamTokenizer class are: Unlike
	93	* the StreamTokenizer, this Tokenizer class returns the delimiters as
	94	* tokens and therefore may be used to tokenize e.g. comma separated
	95	* files with empty fields (the StreamTokenizer handles multiple
	96	* delimiters in a row like one delimiter).
	97	* <p>
	98	* The tokenizer respect quoted words, so the delimiter is ignored if
	99	* inside quotes. And it may handle escaped characters (like an
	100	* escaped quote character, or an escaped new line). So the line
	101	* <code>eric,"he said, \"great!\""</code> returns <code>eric</code>
	102	* and <code>he said, "great!"</code> as words.
	103	* <p>
	104	* Low level interface: The design of the Tokenizer allows to get
	105	* empty columns as well as treat multiple delimiters in a row as one
	106	* delimiter. For the first approach trigger the values on every
	107	* DELIMITER and EOF token whereas for the second, trigger only on
	108	* WORD tokens.
	109	* <p>
	110	* If one wants to be informed about empty words as well, use the
	111	* Tokenizer like in the following code fragment:
	112	* <pre>
	113	* Tokenizer tokenizer = new Tokenizer("text,,,another text");
	114	* String word = "";
	115	* int token;
	116	* while((token = tokenizer.nextToken()) != Tokenizer.EOF)
	117	* {
	118	* switch(token)
	119	* {
	120	* case Tokenizer.EOL:
	121	* System.out.println("word: "+word);
	122	* word = "";
	123	* System.out.println("-------------");
	124	* break;
	125	* case Tokenizer.WORD:
	126	* word = tokenizer.getWord();
	127	* break;
	128	* case Tokenizer.QUOTED_WORD:
	129	* word = tokenizer.getWord() + " (quoted)";
	130	* break;
	131	* case Tokenizer.DELIMITER:
	132	* System.out.println("word: "+word);
	133	* word = "";
	134	* break;
	135	* default:
	136	* System.err.println("Unknown Token: "+token);
	137	* }
	138	* }
	139	* </pre>
	140	* In this example, if the delimiter is set to a comma, a line like
	141	* <code>column1,,,"column4,partofcolumn4"</code> would be treated correctly.
	142	* <p>
	143	* This tokenizer uses the LF character as end of line characters. It
	144	* ignores any CR characters, so it can be used in windows
	145	* environments as well.
	146	*
	147	* @author Christof Dallermassl
	148	* @version $Revision: 1.6 $
	149	*/
	150
	151	public class Tokenizer
	152	{
	153	/** the reader to read from */
	154	protected PushbackReader reader_;
	155	/** the buffer to create the tokens */
	156	protected StringBuffer buffer_;
	157	/** all characters in this string are used as delimiters */
	158	protected String delimiters_ = ",";
	159	/** the escape character */
	160	protected int escapeChar_ = '\\';
	161	/** the quote character */
	162	protected int quoteChar_ = '"';
	163
	164	/** if true, characters are treated as escaped */
	165	protected boolean escapeMode_ = false;
	166
	167	/** if true, end of line is respected */
	168	protected boolean eolIsSignificant_ = true;
	169	/** if true, escape characters are respected */
	170	protected boolean respectEscapedChars_ = false;
	171	/** if true, quoted words are respected */
	172	protected boolean respectQuotedWords_ = true;
	173
	174	/** line count */
	175	protected int lineCount_ = 1;
	176
	177	/** end of file marker */
	178	protected boolean eofReached_ = false;
	179
	180	/** the last token that was found */
	181	protected int lastToken_ = NOT_STARTED;
	182
	183	/** end of file token */
	184	public static final int EOF = -1;
	185	/** end of line token */
	186	public static final int EOL = 0;
	187	/** word token */
	188	public static final int WORD = 1;
	189	/** quoted word token */
	190	public static final int QUOTED_WORD = 2;
	191	/** delimiter token */
	192	public static final int DELIMITER = 3;
	193	/** error token */
	194	public static final int ERROR = 4;
	195	/** not started token */
	196	public static final int NOT_STARTED = 5;
	197
	198
	199	//----------------------------------------------------------------------
	200	/**
	201	* Creates a tokenizer that reads from the given string. It uses the
	202	* comma as delimiter, does not respect escape characters but respects
	203	* quoted words.
	204	*
	205	* @param string the string to read from.
	206	*/
	207	public Tokenizer(String string)
	208	{
	209	this(new StringReader(string));
	210	}
	211
	212	//----------------------------------------------------------------------
	213	/**
	214	* Creates a tokenizer that reads from the given string. All
	215	* characters in the given delimiters string are used as
	216	* delimiter. The tokenizer does not respect escape characters but
	217	* respects quoted words.
	218	*
	219	* @param string the string to read from.
	220	* @param delimiters the delimiters to use.
	221	*/
	222	public Tokenizer(String string, String delimiters)
	223	{
	224	this(new StringReader(string));
	225	setDelimiters(delimiters);
	226	}
	227
	228	//----------------------------------------------------------------------
	229	/**
	230	* Creates a tokenizer that reads from the given string. It uses the
	231	* comma as delimiter, does not respect escape characters but respects
	232	* quoted words.
	233	*
	234	* @param inStream the stream to read from.
	235	*/
	236	public Tokenizer(InputStream inStream)
	237	{
	238	this(new InputStreamReader(inStream));
	239	}
	240
	241	//----------------------------------------------------------------------
	242	/**
	243	* Creates a tokenizer that reads from the given reader. It uses the
	244	* comma as delimiter, does not respect escape characters but respects
	245	* quoted words.
	246	*
	247	* @param reader the reader to read from.
	248	*/
	249	public Tokenizer(Reader reader)
	250	{
	251	reader_ = new PushbackReader(reader,2);
	252	buffer_ = new StringBuffer();
	253	}
	254
	255	//----------------------------------------------------------------------
	256	/**
	257	* Set the delimiter character. The default is the comma.
	258	*
	259	* @param delimiterChar the delimiter character.
	260	*/
	261	public void setDelimiter(int delimiterChar)
	262	{
	263	delimiters_ = new String(new char[]{(char)delimiterChar});
	264	}
	265
	266	//----------------------------------------------------------------------
	267	/**
	268	* Get the first delimiter character.
	269	*
	270	* @return the delimiter character.
	271	* @deprecated use the getDelimiters() method now
	272	*/
	273	public int getDelimiter()
	274	{
	275	return(delimiters_.charAt(0));
	276	}
	277
	278	//----------------------------------------------------------------------
	279	/**
	280	* Set the delimiter characters. All characters in the delimiters are
	281	* used as delimiter.
	282	*
	283	* @param delimiters the delimiter characters.
	284	*/
	285	public void setDelimiters(String delimiters)
	286	{
	287	delimiters_ = delimiters;
	288	}
	289
	290	//----------------------------------------------------------------------
	291	/**
	292	* Get the delimiter character.
	293	*
	294	* @return the delimiter character.
	295	*/
	296	public String getDelimiters()
	297	{
	298	return(delimiters_);
	299	}
	300
	301	//----------------------------------------------------------------------
	302	/**
	303	* Set the escape character. The default is the backslash.
	304	*
	305	* @param escapeChar the escape character.
	306	*/
	307	public void setEscapeChar(int escapeChar)
	308	{
	309	escapeChar_ = escapeChar;
	310	}
	311
	312	//----------------------------------------------------------------------
	313	/**
	314	* Get the escape character.
	315	*
	316	* @return the escape character.
	317	*/
	318	public int getEscapeChar()
	319	{
	320	return(escapeChar_);
	321	}
	322
	323	//----------------------------------------------------------------------
	324	/**
	325	* If escape characters should be respected, set the param to
	326	* <code>true</code>. The default is to ignore escape characters.
	327	*
	328	* @param respectEscaped If escape characters should be respected,
	329	* set the param to <code>true</code>.
	330	*/
	331	public void respectEscapedCharacters(boolean respectEscaped)
	332	{
	333	respectEscapedChars_ = respectEscaped;
	334	}
	335
	336	//----------------------------------------------------------------------
	337	/**
	338	* Returns <code>true</code>, if escape character is respected.
	339	*
	340	* @return <code>true</code>, if escape character is respected.
	341	*/
	342	public boolean respectEscapedCharacters()
	343	{
	344	return(respectEscapedChars_);
	345	}
	346
	347	//----------------------------------------------------------------------
	348	/**
	349	* Get the quote character.
	350	*
	351	* @return the quote character.
	352	*/
	353	public int getQuoteChar()
	354	{
	355	return (quoteChar_);
	356	}
	357
	358	//----------------------------------------------------------------------
	359	/**
	360	* Set the quote character. The default is the double quote.
	361	*
	362	* @param quoteChar the quote character.
	363	*/
	364	public void setQuoteChar(int quoteChar)
	365	{
	366	quoteChar_ = quoteChar;
	367	}
	368
	369	//----------------------------------------------------------------------
	370	/**
	371	* If quoted words should be respected, set the param to
	372	* <code>true</code>. The default is to respect quoted words.
	373	*
	374	* @param respectQuotes If quoted words should be respected,
	375	* set the param to <code>true</code>.
	376	*/
	377	public void respectQuotedWords(boolean respectQuotes)
	378	{
	379	respectQuotedWords_ = respectQuotes;
	380	}
	381
	382	//----------------------------------------------------------------------
	383	/**
	384	* Returns <code>true</code>, if quoted words are respected.
	385	*
	386	* @return <code>true</code>, if quoted words are respected.
	387	*/
	388	public boolean respectQuotedWords()
	389	{
	390	return(respectQuotedWords_);
	391	}
	392
	393	//----------------------------------------------------------------------
	394	/**
	395	* If set to <code>true</code> the end of line is signaled by the EOL
	396	* token. If set to <code>false</code> end of line is treated as a
	397	* normal delimiter. The default value is true;
	398	*
	399	* @param significant if the end of line is treated as a special token
	400	* or as a delimiter.
	401	*/
	402	public void eolIsSignificant(boolean significant)
	403	{
	404	eolIsSignificant_ = significant;
	405	}
	406
	407	//----------------------------------------------------------------------
	408	/**
	409	* Returns <code>true</code>, if in case of an end of line detected,
	410	* an EOL token is returned. If <code>false</code>, the end of line is
	411	* treated as a normal delimiter.
	412	*
	413	* @return <code>true</code>, if in case of an end of line detected,
	414	* an EOL token is returned. If <code>false</code>, the end of line is
	415	* treated as a normal delimiter.
	416	*/
	417	public boolean isEolSignificant()
	418	{
	419	return(eolIsSignificant_);
	420	}
	421
	422
	423	//----------------------------------------------------------------------
	424	/**
	425	* Returns the current line number of the reader.
	426	*
	427	* @return the current line number of the reader.
	428	*/
	429	public int getLineNumber()
	430	{
	431	return(lineCount_);
	432	}
	433
	434	//----------------------------------------------------------------------
	435	/**
	436	* Returns the value of the token. If the token was of the type WORD,
	437	* the word is returned.
	438	*
	439	* @return the value of the token.
	440	*/
	441	public String getWord()
	442	{
	443	return(buffer_.toString());
	444	}
	445
	446	//----------------------------------------------------------------------
	447	/**
	448	* Returns the last token that was returned from the nextToken() method.
	449	*
	450	* @return the last token.
	451	*/
	452	public int getLastToken()
	453	{
	454	return(lastToken_);
	455	}
	456
	457	//----------------------------------------------------------------------
	458	/**
	459	* Returns true, if the given character is seen as a delimiter. This
	460	* method respects escape_mode, so if the escape character was found
	461	* before, it has to act accordingly (usually, return false, even if
	462	* the character is a delimiter).
	463	*
	464	* @param character the character to check for delimiter
	465	* @return true, if the given character is seen as a delimiter.
	466	*/
	467	protected boolean isDelimiter(int character)
	468	{
	469	// check for escape mode:
	470	if(escapeMode_)
	471	return(false);
	472
	473	return(delimiters_.indexOf(character) >= 0);
	474	}
	475
	476	//----------------------------------------------------------------------
	477	/**
	478	* Returns true, if the given character is seen as a quote
	479	* character. This method respects escape_mode, so if the escape
	480	* character was found before, it has to act accordingly (usually,
	481	* return false, even if the character is a quote character).
	482	*
	483	* @param character the character to check for quote.
	484	* @return true, if the given character is seen as a quote character.
	485	*/
	486	protected boolean isQuoteChar(int character)
	487	{
	488	if(!respectQuotedWords_)
	489	return(false);
	490
	491	// check for escape mode:
	492	if(escapeMode_)
	493	return(false);
	494
	495	return(character == quoteChar_);
	496	}
	497
	498	//----------------------------------------------------------------------
	499	/**
	500	* Returns true, if the given character is seen as a escape
	501	* character. This method respects escape_mode, so if the escape
	502	* character was found before, it has to act accordingly (usually,
	503	* return false, even if the character is a escape character).
	504	* @param character the character to check for escape character.
	505	* @return true, if the given character is seen as a escape character.
	506	*/
	507	protected boolean isEscapeChar(int character)
	508	{
	509	if(!respectEscapedChars_)
	510	return(false);
	511
	512	// check for escape mode:
	513	if(escapeMode_)
	514	return(false);
	515
	516	return(character == escapeChar_);
	517	}
	518
	519	//----------------------------------------------------------------------
	520	/**
	521	* Returns true, if the given character is seen as a end of line
	522	* character. This method respects end of line_mode, so if the end of
	523	* line character was found before, it has to act accordingly
	524	* (usually, return false, even if the character is a end of line
	525	* character).
	526	* @param character the character to check for end of line.
	527	* @return true, if the given character is seen as a end of line
	528	* character.
	529	*/
	530	protected boolean isEndOfLine(int character)
	531	{
	532	// check for escape mode:
	533	if(escapeMode_)
	534	{
	535	if(character == '\n') // add line count, even if in escape mode!
	536	lineCount_++;
	537	return(false);
	538	}
	539	if(character == -1)
	540	eofReached_ = true;
	541
	542	return((character=='\n') \|\| (character=='\r') \|\| (character == -1));
	543	}
	544
	545	//----------------------------------------------------------------------
	546	/**
	547	* Closes the tokenizer (and the reader is uses internally).
	548	*
	549	* @exception IOException if an error occured.
	550	*/
	551	public void close()
	552	throws IOException
	553	{
	554	reader_.close();
	555	}
	556
	557	//----------------------------------------------------------------------
	558	/**
	559	* Reads and returns the next character from the reader and checks for
	560	* the escape character. If an escape character is read, a flag is set
	561	* and the next character is read. A newline following the escape
	562	* character is ignored.
	563	*
	564	* @return the next character.
	565	* @exception IOException if an error occured.
	566	*/
	567	protected int readNextChar()
	568	throws IOException
	569	{
	570	int next_char = reader_.read();
	571	if(escapeMode_)
	572	{
	573	escapeMode_ = false;
	574	}
	575	else
	576	{
	577	if(isEscapeChar(next_char))
	578	{
	579	// ignore escape char itself:
	580	next_char = reader_.read();
	581
	582	// check for newline and ignore it:
	583	if(isEndOfLine(next_char))
	584	{
	585	lineCount_++;
	586	next_char = reader_.read();
	587	// ignore CR:
	588	if(next_char == '\r')
	589	{
	590	next_char = readNextChar();
	591	}
	592	}
	593	escapeMode_ = true;
	594	}
	595	}
	596	// ignore CR:
	597	if(next_char == '\r')
	598	{
	599	next_char = readNextChar();
	600	}
	601	return(next_char);
	602	}
	603
	604	//----------------------------------------------------------------------
	605	/**
	606	* Returns the next token from the reader. The token's value may be
	607	* WORD, QUOTED_WORD, EOF, EOL, or DELIMITER. In the case or WORD or
	608	* QUOTED_WORD the actual word can be obtained by the use of the
	609	* getWord method.
	610	*
	611	* @return the next token.
	612	* @exception IOException if an error occured.
	613	*/
	614	public int nextToken()
	615	throws IOException
	616	{
	617	buffer_.setLength(0);
	618
	619	int next_char;
	620	next_char = readNextChar();
	621
	622	// handle EOF:
	623	if(eofReached_)
	624	{
	625	lastToken_ = EOF;
	626	return(EOF);
	627	}
	628
	629	// handle EOL:
	630	if(isEndOfLine(next_char))
	631	{
	632	lineCount_++;
	633	if(eolIsSignificant_)
	634	{
	635	lastToken_ = EOL;
	636	return(EOL);
	637	}
	638	else
	639	{
	640	lastToken_ = DELIMITER;
	641	return(DELIMITER);
	642	}
	643	}
	644
	645	// handle DELIMITER
	646	if(isDelimiter(next_char))
	647	{
	648	lastToken_ = DELIMITER;
	649	return(DELIMITER);
	650	}
	651
	652	// handle quoted words:
	653	if(isQuoteChar(next_char))
	654	{
	655	while(true)
	656	{
	657	next_char = readNextChar();
	658	if(isEndOfLine(next_char))
	659	{
	660	lastToken_ = ERROR;
	661	return(ERROR);
	662	}
	663	else
	664	{
	665	if(isQuoteChar(next_char))
	666	{
	667	lastToken_ = QUOTED_WORD;
	668	return(QUOTED_WORD);
	669	}
	670
	671	// no special char, then append to buffer:
	672	buffer_.append((char)next_char);
	673	}
	674	}
	675	}
	676
	677	// handle 'normal' words:
	678	while(true)
	679	{
	680	buffer_.append((char)next_char);
	681	next_char = readNextChar();
	682	if(isDelimiter(next_char) \|\| isEndOfLine(next_char))
	683	{
	684	reader_.unread(next_char);
	685	lastToken_ = WORD;
	686	return(WORD);
	687	}
	688	}
	689	}
	690
	691	//----------------------------------------------------------------------
	692	/**
	693	* Returns true, if the tokenizer can return another line.
	694	*
	695	* @return true, if the tokenizer can return another line.
	696	* @exception IOException if an error occured.
	697	*/
	698	public boolean hasNextLine()
	699	throws IOException
	700	{
	701	if(lastToken_ == EOF)
	702	return(false);
	703
	704	if((lastToken_ == EOL) \|\| (lastToken_ == NOT_STARTED))
	705	{
	706	int next_char = readNextChar();
	707	if(next_char == -1)
	708	return(false);
	709
	710	reader_.unread(next_char);
	711	}
	712	return(true);
	713	}
	714
	715
	716	//----------------------------------------------------------------------
	717	/**
	718	* Returns a list of elements (Strings) from the next line of the
	719	* tokenizer. If there are multiple delimiters without any values in
	720	* between, empty (zero length) strings are added to the list. They
	721	* may be removed by the use of the {@link
	722	* #removeZeroLengthElements(List)} method.
	723	*
	724	* @return a list of elements (Strings) from the next line of the
	725	* tokenizer.
	726	* @exception IOException if an error occured.
	727	*/
	728	public List<String> nextLine()
	729	throws IOException
	730	{
	731	int token = nextToken();
	732	List<String> list = new ArrayList<String>();
	733	String word = "";
	734	// while(token != Tokenizer.EOF)
	735	while(true)
	736	{
	737	switch(token)
	738	{
	739	case Tokenizer.WORD:
	740	word = getWord();
	741	break;
	742	case Tokenizer.QUOTED_WORD:
	743	word = getWord();
	744	break;
	745	case Tokenizer.DELIMITER:
	746	list.add(word);
	747	word = "";
	748	break;
	749	case Tokenizer.EOL:
	750	case Tokenizer.EOF:
	751	list.add(word);
	752	return(list);
	753	default:
	754	System.err.println("Unknown Token: "+token);
	755	}
	756	token = nextToken();
	757	}
	758	// return(list);
	759	}
	760
	761	//----------------------------------------------------------------------
	762	/**
	763	* This helper method removes all zero length elements from the given
	764	* list and returns it. The given list is not changed!
	765	*
	766	* @param list the list of String objects to remove the zero elements from.
	767	* @return a copy of the given list where all zero length elements are removed.
	768	*/
	769	public static List<String> removeZeroLengthElements(List<String> list)
	770	{
	771	return removeZeroLengthElements(list, false);
	772	}
	773
	774	//----------------------------------------------------------------------
	775	/**
	776	* This helper method trims all elements and removes all zero length
	777	* (length is taken after trimming leading and trailing spaces) elements from the given
	778	* list and returns it. This method copies the (trimmed and) non-zero elements to a
	779	* new list.
	780	*
	781	* @param list the list of String objects to remove the zero elements from.
	782	* @param trim if set to <code>true</code>, all leading and trailing spaces are removed from
	783	* the elements. This is done, before the length is compared to zero (and the element
	784	* may be removed if the length is zero). If set to <code>true</code>, elements
	785	* that only consist of spaces are removed as well!
	786	* @return the list where all zero length elements are remove.
	787	*/
	788	public static List<String> removeZeroLengthElements(List<String> list, boolean trim)
	789	{
	790	Iterator<String> iterator = list.iterator();
	791	String value;
	792	List<String> new_list = new ArrayList<String>();
	793	while(iterator.hasNext())
	794	{
	795	value = iterator.next();
	796	if (trim)
	797	value = value.trim();
	798	if(value.length() != 0)
	799	new_list.add(value);
	800	}
	801	return(new_list);
	802	}
	803
	804	// /**
	805	// * Demonstrates the low level interface.
	806	// * @param args command line arguments.
	807	// */
	808	// protected static void testLowLevel(String[] args)
	809	// {
	810	// try
	811	// {
	812	// String filename;
	813	// if(args.length > 0)
	814	// filename = args[0];
	815	// else
	816	// filename = "/filer/cdaller/tmp/test.csv";
	817	//
	818	// Tokenizer tokenizer = new Tokenizer(new BufferedReader(new FileReader(filename)));
	819	//// Tokenizer tokenizer = new Tokenizer("column1,\"quoted column2\",column3\\, with quoted comma");
	820	// tokenizer.setDelimiter(',');
	821	//// tokenizer.eolIsSignificant(false);
	822	// tokenizer.respectEscapedCharacters(true);
	823	// tokenizer.respectQuotedWords(true);
	824	//
	825	// int token;
	826	// while((token = tokenizer.nextToken()) != Tokenizer.EOF)
	827	// {
	828	// switch(token)
	829	// {
	830	// case Tokenizer.EOL:
	831	// System.out.println("------------- ");
	832	// break;
	833	// case Tokenizer.WORD:
	834	// System.out.println("line" +tokenizer.getLineNumber() +" word: "+tokenizer.getWord());
	835	// break;
	836	// case Tokenizer.QUOTED_WORD:
	837	// System.out.println("line" +tokenizer.getLineNumber() +" quoted word: "+tokenizer.getWord());
	838	// break;
	839	// case Tokenizer.DELIMITER:
	840	// System.out.println("delimiter");
	841	// break;
	842	// default:
	843	// System.err.println("Unknown Token: "+token);
	844	// }
	845	// }
	846	// tokenizer.close();
	847	// }
	848	// catch(Exception ioe)
	849	// {
	850	// ioe.printStackTrace();
	851	// }
	852	// }
	853	//
	854	//
	855	// /**
	856	// * Demonstration of the high level interface.
	857	// * @param args command line arguments.
	858	// */
	859	// protected static void testHighLevel(String[] args)
	860	// {
	861	// try
	862	// {
	863	// String filename;
	864	// if(args.length > 0)
	865	// filename = args[0];
	866	// else
	867	// filename = "/filer/cdaller/tmp/test.csv";
	868	//
	869	// Tokenizer tokenizer = new Tokenizer(new BufferedReader(new FileReader(filename)));
	870	//// Tokenizer tokenizer = new Tokenizer("column1,\"quoted column2\",column3\\, with quoted comma");
	871	// tokenizer.setDelimiter(',');
	872	//// tokenizer.eolIsSignificant(false);
	873	// tokenizer.respectEscapedCharacters(true);
	874	// tokenizer.respectQuotedWords(true);
	875	//
	876	// List list;
	877	// while(tokenizer.hasNextLine())
	878	// {
	879	// list = tokenizer.nextLine();
	880	// System.out.println("List: "+list);
	881	// System.out.println("List w/o zero length elements: "+removeZeroLengthElements(list));
	882	// System.out.println("--");
	883	// }
	884	//
	885	// }
	886	// catch(Exception ioe)
	887	// {
	888	// ioe.printStackTrace();
	889	// }
	890	// }
	891	//
	892	// /**
	893	// * Demo code for the high level interface.
	894	// */
	895	// protected static void testHighLevelExample()
	896	// {
	897	// try
	898	// {
	899	// // simple example, tokenizing string, no escape, but quoted
	900	// // works:
	901	// System.out.println("example 1");
	902	// Tokenizer tokenizer = new Tokenizer("text,,,\"another,text\"");
	903	// List tokens;
	904	// while(tokenizer.hasNextLine())
	905	// {
	906	// tokens = tokenizer.nextLine();
	907	// System.out.println(tokens.get(0)); // prints 'text'
	908	// System.out.println(tokens.get(1)); // prints ''
	909	// System.out.println(tokens.get(2)); // prints ''
	910	// System.out.println(tokens.get(3)); // prints 'another,text'
	911	// }
	912	//
	913	// System.out.println("example 2");
	914	// // simple example, tokenizing string, using escape char and
	915	// // quoted strings:
	916	// tokenizer = new Tokenizer("text,text with\\,comma,,\"another,text\"");
	917	// tokenizer.respectEscapedCharacters(true);
	918	// while(tokenizer.hasNextLine())
	919	// {
	920	// tokens = tokenizer.nextLine();
	921	// System.out.println(tokens.get(0)); // prints 'text'
	922	// System.out.println(tokens.get(1)); // prints 'text with, comma'
	923	// System.out.println(tokens.get(2)); // prints ''
	924	// System.out.println(tokens.get(3)); // prints 'another,text'
	925	// }
	926	// }
	927	// catch(Exception ioe)
	928	// {
	929	// ioe.printStackTrace();
	930	// }
	931	// }
	932	//
	933	// public static void main(String[] args)
	934	// {
	935	//// testLowLevel(args);
	936	//// testHighLevel(args);
	937	//// testGeonetUTF8(args);
	938	// testHighLevelExample();
	939	// }
	940	}
	941
	942

Note: See TracBrowser for help on using the repository browser.

Download in other formats: