001 /* CollationElementIterator.java -- Walks through collation elements
002 Copyright (C) 1998, 1999, 2001, 2002, 2003, 2004 Free Software Foundation
003
004 This file is part of GNU Classpath.
005
006 GNU Classpath is free software; you can redistribute it and/or modify
007 it under the terms of the GNU General Public License as published by
008 the Free Software Foundation; either version 2, or (at your option)
009 any later version.
010
011 GNU Classpath is distributed in the hope that it will be useful, but
012 WITHOUT ANY WARRANTY; without even the implied warranty of
013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 General Public License for more details.
015
016 You should have received a copy of the GNU General Public License
017 along with GNU Classpath; see the file COPYING. If not, write to the
018 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019 02110-1301 USA.
020
021 Linking this library statically or dynamically with other modules is
022 making a combined work based on this library. Thus, the terms and
023 conditions of the GNU General Public License cover the whole
024 combination.
025
026 As a special exception, the copyright holders of this library give you
027 permission to link this library with independent modules to produce an
028 executable, regardless of the license terms of these independent
029 modules, and to copy and distribute the resulting executable under
030 terms of your choice, provided that you also meet, for each linked
031 independent module, the terms and conditions of the license of that
032 module. An independent module is a module which is not derived from
033 or based on this library. If you modify this library, you may extend
034 this exception to your version of the library, but you are not
035 obligated to do so. If you do not wish to do so, delete this
036 exception statement from your version. */
037
038
039 package java.text;
040
041 import gnu.java.lang.CPStringBuilder;
042
043 import java.util.ArrayList;
044
045 /* Written using "Java Class Libraries", 2nd edition, plus online
046 * API docs for JDK 1.2 from http://www.javasoft.com.
047 * Status: Believed complete and correct to JDK 1.1.
048 */
049
050 /**
051 * This class walks through the character collation elements of a
052 * <code>String</code> as defined by the collation rules in an instance of
053 * <code>RuleBasedCollator</code>. There is no public constructor for
054 * this class. An instance is created by calling the
055 * <code>getCollationElementIterator</code> method on
056 * <code>RuleBasedCollator</code>.
057 *
058 * @author Aaron M. Renn (arenn@urbanophile.com)
059 * @author Tom Tromey (tromey@cygnus.com)
060 * @author Guilhem Lavaux (guilhem.lavaux@free.fr)
061 */
062 public final class CollationElementIterator
063 {
064 /**
065 * This is a constant value that is returned to indicate that the end of
066 * the string was encountered.
067 */
068 public static final int NULLORDER = -1;
069
070 /**
071 * This is the RuleBasedCollator this object was created from.
072 */
073 RuleBasedCollator collator;
074
075 /**
076 * This is the String that is being iterated over.
077 */
078 CharacterIterator text;
079
080 /**
081 * This is the index into the collation decomposition where we are currently scanning.
082 */
083 int index;
084
085 /**
086 * This is the index into the String where we are currently scanning.
087 */
088 int textIndex;
089
090 /**
091 * Array containing the collation decomposition of the
092 * text given to the constructor.
093 */
094 private RuleBasedCollator.CollationElement[] text_decomposition;
095
096 /**
097 * Array containing the index of the specified block.
098 */
099 private int[] text_indexes;
100
101 /**
102 * This method initializes a new instance of <code>CollationElementIterator</code>
103 * to iterate over the specified <code>String</code> using the rules in the
104 * specified <code>RuleBasedCollator</code>.
105 *
106 * @param collator The <code>RuleBasedCollation</code> used for calculating collation values
107 * @param text The <code>String</code> to iterate over.
108 */
109 CollationElementIterator(RuleBasedCollator collator, String text)
110 {
111 this.collator = collator;
112
113 setText (text);
114 }
115
116 /**
117 * This method initializes a new instance of <code>CollationElementIterator</code>
118 * to iterate over the specified <code>String</code> using the rules in the
119 * specified <code>RuleBasedCollator</code>.
120 *
121 * @param collator The <code>RuleBasedCollation</code> used for calculating collation values
122 * @param text The character iterator to iterate over.
123 */
124 CollationElementIterator(RuleBasedCollator collator, CharacterIterator text)
125 {
126 this.collator = collator;
127
128 setText (text);
129 }
130
131 RuleBasedCollator.CollationElement nextBlock()
132 {
133 if (index >= text_decomposition.length)
134 return null;
135
136 RuleBasedCollator.CollationElement e = text_decomposition[index];
137
138 textIndex = text_indexes[index+1];
139
140 index++;
141
142 return e;
143 }
144
145 RuleBasedCollator.CollationElement previousBlock()
146 {
147 if (index == 0)
148 return null;
149
150 index--;
151 RuleBasedCollator.CollationElement e = text_decomposition[index];
152
153 textIndex = text_indexes[index+1];
154
155 return e;
156 }
157
158 /**
159 * This method returns the collation ordering value of the next character sequence
160 * in the string (it may be an extended character following collation rules).
161 * This method will return <code>NULLORDER</code> if the
162 * end of the string was reached.
163 *
164 * @return The collation ordering value.
165 */
166 public int next()
167 {
168 RuleBasedCollator.CollationElement e = nextBlock();
169
170 if (e == null)
171 return NULLORDER;
172
173 return e.getValue();
174 }
175
176 /**
177 * This method returns the collation ordering value of the previous character
178 * in the string. This method will return <code>NULLORDER</code> if the
179 * beginning of the string was reached.
180 *
181 * @return The collation ordering value.
182 */
183 public int previous()
184 {
185 RuleBasedCollator.CollationElement e = previousBlock();
186
187 if (e == null)
188 return NULLORDER;
189
190 return e.getValue();
191 }
192
193 /**
194 * This method returns the primary order value for the given collation
195 * value.
196 *
197 * @param order The collation value returned from <code>next()</code> or
198 * <code>previous()</code>.
199 *
200 * @return The primary order value of the specified collation value. This is
201 * the high 16 bits.
202 */
203 public static int primaryOrder(int order)
204 {
205 // From the JDK 1.2 spec.
206 return order >>> 16;
207 }
208
209 /**
210 * This method resets the internal position pointer to read from the
211 * beginning of the <code>String</code> again.
212 */
213 public void reset()
214 {
215 index = 0;
216 textIndex = 0;
217 }
218
219 /**
220 * This method returns the secondary order value for the given collation
221 * value.
222 *
223 * @param order The collation value returned from <code>next()</code> or
224 * <code>previous()</code>.
225 *
226 * @return The secondary order value of the specified collation value. This
227 * is the bits 8-15.
228 */
229 public static short secondaryOrder(int order)
230 {
231 // From the JDK 1.2 spec.
232 return (short) ((order >>> 8) & 255);
233 }
234
235 /**
236 * This method returns the tertiary order value for the given collation
237 * value.
238 *
239 * @param order The collation value returned from <code>next()</code> or
240 * <code>previous()</code>.
241 *
242 * @return The tertiary order value of the specified collation value. This
243 * is the low eight bits.
244 */
245 public static short tertiaryOrder(int order)
246 {
247 // From the JDK 1.2 spec.
248 return (short) (order & 255);
249 }
250
251 /**
252 * This method sets the <code>String</code> that it is iterating over
253 * to the specified <code>String</code>.
254 *
255 * @param text The new <code>String</code> to iterate over.
256 *
257 * @since 1.2
258 */
259 public void setText(String text)
260 {
261 int idx = 0;
262 int idx_idx = 0;
263 int alreadyExpanded = 0;
264 int idxToMove = 0;
265
266 this.text = new StringCharacterIterator(text);
267 this.index = 0;
268
269 String work_text = text.intern();
270
271 ArrayList a_element = new ArrayList();
272 ArrayList a_idx = new ArrayList();
273
274 // Build element collection ordered as they come in "text".
275 while (idx < work_text.length())
276 {
277 String key, key_old;
278
279 Object object = null;
280 int p = 1;
281
282 // IMPROVE: use a TreeMap with a prefix-ordering rule.
283 key_old = key = null;
284 do
285 {
286 if (object != null)
287 key_old = key;
288 key = work_text.substring (idx, idx+p);
289 object = collator.prefix_tree.get (key);
290 if (object != null && idx < alreadyExpanded)
291 {
292 RuleBasedCollator.CollationElement prefix = (RuleBasedCollator.CollationElement)object;
293 if (prefix.expansion != null &&
294 prefix.expansion.startsWith(work_text.substring(0, idx)))
295 {
296 object = null;
297 key = key_old;
298 }
299 }
300 p++;
301 }
302 while (idx+p <= work_text.length());
303
304 if (object == null)
305 key = key_old;
306
307 RuleBasedCollator.CollationElement prefix =
308 (RuleBasedCollator.CollationElement) collator.prefix_tree.get (key);
309
310 /*
311 * First case: There is no such sequence in the database.
312 * We will have to build one from the context.
313 */
314 if (prefix == null)
315 {
316 /*
317 * We are dealing with sequences in an expansion. They
318 * are treated as accented characters (tertiary order).
319 */
320 if (alreadyExpanded > 0)
321 {
322 RuleBasedCollator.CollationElement e =
323 collator.getDefaultAccentedElement (work_text.charAt (idx));
324
325 a_element.add (e);
326 a_idx.add (new Integer(idx_idx));
327 idx++;
328 alreadyExpanded--;
329 if (alreadyExpanded == 0)
330 {
331 /* There is not any characters left in the expansion set.
332 * We can increase the pointer in the source string.
333 */
334 idx_idx += idxToMove;
335 idxToMove = 0;
336 }
337 else
338 idx_idx++;
339 }
340 else
341 {
342 /* This is a normal character. */
343 RuleBasedCollator.CollationElement e =
344 collator.getDefaultElement (work_text.charAt (idx));
345 Integer i_ref = new Integer(idx_idx);
346
347 /* Don't forget to mark it as a special sequence so the
348 * string can be ordered.
349 */
350 a_element.add (RuleBasedCollator.SPECIAL_UNKNOWN_SEQ);
351 a_idx.add (i_ref);
352 a_element.add (e);
353 a_idx.add (i_ref);
354 idx_idx++;
355 idx++;
356 }
357 continue;
358 }
359
360 /*
361 * Second case: Here we have found a matching sequence.
362 * Here we have an expansion string prepend it to the "work text" and
363 * add the corresponding sorting element. We must also mark
364 */
365 if (prefix.expansion != null)
366 {
367 work_text = prefix.expansion
368 + work_text.substring (idx+prefix.key.length());
369 idx = 0;
370 a_element.add (prefix);
371 a_idx.add (new Integer(idx_idx));
372 if (alreadyExpanded == 0)
373 idxToMove = prefix.key.length();
374 alreadyExpanded += prefix.expansion.length()-prefix.key.length();
375 }
376 else
377 {
378 /* Third case: the simplest. We have got the prefix and it
379 * has not to be expanded.
380 */
381 a_element.add (prefix);
382 a_idx.add (new Integer(idx_idx));
383 idx += prefix.key.length();
384 /* If the sequence is in an expansion, we must decrease the
385 * counter.
386 */
387 if (alreadyExpanded > 0)
388 {
389 alreadyExpanded -= prefix.key.length();
390 if (alreadyExpanded == 0)
391 {
392 idx_idx += idxToMove;
393 idxToMove = 0;
394 }
395 }
396 else
397 idx_idx += prefix.key.length();
398 }
399 }
400
401 text_decomposition = (RuleBasedCollator.CollationElement[])
402 a_element.toArray(new RuleBasedCollator.CollationElement[a_element.size()]);
403 text_indexes = new int[a_idx.size()+1];
404 for (int i = 0; i < a_idx.size(); i++)
405 {
406 text_indexes[i] = ((Integer)a_idx.get(i)).intValue();
407 }
408 text_indexes[a_idx.size()] = text.length();
409 }
410
411 /**
412 * This method sets the <code>String</code> that it is iterating over
413 * to the <code>String</code> represented by the specified
414 * <code>CharacterIterator</code>.
415 *
416 * @param source The <code>CharacterIterator</code> containing the new
417 * <code>String</code> to iterate over.
418 */
419 public void setText(CharacterIterator source)
420 {
421 CPStringBuilder expand = new CPStringBuilder();
422
423 // For now assume we read from the beginning of the string.
424 for (char c = source.first();
425 c != CharacterIterator.DONE;
426 c = source.next())
427 expand.append(c);
428
429 setText(expand.toString());
430 }
431
432 /**
433 * This method returns the current offset into the <code>String</code>
434 * that is being iterated over.
435 *
436 * @return The iteration index position.
437 *
438 * @since 1.2
439 */
440 public int getOffset()
441 {
442 return textIndex;
443 }
444
445 /**
446 * This method sets the iteration index position into the current
447 * <code>String</code> to the specified value. This value must not
448 * be negative and must not be greater than the last index position
449 * in the <code>String</code>.
450 *
451 * @param offset The new iteration index position.
452 *
453 * @exception IllegalArgumentException If the new offset is not valid.
454 */
455 public void setOffset(int offset)
456 {
457 if (offset < 0)
458 throw new IllegalArgumentException("Negative offset: " + offset);
459
460 if (offset > (text.getEndIndex() - 1))
461 throw new IllegalArgumentException("Offset too large: " + offset);
462
463 for (index = 0; index < text_decomposition.length; index++)
464 {
465 if (offset <= text_indexes[index])
466 break;
467 }
468 /*
469 * As text_indexes[0] == 0, we should not have to take care whether index is
470 * greater than 0. It is always.
471 */
472 if (text_indexes[index] == offset)
473 textIndex = offset;
474 else
475 textIndex = text_indexes[index-1];
476 }
477
478 /**
479 * This method returns the maximum length of any expansion sequence that
480 * ends with the specified collation order value. (Whatever that means).
481 *
482 * @param value The collation order value
483 *
484 * @return The maximum length of an expansion sequence.
485 */
486 public int getMaxExpansion(int value)
487 {
488 return 1;
489 }
490 }