001 /* Matcher.java -- Instance of a regular expression applied to a char sequence.
002 Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc.
003
004 This file is part of GNU Classpath.
005
006 GNU Classpath is free software; you can redistribute it and/or modify
007 it under the terms of the GNU General Public License as published by
008 the Free Software Foundation; either version 2, or (at your option)
009 any later version.
010
011 GNU Classpath is distributed in the hope that it will be useful, but
012 WITHOUT ANY WARRANTY; without even the implied warranty of
013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 General Public License for more details.
015
016 You should have received a copy of the GNU General Public License
017 along with GNU Classpath; see the file COPYING. If not, write to the
018 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019 02110-1301 USA.
020
021 Linking this library statically or dynamically with other modules is
022 making a combined work based on this library. Thus, the terms and
023 conditions of the GNU General Public License cover the whole
024 combination.
025
026 As a special exception, the copyright holders of this library give you
027 permission to link this library with independent modules to produce an
028 executable, regardless of the license terms of these independent
029 modules, and to copy and distribute the resulting executable under
030 terms of your choice, provided that you also meet, for each linked
031 independent module, the terms and conditions of the license of that
032 module. An independent module is a module which is not derived from
033 or based on this library. If you modify this library, you may extend
034 this exception to your version of the library, but you are not
035 obligated to do so. If you do not wish to do so, delete this
036 exception statement from your version. */
037
038
039 package java.util.regex;
040
041 import gnu.java.lang.CPStringBuilder;
042
043 import gnu.java.util.regex.CharIndexed;
044 import gnu.java.util.regex.RE;
045 import gnu.java.util.regex.REMatch;
046
047 /**
048 * Instance of a regular expression applied to a char sequence.
049 *
050 * @since 1.4
051 */
052 public final class Matcher implements MatchResult
053 {
054 private Pattern pattern;
055 private CharSequence input;
056 // We use CharIndexed as an input object to the getMatch method in order
057 // that /\G/ (the end of the previous match) may work. The information
058 // of the previous match is stored in the CharIndexed object.
059 private CharIndexed inputCharIndexed;
060 private int position;
061 private int appendPosition;
062 private REMatch match;
063
064 /**
065 * The start of the region of the input on which to match.
066 */
067 private int regionStart;
068
069 /**
070 * The end of the region of the input on which to match.
071 */
072 private int regionEnd;
073
074 /**
075 * True if the match process should look beyond the
076 * region marked by regionStart to regionEnd when
077 * performing lookAhead, lookBehind and boundary
078 * matching.
079 */
080 private boolean transparentBounds;
081
082 /**
083 * The flags that affect the anchoring bounds.
084 * If {@link #hasAnchoringBounds()} is {@code true},
085 * the match process will honour the
086 * anchoring bounds: ^, \A, \Z, \z and $. If
087 * {@link #hasAnchoringBounds()} is {@code false},
088 * the anchors are ignored and appropriate flags,
089 * stored in this variable, are used to provide this
090 * behaviour.
091 */
092 private int anchoringBounds;
093
094 Matcher(Pattern pattern, CharSequence input)
095 {
096 this.pattern = pattern;
097 this.input = input;
098 this.inputCharIndexed = RE.makeCharIndexed(input, 0);
099 regionStart = 0;
100 regionEnd = input.length();
101 transparentBounds = false;
102 anchoringBounds = 0;
103 }
104
105 /**
106 * @param sb The target string buffer
107 * @param replacement The replacement string
108 *
109 * @exception IllegalStateException If no match has yet been attempted,
110 * or if the previous match operation failed
111 * @exception IndexOutOfBoundsException If the replacement string refers
112 * to a capturing group that does not exist in the pattern
113 */
114 public Matcher appendReplacement (StringBuffer sb, String replacement)
115 throws IllegalStateException
116 {
117 assertMatchOp();
118 sb.append(input.subSequence(appendPosition,
119 match.getStartIndex()).toString());
120 sb.append(RE.getReplacement(replacement, match,
121 RE.REG_REPLACE_USE_BACKSLASHESCAPE));
122 appendPosition = match.getEndIndex();
123 return this;
124 }
125
126 /**
127 * @param sb The target string buffer
128 */
129 public StringBuffer appendTail (StringBuffer sb)
130 {
131 sb.append(input.subSequence(appendPosition, input.length()).toString());
132 return sb;
133 }
134
135 /**
136 * @exception IllegalStateException If no match has yet been attempted,
137 * or if the previous match operation failed
138 */
139 public int end ()
140 throws IllegalStateException
141 {
142 assertMatchOp();
143 return match.getEndIndex();
144 }
145
146 /**
147 * @param group The index of a capturing group in this matcher's pattern
148 *
149 * @exception IllegalStateException If no match has yet been attempted,
150 * or if the previous match operation failed
151 * @exception IndexOutOfBoundsException If the replacement string refers
152 * to a capturing group that does not exist in the pattern
153 */
154 public int end (int group)
155 throws IllegalStateException
156 {
157 assertMatchOp();
158 return match.getEndIndex(group);
159 }
160
161 public boolean find ()
162 {
163 boolean first = (match == null);
164 if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
165 match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds);
166 else
167 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
168 position, anchoringBounds);
169 if (match != null)
170 {
171 int endIndex = match.getEndIndex();
172 // Are we stuck at the same position?
173 if (!first && endIndex == position)
174 {
175 match = null;
176 // Not at the end of the input yet?
177 if (position < input.length() - 1)
178 {
179 position++;
180 return find(position);
181 }
182 else
183 return false;
184 }
185 position = endIndex;
186 return true;
187 }
188 return false;
189 }
190
191 /**
192 * @param start The index to start the new pattern matching
193 *
194 * @exception IndexOutOfBoundsException If the replacement string refers
195 * to a capturing group that does not exist in the pattern
196 */
197 public boolean find (int start)
198 {
199 if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
200 match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds);
201 else
202 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
203 start, anchoringBounds);
204 if (match != null)
205 {
206 position = match.getEndIndex();
207 return true;
208 }
209 return false;
210 }
211
212 /**
213 * @exception IllegalStateException If no match has yet been attempted,
214 * or if the previous match operation failed
215 */
216 public String group ()
217 {
218 assertMatchOp();
219 return match.toString();
220 }
221
222 /**
223 * @param group The index of a capturing group in this matcher's pattern
224 *
225 * @exception IllegalStateException If no match has yet been attempted,
226 * or if the previous match operation failed
227 * @exception IndexOutOfBoundsException If the replacement string refers
228 * to a capturing group that does not exist in the pattern
229 */
230 public String group (int group)
231 throws IllegalStateException
232 {
233 assertMatchOp();
234 return match.toString(group);
235 }
236
237 /**
238 * @param replacement The replacement string
239 */
240 public String replaceFirst (String replacement)
241 {
242 reset();
243 // Semantics might not quite match
244 return pattern.getRE().substitute(input, replacement, position,
245 RE.REG_REPLACE_USE_BACKSLASHESCAPE);
246 }
247
248 /**
249 * @param replacement The replacement string
250 */
251 public String replaceAll (String replacement)
252 {
253 reset();
254 return pattern.getRE().substituteAll(input, replacement, position,
255 RE.REG_REPLACE_USE_BACKSLASHESCAPE);
256 }
257
258 public int groupCount ()
259 {
260 return pattern.getRE().getNumSubs();
261 }
262
263 public boolean lookingAt ()
264 {
265 if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
266 match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
267 anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
268 else
269 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
270 anchoringBounds|RE.REG_FIX_STARTING_POSITION);
271 if (match != null)
272 {
273 if (match.getStartIndex() == 0)
274 {
275 position = match.getEndIndex();
276 return true;
277 }
278 match = null;
279 }
280 return false;
281 }
282
283 /**
284 * Attempts to match the entire input sequence against the pattern.
285 *
286 * If the match succeeds then more information can be obtained via the
287 * start, end, and group methods.
288 *
289 * @see #start()
290 * @see #end()
291 * @see #group()
292 */
293 public boolean matches ()
294 {
295 if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
296 match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
297 anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
298 else
299 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
300 anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION);
301 if (match != null)
302 {
303 if (match.getStartIndex() == 0)
304 {
305 position = match.getEndIndex();
306 if (position == input.length())
307 return true;
308 }
309 match = null;
310 }
311 return false;
312 }
313
314 /**
315 * Returns the Pattern that is interpreted by this Matcher
316 */
317 public Pattern pattern ()
318 {
319 return pattern;
320 }
321
322 /**
323 * Resets the internal state of the matcher, including
324 * resetting the region to its default state of encompassing
325 * the whole input. The state of {@link #hasTransparentBounds()}
326 * and {@link #hasAnchoringBounds()} are unaffected.
327 *
328 * @return a reference to this matcher.
329 * @see #regionStart()
330 * @see #regionEnd()
331 * @see #hasTransparentBounds()
332 * @see #hasAnchoringBounds()
333 */
334 public Matcher reset ()
335 {
336 position = 0;
337 match = null;
338 regionStart = 0;
339 regionEnd = input.length();
340 appendPosition = 0;
341 return this;
342 }
343
344 /**
345 * Resets the internal state of the matcher, including
346 * resetting the region to its default state of encompassing
347 * the whole input. The state of {@link #hasTransparentBounds()}
348 * and {@link #hasAnchoringBounds()} are unaffected.
349 *
350 * @param input The new input character sequence.
351 * @return a reference to this matcher.
352 * @see #regionStart()
353 * @see #regionEnd()
354 * @see #hasTransparentBounds()
355 * @see #hasAnchoringBounds()
356 */
357 public Matcher reset (CharSequence input)
358 {
359 this.input = input;
360 this.inputCharIndexed = RE.makeCharIndexed(input, 0);
361 return reset();
362 }
363
364 /**
365 * @return the index of a capturing group in this matcher's pattern
366 *
367 * @exception IllegalStateException If no match has yet been attempted,
368 * or if the previous match operation failed
369 */
370 public int start ()
371 throws IllegalStateException
372 {
373 assertMatchOp();
374 return match.getStartIndex();
375 }
376
377 /**
378 * @param group The index of a capturing group in this matcher's pattern
379 *
380 * @exception IllegalStateException If no match has yet been attempted,
381 * or if the previous match operation failed
382 * @exception IndexOutOfBoundsException If the replacement string refers
383 * to a capturing group that does not exist in the pattern
384 */
385 public int start (int group)
386 throws IllegalStateException
387 {
388 assertMatchOp();
389 return match.getStartIndex(group);
390 }
391
392 /**
393 * @return True if and only if the matcher hit the end of input.
394 * @since 1.5
395 */
396 public boolean hitEnd()
397 {
398 return inputCharIndexed.hitEnd();
399 }
400
401 /**
402 * @return A string expression of this matcher.
403 */
404 public String toString()
405 {
406 CPStringBuilder sb = new CPStringBuilder();
407 sb.append(this.getClass().getName())
408 .append("[pattern=").append(pattern.pattern())
409 .append(" region=").append(regionStart).append(",").append(regionEnd)
410 .append(" anchoringBounds=").append(anchoringBounds == 0)
411 .append(" transparentBounds=").append(transparentBounds)
412 .append(" lastmatch=").append(match == null ? "" : match.toString())
413 .append("]");
414 return sb.toString();
415 }
416
417 private void assertMatchOp()
418 {
419 if (match == null) throw new IllegalStateException();
420 }
421
422 /**
423 * <p>
424 * Defines the region of the input on which to match.
425 * By default, the {@link Matcher} attempts to match
426 * the whole string (from 0 to the length of the input),
427 * but a region between {@code start} (inclusive) and
428 * {@code end} (exclusive) on which to match may instead
429 * be defined using this method.
430 * </p>
431 * <p>
432 * The behaviour of region matching is further affected
433 * by the use of transparent or opaque bounds (see
434 * {@link #useTransparentBounds(boolean)}) and whether or not
435 * anchors ({@code ^} and {@code $}) are in use
436 * (see {@link #useAnchoringBounds(boolean)}). With transparent
437 * bounds, the matcher is aware of input outside the bounds
438 * set by this method, whereas, with opaque bounds (the default)
439 * only the input within the bounds is used. The use of
440 * anchors are affected by this setting; with transparent
441 * bounds, anchors will match the beginning of the real input,
442 * while with opaque bounds they match the beginning of the
443 * region. {@link #useAnchoringBounds(boolean)} can be used
444 * to turn on or off the matching of anchors.
445 * </p>
446 *
447 * @param start the start of the region (inclusive).
448 * @param end the end of the region (exclusive).
449 * @return a reference to this matcher.
450 * @throws IndexOutOfBoundsException if either {@code start} or
451 * {@code end} are less than zero,
452 * if either {@code start} or
453 * {@code end} are greater than the
454 * length of the input, or if
455 * {@code start} is greater than
456 * {@code end}.
457 * @see #regionStart()
458 * @see #regionEnd()
459 * @see #hasTransparentBounds()
460 * @see #useTransparentBounds(boolean)
461 * @see #hasAnchoringBounds()
462 * @see #useAnchoringBounds(boolean)
463 * @since 1.5
464 */
465 public Matcher region(int start, int end)
466 {
467 int length = input.length();
468 if (start < 0)
469 throw new IndexOutOfBoundsException("The start position was less than zero.");
470 if (start >= length)
471 throw new IndexOutOfBoundsException("The start position is after the end of the input.");
472 if (end < 0)
473 throw new IndexOutOfBoundsException("The end position was less than zero.");
474 if (end > length)
475 throw new IndexOutOfBoundsException("The end position is after the end of the input.");
476 if (start > end)
477 throw new IndexOutOfBoundsException("The start position is after the end position.");
478 reset();
479 regionStart = start;
480 regionEnd = end;
481 return this;
482 }
483
484 /**
485 * The start of the region on which to perform matches (inclusive).
486 *
487 * @return the start index of the region.
488 * @see #region(int,int)
489 * #see #regionEnd()
490 * @since 1.5
491 */
492 public int regionStart()
493 {
494 return regionStart;
495 }
496
497 /**
498 * The end of the region on which to perform matches (exclusive).
499 *
500 * @return the end index of the region.
501 * @see #region(int,int)
502 * @see #regionStart()
503 * @since 1.5
504 */
505 public int regionEnd()
506 {
507 return regionEnd;
508 }
509
510 /**
511 * Returns true if the bounds of the region marked by
512 * {@link #regionStart()} and {@link #regionEnd()} are
513 * transparent. When these bounds are transparent, the
514 * matching process can look beyond them to perform
515 * lookahead, lookbehind and boundary matching operations.
516 * By default, the bounds are opaque.
517 *
518 * @return true if the bounds of the matching region are
519 * transparent.
520 * @see #useTransparentBounds(boolean)
521 * @see #region(int,int)
522 * @see #regionStart()
523 * @see #regionEnd()
524 * @since 1.5
525 */
526 public boolean hasTransparentBounds()
527 {
528 return transparentBounds;
529 }
530
531 /**
532 * Sets the transparency of the bounds of the region
533 * marked by {@link #regionStart()} and {@link #regionEnd()}.
534 * A value of {@code true} makes the bounds transparent,
535 * so the matcher can see beyond them to perform lookahead,
536 * lookbehind and boundary matching operations. A value
537 * of {@code false} (the default) makes the bounds opaque,
538 * restricting the match to the input region denoted
539 * by {@link #regionStart()} and {@link #regionEnd()}.
540 *
541 * @param transparent true if the bounds should be transparent.
542 * @return a reference to this matcher.
543 * @see #hasTransparentBounds()
544 * @see #region(int,int)
545 * @see #regionStart()
546 * @see #regionEnd()
547 * @since 1.5
548 */
549 public Matcher useTransparentBounds(boolean transparent)
550 {
551 transparentBounds = transparent;
552 return this;
553 }
554
555 /**
556 * Returns true if the matcher will honour the use of
557 * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z},
558 * {@code \z} and {@code $}. By default, the anchors
559 * are used. Note that the effect of the anchors is
560 * also affected by {@link #hasTransparentBounds()}.
561 *
562 * @return true if the matcher will attempt to match
563 * the anchoring bounds.
564 * @see #useAnchoringBounds(boolean)
565 * @see #hasTransparentBounds()
566 * @since 1.5
567 */
568 public boolean hasAnchoringBounds()
569 {
570 return anchoringBounds == 0;
571 }
572
573 /**
574 * Enables or disables the use of the anchoring bounds:
575 * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and
576 * {@code $}. By default, their use is enabled. When
577 * disabled, the matcher will not attempt to match
578 * the anchors.
579 *
580 * @param useAnchors true if anchoring bounds should be used.
581 * @return a reference to this matcher.
582 * @since 1.5
583 * @see #hasAnchoringBounds()
584 */
585 public Matcher useAnchoringBounds(boolean useAnchors)
586 {
587 if (useAnchors)
588 anchoringBounds = 0;
589 else
590 anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL;
591 return this;
592 }
593
594 /**
595 * Returns a read-only snapshot of the current state of
596 * the {@link Matcher} as a {@link MatchResult}. Any
597 * subsequent changes to this instance are not reflected
598 * in the returned {@link MatchResult}.
599 *
600 * @return a {@link MatchResult} instance representing the
601 * current state of the {@link Matcher}.
602 */
603 public MatchResult toMatchResult()
604 {
605 Matcher snapshot = new Matcher(pattern, input);
606 if (match != null)
607 snapshot.match = (REMatch) match.clone();
608 return snapshot;
609 }
610
611 }