00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018
00019
00020
00045 #include "unicode/utypes.h"
00046
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053
00054 #include "unicode/uregex.h"
00055
00056 U_NAMESPACE_BEGIN
00057
00058
00059
00060
00061 class RegexMatcher;
00062 class RegexPattern;
00063 class UVector;
00064 class UVector32;
00065 class UVector64;
00066 class UnicodeSet;
00067 struct REStackFrame;
00068 struct Regex8BitSet;
00069 class RuleBasedBreakIterator;
00070 class RegexCImpl;
00071
00072
00073
00074
00079 #ifdef REGEX_DEBUG
00080 U_INTERNAL void U_EXPORT2
00081 RegexPatternDump(const RegexPattern *pat);
00082 #else
00083 #undef RegexPatternDump
00084 #define RegexPatternDump(pat)
00085 #endif
00086
00087
00088
00100 class U_I18N_API RegexPattern: public UObject {
00101 public:
00102
00110 RegexPattern();
00111
00118 RegexPattern(const RegexPattern &source);
00119
00125 virtual ~RegexPattern();
00126
00135 UBool operator==(const RegexPattern& that) const;
00136
00145 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00146
00152 RegexPattern &operator =(const RegexPattern &source);
00153
00161 virtual RegexPattern *clone() const;
00162
00163
00188 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00189 UParseError &pe,
00190 UErrorCode &status);
00191
00192
00219 static RegexPattern * U_EXPORT2 compile( UText *regex,
00220 UParseError &pe,
00221 UErrorCode &status);
00222
00247 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00248 uint32_t flags,
00249 UParseError &pe,
00250 UErrorCode &status);
00251
00252
00279 static RegexPattern * U_EXPORT2 compile( UText *regex,
00280 uint32_t flags,
00281 UParseError &pe,
00282 UErrorCode &status);
00283
00284
00307 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex,
00308 uint32_t flags,
00309 UErrorCode &status);
00310
00311
00336 static RegexPattern * U_EXPORT2 compile( UText *regex,
00337 uint32_t flags,
00338 UErrorCode &status);
00339
00340
00346 virtual uint32_t flags() const;
00347
00365 virtual RegexMatcher *matcher(const UnicodeString &input,
00366 UErrorCode &status) const;
00367
00368
00373 enum PatternIsUTextFlag { PATTERN_IS_UTEXT };
00374
00394 virtual RegexMatcher *matcher(UText *input,
00395 PatternIsUTextFlag flag,
00396 UErrorCode &status) const;
00397
00398 private:
00412 RegexMatcher *matcher(const UChar *input,
00413 UErrorCode &status) const;
00414 public:
00415
00416
00428 virtual RegexMatcher *matcher(UErrorCode &status) const;
00429
00430
00445 static UBool U_EXPORT2 matches(const UnicodeString ®ex,
00446 const UnicodeString &input,
00447 UParseError &pe,
00448 UErrorCode &status);
00449
00450
00465 static UBool U_EXPORT2 matches(UText *regex,
00466 UText *input,
00467 UParseError &pe,
00468 UErrorCode &status);
00469
00470
00479 virtual UnicodeString pattern() const;
00480
00481
00492 virtual UText *patternText(UErrorCode &status) const;
00493
00494
00520 virtual int32_t split(const UnicodeString &input,
00521 UnicodeString dest[],
00522 int32_t destCapacity,
00523 UErrorCode &status) const;
00524
00525
00551 virtual int32_t split(UText *input,
00552 UText *dest[],
00553 int32_t destCapacity,
00554 UErrorCode &status) const;
00555
00556
00562 virtual UClassID getDynamicClassID() const;
00563
00569 static UClassID U_EXPORT2 getStaticClassID();
00570
00571 private:
00572
00573
00574
00575 UText *fPattern;
00576 UnicodeString *fPatternString;
00577 uint32_t fFlags;
00578
00579 UVector64 *fCompiledPat;
00580 UnicodeString fLiteralText;
00581
00582
00583 UVector *fSets;
00584 Regex8BitSet *fSets8;
00585
00586
00587 UErrorCode fDeferredStatus;
00588
00589
00590 int32_t fMinMatchLen;
00591
00592
00593
00594
00595 int32_t fFrameSize;
00596
00597
00598 int32_t fDataSize;
00599
00600
00601
00602 UVector32 *fGroupMap;
00603
00604
00605 int32_t fMaxCaptureDigits;
00606
00607 UnicodeSet **fStaticSets;
00608
00609
00610 Regex8BitSet *fStaticSets8;
00611
00612
00613 int32_t fStartType;
00614 int32_t fInitialStringIdx;
00615 int32_t fInitialStringLen;
00616 UnicodeSet *fInitialChars;
00617 UChar32 fInitialChar;
00618 Regex8BitSet *fInitialChars8;
00619 UBool fNeedsAltInput;
00620
00621 friend class RegexCompile;
00622 friend class RegexMatcher;
00623 friend class RegexCImpl;
00624
00625
00626
00627
00628 void init();
00629 void zap();
00630 #ifdef REGEX_DEBUG
00631 void dumpOp(int32_t index) const;
00632 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00633 #endif
00634
00635 };
00636
00637
00638
00648 class U_I18N_API RegexMatcher: public UObject {
00649 public:
00650
00665 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status);
00666
00682 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00683
00705 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
00706 uint32_t flags, UErrorCode &status);
00707
00729 RegexMatcher(UText *regexp, UText *input,
00730 uint32_t flags, UErrorCode &status);
00731
00732 private:
00746 RegexMatcher(const UnicodeString ®exp, const UChar *input,
00747 uint32_t flags, UErrorCode &status);
00748 public:
00749
00750
00756 virtual ~RegexMatcher();
00757
00758
00765 virtual UBool matches(UErrorCode &status);
00766
00767
00778 virtual UBool matches(int64_t startIndex, UErrorCode &status);
00779
00780
00794 virtual UBool lookingAt(UErrorCode &status);
00795
00796
00810 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
00811
00812
00825 virtual UBool find();
00826
00827
00837 virtual UBool find(int64_t start, UErrorCode &status);
00838
00839
00849 virtual UnicodeString group(UErrorCode &status) const;
00850
00851
00864 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00865
00866
00872 virtual int32_t groupCount() const;
00873
00874
00889 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
00890
00894 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
00895
00911 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00912
00913
00921 virtual int32_t start(UErrorCode &status) const;
00922
00926 virtual int64_t start64(UErrorCode &status) const;
00927
00928
00942 virtual int32_t start(int32_t group, UErrorCode &status) const;
00943
00947 virtual int64_t start64(int32_t group, UErrorCode &status) const;
00948
00949
00962 virtual int32_t end(UErrorCode &status) const;
00963
00967 virtual int64_t end64(UErrorCode &status) const;
00968
00969
00986 virtual int32_t end(int32_t group, UErrorCode &status) const;
00987
00991 virtual int64_t end64(int32_t group, UErrorCode &status) const;
00992
00993
01002 virtual RegexMatcher &reset();
01003
01004
01020 virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
01021
01022
01040 virtual RegexMatcher &reset(const UnicodeString &input);
01041
01042
01056 virtual RegexMatcher &reset(UText *input);
01057
01058 private:
01072 RegexMatcher &reset(const UChar *input);
01073 public:
01074
01082 virtual const UnicodeString &input() const;
01083
01092 virtual UText *inputText() const;
01093
01103 virtual UText *getInput(UText *dest, UErrorCode &status) const;
01104
01105
01124 virtual RegexMatcher ®ion(int64_t start, int64_t limit, UErrorCode &status);
01125
01135 virtual RegexMatcher ®ion(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
01136
01145 virtual int32_t regionStart() const;
01146
01150 virtual int64_t regionStart64() const;
01151
01152
01161 virtual int32_t regionEnd() const;
01162
01166 virtual int64_t regionEnd64() const;
01167
01176 virtual UBool hasTransparentBounds() const;
01177
01196 virtual RegexMatcher &useTransparentBounds(UBool b);
01197
01198
01206 virtual UBool hasAnchoringBounds() const;
01207
01208
01221 virtual RegexMatcher &useAnchoringBounds(UBool b);
01222
01223
01236 virtual UBool hitEnd() const;
01237
01247 virtual UBool requireEnd() const;
01248
01249
01255 virtual const RegexPattern &pattern() const;
01256
01257
01274 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01275
01276
01297 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01298
01299
01320 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01321
01322
01347 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01348
01349
01377 virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01378 const UnicodeString &replacement, UErrorCode &status);
01379
01380
01408 virtual RegexMatcher &appendReplacement(UText *dest,
01409 UText *replacement, UErrorCode &status);
01410
01411
01422 virtual UnicodeString &appendTail(UnicodeString &dest);
01423
01424
01437 virtual UText *appendTail(UText *dest, UErrorCode &status);
01438
01439
01463 virtual int32_t split(const UnicodeString &input,
01464 UnicodeString dest[],
01465 int32_t destCapacity,
01466 UErrorCode &status);
01467
01468
01492 virtual int32_t split(UText *input,
01493 UText *dest[],
01494 int32_t destCapacity,
01495 UErrorCode &status);
01496
01518 virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01519
01526 virtual int32_t getTimeLimit() const;
01527
01549 virtual void setStackLimit(int32_t limit, UErrorCode &status);
01550
01558 virtual int32_t getStackLimit() const;
01559
01560
01574 virtual void setMatchCallback(URegexMatchCallback *callback,
01575 const void *context,
01576 UErrorCode &status);
01577
01578
01589 virtual void getMatchCallback(URegexMatchCallback *&callback,
01590 const void *&context,
01591 UErrorCode &status);
01592
01593
01607 virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
01608 const void *context,
01609 UErrorCode &status);
01610
01611
01622 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
01623 const void *&context,
01624 UErrorCode &status);
01625
01626
01632 void setTrace(UBool state);
01633
01634
01640 static UClassID U_EXPORT2 getStaticClassID();
01641
01647 virtual UClassID getDynamicClassID() const;
01648
01649 private:
01650
01651
01652 RegexMatcher();
01653 RegexMatcher(const RegexPattern *pat);
01654 RegexMatcher(const RegexMatcher &other);
01655 RegexMatcher &operator =(const RegexMatcher &rhs);
01656 void init(UErrorCode &status);
01657 void init2(UText *t, UErrorCode &e);
01658
01659 friend class RegexPattern;
01660 friend class RegexCImpl;
01661 public:
01663 void resetPreserveRegion();
01664 private:
01665
01666
01667
01668
01669
01670 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01671 inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
01672 UBool isWordBoundary(int64_t pos);
01673 UBool isUWordBoundary(int64_t pos);
01674 REStackFrame *resetStack();
01675 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01676 void IncrementTime(UErrorCode &status);
01677 UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
01678
01679 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01680
01681 UBool findUsingChunk();
01682 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01683 UBool isChunkWordBoundary(int32_t pos);
01684
01685 const RegexPattern *fPattern;
01686 RegexPattern *fPatternOwned;
01687
01688
01689 const UnicodeString *fInput;
01690 UText *fInputText;
01691 UText *fAltInputText;
01692
01693 int64_t fInputLength;
01694 int32_t fFrameSize;
01695
01696 int64_t fRegionStart;
01697 int64_t fRegionLimit;
01698
01699 int64_t fAnchorStart;
01700 int64_t fAnchorLimit;
01701
01702 int64_t fLookStart;
01703 int64_t fLookLimit;
01704
01705
01706 int64_t fActiveStart;
01707 int64_t fActiveLimit;
01708
01709
01710
01711 UBool fTransparentBounds;
01712 UBool fAnchoringBounds;
01713
01714 UBool fMatch;
01715 int64_t fMatchStart;
01716 int64_t fMatchEnd;
01717
01718
01719 int64_t fLastMatchEnd;
01720
01721 int64_t fAppendPosition;
01722
01723
01724
01725 UBool fHitEnd;
01726 UBool fRequireEnd;
01727
01728
01729 UVector64 *fStack;
01730 REStackFrame *fFrame;
01731
01732
01733
01734 int64_t *fData;
01735 int64_t fSmallData[8];
01736
01737 int32_t fTimeLimit;
01738
01739
01740 int32_t fTime;
01741 int32_t fTickCounter;
01742
01743
01744
01745
01746 int32_t fStackLimit;
01747
01748
01749 URegexMatchCallback *fCallbackFn;
01750
01751 const void *fCallbackContext;
01752
01753 URegexFindProgressCallback *fFindProgressCallbackFn;
01754
01755 const void *fFindProgressCallbackContext;
01756
01757
01758 UBool fInputUniStrMaybeMutable;
01759
01760 UBool fTraceDebug;
01761
01762 UErrorCode fDeferredStatus;
01763
01764
01765 RuleBasedBreakIterator *fWordBreakItr;
01766
01767
01768 };
01769
01770 U_NAMESPACE_END
01771 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
01772 #endif