ICU 55.1  55.1
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
58 struct UHashtable;
59 
61 
62 struct Regex8BitSet;
63 class RegexCImpl;
64 class RegexMatcher;
65 class RegexPattern;
66 struct REStackFrame;
67 class RuleBasedBreakIterator;
68 class UnicodeSet;
69 class UVector;
70 class UVector32;
71 class UVector64;
72 
73 
85 class U_I18N_API RegexPattern U_FINAL : public UObject {
86 public:
87 
95  RegexPattern();
96 
103  RegexPattern(const RegexPattern &source);
104 
110  virtual ~RegexPattern();
111 
120  UBool operator==(const RegexPattern& that) const;
121 
130  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
131 
137  RegexPattern &operator =(const RegexPattern &source);
138 
146  virtual RegexPattern *clone() const;
147 
148 
173  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
174  UParseError &pe,
175  UErrorCode &status);
176 
203  static RegexPattern * U_EXPORT2 compile( UText *regex,
204  UParseError &pe,
205  UErrorCode &status);
206 
231  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
232  uint32_t flags,
233  UParseError &pe,
234  UErrorCode &status);
235 
262  static RegexPattern * U_EXPORT2 compile( UText *regex,
263  uint32_t flags,
264  UParseError &pe,
265  UErrorCode &status);
266 
289  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
290  uint32_t flags,
291  UErrorCode &status);
292 
317  static RegexPattern * U_EXPORT2 compile( UText *regex,
318  uint32_t flags,
319  UErrorCode &status);
320 
326  virtual uint32_t flags() const;
327 
345  virtual RegexMatcher *matcher(const UnicodeString &input,
346  UErrorCode &status) const;
347 
348 private:
361  RegexMatcher *matcher(const UChar *input,
362  UErrorCode &status) const;
363 public:
364 
365 
377  virtual RegexMatcher *matcher(UErrorCode &status) const;
378 
379 
394  static UBool U_EXPORT2 matches(const UnicodeString &regex,
395  const UnicodeString &input,
396  UParseError &pe,
397  UErrorCode &status);
398 
413  static UBool U_EXPORT2 matches(UText *regex,
414  UText *input,
415  UParseError &pe,
416  UErrorCode &status);
417 
426  virtual UnicodeString pattern() const;
427 
428 
439  virtual UText *patternText(UErrorCode &status) const;
440 
441 
455  virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
456 
457 
474  virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
475 
476 
515  virtual int32_t split(const UnicodeString &input,
516  UnicodeString dest[],
517  int32_t destCapacity,
518  UErrorCode &status) const;
519 
520 
559  virtual int32_t split(UText *input,
560  UText *dest[],
561  int32_t destCapacity,
562  UErrorCode &status) const;
563 
564 
570  virtual UClassID getDynamicClassID() const;
571 
577  static UClassID U_EXPORT2 getStaticClassID();
578 
579 private:
580  //
581  // Implementation Data
582  //
583  UText *fPattern; // The original pattern string.
584  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
585  uint32_t fFlags; // The flags used when compiling the pattern.
586  //
587  UVector64 *fCompiledPat; // The compiled pattern p-code.
588  UnicodeString fLiteralText; // Any literal string data from the pattern,
589  // after un-escaping, for use during the match.
590 
591  UVector *fSets; // Any UnicodeSets referenced from the pattern.
592  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
593 
594 
595  UErrorCode fDeferredStatus; // status if some prior error has left this
596  // RegexPattern in an unusable state.
597 
598  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
599  // >= this value. For some patterns, this calculated
600  // value may be less than the true shortest
601  // possible match.
602 
603  int32_t fFrameSize; // Size of a state stack frame in the
604  // execution engine.
605 
606  int32_t fDataSize; // The size of the data needed by the pattern that
607  // does not go on the state stack, but has just
608  // a single copy per matcher.
609 
610  UVector32 *fGroupMap; // Map from capture group number to position of
611  // the group's variables in the matcher stack frame.
612 
613  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
614  // regex character classes, e.g. Word.
615 
616  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
617  // sets for predefined regex classes.
618 
619  int32_t fStartType; // Info on how a match must start.
620  int32_t fInitialStringIdx; //
621  int32_t fInitialStringLen;
622  UnicodeSet *fInitialChars;
623  UChar32 fInitialChar;
624  Regex8BitSet *fInitialChars8;
625  UBool fNeedsAltInput;
626 
627  UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
628 
629  friend class RegexCompile;
630  friend class RegexMatcher;
631  friend class RegexCImpl;
632 
633  //
634  // Implementation Methods
635  //
636  void init(); // Common initialization, for use by constructors.
637  void zap(); // Common cleanup
638 
639  void dumpOp(int32_t index) const;
640 
641  public:
642 #ifndef U_HIDE_INTERNAL_API
643 
647  void dumpPattern() const;
648 #endif /* U_HIDE_INTERNAL_API */
649 };
650 
651 
652 
662 class U_I18N_API RegexMatcher U_FINAL : public UObject {
663 public:
664 
679  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
680 
696  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
697 
719  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
720  uint32_t flags, UErrorCode &status);
721 
743  RegexMatcher(UText *regexp, UText *input,
744  uint32_t flags, UErrorCode &status);
745 
746 private:
759  RegexMatcher(const UnicodeString &regexp, const UChar *input,
760  uint32_t flags, UErrorCode &status);
761 public:
762 
763 
769  virtual ~RegexMatcher();
770 
771 
778  virtual UBool matches(UErrorCode &status);
779 
780 
791  virtual UBool matches(int64_t startIndex, UErrorCode &status);
792 
793 
807  virtual UBool lookingAt(UErrorCode &status);
808 
809 
823  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
824 
825 
838  virtual UBool find();
839 
840 
854  virtual UBool find(UErrorCode &status);
855 
865  virtual UBool find(int64_t start, UErrorCode &status);
866 
867 
877  virtual UnicodeString group(UErrorCode &status) const;
878 
879 
892  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
893 
899  virtual int32_t groupCount() const;
900 
901 
916  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
917 
933  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
934 
942  virtual int32_t start(UErrorCode &status) const;
943 
951  virtual int64_t start64(UErrorCode &status) const;
952 
953 
967  virtual int32_t start(int32_t group, UErrorCode &status) const;
968 
982  virtual int64_t start64(int32_t group, UErrorCode &status) const;
983 
997  virtual int32_t end(UErrorCode &status) const;
998 
1012  virtual int64_t end64(UErrorCode &status) const;
1013 
1014 
1032  virtual int32_t end(int32_t group, UErrorCode &status) const;
1033 
1051  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1052 
1061  virtual RegexMatcher &reset();
1062 
1063 
1079  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1080 
1081 
1099  virtual RegexMatcher &reset(const UnicodeString &input);
1100 
1101 
1115  virtual RegexMatcher &reset(UText *input);
1116 
1117 
1142  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1143 
1144 private:
1157  RegexMatcher &reset(const UChar *input);
1158 public:
1159 
1167  virtual const UnicodeString &input() const;
1168 
1177  virtual UText *inputText() const;
1178 
1189  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1190 
1191 
1210  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1211 
1223  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1224 
1233  virtual int32_t regionStart() const;
1234 
1243  virtual int64_t regionStart64() const;
1244 
1245 
1254  virtual int32_t regionEnd() const;
1255 
1264  virtual int64_t regionEnd64() const;
1265 
1274  virtual UBool hasTransparentBounds() const;
1275 
1294  virtual RegexMatcher &useTransparentBounds(UBool b);
1295 
1296 
1304  virtual UBool hasAnchoringBounds() const;
1305 
1306 
1319  virtual RegexMatcher &useAnchoringBounds(UBool b);
1320 
1321 
1334  virtual UBool hitEnd() const;
1335 
1345  virtual UBool requireEnd() const;
1346 
1347 
1353  virtual const RegexPattern &pattern() const;
1354 
1355 
1372  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1373 
1374 
1395  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1396 
1397 
1418  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1419 
1420 
1445  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1446 
1447 
1475  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1476  const UnicodeString &replacement, UErrorCode &status);
1477 
1478 
1506  virtual RegexMatcher &appendReplacement(UText *dest,
1507  UText *replacement, UErrorCode &status);
1508 
1509 
1520  virtual UnicodeString &appendTail(UnicodeString &dest);
1521 
1522 
1536  virtual UText *appendTail(UText *dest, UErrorCode &status);
1537 
1538 
1562  virtual int32_t split(const UnicodeString &input,
1563  UnicodeString dest[],
1564  int32_t destCapacity,
1565  UErrorCode &status);
1566 
1567 
1591  virtual int32_t split(UText *input,
1592  UText *dest[],
1593  int32_t destCapacity,
1594  UErrorCode &status);
1595 
1617  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1618 
1625  virtual int32_t getTimeLimit() const;
1626 
1648  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1649 
1657  virtual int32_t getStackLimit() const;
1658 
1659 
1673  virtual void setMatchCallback(URegexMatchCallback *callback,
1674  const void *context,
1675  UErrorCode &status);
1676 
1677 
1688  virtual void getMatchCallback(URegexMatchCallback *&callback,
1689  const void *&context,
1690  UErrorCode &status);
1691 
1692 
1706  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1707  const void *context,
1708  UErrorCode &status);
1709 
1710 
1721  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1722  const void *&context,
1723  UErrorCode &status);
1724 
1725 #ifndef U_HIDE_INTERNAL_API
1726 
1731  void setTrace(UBool state);
1732 #endif /* U_HIDE_INTERNAL_API */
1733 
1739  static UClassID U_EXPORT2 getStaticClassID();
1740 
1746  virtual UClassID getDynamicClassID() const;
1747 
1748 private:
1749  // Constructors and other object boilerplate are private.
1750  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1751  RegexMatcher(); // default constructor not implemented
1752  RegexMatcher(const RegexPattern *pat);
1753  RegexMatcher(const RegexMatcher &other);
1754  RegexMatcher &operator =(const RegexMatcher &rhs);
1755  void init(UErrorCode &status); // Common initialization
1756  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1757 
1758  friend class RegexPattern;
1759  friend class RegexCImpl;
1760 public:
1761 #ifndef U_HIDE_INTERNAL_API
1762 
1763  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1764 #endif /* U_HIDE_INTERNAL_API */
1765 private:
1766 
1767  //
1768  // MatchAt This is the internal interface to the match engine itself.
1769  // Match status comes back in matcher member variables.
1770  //
1771  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1772  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1773  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1774  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1775  REStackFrame *resetStack();
1776  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1777  void IncrementTime(UErrorCode &status);
1778 
1779  // Call user find callback function, if set. Return TRUE if operation should be interrupted.
1780  inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1781 
1782  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1783 
1784  UBool findUsingChunk(UErrorCode &status);
1785  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1786  UBool isChunkWordBoundary(int32_t pos);
1787 
1788  const RegexPattern *fPattern;
1789  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1790  // should delete it when through.
1791 
1792  const UnicodeString *fInput; // The string being matched. Only used for input()
1793  UText *fInputText; // The text being matched. Is never NULL.
1794  UText *fAltInputText; // A shallow copy of the text being matched.
1795  // Only created if the pattern contains backreferences.
1796  int64_t fInputLength; // Full length of the input text.
1797  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1798 
1799  int64_t fRegionStart; // Start of the input region, default = 0.
1800  int64_t fRegionLimit; // End of input region, default to input.length.
1801 
1802  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1803  int64_t fAnchorLimit; // See useAnchoringBounds
1804 
1805  int64_t fLookStart; // Region bounds for look-ahead/behind and
1806  int64_t fLookLimit; // and other boundary tests. See
1807  // useTransparentBounds
1808 
1809  int64_t fActiveStart; // Currently active bounds for matching.
1810  int64_t fActiveLimit; // Usually is the same as region, but
1811  // is changed to fLookStart/Limit when
1812  // entering look around regions.
1813 
1814  UBool fTransparentBounds; // True if using transparent bounds.
1815  UBool fAnchoringBounds; // True if using anchoring bounds.
1816 
1817  UBool fMatch; // True if the last attempted match was successful.
1818  int64_t fMatchStart; // Position of the start of the most recent match
1819  int64_t fMatchEnd; // First position after the end of the most recent match
1820  // Zero if no previous match, even when a region
1821  // is active.
1822  int64_t fLastMatchEnd; // First position after the end of the previous match,
1823  // or -1 if there was no previous match.
1824  int64_t fAppendPosition; // First position after the end of the previous
1825  // appendReplacement(). As described by the
1826  // JavaDoc for Java Matcher, where it is called
1827  // "append position"
1828  UBool fHitEnd; // True if the last match touched the end of input.
1829  UBool fRequireEnd; // True if the last match required end-of-input
1830  // (matched $ or Z)
1831 
1832  UVector64 *fStack;
1833  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1834  // which will contain the capture group results.
1835  // NOT valid while match engine is running.
1836 
1837  int64_t *fData; // Data area for use by the compiled pattern.
1838  int64_t fSmallData[8]; // Use this for data if it's enough.
1839 
1840  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1841  // match engine run. Zero for unlimited.
1842 
1843  int32_t fTime; // Match time, accumulates while matching.
1844  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1845  // Kept separately from fTime to keep as much
1846  // code as possible out of the inline
1847  // StateSave function.
1848 
1849  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1850  // stack, in bytes. Zero for unlimited.
1851 
1852  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1853  // NULL if there is no callback.
1854  const void *fCallbackContext; // User Context ptr for callback function.
1855 
1856  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1857  // NULL if there is no callback.
1858  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1859 
1860 
1861  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1862 
1863  UBool fTraceDebug; // Set true for debug tracing of match engine.
1864 
1865  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1866  // reported, or that permanently disables this matcher.
1867 
1868  RuleBasedBreakIterator *fWordBreakItr;
1869 };
1870 
1872 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1873 #endif
virtual UClassID getDynamicClassID() const
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:91
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:85
UBool URegexFindProgressCallback(const void *context, int64_t matchIndex)
Function pointer for a regular expression find callback function.
Definition: uregex.h:1573
C API: Abstract Unicode Text API.
class RegexMatcher bundles together a regular expression pattern and input text to which the expressi...
Definition: regex.h:662
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:358
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:332
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:276
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:312
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:1499
C API: Parse Error Information.
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:130
UText struct.
Definition: utext.h:1343
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:65
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:56
Basic definitions for ICU, for both C and C++ APIs.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:245
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:221
int8_t UBool
The ICU boolean type.
Definition: umachine.h:234