git.fiddlerwoaroof.com
Browse code

reformat

Ed Langley authored on 10/03/2019 19:49:01
Showing 19 changed files
... ...
@@ -1,26 +1,26 @@
1 1
 <?xml version="1.0" encoding="UTF-8"?>
2 2
 
3 3
 <schema version="1.0" xmlns="http://www.apple.com/metadata"
4
-	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
5
-	xsi:schemaLocation="http://www.apple.com/metadata file:///System/Library/Frameworks/CoreServices.framework/Frameworks/Metadata.framework/Resources/MetadataSchema.xsd">
6
-
7
-	<attributes>
8
-		<attribute name="org_lisp_definitions" multivalued="true" type="CFString"/>
9
-		<attribute name="org_lisp_defuns" multivalued="true" type="CFString"/>
10
-		<attribute name="org_lisp_defmethods" multivalued="true" type="CFString"/>
4
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
5
+    xsi:schemaLocation="http://www.apple.com/metadata file:///System/Library/Frameworks/CoreServices.framework/Frameworks/Metadata.framework/Resources/MetadataSchema.xsd">
6
+    
7
+    <attributes>
8
+        <attribute name="org_lisp_definitions" multivalued="true" type="CFString"/>
9
+        <attribute name="org_lisp_defuns" multivalued="true" type="CFString"/>
10
+        <attribute name="org_lisp_defmethods" multivalued="true" type="CFString"/>
11 11
         <attribute name="org_lisp_defpackages" multivalued="true" type="CFString"/>
12
-		<attribute name="org_lisp_defgenerics" multivalued="true" type="CFString"/>
13
-		<attribute name="org_lisp_defmacros" multivalued="true" type="CFString"/>
14
-		<attribute name="org_lisp_defvars" multivalued="true" type="CFString"/>
15
-		<attribute name="org_lisp_defclasses" multivalued="true" type="CFString"/>
16
-		<attribute name="org_lisp_defstructs" multivalued="true" type="CFString"/>
17
-	</attributes>
12
+        <attribute name="org_lisp_defgenerics" multivalued="true" type="CFString"/>
13
+        <attribute name="org_lisp_defmacros" multivalued="true" type="CFString"/>
14
+        <attribute name="org_lisp_defvars" multivalued="true" type="CFString"/>
15
+        <attribute name="org_lisp_defclasses" multivalued="true" type="CFString"/>
16
+        <attribute name="org_lisp_defstructs" multivalued="true" type="CFString"/>
17
+    </attributes>
18 18
     
19
-	<types>
20
-		<type name="org.lisp.lisp-source">
21
-			<allattrs>kMDItemTextContent org_lisp_definitions org_lisp_defuns org_lisp_defmethods org_lisp_defgenerics org_lisp_defmacros org_lisp_defvars org_lisp_defclasses org_lisp_defstructs</allattrs>
19
+    <types>
20
+        <type name="org.lisp.lisp-source">
21
+            <allattrs>kMDItemTextContent org_lisp_definitions org_lisp_defuns org_lisp_defmethods org_lisp_defgenerics org_lisp_defmacros org_lisp_defvars org_lisp_defclasses org_lisp_defstructs</allattrs>
22 22
             <displayattrs>org_lisp_defpackages org_lisp_defclasses org_lisp_defgenerics org_lisp_defuns org_lisp_defmacros</displayattrs>
23
-		</type>
24
-	</types>
23
+        </type>
24
+    </types>
25 25
 </schema>
26 26
 
... ...
@@ -16,213 +16,213 @@
16 16
 @class AGRegex, NSArray, NSString;
17 17
 
18 18
 /*!
19
-@enum Options 
20
-Options defined for -initWithPattern:options:. Two or more options can be combined with the bitwise OR operator.
21
-@constant AGRegexCaseInsensitive Matching is case insensitive. Equivalent to /i in Perl.
22
-@constant AGRegexDotAll Dot metacharacter matches any character including newline. Equivalent to /s in Perl.
23
-@constant AGRegexExtended Allow whitespace and comments in the pattern. Equivalent to /x in Perl.
24
-@constant AGRegexLazy Makes greedy quantifiers lazy and lazy quantifiers greedy. No equivalent in Perl.
25
-@constant AGRegexMultiline Caret and dollar anchors match at newline. Equivalent to /m in Perl.
26
-*/
19
+ @enum Options 
20
+ Options defined for -initWithPattern:options:. Two or more options can be combined with the bitwise OR operator.
21
+ @constant AGRegexCaseInsensitive Matching is case insensitive. Equivalent to /i in Perl.
22
+ @constant AGRegexDotAll Dot metacharacter matches any character including newline. Equivalent to /s in Perl.
23
+ @constant AGRegexExtended Allow whitespace and comments in the pattern. Equivalent to /x in Perl.
24
+ @constant AGRegexLazy Makes greedy quantifiers lazy and lazy quantifiers greedy. No equivalent in Perl.
25
+ @constant AGRegexMultiline Caret and dollar anchors match at newline. Equivalent to /m in Perl.
26
+ */
27 27
 enum {
28
-	AGRegexCaseInsensitive = 1,
29
-	AGRegexDotAll = 2,
30
-	AGRegexExtended = 4,
31
-	AGRegexLazy = 8,
32
-	AGRegexMultiline = 16
28
+    AGRegexCaseInsensitive = 1,
29
+    AGRegexDotAll = 2,
30
+    AGRegexExtended = 4,
31
+    AGRegexLazy = 8,
32
+    AGRegexMultiline = 16
33 33
 };
34 34
 
35 35
 /*!
36
-@class AGRegexMatch
37
-@abstract A single occurence of a regular expression.
38
-@discussion An AGRegexMatch represents a single occurence of a regular expression within the target string. The range of each subpattern within the target string is returned by -range, -rangeAtIndex:, or -rangeNamed:. The part of the target string that matched each subpattern is returned by -group, -groupAtIndex:, or -groupNamed:.
39
-*/
36
+ @class AGRegexMatch
37
+ @abstract A single occurence of a regular expression.
38
+ @discussion An AGRegexMatch represents a single occurence of a regular expression within the target string. The range of each subpattern within the target string is returned by -range, -rangeAtIndex:, or -rangeNamed:. The part of the target string that matched each subpattern is returned by -group, -groupAtIndex:, or -groupNamed:.
39
+ */
40 40
 @interface AGRegexMatch : NSObject {
41
-	AGRegex *regex;
42
-	NSString *string;
43
-	int *matchv;
44
-	int count;
41
+    AGRegex *regex;
42
+    NSString *string;
43
+    int *matchv;
44
+    int count;
45 45
 }
46 46
 
47 47
 /*!
48
-@method count
49
-The number of capturing subpatterns, including the pattern itself. */
48
+ @method count
49
+ The number of capturing subpatterns, including the pattern itself. */
50 50
 - (int)count;
51 51
 
52 52
 /*!
53
-@method group
54
-Returns the part of the target string that matched the pattern. */
53
+ @method group
54
+ Returns the part of the target string that matched the pattern. */
55 55
 - (NSString *)group;
56 56
 
57 57
 /*!
58
-@method groupAtIndex:
59
-Returns the part of the target string that matched the subpattern at the given index or nil if it wasn't matched. The subpatterns are indexed in order of their opening parentheses, 0 is the entire pattern, 1 is the first capturing subpattern, and so on. */
58
+ @method groupAtIndex:
59
+ Returns the part of the target string that matched the subpattern at the given index or nil if it wasn't matched. The subpatterns are indexed in order of their opening parentheses, 0 is the entire pattern, 1 is the first capturing subpattern, and so on. */
60 60
 - (NSString *)groupAtIndex:(int)idx;
61 61
 
62 62
 /*!
63
-@method groupNamed:
64
-Returns the part of the target string that matched the subpattern of the given name or nil if it wasn't matched. */
63
+ @method groupNamed:
64
+ Returns the part of the target string that matched the subpattern of the given name or nil if it wasn't matched. */
65 65
 - (NSString *)groupNamed:(NSString *)name;
66 66
 
67 67
 /*!
68
-@method range
69
-Returns the range of the target string that matched the pattern. */
68
+ @method range
69
+ Returns the range of the target string that matched the pattern. */
70 70
 - (NSRange)range;
71 71
 
72 72
 /*!
73
-@method rangeAtIndex:
74
-Returns the range of the target string that matched the subpattern at the given index or {NSNotFound, 0} if it wasn't matched. The subpatterns are indexed in order of their opening parentheses, 0 is the entire pattern, 1 is the first capturing subpattern, and so on. */
73
+ @method rangeAtIndex:
74
+ Returns the range of the target string that matched the subpattern at the given index or {NSNotFound, 0} if it wasn't matched. The subpatterns are indexed in order of their opening parentheses, 0 is the entire pattern, 1 is the first capturing subpattern, and so on. */
75 75
 - (NSRange)rangeAtIndex:(int)idx;
76 76
 
77 77
 /*!
78
-@method rangeNamed:
79
-Returns the range of the target string that matched the subpattern of the given name or {NSNotFound, 0} if it wasn't matched. */
78
+ @method rangeNamed:
79
+ Returns the range of the target string that matched the subpattern of the given name or {NSNotFound, 0} if it wasn't matched. */
80 80
 - (NSRange)rangeNamed:(NSString *)name;
81 81
 
82 82
 /*!
83
-@method string
84
-Returns the target string. */
83
+ @method string
84
+ Returns the target string. */
85 85
 - (NSString *)string;
86 86
 
87 87
 @end
88 88
 
89 89
 /*!
90
-@class AGRegex
91
-@abstract An Perl-compatible regular expression class.
92
-@discussion An AGRegex is created with -initWithPattern: or -initWithPattern:options: or the corresponding class methods +regexWithPattern: or +regexWithPattern:options:. These take a regular expression pattern string and the bitwise OR of zero or more option flags. For example:
93
-
94
-<code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [[AGRegex alloc] initWithPattern:&#64;"(paran|andr)oid" options:AGRegexCaseInsensitive];</code>
95
-
96
-Matching is done with -findInString: or -findInString:range: which look for the first occurrence of the pattern in the target string and return an AGRegexMatch or nil if the pattern was not found.
97
-
98
-<code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegexMatch *match = [regex findInString:&#64;"paranoid android"];</code>
99
-    
100
-A match object returns a captured subpattern by -group, -groupAtIndex:, or -groupNamed:, or the range of a captured subpattern by -range, -rangeAtIndex:, or -rangeNamed:. The subpatterns are indexed in order of their opening parentheses, 0 is the entire pattern, 1 is the first capturing subpattern, and so on. -count returns the total number of subpatterns, including the pattern itself. The following prints the result of our last match case:
101
-
102
-<code>&nbsp;&nbsp;&nbsp;&nbsp;for (i = 0; i &lt; [match count]; i++)<br />
103
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NSLog(&#64;"%d %&#64; %&#64;", i, NSStringFromRange([match rangeAtIndex:i]), [match groupAtIndex:i]);</code>
104
-
105
-<code>&nbsp;&nbsp;&nbsp;&nbsp;0 {0, 8} paranoid<br />
106
-&nbsp;&nbsp;&nbsp;&nbsp;1 {0, 5} paran</code>
107
-
108
-If any of the subpatterns didn't match, -groupAtIndex: will  return nil, and -rangeAtIndex: will return {NSNotFound, 0}. For example, if we change our original pattern to "(?:(paran)|(andr))oid" we will get the following output:
109
-
110
-<code>&nbsp;&nbsp;&nbsp;&nbsp;0 {0, 8} paranoid<br />
111
-&nbsp;&nbsp;&nbsp;&nbsp;1 {0, 5} paran<br />
112
-&nbsp;&nbsp;&nbsp;&nbsp;2 {2147483647, 0} (null)</code>
113
-
114
--findAllInString: and -findAllInString:range: return an NSArray of all non-overlapping occurrences of the pattern in the target string. -findEnumeratorInString: and -findEnumeratorInString:range: return an NSEnumerator for all non-overlapping occurrences of the pattern in the target string. For example,
115
-
116
-<code>&nbsp;&nbsp;&nbsp;&nbsp;NSArray *all = [regex findAllInString:&#64;"paranoid android"];</code>
117
-
118
-The first object in the returned array is the match case for "paranoid" and the second object is the match case for "android".
119
-
120
-AGRegex provides the methods -replaceWithString:inString: and -replaceWithString:inString:limit: to perform substitution on strings.
121
-
122
-<code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"remote"];<br />
123
-&nbsp;&nbsp;&nbsp;&nbsp;NSString *result = [regex replaceWithString:&#64;"complete" inString:&#64;"remote control"]; // result is "complete control"</code>
124
-
125
-Captured subpatterns can be interpolated into the replacement string using the syntax $x or ${x} where x is the index or name of the subpattern. $0 and $& both refer to the entire pattern. Additionally, the case modifier sequences \U...\E, \L...\E, \u, and \l are allowed in the replacement string. All other escape sequences are handled literally.
126
-
127
-<code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"[usr]"];<br />
128
-&nbsp;&nbsp;&nbsp;&nbsp;NSString *result = [regex replaceWithString:&#64;"\\u$&amp;." inString:&#64;"Back in the ussr"]; // result is "Back in the U.S.S.R."</code>
129
-
130
-Note that you have to escape a backslash to get it into an NSString literal. 
131
-
132
-Named subpatterns may also be used in the pattern and replacement strings, like in Python. 
133
-
134
-<code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"(?P&lt;who&gt;\\w+) is a (?P&lt;what&gt;\\w+)"];<br />
135
-&nbsp;&nbsp;&nbsp;&nbsp;NSString *result = [regex replaceWithString:&#64;"Jackie is a $what, $who is a runt" inString:&#64;"Judy is a punk"]); // result is "Jackie is a punk, Judy is a runt"</code>
136
-
137
-Finally, AGRegex provides -splitString: and -splitString:limit: which return an NSArray created by splitting the target string at each occurrence of the pattern. For example:
138
-
139
-<code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"ea?"];<br />
140
-&nbsp;&nbsp;&nbsp;&nbsp;NSArray *result = [regex splitString:&#64;"Repeater"]; // result is "R", "p", "t", "r"</code>
141
-
142
-If there are captured subpatterns, they are returned in the array. 
143
-
144
-<code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"e(a)?"];<br />
145
-&nbsp;&nbsp;&nbsp;&nbsp;NSArray *result = [regex splitString:&#64;"Repeater"]; // result is "R", "p", "a", "t", "r"</code>
146
-
147
-In Perl, this would return "R", undef, "p", "a", "t", undef, "r". Unfortunately, there is no convenient way to represent this in an NSArray. (NSNull could be used in place of undef, but then all members of the array couldn't be expected to be NSStrings.)
148
-*/
90
+ @class AGRegex
91
+ @abstract An Perl-compatible regular expression class.
92
+ @discussion An AGRegex is created with -initWithPattern: or -initWithPattern:options: or the corresponding class methods +regexWithPattern: or +regexWithPattern:options:. These take a regular expression pattern string and the bitwise OR of zero or more option flags. For example:
93
+ 
94
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [[AGRegex alloc] initWithPattern:&#64;"(paran|andr)oid" options:AGRegexCaseInsensitive];</code>
95
+ 
96
+ Matching is done with -findInString: or -findInString:range: which look for the first occurrence of the pattern in the target string and return an AGRegexMatch or nil if the pattern was not found.
97
+ 
98
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegexMatch *match = [regex findInString:&#64;"paranoid android"];</code>
99
+ 
100
+ A match object returns a captured subpattern by -group, -groupAtIndex:, or -groupNamed:, or the range of a captured subpattern by -range, -rangeAtIndex:, or -rangeNamed:. The subpatterns are indexed in order of their opening parentheses, 0 is the entire pattern, 1 is the first capturing subpattern, and so on. -count returns the total number of subpatterns, including the pattern itself. The following prints the result of our last match case:
101
+ 
102
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;for (i = 0; i &lt; [match count]; i++)<br />
103
+ &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;NSLog(&#64;"%d %&#64; %&#64;", i, NSStringFromRange([match rangeAtIndex:i]), [match groupAtIndex:i]);</code>
104
+ 
105
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;0 {0, 8} paranoid<br />
106
+ &nbsp;&nbsp;&nbsp;&nbsp;1 {0, 5} paran</code>
107
+ 
108
+ If any of the subpatterns didn't match, -groupAtIndex: will  return nil, and -rangeAtIndex: will return {NSNotFound, 0}. For example, if we change our original pattern to "(?:(paran)|(andr))oid" we will get the following output:
109
+ 
110
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;0 {0, 8} paranoid<br />
111
+ &nbsp;&nbsp;&nbsp;&nbsp;1 {0, 5} paran<br />
112
+ &nbsp;&nbsp;&nbsp;&nbsp;2 {2147483647, 0} (null)</code>
113
+ 
114
+ -findAllInString: and -findAllInString:range: return an NSArray of all non-overlapping occurrences of the pattern in the target string. -findEnumeratorInString: and -findEnumeratorInString:range: return an NSEnumerator for all non-overlapping occurrences of the pattern in the target string. For example,
115
+ 
116
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;NSArray *all = [regex findAllInString:&#64;"paranoid android"];</code>
117
+ 
118
+ The first object in the returned array is the match case for "paranoid" and the second object is the match case for "android".
119
+ 
120
+ AGRegex provides the methods -replaceWithString:inString: and -replaceWithString:inString:limit: to perform substitution on strings.
121
+ 
122
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"remote"];<br />
123
+ &nbsp;&nbsp;&nbsp;&nbsp;NSString *result = [regex replaceWithString:&#64;"complete" inString:&#64;"remote control"]; // result is "complete control"</code>
124
+ 
125
+ Captured subpatterns can be interpolated into the replacement string using the syntax $x or ${x} where x is the index or name of the subpattern. $0 and $& both refer to the entire pattern. Additionally, the case modifier sequences \U...\E, \L...\E, \u, and \l are allowed in the replacement string. All other escape sequences are handled literally.
126
+ 
127
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"[usr]"];<br />
128
+ &nbsp;&nbsp;&nbsp;&nbsp;NSString *result = [regex replaceWithString:&#64;"\\u$&amp;." inString:&#64;"Back in the ussr"]; // result is "Back in the U.S.S.R."</code>
129
+ 
130
+ Note that you have to escape a backslash to get it into an NSString literal. 
131
+ 
132
+ Named subpatterns may also be used in the pattern and replacement strings, like in Python. 
133
+ 
134
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"(?P&lt;who&gt;\\w+) is a (?P&lt;what&gt;\\w+)"];<br />
135
+ &nbsp;&nbsp;&nbsp;&nbsp;NSString *result = [regex replaceWithString:&#64;"Jackie is a $what, $who is a runt" inString:&#64;"Judy is a punk"]); // result is "Jackie is a punk, Judy is a runt"</code>
136
+ 
137
+ Finally, AGRegex provides -splitString: and -splitString:limit: which return an NSArray created by splitting the target string at each occurrence of the pattern. For example:
138
+ 
139
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"ea?"];<br />
140
+ &nbsp;&nbsp;&nbsp;&nbsp;NSArray *result = [regex splitString:&#64;"Repeater"]; // result is "R", "p", "t", "r"</code>
141
+ 
142
+ If there are captured subpatterns, they are returned in the array. 
143
+ 
144
+ <code>&nbsp;&nbsp;&nbsp;&nbsp;AGRegex *regex = [AGRegex regexWithPattern:&#64;"e(a)?"];<br />
145
+ &nbsp;&nbsp;&nbsp;&nbsp;NSArray *result = [regex splitString:&#64;"Repeater"]; // result is "R", "p", "a", "t", "r"</code>
146
+ 
147
+ In Perl, this would return "R", undef, "p", "a", "t", undef, "r". Unfortunately, there is no convenient way to represent this in an NSArray. (NSNull could be used in place of undef, but then all members of the array couldn't be expected to be NSStrings.)
148
+ */
149 149
 @interface AGRegex : NSObject {
150
-	void *regex;
151
-	void *extra;
152
-	int groupCount;
150
+    void *regex;
151
+    void *extra;
152
+    int groupCount;
153 153
 }
154 154
 
155 155
 /*!
156
-@method regexWithPattern:
157
-Creates a new regex using the given pattern string. Returns nil if the pattern string is invalid. */
156
+ @method regexWithPattern:
157
+ Creates a new regex using the given pattern string. Returns nil if the pattern string is invalid. */
158 158
 + (id)regexWithPattern:(NSString *)pat;
159 159
 
160 160
 /*!
161
-@method regexWithPattern:options:
162
-Creates a new regex using the given pattern string and option flags. Returns nil if the pattern string is invalid. */
161
+ @method regexWithPattern:options:
162
+ Creates a new regex using the given pattern string and option flags. Returns nil if the pattern string is invalid. */
163 163
 + (id)regexWithPattern:(NSString *)pat options:(int)opts;
164 164
 
165 165
 
166 166
 /*!
167
-@method initWithPattern:
168
-Initializes the regex using the given pattern string. Returns nil if the pattern string is invalid. */
167
+ @method initWithPattern:
168
+ Initializes the regex using the given pattern string. Returns nil if the pattern string is invalid. */
169 169
 - (id)initWithPattern:(NSString *)pat;
170 170
 
171 171
 /*!
172
-@method initWithPattern:options:
173
-Initializes the regex using the given pattern string and option flags. Returns nil if the pattern string is invalid. */
172
+ @method initWithPattern:options:
173
+ Initializes the regex using the given pattern string and option flags. Returns nil if the pattern string is invalid. */
174 174
 - (id)initWithPattern:(NSString *)pat options:(int)opts;
175 175
 
176 176
 /*!
177
-@method findInString:
178
-Calls findInString:range: using the full range of the target string. */
177
+ @method findInString:
178
+ Calls findInString:range: using the full range of the target string. */
179 179
 - (AGRegexMatch *)findInString:(NSString *)str;
180 180
 
181 181
 /*!
182
-@method findInString:range:
183
-Returns an AGRegexMatch for the first occurrence of the regex in the given range of the target string or nil if none is found. */
182
+ @method findInString:range:
183
+ Returns an AGRegexMatch for the first occurrence of the regex in the given range of the target string or nil if none is found. */
184 184
 - (AGRegexMatch *)findInString:(NSString *)str range:(NSRange)r;
185 185
 
186 186
 /*!
187
-@method findAllInString:
188
-Calls findAllInString:range: using the full range of the target string. */
187
+ @method findAllInString:
188
+ Calls findAllInString:range: using the full range of the target string. */
189 189
 - (NSArray *)findAllInString:(NSString *)str;
190 190
 
191 191
 /*!
192
-@method findAllInString:range:
193
-Returns an array of all non-overlapping occurrences of the regex in the given range of the target string. The members of the array are AGRegexMatches. */
192
+ @method findAllInString:range:
193
+ Returns an array of all non-overlapping occurrences of the regex in the given range of the target string. The members of the array are AGRegexMatches. */
194 194
 - (NSArray *)findAllInString:(NSString *)str range:(NSRange)r;
195 195
 
196 196
 /*!
197
-@method findEnumeratorInString:
198
-Calls findEnumeratorInString:range: using the full range of the target string. */
197
+ @method findEnumeratorInString:
198
+ Calls findEnumeratorInString:range: using the full range of the target string. */
199 199
 - (NSEnumerator *)findEnumeratorInString:(NSString *)str;
200 200
 
201 201
 /*!
202
-@method findEnumeratorInString:range:
203
-Returns an enumerator for all non-overlapping occurrences of the regex in the given range of the target string. The objects returned by the enumerator are AGRegexMatches. */
202
+ @method findEnumeratorInString:range:
203
+ Returns an enumerator for all non-overlapping occurrences of the regex in the given range of the target string. The objects returned by the enumerator are AGRegexMatches. */
204 204
 - (NSEnumerator *)findEnumeratorInString:(NSString *)str range:(NSRange)r;
205 205
 
206 206
 /*!
207
-@method replaceWithString:inString:
208
-Calls replaceWithString:inString:limit: with no limit. */
207
+ @method replaceWithString:inString:
208
+ Calls replaceWithString:inString:limit: with no limit. */
209 209
 - (NSString *)replaceWithString:(NSString *)rep inString:(NSString *)str;
210 210
 
211 211
 /*!
212
-@method replaceWithString:inString:limit:
213
-Returns the string created by replacing occurrences of the regex in the target string with the replacement string. If the limit is positive, no more than that many replacements will be made.
214
-
215
-Captured subpatterns can be interpolated into the replacement string using the syntax $x or ${x} where x is the index or name of the subpattern. $0 and $&amp; both refer to the entire pattern. Additionally, the case modifier sequences \U...\E, \L...\E, \u, and \l are allowed in the replacement string. All other escape sequences are handled literally. */
212
+ @method replaceWithString:inString:limit:
213
+ Returns the string created by replacing occurrences of the regex in the target string with the replacement string. If the limit is positive, no more than that many replacements will be made.
214
+ 
215
+ Captured subpatterns can be interpolated into the replacement string using the syntax $x or ${x} where x is the index or name of the subpattern. $0 and $&amp; both refer to the entire pattern. Additionally, the case modifier sequences \U...\E, \L...\E, \u, and \l are allowed in the replacement string. All other escape sequences are handled literally. */
216 216
 - (NSString *)replaceWithString:(NSString *)rep inString:(NSString *)str limit:(int)limit;
217 217
 
218 218
 /*!
219
-@method splitString:
220
-Call splitString:limit: with no limit. */
219
+ @method splitString:
220
+ Call splitString:limit: with no limit. */
221 221
 - (NSArray *)splitString:(NSString *)str;
222 222
 
223 223
 /*!
224
-@method splitString:limit:
225
-Returns an array of strings created by splitting the target string at each occurrence of the pattern. If the limit is positive, no more than that many splits will be made. If there are captured subpatterns, they are returned in the array.  */
224
+ @method splitString:limit:
225
+ Returns an array of strings created by splitting the target string at each occurrence of the pattern. If the limit is positive, no more than that many splits will be made. If there are captured subpatterns, they are returned in the array.  */
226 226
 - (NSArray *)splitString:(NSString *)str limit:(int)lim;
227 227
 
228
-@end
229 228
\ No newline at end of file
229
+@end
... ...
@@ -31,22 +31,22 @@
31 31
 
32 32
 // information about a case modifier
33 33
 typedef struct {
34
-	unsigned location;
35
-	char type;
34
+    unsigned location;
35
+    char type;
36 36
 } case_modifier_t;
37 37
 
38 38
 #ifdef SUPPORT_UTF8
39 39
 // count the number of UTF-8 characters in a string
40 40
 // there is probably a better way to do this but this works for now
41 41
 static int utf8charcount(const char *str, int len) {
42
-	int chars, pos;
43
-	unsigned char c;
44
-	for (pos = chars = 0; pos < len; pos++) {
45
-		c = str[pos];
46
-		if (c <= 0x7f || (0xc0 <= c && c <= 0xfd))
47
-			chars++;
48
-	}
49
-	return chars;
42
+    int chars, pos;
43
+    unsigned char c;
44
+    for (pos = chars = 0; pos < len; pos++) {
45
+        c = str[pos];
46
+        if (c <= 0x7f || (0xc0 <= c && c <= 0xfd))
47
+            chars++;
48
+    }
49
+    return chars;
50 50
 }
51 51
 #else
52 52
 #define utf8charcount(str, len) (len)
... ...
@@ -75,11 +75,11 @@ static int utf8charcount(const char *str, int len) {
75 75
 static AGRegex *backrefPattern;
76 76
 
77 77
 + (void)initialize {
78
-	static BOOL initialized = NO;
79
-	if (initialized) return;
80
-	initialized = YES;
81
-	[super initialize];
82
-	backrefPattern = [[AGRegex alloc] initWithPattern:BACKREF_PATTERN];
78
+    static BOOL initialized = NO;
79
+    if (initialized) return;
80
+    initialized = YES;
81
+    [super initialize];
82
+    backrefPattern = [[AGRegex alloc] initWithPattern:BACKREF_PATTERN];
83 83
 }
84 84
 
85 85
 + (id)regexWithPattern:(NSString *)pat { return [[[self alloc] initWithPattern:pat] autorelease]; }
... ...
@@ -87,87 +87,87 @@ static AGRegex *backrefPattern;
87 87
 + (id)regexWithPattern:(NSString *)pat options:(int)opts { return [[[self alloc] initWithPattern:pat options:opts ] autorelease]; }
88 88
 
89 89
 - (id)init {
90
-	return [self initWithPattern:@""];
90
+    return [self initWithPattern:@""];
91 91
 }
92 92
 
93 93
 - (id)initWithPattern:(NSString *)pat {
94
-	return [self initWithPattern:pat options:0];
94
+    return [self initWithPattern:pat options:0];
95 95
 }
96 96
 
97 97
 - (id)initWithPattern:(NSString *)pat options:(int)opts {
98
-	if (self = [super init]) {
99
-		const char *emsg;
100
-		int eloc, copts = 0;
101
-		if (opts & AGRegexCaseInsensitive)	copts |= PCRE_CASELESS;
102
-		if (opts & AGRegexDotAll)			copts |= PCRE_DOTALL;
103
-		if (opts & AGRegexExtended)			copts |= PCRE_EXTENDED;
104
-		if (opts & AGRegexLazy)				copts |= PCRE_UNGREEDY;
105
-		if (opts & AGRegexMultiline)		copts |= PCRE_MULTILINE;
98
+    if (self = [super init]) {
99
+        const char *emsg;
100
+        int eloc, copts = 0;
101
+        if (opts & AGRegexCaseInsensitive)	copts |= PCRE_CASELESS;
102
+        if (opts & AGRegexDotAll)			copts |= PCRE_DOTALL;
103
+        if (opts & AGRegexExtended)			copts |= PCRE_EXTENDED;
104
+        if (opts & AGRegexLazy)				copts |= PCRE_UNGREEDY;
105
+        if (opts & AGRegexMultiline)		copts |= PCRE_MULTILINE;
106 106
 #ifdef SUPPORT_UTF8
107
-		copts |= PCRE_UTF8;
107
+        copts |= PCRE_UTF8;
108 108
 #else
109
-		// check for valid ASCII string
110
-		if (![pat canBeConvertedToEncoding:NSASCIIStringEncoding]) {
111
-			[self release];
112
-			return nil;
113
-		}
109
+        // check for valid ASCII string
110
+        if (![pat canBeConvertedToEncoding:NSASCIIStringEncoding]) {
111
+            [self release];
112
+            return nil;
113
+        }
114 114
 #endif
115
-		if (!(regex = pcre_compile([pat UTF8String], copts, &emsg, &eloc, NULL))) {
116
-			[self release];
117
-			return nil;
118
-		}
119
-		if (pcre_fullinfo(regex, NULL, PCRE_INFO_CAPTURECOUNT, &groupCount)) {
120
-			[self release];
121
-			return nil;
122
-		}
123
-		groupCount++;
124
-	}
125
-	return self;
115
+        if (!(regex = pcre_compile([pat UTF8String], copts, &emsg, &eloc, NULL))) {
116
+            [self release];
117
+            return nil;
118
+        }
119
+        if (pcre_fullinfo(regex, NULL, PCRE_INFO_CAPTURECOUNT, &groupCount)) {
120
+            [self release];
121
+            return nil;
122
+        }
123
+        groupCount++;
124
+    }
125
+    return self;
126 126
 }
127 127
 
128 128
 - (void)dealloc {
129
-	pcre_free(regex);
130
-	pcre_free(extra);
131
-	[super dealloc];
129
+    pcre_free(regex);
130
+    pcre_free(extra);
131
+    [super dealloc];
132 132
 }
133 133
 
134 134
 - (AGRegexMatch *)findInString:(NSString *)str {
135
-	return [self findInString:str range:NSMakeRange(0, [str length])];
135
+    return [self findInString:str range:NSMakeRange(0, [str length])];
136 136
 }
137 137
 
138 138
 - (AGRegexMatch *)findInString:(NSString *)str range:(NSRange)range {
139
-	int error, length, options, *matchv;
140
-	length = [str length];
141
-	options = 0;
139
+    int error, length, options, *matchv;
140
+    length = [str length];
141
+    options = 0;
142 142
 #ifndef SUPPORT_UTF8
143
-	// check for valid ASCII string
144
-	if (![str canBeConvertedToEncoding:NSASCIIStringEncoding])
145
-		[NSException raise:@"%@ is not a valid ASCII string, build with UTF-8 support", str];
143
+    // check for valid ASCII string
144
+    if (![str canBeConvertedToEncoding:NSASCIIStringEncoding])
145
+        [NSException raise:@"%@ is not a valid ASCII string, build with UTF-8 support", str];
146 146
 #endif
147
-	// sanity check range
148
-	if (range.location + range.length > length)
149
-		[NSException raise:NSRangeException format:@"range %@ out of bounds", NSStringFromRange(range)];
150
-	// don't match $ anchor if range is before end of string
151
-	if (range.location + range.length < length)
152
-		options |= PCRE_NOTEOL;
153
-	// allocate match vector
154
-	NSAssert1(matchv = malloc(sizeof(int) * groupCount * 3), @"couldn't allocate match vector for %d items", groupCount * 3);
155
-	// convert character range to byte range
156
-	range.length = strlen([[str substringWithRange:range] UTF8String]);
157
-	range.location = strlen([[str substringToIndex:range.location] UTF8String]);
158
-	// try match
159
-	if ((error = pcre_exec(regex, extra, [str UTF8String], range.location + range.length, range.location, options, matchv, groupCount * 3)) == PCRE_ERROR_NOMATCH) {
160
-		free(matchv);
161
-		return nil;
162
-	}
163
-	// should not get any error besides PCRE_ERROR_NOMATCH
164
-	NSAssert1(error > 0, @"unexpected error pcre_exec(): %d", error);
165
-	// return the match, match object takes ownership of matchv
166
-	return [[[AGRegexMatch alloc] initWithRegex:self string:str vector:matchv count:groupCount] autorelease];
147
+    // sanity check range
148
+    if (range.location + range.length > length)
149
+        [NSException raise:NSRangeException format:@"range %@ out of bounds", NSStringFromRange(range)];
150
+    // don't match $ anchor if range is before end of string
151
+    if (range.location + range.length < length)
152
+        options |= PCRE_NOTEOL;
153
+    // allocate match vector
154
+    NSAssert1(matchv = malloc(sizeof(int) * groupCount * 3), @"couldn't allocate match vector for %d items", groupCount * 3);
155
+    // convert character range to byte range
156
+    range.length = strlen([[str substringWithRange:range] UTF8String]);
157
+    range.location = strlen([[str substringToIndex:range.location] UTF8String]);
158
+    // try match
159
+    if ((error = pcre_exec(regex, extra, [str UTF8String], range.location + range.length, range.location, options, matchv, groupCount * 3)) == PCRE_ERROR_NOMATCH) {
160
+        free(matchv);
161
+        return nil;
162
+    }
163
+    // should not get any error besides PCRE_ERROR_NOMATCH
164
+    NSAssert1(error > 0, @"unexpected error pcre_exec(): %d", error);
165
+    // return the match, match object takes ownership of matchv
166
+    return [[[AGRegexMatch alloc] initWithRegex:self string:str vector:matchv count:groupCount] autorelease];
167 167
 }
168 168
 
169 169
 - (NSArray *)findAllInString:(NSString *)str {
170
-	return [self findAllInString:str range:NSMakeRange(0, [str length])];
170
+    return [self findAllInString:str range:NSMakeRange(0, [str length])];
171 171
 }
172 172
 
173 173
 - (NSArray *)findAllInString:(NSString *)str range:(NSRange)range {
... ...
@@ -175,7 +175,7 @@ static AGRegex *backrefPattern;
175 175
 }
176 176
 
177 177
 - (NSEnumerator *)findEnumeratorInString:(NSString *)str {
178
-	return [self findEnumeratorInString:str range:NSMakeRange(0, [str length])];
178
+    return [self findEnumeratorInString:str range:NSMakeRange(0, [str length])];
179 179
 }
180 180
 
181 181
 - (NSEnumerator *)findEnumeratorInString:(NSString *)str range:(NSRange)r {
... ...
@@ -183,165 +183,165 @@ static AGRegex *backrefPattern;
183 183
 }
184 184
 
185 185
 - (NSString *)replaceWithString:(NSString *)rep inString:(NSString *)str {
186
-	return [self replaceWithString:rep inString:str limit:0];
186
+    return [self replaceWithString:rep inString:str limit:0];
187 187
 }
188 188
 
189 189
 - (NSString *)replaceWithString:(NSString *)rep inString:(NSString *)str limit:(int)lim {
190
-	NSMutableString *repBuffer, *result = [NSMutableString string];
191
-	AGRegexMatch *match, *backref;
192
-	NSArray *allMatches, *allBackrefs;
193
-	NSRange remainRange, matchRange, backrefRemainRange, backrefMatchRange;
194
-	case_modifier_t *caseModVector;
195
-	int i, j, k, l, length, repLength, allCount, allBackrefsCount, caseModIdx;
196
-	// set remaining range to full range of receiver
197
-	length = [str length];
198
-	remainRange = NSMakeRange(0, length);
199
-	// find all matches of pattern
200
-	allMatches = [self findAllInString:str];
201
-	allCount = [allMatches count];
202
-	// find all backrefs/escapes in replacement string
203
-	allBackrefs = [backrefPattern findAllInString:rep];
204
-	allBackrefsCount = [allBackrefs count];
205
-	repLength = [rep length];
206
-	// create case mod list
207
-	caseModVector = malloc(sizeof(case_modifier_t) * allCount * allBackrefsCount);
208
-	NSAssert1(caseModVector, @"couldn't allocate memory for %d case modifiers", allCount * allBackrefsCount);
209
-	// while limit is not reached and there are more matches to replace
210
-	for (i = 0; (lim < 1 || i < lim) && i < allCount; i++) {
211
-		// get the the next match
212
-		match = [allMatches objectAtIndex:i];
213
-		// build the replacement string
214
-		repBuffer = [NSMutableString string];
215
-		backrefRemainRange = NSMakeRange(0, repLength);
216
-		caseModIdx = 0;
217
-		for (j = 0; j < allBackrefsCount; j++) {
218
-			// get the next backref
219
-			backref = [allBackrefs objectAtIndex:j];
220
-			backrefMatchRange = [backref range];
221
-			// append the part before the backref
222
-			[repBuffer appendString:[rep substringWithRange:NSMakeRange(backrefRemainRange.location, backrefMatchRange.location - backrefRemainRange.location)]];
223
-			// interpret backref
224
-			if (IS_BACKREF(backref)) {
225
-				NSString *captured;
226
-				int idx;
227
-				if (IS_NAMED_BACKREF(backref)) {
228
-					NSString *backrefName = BACKREF_NAME(backref);
229
-					while ((idx = pcre_get_stringnumber(regex, [backrefName UTF8String])) == PCRE_ERROR_NOSUBSTRING && !BACKREF_IS_PARENTHESIZED(backref)) {
230
-						if (backrefMatchRange.length < 3) // need at least one letter
231
-							[NSException raise:NSInvalidArgumentException format:@"no backreference named %@ in pattern", backrefName];
232
-						backrefName = [backrefName substringToIndex:[backrefName length] - 1];
233
-						backrefMatchRange.length--;
234
-					}
235
-				}
236
-				else {
237
-					idx = BACKREF_INDEX(backref);
238
-					// in the case of multiple digits after $, chop it down to the highest valid index
239
-					while (idx >= [match count] && !BACKREF_IS_PARENTHESIZED(backref)) {
240
-						if (backrefMatchRange.length < 3) // need at least one digit
241
-							[NSException raise:NSInvalidArgumentException format:@"no such backreference %d in pattern", idx];
242
-						idx /= 10;
243
-						backrefMatchRange.length--;
244
-					}
245
-				}
246
-				// append the captured subpattern to ther replacement string
247
-				captured = [match groupAtIndex:idx];
248
-				[repBuffer appendString:captured ? captured : @""];
249
-			// handle case modifier
250
-			} else if (IS_CASE_MODIFIER(backref)) {
251
-				case_modifier_t caseMod;
252
-				caseMod.location = [repBuffer length];
253
-				caseMod.type = [CASE_MODIFIER_STRING(backref) UTF8String][0];
254
-				caseModVector[caseModIdx] = caseMod;
255
-				caseModIdx++;
256
-			// handle literal escape
257
-			} else {
258
-				NSAssert1(IS_LITERAL_ESCAPE(backref), @"%@ isn't a backref, case modifier, or literal escape!", backref);
259
-				[repBuffer appendString:LITERAL_ESCAPE_STRING(backref)];
260
-			}
261
-			// set the remaining range to the part after the match
262
-			backrefRemainRange.location = backrefMatchRange.location + backrefMatchRange.length;
263
-			backrefRemainRange.length = repLength - backrefRemainRange.location;
264
-		}
265
-		// append the remaining replacement string to repBuffer
266
-		[repBuffer appendString:[rep substringWithRange:backrefRemainRange]];
267
-		// interpret case modifiers
268
-		for (k = 0; k < caseModIdx; k++) {
269
-			NSRange caseModRange;
270
-			char caseModType = caseModVector[k].type;
271
-			switch (caseModType) {
272
-			case 'u':
273
-			case 'l':
274
-				caseModRange = NSMakeRange(caseModVector[k].location, 1);
275
-				break;
276
-			case 'U':
277
-			case 'L':
278
-				// assume case modifier applies to rest of string unless we find a terminator
279
-				caseModRange = NSMakeRange(caseModVector[k].location, [repBuffer length] - caseModVector[k].location);
280
-				for (l = k + 1; l < caseModIdx; l++)
281
-					if (caseModVector[l].type == 'E') {
282
-						caseModRange = NSMakeRange(caseModVector[k].location, caseModVector[l].location - caseModVector[k].location);
283
-						break;
284
-					}
285
-				break;
286
-			case 'E':
287
-				break;
288
-			}
289
-			if (caseModRange.location + caseModRange.length > [repBuffer length])
290
-				continue;
291
-			if (caseModType == 'u' || caseModType == 'U')
292
-				[repBuffer replaceCharactersInRange:caseModRange withString:[[repBuffer substringWithRange:caseModRange] uppercaseString]];
293
-			else if (caseModType == 'l' || caseModType == 'L')
294
-				[repBuffer replaceCharactersInRange:caseModRange withString:[[repBuffer substringWithRange:caseModRange] lowercaseString]];
295
-		}
296
-		// append the part of the target string before the match
297
-		matchRange = [match range];
298
-		[result appendString:[str substringWithRange:NSMakeRange(remainRange.location, matchRange.location - remainRange.location)]];
299
-		// append repBuffer
300
-		[result appendString:repBuffer];
301
-		// set the remaining range to the part after the match
302
-		remainRange.location = matchRange.location + matchRange.length;
303
-		remainRange.length = length - remainRange.location;
304
-	}
305
-	free(caseModVector);
306
-	// append the remaining string
307
-	[result appendString:[str substringWithRange:remainRange]];
308
-	return result;
190
+    NSMutableString *repBuffer, *result = [NSMutableString string];
191
+    AGRegexMatch *match, *backref;
192
+    NSArray *allMatches, *allBackrefs;
193
+    NSRange remainRange, matchRange, backrefRemainRange, backrefMatchRange;
194
+    case_modifier_t *caseModVector;
195
+    int i, j, k, l, length, repLength, allCount, allBackrefsCount, caseModIdx;
196
+    // set remaining range to full range of receiver
197
+    length = [str length];
198
+    remainRange = NSMakeRange(0, length);
199
+    // find all matches of pattern
200
+    allMatches = [self findAllInString:str];
201
+    allCount = [allMatches count];
202
+    // find all backrefs/escapes in replacement string
203
+    allBackrefs = [backrefPattern findAllInString:rep];
204
+    allBackrefsCount = [allBackrefs count];
205
+    repLength = [rep length];
206
+    // create case mod list
207
+    caseModVector = malloc(sizeof(case_modifier_t) * allCount * allBackrefsCount);
208
+    NSAssert1(caseModVector, @"couldn't allocate memory for %d case modifiers", allCount * allBackrefsCount);
209
+    // while limit is not reached and there are more matches to replace
210
+    for (i = 0; (lim < 1 || i < lim) && i < allCount; i++) {
211
+        // get the the next match
212
+        match = [allMatches objectAtIndex:i];
213
+        // build the replacement string
214
+        repBuffer = [NSMutableString string];
215
+        backrefRemainRange = NSMakeRange(0, repLength);
216
+        caseModIdx = 0;
217
+        for (j = 0; j < allBackrefsCount; j++) {
218
+            // get the next backref
219
+            backref = [allBackrefs objectAtIndex:j];
220
+            backrefMatchRange = [backref range];
221
+            // append the part before the backref
222
+            [repBuffer appendString:[rep substringWithRange:NSMakeRange(backrefRemainRange.location, backrefMatchRange.location - backrefRemainRange.location)]];
223
+            // interpret backref
224
+            if (IS_BACKREF(backref)) {
225
+                NSString *captured;
226
+                int idx;
227
+                if (IS_NAMED_BACKREF(backref)) {
228
+                    NSString *backrefName = BACKREF_NAME(backref);
229
+                    while ((idx = pcre_get_stringnumber(regex, [backrefName UTF8String])) == PCRE_ERROR_NOSUBSTRING && !BACKREF_IS_PARENTHESIZED(backref)) {
230
+                        if (backrefMatchRange.length < 3) // need at least one letter
231
+                            [NSException raise:NSInvalidArgumentException format:@"no backreference named %@ in pattern", backrefName];
232
+                        backrefName = [backrefName substringToIndex:[backrefName length] - 1];
233
+                        backrefMatchRange.length--;
234
+                    }
235
+                }
236
+                else {
237
+                    idx = BACKREF_INDEX(backref);
238
+                    // in the case of multiple digits after $, chop it down to the highest valid index
239
+                    while (idx >= [match count] && !BACKREF_IS_PARENTHESIZED(backref)) {
240
+                        if (backrefMatchRange.length < 3) // need at least one digit
241
+                            [NSException raise:NSInvalidArgumentException format:@"no such backreference %d in pattern", idx];
242
+                        idx /= 10;
243
+                        backrefMatchRange.length--;
244
+                    }
245
+                }
246
+                // append the captured subpattern to ther replacement string
247
+                captured = [match groupAtIndex:idx];
248
+                [repBuffer appendString:captured ? captured : @""];
249
+                // handle case modifier
250
+            } else if (IS_CASE_MODIFIER(backref)) {
251
+                case_modifier_t caseMod;
252
+                caseMod.location = [repBuffer length];
253
+                caseMod.type = [CASE_MODIFIER_STRING(backref) UTF8String][0];
254
+                caseModVector[caseModIdx] = caseMod;
255
+                caseModIdx++;
256
+                // handle literal escape
257
+            } else {
258
+                NSAssert1(IS_LITERAL_ESCAPE(backref), @"%@ isn't a backref, case modifier, or literal escape!", backref);
259
+                [repBuffer appendString:LITERAL_ESCAPE_STRING(backref)];
260
+            }
261
+            // set the remaining range to the part after the match
262
+            backrefRemainRange.location = backrefMatchRange.location + backrefMatchRange.length;
263
+            backrefRemainRange.length = repLength - backrefRemainRange.location;
264
+        }
265
+        // append the remaining replacement string to repBuffer
266
+        [repBuffer appendString:[rep substringWithRange:backrefRemainRange]];
267
+        // interpret case modifiers
268
+        for (k = 0; k < caseModIdx; k++) {
269
+            NSRange caseModRange;
270
+            char caseModType = caseModVector[k].type;
271
+            switch (caseModType) {
272
+                case 'u':
273
+                case 'l':
274
+                    caseModRange = NSMakeRange(caseModVector[k].location, 1);
275
+                    break;
276
+                case 'U':
277
+                case 'L':
278
+                    // assume case modifier applies to rest of string unless we find a terminator
279
+                    caseModRange = NSMakeRange(caseModVector[k].location, [repBuffer length] - caseModVector[k].location);
280
+                    for (l = k + 1; l < caseModIdx; l++)
281
+                        if (caseModVector[l].type == 'E') {
282
+                            caseModRange = NSMakeRange(caseModVector[k].location, caseModVector[l].location - caseModVector[k].location);
283
+                            break;
284
+                        }
285
+                    break;
286
+                case 'E':
287
+                    break;
288
+            }
289
+            if (caseModRange.location + caseModRange.length > [repBuffer length])
290
+                continue;
291
+            if (caseModType == 'u' || caseModType == 'U')
292
+                [repBuffer replaceCharactersInRange:caseModRange withString:[[repBuffer substringWithRange:caseModRange] uppercaseString]];
293
+            else if (caseModType == 'l' || caseModType == 'L')
294
+                [repBuffer replaceCharactersInRange:caseModRange withString:[[repBuffer substringWithRange:caseModRange] lowercaseString]];
295
+        }
296
+        // append the part of the target string before the match
297
+        matchRange = [match range];
298
+        [result appendString:[str substringWithRange:NSMakeRange(remainRange.location, matchRange.location - remainRange.location)]];
299
+        // append repBuffer
300
+        [result appendString:repBuffer];
301
+        // set the remaining range to the part after the match
302
+        remainRange.location = matchRange.location + matchRange.length;
303
+        remainRange.length = length - remainRange.location;
304
+    }
305
+    free(caseModVector);
306
+    // append the remaining string
307
+    [result appendString:[str substringWithRange:remainRange]];
308
+    return result;
309 309
 }
310 310
 
311 311
 - (NSArray *)splitString:(NSString *)str {
312
-	return [self splitString:str limit:0];
312
+    return [self splitString:str limit:0];
313 313
 }
314 314
 
315 315
 - (NSArray *)splitString:(NSString *)str limit:(int)lim {
316
-	NSMutableArray *result = [NSMutableArray array];
317
-	AGRegexMatch *match;
318
-	NSArray *allMatches;
319
-	NSString *group;
320
-	NSRange remainRange, matchRange;
321
-	int i, j, count, allCount, length = [str length];
322
-	// find all matches
323
-	allMatches = [self findAllInString:str]; 
324
-	allCount = [allMatches count];
325
-	remainRange = NSMakeRange(0, length);
326
-	// while limit is not reached and there are more matches
327
-	for (i = 0; (lim < 1 || i < lim) && i < allCount; i++) {
328
-		// get next match
329
-		match = [allMatches objectAtIndex:i];
330
-		matchRange = [match range];
331
-		// add substring from last split to this split
332
-		[result addObject:[str substringWithRange:NSMakeRange(remainRange.location, matchRange.location - remainRange.location)]];
333
-		// add captured subpatterns if any
334
-		count = [match count];
335
-		for (j = 1; j < count; j++)
316
+    NSMutableArray *result = [NSMutableArray array];
317
+    AGRegexMatch *match;
318
+    NSArray *allMatches;
319
+    NSString *group;
320
+    NSRange remainRange, matchRange;
321
+    int i, j, count, allCount, length = [str length];
322
+    // find all matches
323
+    allMatches = [self findAllInString:str]; 
324
+    allCount = [allMatches count];
325
+    remainRange = NSMakeRange(0, length);
326
+    // while limit is not reached and there are more matches
327
+    for (i = 0; (lim < 1 || i < lim) && i < allCount; i++) {
328
+        // get next match
329
+        match = [allMatches objectAtIndex:i];
330
+        matchRange = [match range];
331
+        // add substring from last split to this split
332
+        [result addObject:[str substringWithRange:NSMakeRange(remainRange.location, matchRange.location - remainRange.location)]];
333
+        // add captured subpatterns if any
334
+        count = [match count];
335
+        for (j = 1; j < count; j++)
336 336
             if ((group = [match groupAtIndex:j]))
337
-				[result addObject:group];
338
-		// set remaining range to the part after the split
339
-		remainRange.location = matchRange.location + matchRange.length;
340
-		remainRange.length = length - remainRange.location;
341
-	}
342
-	// add rest of the string
343
-	[result addObject:[str substringWithRange:remainRange]];
344
-	return result;
337
+                [result addObject:group];
338
+        // set remaining range to the part after the split
339
+        remainRange.location = matchRange.location + matchRange.length;
340
+        remainRange.length = length - remainRange.location;
341
+    }
342
+    // add rest of the string
343
+    [result addObject:[str substringWithRange:remainRange]];
344
+    return result;
345 345
 }
346 346
 
347 347
 - (const pcre *)pcre { return regex; }
... ...
@@ -352,76 +352,76 @@ static AGRegex *backrefPattern;
352 352
 
353 353
 // takes ownership of the passed match vector, free on dealloc
354 354
 - (id)initWithRegex:(AGRegex *)re string:(NSString *)str vector:(int *)mv count:(int)c {
355
-	if (self = [super init]) {
356
-		regex = [re retain];
357
-		string = [str copy]; // really only copies if the string is mutable, immutable strings are just retained
358
-		matchv = mv;
359
-		count = c;
360
-	}
361
-	return self;
355
+    if (self = [super init]) {
356
+        regex = [re retain];
357
+        string = [str copy]; // really only copies if the string is mutable, immutable strings are just retained
358
+        matchv = mv;
359
+        count = c;
360
+    }
361
+    return self;
362 362
 }
363 363
 
364 364
 - (void)dealloc {
365
-	free(matchv);
366
-	[regex release];
367
-	[string release];
368
-	[super dealloc];
365
+    free(matchv);
366
+    [regex release];
367
+    [string release];
368
+    [super dealloc];
369 369
 }
370 370
 
371 371
 - (int)count {
372
-	return count;
372
+    return count;
373 373
 }
374 374
 
375 375
 - (NSString *)group {
376
-	return [self groupAtIndex:0];
376
+    return [self groupAtIndex:0];
377 377
 }
378 378
 
379 379
 - (NSString *)groupAtIndex:(int)idx {
380
-	NSRange r = [self rangeAtIndex:idx];
381
-	return r.location == NSNotFound ? nil : [string substringWithRange:r];
380
+    NSRange r = [self rangeAtIndex:idx];
381
+    return r.location == NSNotFound ? nil : [string substringWithRange:r];
382 382
 }
383 383
 
384 384
 - (NSString *)groupNamed:(NSString *)name {
385
-	int idx = pcre_get_stringnumber([regex pcre], [name UTF8String]);
386
-	if (idx == PCRE_ERROR_NOSUBSTRING)
387
-		[NSException raise:NSInvalidArgumentException format:@"no group named %@", name];
388
-	return [self groupAtIndex:idx];
385
+    int idx = pcre_get_stringnumber([regex pcre], [name UTF8String]);
386
+    if (idx == PCRE_ERROR_NOSUBSTRING)
387
+        [NSException raise:NSInvalidArgumentException format:@"no group named %@", name];
388
+    return [self groupAtIndex:idx];
389 389
 }
390 390
 
391 391
 - (NSRange)range {
392
-	return [self rangeAtIndex:0];
392
+    return [self rangeAtIndex:0];
393 393
 }
394 394
 
395 395
 - (NSRange)rangeAtIndex:(int)idx {
396
-	int start, end;
397
-	if (idx >= count)
398
-		[NSException raise:NSRangeException format:@"index %d out of bounds", idx];
399
-	start = matchv[2 * idx];
400
-	end = matchv[2 * idx + 1];
401
-	if (start < 0)
402
-		return NSMakeRange(NSNotFound, 0);
403
-	// convert byte locations to character locations
404
-	return NSMakeRange(utf8charcount([string UTF8String], start), utf8charcount([string UTF8String] + start, end - start));
396
+    int start, end;
397
+    if (idx >= count)
398
+        [NSException raise:NSRangeException format:@"index %d out of bounds", idx];
399
+    start = matchv[2 * idx];
400
+    end = matchv[2 * idx + 1];
401
+    if (start < 0)
402
+        return NSMakeRange(NSNotFound, 0);
403
+    // convert byte locations to character locations
404
+    return NSMakeRange(utf8charcount([string UTF8String], start), utf8charcount([string UTF8String] + start, end - start));
405 405
 }
406 406
 
407 407
 - (NSRange)rangeNamed:(NSString *)name {
408
-	int idx = pcre_get_stringnumber([regex pcre], [name UTF8String]);
409
-	if (idx == PCRE_ERROR_NOSUBSTRING)
410
-		[NSException raise:NSInvalidArgumentException format:@"no group named %@", name];
411
-	return [self rangeAtIndex:idx];
408
+    int idx = pcre_get_stringnumber([regex pcre], [name UTF8String]);
409
+    if (idx == PCRE_ERROR_NOSUBSTRING)
410
+        [NSException raise:NSInvalidArgumentException format:@"no group named %@", name];
411
+    return [self rangeAtIndex:idx];
412 412
 }
413 413
 
414 414
 - (NSString *)string {
415
-	return string;
415
+    return string;
416 416
 }
417 417
 
418 418
 - (NSString *)description {
419
-	NSMutableString *desc = [NSMutableString stringWithFormat:@"%@ {\n", [super description]];
420
-	int i;
421
-	for (i = 0; i < count; i++)
422
-		[desc appendFormat:@"\t%d %@ %@\n", i, NSStringFromRange([self rangeAtIndex:i]), [self groupAtIndex:i]];
423
-	[desc appendString:@"}"];
424
-	return desc;
419
+    NSMutableString *desc = [NSMutableString stringWithFormat:@"%@ {\n", [super description]];
420
+    int i;
421
+    for (i = 0; i < count; i++)
422
+        [desc appendFormat:@"\t%d %@ %@\n", i, NSStringFromRange([self rangeAtIndex:i]), [self groupAtIndex:i]];
423
+    [desc appendString:@"}"];
424
+    return desc;
425 425
 }
426 426
 
427 427
 @end
... ...
@@ -1,183 +1,183 @@
1 1
 /*************************************************
2
-*      Perl-Compatible Regular Expressions       *
3
-*************************************************/
2
+ *      Perl-Compatible Regular Expressions       *
3
+ *************************************************/
4 4
 
5 5
 /* This file is automatically written by the dftables auxiliary 
6
-program. If you edit it by hand, you might like to edit the Makefile to 
7
-prevent its ever being regenerated.
8
-
9
-This file is #included in the compilation of pcre.c to build the default
10
-character tables which are used when no tables are passed to the compile
11
-function. */
6
+ program. If you edit it by hand, you might like to edit the Makefile to 
7
+ prevent its ever being regenerated.
8
+ 
9
+ This file is #included in the compilation of pcre.c to build the default
10
+ character tables which are used when no tables are passed to the compile
11
+ function. */
12 12
 
13 13
 static unsigned char pcre_default_tables[] = {
14
-
15
-/* This table is a lower casing table. */
16
-
14
+    
15
+    /* This table is a lower casing table. */
16
+    
17 17
     0,  1,  2,  3,  4,  5,  6,  7,
18 18
     8,  9, 10, 11, 12, 13, 14, 15,
19
-   16, 17, 18, 19, 20, 21, 22, 23,
20
-   24, 25, 26, 27, 28, 29, 30, 31,
21
-   32, 33, 34, 35, 36, 37, 38, 39,
22
-   40, 41, 42, 43, 44, 45, 46, 47,
23
-   48, 49, 50, 51, 52, 53, 54, 55,
24
-   56, 57, 58, 59, 60, 61, 62, 63,
25
-   64, 97, 98, 99,100,101,102,103,
26
-  104,105,106,107,108,109,110,111,
27
-  112,113,114,115,116,117,118,119,
28
-  120,121,122, 91, 92, 93, 94, 95,
29
-   96, 97, 98, 99,100,101,102,103,
30
-  104,105,106,107,108,109,110,111,
31
-  112,113,114,115,116,117,118,119,
32
-  120,121,122,123,124,125,126,127,
33
-  128,129,130,131,132,133,134,135,
34
-  136,137,138,139,140,141,142,143,
35
-  144,145,146,147,148,149,150,151,
36
-  152,153,154,155,156,157,158,159,
37
-  160,161,162,163,164,165,166,167,
38
-  168,169,170,171,172,173,174,175,
39
-  176,177,178,179,180,181,182,183,
40
-  184,185,186,187,188,189,190,191,
41
-  192,193,194,195,196,197,198,199,
42
-  200,201,202,203,204,205,206,207,
43
-  208,209,210,211,212,213,214,215,
44
-  216,217,218,219,220,221,222,223,
45
-  224,225,226,227,228,229,230,231,
46
-  232,233,234,235,236,237,238,239,
47
-  240,241,242,243,244,245,246,247,
48
-  248,249,250,251,252,253,254,255,
49
-
50
-/* This table is a case flipping table. */
51
-
19
+    16, 17, 18, 19, 20, 21, 22, 23,
20
+    24, 25, 26, 27, 28, 29, 30, 31,
21
+    32, 33, 34, 35, 36, 37, 38, 39,
22
+    40, 41, 42, 43, 44, 45, 46, 47,
23
+    48, 49, 50, 51, 52, 53, 54, 55,
24
+    56, 57, 58, 59, 60, 61, 62, 63,
25
+    64, 97, 98, 99,100,101,102,103,
26
+    104,105,106,107,108,109,110,111,
27
+    112,113,114,115,116,117,118,119,
28
+    120,121,122, 91, 92, 93, 94, 95,
29
+    96, 97, 98, 99,100,101,102,103,
30
+    104,105,106,107,108,109,110,111,
31
+    112,113,114,115,116,117,118,119,
32
+    120,121,122,123,124,125,126,127,
33
+    128,129,130,131,132,133,134,135,
34
+    136,137,138,139,140,141,142,143,
35
+    144,145,146,147,148,149,150,151,
36
+    152,153,154,155,156,157,158,159,
37
+    160,161,162,163,164,165,166,167,
38
+    168,169,170,171,172,173,174,175,
39
+    176,177,178,179,180,181,182,183,
40
+    184,185,186,187,188,189,190,191,
41
+    192,193,194,195,196,197,198,199,
42
+    200,201,202,203,204,205,206,207,
43
+    208,209,210,211,212,213,214,215,
44
+    216,217,218,219,220,221,222,223,
45
+    224,225,226,227,228,229,230,231,
46
+    232,233,234,235,236,237,238,239,
47
+    240,241,242,243,244,245,246,247,
48
+    248,249,250,251,252,253,254,255,
49
+    
50
+    /* This table is a case flipping table. */
51
+    
52 52
     0,  1,  2,  3,  4,  5,  6,  7,
53 53
     8,  9, 10, 11, 12, 13, 14, 15,
54
-   16, 17, 18, 19, 20, 21, 22, 23,
55
-   24, 25, 26, 27, 28, 29, 30, 31,
56
-   32, 33, 34, 35, 36, 37, 38, 39,
57
-   40, 41, 42, 43, 44, 45, 46, 47,
58
-   48, 49, 50, 51, 52, 53, 54, 55,
59
-   56, 57, 58, 59, 60, 61, 62, 63,
60
-   64, 97, 98, 99,100,101,102,103,
61
-  104,105,106,107,108,109,110,111,
62
-  112,113,114,115,116,117,118,119,
63
-  120,121,122, 91, 92, 93, 94, 95,
64
-   96, 65, 66, 67, 68, 69, 70, 71,
65
-   72, 73, 74, 75, 76, 77, 78, 79,
66
-   80, 81, 82, 83, 84, 85, 86, 87,
67
-   88, 89, 90,123,124,125,126,127,
68
-  128,129,130,131,132,133,134,135,
69
-  136,137,138,139,140,141,142,143,
70
-  144,145,146,147,148,149,150,151,
71
-  152,153,154,155,156,157,158,159,
72
-  160,161,162,163,164,165,166,167,
73
-  168,169,170,171,172,173,174,175,
74
-  176,177,178,179,180,181,182,183,
75
-  184,185,186,187,188,189,190,191,
76
-  192,193,194,195,196,197,198,199,
77
-  200,201,202,203,204,205,206,207,
78
-  208,209,210,211,212,213,214,215,
79
-  216,217,218,219,220,221,222,223,
80
-  224,225,226,227,228,229,230,231,
81
-  232,233,234,235,236,237,238,239,
82
-  240,241,242,243,244,245,246,247,
83
-  248,249,250,251,252,253,254,255,
84
-
85
-/* This table contains bit maps for various character classes.
86
-Each map is 32 bytes long and the bits run from the least
87
-significant end of each byte. The classes that have their own
88
-maps are: space, xdigit, digit, upper, lower, word, graph
89
-print, punct, and cntrl. Other classes are built from combinations. */
90
-
91
-  0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
92
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
93
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
94
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
95
-
96
-  0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
97
-  0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
98
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
99
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
100
-
101
-  0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
102
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
103
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
104
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
105
-
106
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
107
-  0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
108
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
109
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
110
-
111
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
112
-  0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
113
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
114
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
115
-
116
-  0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
117
-  0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
118
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
119
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
120
-
121
-  0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,
122
-  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
123
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
124
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
125
-
126
-  0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,
127
-  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
128
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
129
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
130
-
131
-  0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,
132
-  0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
133
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
134
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
135
-
136
-  0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,
137
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
138
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
139
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140
-
141
-/* This table identifies various classes of character by individual bits:
142
-  0x01   white space character
143
-  0x02   letter
144
-  0x04   decimal digit
145
-  0x08   hexadecimal digit
146
-  0x10   alphanumeric or '_'
147
-  0x80   regular expression metacharacter or binary zero
148
-*/
149
-
150
-  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
151
-  0x00,0x01,0x01,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
152
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
153
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
154
-  0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /*    - '  */
155
-  0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /*  ( - /  */
156
-  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
157
-  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /*  8 - ?  */
158
-  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  @ - G  */
159
-  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  H - O  */
160
-  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  P - W  */
161
-  0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /*  X - _  */
162
-  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  ` - g  */
163
-  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  h - o  */
164
-  0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  p - w  */
165
-  0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /*  x -127 */
166
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
167
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
168
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
169
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
170
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
171
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
172
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
173
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
174
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
175
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
176
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
177
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
178
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
179
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
180
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
181
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
54
+    16, 17, 18, 19, 20, 21, 22, 23,
55
+    24, 25, 26, 27, 28, 29, 30, 31,
56
+    32, 33, 34, 35, 36, 37, 38, 39,
57
+    40, 41, 42, 43, 44, 45, 46, 47,
58
+    48, 49, 50, 51, 52, 53, 54, 55,
59
+    56, 57, 58, 59, 60, 61, 62, 63,
60
+    64, 97, 98, 99,100,101,102,103,
61
+    104,105,106,107,108,109,110,111,
62
+    112,113,114,115,116,117,118,119,
63
+    120,121,122, 91, 92, 93, 94, 95,
64
+    96, 65, 66, 67, 68, 69, 70, 71,
65
+    72, 73, 74, 75, 76, 77, 78, 79,
66
+    80, 81, 82, 83, 84, 85, 86, 87,
67
+    88, 89, 90,123,124,125,126,127,
68
+    128,129,130,131,132,133,134,135,
69
+    136,137,138,139,140,141,142,143,
70
+    144,145,146,147,148,149,150,151,
71
+    152,153,154,155,156,157,158,159,
72
+    160,161,162,163,164,165,166,167,
73
+    168,169,170,171,172,173,174,175,
74
+    176,177,178,179,180,181,182,183,
75
+    184,185,186,187,188,189,190,191,
76
+    192,193,194,195,196,197,198,199,
77
+    200,201,202,203,204,205,206,207,
78
+    208,209,210,211,212,213,214,215,
79
+    216,217,218,219,220,221,222,223,
80
+    224,225,226,227,228,229,230,231,
81
+    232,233,234,235,236,237,238,239,
82
+    240,241,242,243,244,245,246,247,
83
+    248,249,250,251,252,253,254,255,
84
+    
85
+    /* This table contains bit maps for various character classes.
86
+     Each map is 32 bytes long and the bits run from the least
87
+     significant end of each byte. The classes that have their own
88
+     maps are: space, xdigit, digit, upper, lower, word, graph
89
+     print, punct, and cntrl. Other classes are built from combinations. */
90
+    
91
+    0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
92
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
93
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
94
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
95
+    
96
+    0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
97
+    0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
98
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
99
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
100
+    
101
+    0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
102
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
103
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
104
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
105
+    
106
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
107
+    0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
108
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
109
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
110
+    
111
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
112
+    0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
113
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
114
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
115
+    
116
+    0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
117
+    0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
118
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
119
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
120
+    
121
+    0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,
122
+    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
123
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
124
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
125
+    
126
+    0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,
127
+    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
128
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
129
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
130
+    
131
+    0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,
132
+    0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
133
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
134
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
135
+    
136
+    0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,
137
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
138
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
139
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140
+    
141
+    /* This table identifies various classes of character by individual bits:
142
+     0x01   white space character
143
+     0x02   letter
144
+     0x04   decimal digit
145
+     0x08   hexadecimal digit
146
+     0x10   alphanumeric or '_'
147
+     0x80   regular expression metacharacter or binary zero
148
+     */
149
+    
150
+    0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
151
+    0x00,0x01,0x01,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
152
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
153
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
154
+    0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /*    - '  */
155
+    0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /*  ( - /  */
156
+    0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
157
+    0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /*  8 - ?  */
158
+    0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  @ - G  */
159
+    0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  H - O  */
160
+    0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  P - W  */
161
+    0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /*  X - _  */
162
+    0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  ` - g  */
163
+    0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  h - o  */
164
+    0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  p - w  */
165
+    0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /*  x -127 */
166
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
167
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
168
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
169
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
170
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
171
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
172
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
173
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
174
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
175
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
176
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
177
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
178
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
179
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
180
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
181
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
182 182
 
183 183
 /* End of chartables.c */
... ...
@@ -1,16 +1,16 @@
1 1
 /* config.h.  Generated by configure.  */
2 2
 
3 3
 /* On Unix systems config.in is converted by configure into config.h. PCRE is
4
-written in Standard C, but there are a few non-standard things it can cope
5
-with, allowing it to run on SunOS4 and other "close to standard" systems.
6
-
7
-On a non-Unix system you should just copy this file into config.h, and set up
8
-the macros the way you need them. You should normally change the definitions of
9
-HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because of the way autoconf
10
-works, these cannot be made the defaults. If your system has bcopy() and not
11
-memmove(), change the definition of HAVE_BCOPY instead of HAVE_MEMMOVE. If your
12
-system has neither bcopy() nor memmove(), leave them both as 0; an emulation
13
-function will be used. */
4
+ written in Standard C, but there are a few non-standard things it can cope
5
+ with, allowing it to run on SunOS4 and other "close to standard" systems.
6
+ 
7
+ On a non-Unix system you should just copy this file into config.h, and set up
8
+ the macros the way you need them. You should normally change the definitions of
9
+ HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because of the way autoconf
10
+ works, these cannot be made the defaults. If your system has bcopy() and not
11
+ memmove(), change the definition of HAVE_BCOPY instead of HAVE_MEMMOVE. If your
12
+ system has neither bcopy() nor memmove(), leave them both as 0; an emulation
13
+ function will be used. */
14 14
 
15 15
 /* Define to empty if the keyword does not work. */
16 16
 
... ...
@@ -21,48 +21,48 @@ function will be used. */
21 21
 /* #undef size_t */
22 22
 
23 23
 /* The following two definitions are mainly for the benefit of SunOS4, which
24
-doesn't have the strerror() or memmove() functions that should be present in
25
-all Standard C libraries. The macros HAVE_STRERROR and HAVE_MEMMOVE should
26
-normally be defined with the value 1 for other systems, but unfortunately we
27
-can't make this the default because "configure" files generated by autoconf
28
-will only change 0 to 1; they won't change 1 to 0 if the functions are not
29
-found. */
24
+ doesn't have the strerror() or memmove() functions that should be present in
25
+ all Standard C libraries. The macros HAVE_STRERROR and HAVE_MEMMOVE should
26
+ normally be defined with the value 1 for other systems, but unfortunately we
27
+ can't make this the default because "configure" files generated by autoconf
28
+ will only change 0 to 1; they won't change 1 to 0 if the functions are not
29
+ found. */
30 30
 
31 31
 #define HAVE_STRERROR 1
32 32
 #define HAVE_MEMMOVE 1
33 33
 
34 34
 /* There are some non-Unix systems that don't even have bcopy(). If this macro
35
-is false, an emulation is used. If HAVE_MEMMOVE is set to 1, the value of
36
-HAVE_BCOPY is not relevant. */
35
+ is false, an emulation is used. If HAVE_MEMMOVE is set to 1, the value of
36
+ HAVE_BCOPY is not relevant. */
37 37
 
38 38
 #define HAVE_BCOPY 1
39 39
 
40 40
 /* The value of NEWLINE determines the newline character. The default is to
41
-leave it up to the compiler, but some sites want to force a particular value.
42
-On Unix systems, "configure" can be used to override this default. */
41
+ leave it up to the compiler, but some sites want to force a particular value.
42
+ On Unix systems, "configure" can be used to override this default. */
43 43
 
44 44
 #ifndef NEWLINE
45 45
 #define NEWLINE '\n'
46 46
 #endif
47 47
 
48 48
 /* The value of LINK_SIZE determines the number of bytes used to store
49
-links as offsets within the compiled regex. The default is 2, which allows for
50
-compiled patterns up to 64K long. This covers the vast majority of cases.
51
-However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows for
52
-longer patterns in extreme cases. On Unix systems, "configure" can be used to
53
-override this default. */
49
+ links as offsets within the compiled regex. The default is 2, which allows for
50
+ compiled patterns up to 64K long. This covers the vast majority of cases.
51
+ However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows for
52
+ longer patterns in extreme cases. On Unix systems, "configure" can be used to
53
+ override this default. */
54 54
 
55 55
 #ifndef LINK_SIZE
56 56
 #define LINK_SIZE   2
57 57
 #endif
58 58
 
59 59
 /* The value of MATCH_LIMIT determines the default number of times the match()
60
-function can be called during a single execution of pcre_exec(). (There is a
61
-runtime method of setting a different limit.) The limit exists in order to
62
-catch runaway regular expressions that take for ever to determine that they do
63
-not match. The default is set very large so that it does not accidentally catch
64
-legitimate cases. On Unix systems, "configure" can be used to override this
65
-default default. */
60
+ function can be called during a single execution of pcre_exec(). (There is a
61
+ runtime method of setting a different limit.) The limit exists in order to
62
+ catch runaway regular expressions that take for ever to determine that they do
63
+ not match. The default is set very large so that it does not accidentally catch
64
+ legitimate cases. On Unix systems, "configure" can be used to override this
65
+ default default. */
66 66
 
67 67
 #ifndef MATCH_LIMIT
68 68
 #define MATCH_LIMIT 10000000
... ...
@@ -1,349 +1,349 @@
1 1
 /*************************************************
2
-*      Perl-Compatible Regular Expressions       *
3
-*************************************************/
2
+ *      Perl-Compatible Regular Expressions       *
3
+ *************************************************/
4 4
 
5 5
 /*
6
-This is a library of functions to support regular expressions whose syntax
7
-and semantics are as close as possible to those of the Perl 5 language. See
8
-the file Tech.Notes for some information on the internals.
9
-
10
-Written by: Philip Hazel <ph10@cam.ac.uk>
11
-
12
-           Copyright (c) 1997-2003 University of Cambridge
13
-
14
-Permission is granted to anyone to use this software for any purpose on any
15
-computer system, and to redistribute it freely, subject to the following
16
-restrictions:
17
-
18
-1. This software is distributed in the hope that it will be useful,
19
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
20
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
21
-
22
-2. The origin of this software must not be misrepresented, either by
23
-   explicit claim or by omission.
24
-
25
-3. Altered versions must be plainly marked as such, and must not be
26
-   misrepresented as being the original software.
27
-
28
-4. If PCRE is embedded in any software that is released under the GNU
29
-   General Purpose Licence (GPL), then the terms of that licence shall
30
-   supersede any condition above with which it is incompatible.
31
-*/
6
+ This is a library of functions to support regular expressions whose syntax
7
+ and semantics are as close as possible to those of the Perl 5 language. See
8
+ the file Tech.Notes for some information on the internals.
9
+ 
10
+ Written by: Philip Hazel <ph10@cam.ac.uk>
11
+ 
12
+ Copyright (c) 1997-2003 University of Cambridge
13
+ 
14
+ -----------------------------------------------------------------------------
15
+ Permission is granted to anyone to use this software for any purpose on any
16
+ computer system, and to redistribute it freely, subject to the following
17
+ restrictions:
18
+ 
19
+ 1. This software is distributed in the hope that it will be useful,
20
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
+ 
23
+ 2. The origin of this software must not be misrepresented, either by
24
+ explicit claim or by omission.
25
+ 
26
+ 3. Altered versions must be plainly marked as such, and must not be
27
+ misrepresented as being the original software.
28
+ 
29
+ 4. If PCRE is embedded in any software that is released under the GNU
30
+ General Purpose Licence (GPL), then the terms of that licence shall
31
+ supersede any condition above with which it is incompatible.
32
+ -----------------------------------------------------------------------------
33
+ */
32 34
 
33 35
 /* This module contains some convenience functions for extracting substrings
34
-from the subject string after a regex match has succeeded. The original idea
35
-for these functions came from Scott Wimer <scottw@cgibuilder.com>. */
36
+ from the subject string after a regex match has succeeded. The original idea
37
+ for these functions came from Scott Wimer <scottw@cgibuilder.com>. */
36 38
 
37 39
 
38 40
 /* Include the internals header, which itself includes Standard C headers plus
39
-the external pcre header. */
41
+ the external pcre header. */
40 42
 
41 43
 #include "internal.h"
42 44
 
43 45
 
44 46
 /*************************************************
45
-*           Find number for named string         *
46
-*************************************************/
47
+ *           Find number for named string         *
48
+ *************************************************/
47 49
 
48 50
 /* This function is used by the two extraction functions below, as well
49
-as being generally available.
50
-
51
-Arguments:
52
-  code        the compiled regex
53
-  stringname  the name whose number is required
54
-
55
-Returns:      the number of the named parentheses, or a negative number
56
-                (PCRE_ERROR_NOSUBSTRING) if not found
57
-*/
51
+ as being generally available.
52
+ 
53
+ Arguments:
54
+ code        the compiled regex
55
+ stringname  the name whose number is required
56
+ 
57
+ Returns:      the number of the named parentheses, or a negative number
58
+ (PCRE_ERROR_NOSUBSTRING) if not found
59
+ */
58 60
 
59 61
 int
60 62
 pcre_get_stringnumber(const pcre *code, const char *stringname)
61 63
 {
62
-int rc;
63
-int entrysize;
64
-int top, bot;
65
-uschar *nametable;
66
-
67
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
68
-  return rc;
69
-if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
70
-
71
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
72
-  return rc;
73
-if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
74
-  return rc;
75
-
76
-bot = 0;
77
-while (top > bot)
78
-  {
79
-  int mid = (top + bot) / 2;
80
-  uschar *entry = nametable + entrysize*mid;
81
-  int c = strcmp(stringname, (char *)(entry + 2));
82
-  if (c == 0) return (entry[0] << 8) + entry[1];
83
-  if (c > 0) bot = mid + 1; else top = mid;
84
-  }
85
-
86
-return PCRE_ERROR_NOSUBSTRING;
64
+    int rc;
65
+    int entrysize;
66
+    int top, bot;
67
+    uschar *nametable;
68
+    
69
+    if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0)
70
+        return rc;
71
+    if (top <= 0) return PCRE_ERROR_NOSUBSTRING;
72
+    
73
+    if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0)
74
+        return rc;
75
+    if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0)
76
+        return rc;
77
+    
78
+    bot = 0;
79
+    while (top > bot)
80
+    {
81
+        int mid = (top + bot) / 2;
82
+        uschar *entry = nametable + entrysize*mid;
83
+        int c = strcmp(stringname, (char *)(entry + 2));
84
+        if (c == 0) return (entry[0] << 8) + entry[1];
85
+        if (c > 0) bot = mid + 1; else top = mid;
86
+    }
87
+    
88
+    return PCRE_ERROR_NOSUBSTRING;
87 89
 }
88 90
 
89 91
 
90 92
 
91 93
 /*************************************************
92
-*      Copy captured string to given buffer      *
93
-*************************************************/
94
+ *      Copy captured string to given buffer      *
95
+ *************************************************/
94 96
 
95 97
 /* This function copies a single captured substring into a given buffer.
96
-Note that we use memcpy() rather than strncpy() in case there are binary zeros
97
-in the string.
98
-
99
-Arguments:
100
-  subject        the subject string that was matched
101
-  ovector        pointer to the offsets table
102
-  stringcount    the number of substrings that were captured
103
-                   (i.e. the yield of the pcre_exec call, unless
104
-                   that was zero, in which case it should be 1/3
105
-                   of the offset table size)
106
-  stringnumber   the number of the required substring
107
-  buffer         where to put the substring
108
-  size           the size of the buffer
109
-
110
-Returns:         if successful:
111
-                   the length of the copied string, not including the zero
112
-                   that is put on the end; can be zero
113
-                 if not successful:
114
-                   PCRE_ERROR_NOMEMORY (-6) buffer too small
115
-                   PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
116
-*/
98
+ Note that we use memcpy() rather than strncpy() in case there are binary zeros
99
+ in the string.
100
+ 
101
+ Arguments:
102
+ subject        the subject string that was matched
103
+ ovector        pointer to the offsets table
104
+ stringcount    the number of substrings that were captured
105
+ (i.e. the yield of the pcre_exec call, unless
106
+ that was zero, in which case it should be 1/3
107
+ of the offset table size)
108
+ stringnumber   the number of the required substring
109
+ buffer         where to put the substring
110
+ size           the size of the buffer
111
+ 
112
+ Returns:         if successful:
113
+ the length of the copied string, not including the zero
114
+ that is put on the end; can be zero
115
+ if not successful:
116
+ PCRE_ERROR_NOMEMORY (-6) buffer too small
117
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
118
+ */
117 119
 
118 120
 int
119 121
 pcre_copy_substring(const char *subject, int *ovector, int stringcount,
120
-  int stringnumber, char *buffer, int size)
122
+                    int stringnumber, char *buffer, int size)
121 123
 {
122
-int yield;
123
-if (stringnumber < 0 || stringnumber >= stringcount)
124
-  return PCRE_ERROR_NOSUBSTRING;
125
-stringnumber *= 2;
126
-yield = ovector[stringnumber+1] - ovector[stringnumber];
127
-if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
128
-memcpy(buffer, subject + ovector[stringnumber], yield);
129
-buffer[yield] = 0;
130
-return yield;
124
+    int yield;
125
+    if (stringnumber < 0 || stringnumber >= stringcount)
126
+        return PCRE_ERROR_NOSUBSTRING;
127
+    stringnumber *= 2;
128
+    yield = ovector[stringnumber+1] - ovector[stringnumber];
129
+    if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
130
+    memcpy(buffer, subject + ovector[stringnumber], yield);
131
+    buffer[yield] = 0;
132
+    return yield;
131 133
 }
132 134
 
133 135
 
134 136
 
135 137
 /*************************************************
136
-*   Copy named captured string to given buffer   *
137
-*************************************************/
138
+ *   Copy named captured string to given buffer   *
139
+ *************************************************/
138 140
 
139 141
 /* This function copies a single captured substring into a given buffer,
140
-identifying it by name.
141
-
142
-Arguments:
143
-  code           the compiled regex
144
-  subject        the subject string that was matched
145
-  ovector        pointer to the offsets table
146
-  stringcount    the number of substrings that were captured
147
-                   (i.e. the yield of the pcre_exec call, unless
148
-                   that was zero, in which case it should be 1/3
149
-                   of the offset table size)
150
-  stringname     the name of the required substring
151
-  buffer         where to put the substring
152
-  size           the size of the buffer
153
-
154
-Returns:         if successful:
155
-                   the length of the copied string, not including the zero
156
-                   that is put on the end; can be zero
157
-                 if not successful:
158
-                   PCRE_ERROR_NOMEMORY (-6) buffer too small
159
-                   PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
160
-*/
142
+ identifying it by name.
143
+ 
144
+ Arguments:
145
+ code           the compiled regex
146
+ subject        the subject string that was matched
147
+ ovector        pointer to the offsets table
148
+ stringcount    the number of substrings that were captured
149
+ (i.e. the yield of the pcre_exec call, unless
150
+ that was zero, in which case it should be 1/3
151
+ of the offset table size)
152
+ stringname     the name of the required substring
153
+ buffer         where to put the substring
154
+ size           the size of the buffer
155
+ 
156
+ Returns:         if successful:
157
+ the length of the copied string, not including the zero
158
+ that is put on the end; can be zero
159
+ if not successful:
160
+ PCRE_ERROR_NOMEMORY (-6) buffer too small
161
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
162
+ */
161 163
 
162 164
 int
163 165
 pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector,
164
-  int stringcount, const char *stringname, char *buffer, int size)
166
+                          int stringcount, const char *stringname, char *buffer, int size)
165 167
 {
166
-int n = pcre_get_stringnumber(code, stringname);
167
-if (n <= 0) return n;
168
-return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
168
+    int n = pcre_get_stringnumber(code, stringname);
169
+    if (n <= 0) return n;
170
+    return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
169 171
 }
170 172
 
171 173
 
172 174
 
173 175
 /*************************************************
174
-*      Copy all captured strings to new store    *
175
-*************************************************/
176
+ *      Copy all captured strings to new store    *
177
+ *************************************************/
176 178
 
177 179
 /* This function gets one chunk of store and builds a list of pointers and all
178
-of the captured substrings in it. A NULL pointer is put on the end of the list.
179
-
180
-Arguments:
181
-  subject        the subject string that was matched
182
-  ovector        pointer to the offsets table
183
-  stringcount    the number of substrings that were captured
184
-                   (i.e. the yield of the pcre_exec call, unless
185
-                   that was zero, in which case it should be 1/3
186
-                   of the offset table size)
187
-  listptr        set to point to the list of pointers
188
-
189
-Returns:         if successful: 0
190
-                 if not successful:
191
-                   PCRE_ERROR_NOMEMORY (-6) failed to get store
192
-*/
180
+ of the captured substrings in it. A NULL pointer is put on the end of the list.
181
+ 
182
+ Arguments:
183
+ subject        the subject string that was matched
184
+ ovector        pointer to the offsets table
185
+ stringcount    the number of substrings that were captured
186
+ (i.e. the yield of the pcre_exec call, unless
187
+ that was zero, in which case it should be 1/3
188
+ of the offset table size)
189
+ listptr        set to point to the list of pointers
190
+ 
191
+ Returns:         if successful: 0
192
+ if not successful:
193
+ PCRE_ERROR_NOMEMORY (-6) failed to get store
194
+ */
193 195
 
194 196
 int
195 197
 pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
196
-  const char ***listptr)
198
+                        const char ***listptr)
197 199
 {
198
-int i;
199
-int size = sizeof(char *);
200
-int double_count = stringcount * 2;
201
-char **stringlist;
202
-char *p;
203
-
204
-for (i = 0; i < double_count; i += 2)
205
-  size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
206
-
207
-stringlist = (char **)(pcre_malloc)(size);
208
-if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
209
-
210
-*listptr = (const char **)stringlist;
211
-p = (char *)(stringlist + stringcount + 1);
212
-
213
-for (i = 0; i < double_count; i += 2)
214
-  {
215
-  int len = ovector[i+1] - ovector[i];
216
-  memcpy(p, subject + ovector[i], len);
217
-  *stringlist++ = p;
218
-  p += len;
219
-  *p++ = 0;
220
-  }
221
-
222
-*stringlist = NULL;
223
-return 0;
200
+    int i;
201
+    int size = sizeof(char *);
202
+    int double_count = stringcount * 2;
203
+    char **stringlist;
204
+    char *p;
205
+    
206
+    for (i = 0; i < double_count; i += 2)
207
+        size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
208
+    
209
+    stringlist = (char **)(pcre_malloc)(size);
210
+    if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
211
+    
212
+    *listptr = (const char **)stringlist;
213
+    p = (char *)(stringlist + stringcount + 1);
214
+    
215
+    for (i = 0; i < double_count; i += 2)
216
+    {
217
+        int len = ovector[i+1] - ovector[i];
218
+        memcpy(p, subject + ovector[i], len);
219
+        *stringlist++ = p;
220
+        p += len;
221
+        *p++ = 0;
222
+    }
223
+    
224
+    *stringlist = NULL;
225
+    return 0;
224 226
 }
225 227
 
226 228
 
227 229
 
228 230
 /*************************************************
229
-*   Free store obtained by get_substring_list    *
230
-*************************************************/
231
+ *   Free store obtained by get_substring_list    *
232
+ *************************************************/
231 233
 
232 234
 /* This function exists for the benefit of people calling PCRE from non-C
233
-programs that can call its functions, but not free() or (pcre_free)() directly.
234
-
235
-Argument:   the result of a previous pcre_get_substring_list()
236
-Returns:    nothing
237
-*/
235
+ programs that can call its functions, but not free() or (pcre_free)() directly.
236
+ 
237
+ Argument:   the result of a previous pcre_get_substring_list()
238
+ Returns:    nothing
239
+ */
238 240
 
239 241
 void
240 242
 pcre_free_substring_list(const char **pointer)
241 243
 {
242
-(pcre_free)((void *)pointer);
244
+    (pcre_free)((void *)pointer);
243 245
 }
244 246
 
245 247
 
246 248
 
247 249
 /*************************************************
248
-*      Copy captured string to new store         *
249
-*************************************************/
250
+ *      Copy captured string to new store         *
251
+ *************************************************/
250 252
 
251 253
 /* This function copies a single captured substring into a piece of new
252
-store
253
-
254
-Arguments:
255
-  subject        the subject string that was matched
256
-  ovector        pointer to the offsets table
257
-  stringcount    the number of substrings that were captured
258
-                   (i.e. the yield of the pcre_exec call, unless
259
-                   that was zero, in which case it should be 1/3
260
-                   of the offset table size)
261
-  stringnumber   the number of the required substring
262
-  stringptr      where to put a pointer to the substring
263
-
264
-Returns:         if successful:
265
-                   the length of the string, not including the zero that
266
-                   is put on the end; can be zero
267
-                 if not successful:
268
-                   PCRE_ERROR_NOMEMORY (-6) failed to get store
269
-                   PCRE_ERROR_NOSUBSTRING (-7) substring not present
270
-*/
254
+ store
255
+ 
256
+ Arguments:
257
+ subject        the subject string that was matched
258
+ ovector        pointer to the offsets table
259
+ stringcount    the number of substrings that were captured
260
+ (i.e. the yield of the pcre_exec call, unless
261
+ that was zero, in which case it should be 1/3
262
+ of the offset table size)
263
+ stringnumber   the number of the required substring
264
+ stringptr      where to put a pointer to the substring
265
+ 
266
+ Returns:         if successful:
267
+ the length of the string, not including the zero that
268
+ is put on the end; can be zero
269
+ if not successful:
270
+ PCRE_ERROR_NOMEMORY (-6) failed to get store
271
+ PCRE_ERROR_NOSUBSTRING (-7) substring not present
272
+ */
271 273
 
272 274
 int
273 275
 pcre_get_substring(const char *subject, int *ovector, int stringcount,
274
-  int stringnumber, const char **stringptr)
276
+                   int stringnumber, const char **stringptr)
275 277
 {
276
-int yield;
277
-char *substring;
278
-if (stringnumber < 0 || stringnumber >= stringcount)
279
-  return PCRE_ERROR_NOSUBSTRING;
280
-stringnumber *= 2;
281
-yield = ovector[stringnumber+1] - ovector[stringnumber];
282
-substring = (char *)(pcre_malloc)(yield + 1);
283
-if (substring == NULL) return PCRE_ERROR_NOMEMORY;
284
-memcpy(substring, subject + ovector[stringnumber], yield);
285
-substring[yield] = 0;
286
-*stringptr = substring;
287
-return yield;
278
+    int yield;
279
+    char *substring;
280
+    if (stringnumber < 0 || stringnumber >= stringcount)
281
+        return PCRE_ERROR_NOSUBSTRING;
282
+    stringnumber *= 2;
283
+    yield = ovector[stringnumber+1] - ovector[stringnumber];
284
+    substring = (char *)(pcre_malloc)(yield + 1);
285
+    if (substring == NULL) return PCRE_ERROR_NOMEMORY;
286
+    memcpy(substring, subject + ovector[stringnumber], yield);
287
+    substring[yield] = 0;
288
+    *stringptr = substring;
289
+    return yield;
288 290
 }
289 291
 
290 292
 
291 293
 
292 294
 /*************************************************
293
-*   Copy named captured string to new store      *
294
-*************************************************/
295
+ *   Copy named captured string to new store      *
296
+ *************************************************/
295 297
 
296 298
 /* This function copies a single captured substring, identified by name, into
297
-new store.
298
-
299
-Arguments:
300
-  code           the compiled regex
301
-  subject        the subject string that was matched
302
-  ovector        pointer to the offsets table
303
-  stringcount    the number of substrings that were captured
304
-                   (i.e. the yield of the pcre_exec call, unless
305
-                   that was zero, in which case it should be 1/3
306
-                   of the offset table size)
307
-  stringname     the name of the required substring
308
-  stringptr      where to put the pointer
309
-
310
-Returns:         if successful:
311
-                   the length of the copied string, not including the zero
312
-                   that is put on the end; can be zero
313
-                 if not successful:
314
-                   PCRE_ERROR_NOMEMORY (-6) couldn't get memory
315
-                   PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
316
-*/
299
+ new store.
300
+ 
301
+ Arguments:
302
+ code           the compiled regex
303
+ subject        the subject string that was matched
304
+ ovector        pointer to the offsets table
305
+ stringcount    the number of substrings that were captured
306
+ (i.e. the yield of the pcre_exec call, unless
307
+ that was zero, in which case it should be 1/3
308
+ of the offset table size)
309
+ stringname     the name of the required substring
310
+ stringptr      where to put the pointer
311
+ 
312
+ Returns:         if successful:
313
+ the length of the copied string, not including the zero
314
+ that is put on the end; can be zero
315
+ if not successful:
316
+ PCRE_ERROR_NOMEMORY (-6) couldn't get memory
317
+ PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
318
+ */
317 319
 
318 320
 int
319 321
 pcre_get_named_substring(const pcre *code, const char *subject, int *ovector,
320
-  int stringcount, const char *stringname, const char **stringptr)
322
+                         int stringcount, const char *stringname, const char **stringptr)
321 323
 {
322
-int n = pcre_get_stringnumber(code, stringname);
323
-if (n <= 0) return n;
324
-return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
324
+    int n = pcre_get_stringnumber(code, stringname);
325
+    if (n <= 0) return n;
326
+    return pcre_get_substring(subject, ovector, stringcount, n, stringptr);
325 327
 }
326 328
 
327 329
 
328 330
 
329 331
 
330 332
 /*************************************************
331
-*       Free store obtained by get_substring     *
332
-*************************************************/
333
+ *       Free store obtained by get_substring     *
334
+ *************************************************/
333 335
 
334 336
 /* This function exists for the benefit of people calling PCRE from non-C
335
-programs that can call its functions, but not free() or (pcre_free)() directly.
336
-
337
-Argument:   the result of a previous pcre_get_substring()
338
-Returns:    nothing
339
-*/
337
+ programs that can call its functions, but not free() or (pcre_free)() directly.
338
+ 
339
+ Argument:   the result of a previous pcre_get_substring()
340
+ Returns:    nothing
341
+ */
340 342
 
341 343
 void
342 344
 pcre_free_substring(const char *pointer)
343 345
 {
344
-(pcre_free)((void *)pointer);
346
+    (pcre_free)((void *)pointer);
345 347
 }
346 348
 
347 349
 /* End of get.c */
... ...
@@ -1,47 +1,47 @@
1 1
 /*************************************************
2
-*      Perl-Compatible Regular Expressions       *
3
-*************************************************/
2
+ *      Perl-Compatible Regular Expressions       *
3
+ *************************************************/
4 4
 
5 5
 
6 6
 /* This is a library of functions to support regular expressions whose syntax
7
-and semantics are as close as possible to those of the Perl 5 language. See
8
-the file Tech.Notes for some information on the internals.
9
-
10
-Written by: Philip Hazel <ph10@cam.ac.uk>
11
-
12
-           Copyright (c) 1997-2003 University of Cambridge
13
-
14
-Permission is granted to anyone to use this software for any purpose on any
15
-computer system, and to redistribute it freely, subject to the following
16
-restrictions:
17
-
18
-1. This software is distributed in the hope that it will be useful,
19
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
20
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
21
-
22
-2. The origin of this software must not be misrepresented, either by
23
-   explicit claim or by omission.
24
-
25
-3. Altered versions must be plainly marked as such, and must not be
26
-   misrepresented as being the original software.
27
-
28
-4. If PCRE is embedded in any software that is released under the GNU
29
-   General Purpose Licence (GPL), then the terms of that licence shall
30
-   supersede any condition above with which it is incompatible.
31
-*/
7
+ and semantics are as close as possible to those of the Perl 5 language. See
8
+ the file Tech.Notes for some information on the internals.
9
+ 
10
+ Written by: Philip Hazel <ph10@cam.ac.uk>
11
+ 
12
+ Copyright (c) 1997-2003 University of Cambridge
13
+ 
14
+ -----------------------------------------------------------------------------
15
+ Permission is granted to anyone to use this software for any purpose on any
16
+ computer system, and to redistribute it freely, subject to the following
17
+ restrictions:
18
+ 
19
+ 1. This software is distributed in the hope that it will be useful,
20
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
+ 
23
+ 2. The origin of this software must not be misrepresented, either by
24
+ explicit claim or by omission.
25
+ 
26
+ 3. Altered versions must be plainly marked as such, and must not be
27
+ misrepresented as being the original software.
28
+ 
29
+ 4. If PCRE is embedded in any software that is released under the GNU
30
+ General Purpose Licence (GPL), then the terms of that licence shall
31
+ supersede any condition above with which it is incompatible.
32
+ -----------------------------------------------------------------------------
33
+ */
32 34
 
33 35
 /* This header contains definitions that are shared between the different
34
-modules, but which are not relevant to the outside. */
36
+ modules, but which are not relevant to the outside. */
35 37
 
36 38
 /* Get the definitions provided by running "configure" */
37 39
 
38 40
 #include "config.h"
39 41
 
40 42
 /* When compiling for use with the Virtual Pascal compiler, these functions
41
-need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
42
-option on the command line. */
43
+ need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
44
+ option on the command line. */
43 45
 
44 46
 #ifdef VPCOMPAT
45 47
 #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
... ...
@@ -51,11 +51,11 @@ option on the command line. */
51 51
 #else  /* VPCOMPAT */
52 52
 
53 53
 /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
54
-define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
55
-is set. Otherwise, include an emulating function for those systems that have
56
-neither (there some non-Unix environments where this is the case). This assumes
57
-that all calls to memmove are moving strings upwards in store, which is the
58
-case in PCRE. */
54
+ define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
55
+ is set. Otherwise, include an emulating function for those systems that have
56
+ neither (there some non-Unix environments where this is the case). This assumes
57
+ that all calls to memmove are moving strings upwards in store, which is the
58
+ case in PCRE. */
59 59
 
60 60
 #if ! HAVE_MEMMOVE
61 61
 #undef  memmove        /* some systems may have a macro */
... ...
@@ -65,10 +65,10 @@ case in PCRE. */
65 65
 void *
66 66
 pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
67 67
 {
68
-int i;
69
-dest += n;
70
-src += n;
71
-for (i = 0; i < n; ++i) *(--dest) =  *(--src);
68
+    int i;
69
+    dest += n;
70
+    src += n;
71
+    for (i = 0; i < n; ++i) *(--dest) =  *(--src);
72 72
 }
73 73
 #define memmove(a, b, c) pcre_memmove(a, b, c)
74 74
 #endif   /* not HAVE_BCOPY */
... ...
@@ -77,25 +77,25 @@ for (i = 0; i < n; ++i) *(--dest) =  *(--src);
77 77
 
78 78
 
79 79
 /* PCRE keeps offsets in its compiled code as 2-byte quantities by default.
80
-These are used, for example, to link from the start of a subpattern to its
81
-alternatives and its end. The use of 2 bytes per offset limits the size of the
82
-compiled regex to around 64K, which is big enough for almost everybody.
83
-However, I received a request for an even bigger limit. For this reason, and
84
-also to make the code easier to maintain, the storing and loading of offsets
85
-from the byte string is now handled by the macros that are defined here.
86
-
87
-The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
88
-the config.h file, but can be overridden by using -D on the command line. This
89
-is automated on Unix systems via the "configure" command. */
80
+ These are used, for example, to link from the start of a subpattern to its
81
+ alternatives and its end. The use of 2 bytes per offset limits the size of the
82
+ compiled regex to around 64K, which is big enough for almost everybody.
83
+ However, I received a request for an even bigger limit. For this reason, and
84
+ also to make the code easier to maintain, the storing and loading of offsets
85
+ from the byte string is now handled by the macros that are defined here.
86
+ 
87
+ The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
88
+ the config.h file, but can be overridden by using -D on the command line. This
89
+ is automated on Unix systems via the "configure" command. */
90 90
 
91 91
 #if LINK_SIZE == 2
92 92
 
93 93
 #define PUT(a,n,d)   \
94
-  (a[n] = (d) >> 8), \
95
-  (a[(n)+1] = (d) & 255)
94
+(a[n] = (d) >> 8), \
95
+(a[(n)+1] = (d) & 255)
96 96
 
97 97
 #define GET(a,n) \
98
-  (((a)[n] << 8) | (a)[(n)+1])
98
+(((a)[n] << 8) | (a)[(n)+1])
99 99
 
100 100
 #define MAX_PATTERN_SIZE (1 << 16)
101 101
 
... ...
@@ -103,12 +103,12 @@ is automated on Unix systems via the "configure" command. */
103 103
 #elif LINK_SIZE == 3
104 104
 
105 105
 #define PUT(a,n,d)       \
106
-  (a[n] = (d) >> 16),    \
107
-  (a[(n)+1] = (d) >> 8), \
108
-  (a[(n)+2] = (d) & 255)
106
+(a[n] = (d) >> 16),    \
107
+(a[(n)+1] = (d) >> 8), \
108
+(a[(n)+2] = (d) & 255)
109 109
 
110 110
 #define GET(a,n) \
111
-  (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
111
+(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
112 112
 
113 113
 #define MAX_PATTERN_SIZE (1 << 24)
114 114
 
... ...
@@ -116,13 +116,13 @@ is automated on Unix systems via the "configure" command. */
116 116
 #elif LINK_SIZE == 4
117 117
 
118 118
 #define PUT(a,n,d)        \
119
-  (a[n] = (d) >> 24),     \
120
-  (a[(n)+1] = (d) >> 16), \
121
-  (a[(n)+2] = (d) >> 8),  \
122
-  (a[(n)+3] = (d) & 255)
119
+(a[n] = (d) >> 24),     \
120
+(a[(n)+1] = (d) >> 16), \
121
+(a[(n)+2] = (d) >> 8),  \
122
+(a[(n)+3] = (d) & 255)
123 123
 
124 124
 #define GET(a,n) \
125
-  (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
125
+(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
126 126
 
127 127
 #define MAX_PATTERN_SIZE (1 << 30)   /* Keep it positive */
128 128
 
... ...
@@ -138,15 +138,15 @@ is automated on Unix systems via the "configure" command. */
138 138
 
139 139
 
140 140
 /* PCRE uses some other 2-byte quantities that do not change when the size of
141
-offsets changes. There are used for repeat counts and for other things such as
142
-capturing parenthesis numbers in back references. */
141
+ offsets changes. There are used for repeat counts and for other things such as
142
+ capturing parenthesis numbers in back references. */
143 143
 
144 144
 #define PUT2(a,n,d)   \
145
-  a[n] = (d) >> 8; \
146
-  a[(n)+1] = (d) & 255
145
+a[n] = (d) >> 8; \
146
+a[(n)+1] = (d) & 255
147 147
 
148 148
 #define GET2(a,n) \
149
-  (((a)[n] << 8) | (a)[(n)+1])
149
+(((a)[n] << 8) | (a)[(n)+1])
150 150
 
151 151
 #define PUT2INC(a,n,d)  PUT2(a,n,d), a += 2
152 152
 
... ...
@@ -167,7 +167,7 @@ capturing parenthesis numbers in back references. */
167 167
 #include "pcre.h"
168 168
 
169 169
 /* In case there is no definition of offsetof() provided - though any proper
170
-Standard C system should have one. */
170
+ Standard C system should have one. */
171 171
 
172 172
 #ifndef offsetof
173 173
 #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
... ...
@@ -178,10 +178,10 @@ Standard C system should have one. */
178 178
 #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
179 179
 
180 180
 /* Private options flags start at the most significant end of the four bytes,
181
-but skip the top bit so we can use ints for convenience without getting tangled
182
-with negative values. The public options defined in pcre.h start at the least
183
-significant end. Make sure they don't overlap, though now that we have expanded
184
-to four bytes there is plenty of space. */
181
+ but skip the top bit so we can use ints for convenience without getting tangled
182
+ with negative values. The public options defined in pcre.h start at the least
183
+ significant end. Make sure they don't overlap, though now that we have expanded
184
+ to four bytes there is plenty of space. */
185 185
 
186 186
 #define PCRE_FIRSTSET      0x40000000  /* first_byte is set */
187 187
 #define PCRE_REQCHSET      0x20000000  /* req_byte is set */
... ...
@@ -193,15 +193,15 @@ to four bytes there is plenty of space. */
193 193
 #define PCRE_STUDY_MAPPED   0x01     /* a map of starting chars exists */
194 194
 
195 195
 /* Masks for identifying the public options which are permitted at compile
196
-time, run time or study time, respectively. */
196
+ time, run time or study time, respectively. */
197 197
 
198 198
 #define PUBLIC_OPTIONS \
199
-  (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
200
-   PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
201
-   PCRE_NO_AUTO_CAPTURE)
199
+(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
200
+PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
201
+PCRE_NO_AUTO_CAPTURE)
202 202
 
203 203
 #define PUBLIC_EXEC_OPTIONS \
204
-  (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY)
204
+(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY)
205 205
 
206 206
 #define PUBLIC_STUDY_OPTIONS 0   /* None defined */
207 207
 
... ...
@@ -215,7 +215,7 @@ time, run time or study time, respectively. */
215 215
 #define REQ_NONE  (-1)
216 216
 
217 217
 /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
218
-variable-length repeat, or a anything other than literal characters. */
218
+ variable-length repeat, or a anything other than literal characters. */
219 219
 
220 220
 #define REQ_CASELESS 0x0100    /* indicates caselessness */
221 221
 #define REQ_VARY     0x0200    /* reqbyte followed non-literal item */
... ...
@@ -228,8 +228,8 @@ typedef int BOOL;
228 228
 #define TRUE    1
229 229
 
230 230
 /* Escape items that are just an encoding of a particular data value. Note that
231
-ESC_n is defined as yet another macro, which is set in config.h to either \n
232
-(the default) or \r (which some people want). */
231
+ ESC_n is defined as yet another macro, which is set in config.h to either \n
232
+ (the default) or \r (which some people want). */
233 233
 
234 234
 #ifndef ESC_e
235 235
 #define ESC_e 27
... ...
@@ -248,28 +248,28 @@ ESC_n is defined as yet another macro, which is set in config.h to either \n
248 248
 #endif
249 249
 
250 250
 /* We can't officially use ESC_t because it is a POSIX reserved identifier
251
-(presumably because of all the others like size_t). */
251
+ (presumably because of all the others like size_t). */
252 252
 
253 253
 #ifndef ESC_tee
254 254
 #define ESC_tee '\t'
255 255
 #endif
256 256
 
257 257
 /* These are escaped items that aren't just an encoding of a particular data
258
-value such as \n. They must have non-zero values, as check_escape() returns
259
-their negation. Also, they must appear in the same order as in the opcode
260
-definitions below, up to ESC_z. There's a dummy for OP_ANY because it
261
-corresponds to "." rather than an escape sequence. The final one must be
262
-ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
263
-tests in the code for an escape greater than ESC_b and less than ESC_Z to
264
-detect the types that may be repeated. These are the types that consume a
265
-character. If any new escapes are put in between that don't consume a
266
-character, that code will have to change. */
258
+ value such as \n. They must have non-zero values, as check_escape() returns
259
+ their negation. Also, they must appear in the same order as in the opcode
260
+ definitions below, up to ESC_z. There's a dummy for OP_ANY because it
261
+ corresponds to "." rather than an escape sequence. The final one must be
262
+ ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
263
+ tests in the code for an escape greater than ESC_b and less than ESC_Z to
264
+ detect the types that may be repeated. These are the types that consume a
265
+ character. If any new escapes are put in between that don't consume a
266
+ character, that code will have to change. */
267 267
 
268 268
 enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
269
-       ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
269
+    ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
270 270
 
271 271
 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
272
-contain UTF-8 characters with values greater than 255. */
272
+ contain UTF-8 characters with values greater than 255. */
273 273
 
274 274
 #define XCL_NOT    0x01    /* Flag: this is a negative class */
275 275
 #define XCL_MAP    0x02    /* Flag: a 32-byte map is present */
... ...
@@ -280,197 +280,197 @@ contain UTF-8 characters with values greater than 255. */
280 280
 
281 281
 
282 282
 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
283
-that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
284
-OP_EOD must correspond in order to the list of escapes immediately above.
285
-Note that whenever this list is updated, the two macro definitions that follow
286
-must also be updated to match. */
283
+ that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
284
+ OP_EOD must correspond in order to the list of escapes immediately above.
285
+ Note that whenever this list is updated, the two macro definitions that follow
286
+ must also be updated to match. */
287 287
 
288 288
 enum {
289
-  OP_END,            /* 0 End of pattern */
290
-
291
-  /* Values corresponding to backslashed metacharacters */
292
-
293
-  OP_SOD,            /* 1 Start of data: \A */
294
-  OP_SOM,            /* 2 Start of match (subject + offset): \G */
295
-  OP_NOT_WORD_BOUNDARY,  /*  3 \B */
296
-  OP_WORD_BOUNDARY,      /*  4 \b */
297
-  OP_NOT_DIGIT,          /*  5 \D */
298
-  OP_DIGIT,              /*  6 \d */
299
-  OP_NOT_WHITESPACE,     /*  7 \S */
300
-  OP_WHITESPACE,         /*  8 \s */
301
-  OP_NOT_WORDCHAR,       /*  9 \W */
302
-  OP_WORDCHAR,           /* 10 \w */
303
-  OP_ANY,            /* 11 Match any character */
304
-  OP_ANYBYTE,        /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
305
-  OP_EODN,           /* 13 End of data or \n at end of data: \Z. */
306
-  OP_EOD,            /* 14 End of data: \z */
307
-
308
-  OP_OPT,            /* 15 Set runtime options */
309
-  OP_CIRC,           /* 16 Start of line - varies with multiline switch */
310
-  OP_DOLL,           /* 17 End of line - varies with multiline switch */
311
-  OP_CHARS,          /* 18 Match string of characters */
312
-  OP_NOT,            /* 19 Match anything but the following char */
313
-
314
-  OP_STAR,           /* 20 The maximizing and minimizing versions of */
315
-  OP_MINSTAR,        /* 21 all these opcodes must come in pairs, with */
316
-  OP_PLUS,           /* 22 the minimizing one second. */
317
-  OP_MINPLUS,        /* 23 This first set applies to single characters */
318
-  OP_QUERY,          /* 24 */
319
-  OP_MINQUERY,       /* 25 */
320
-  OP_UPTO,           /* 26 From 0 to n matches */
321
-  OP_MINUPTO,        /* 27 */
322
-  OP_EXACT,          /* 28 Exactly n matches */
323
-
324
-  OP_NOTSTAR,        /* 29 The maximizing and minimizing versions of */
325
-  OP_NOTMINSTAR,     /* 30 all these opcodes must come in pairs, with */
326
-  OP_NOTPLUS,        /* 31 the minimizing one second. */
327
-  OP_NOTMINPLUS,     /* 32 This set applies to "not" single characters */
328
-  OP_NOTQUERY,       /* 33 */
329
-  OP_NOTMINQUERY,    /* 34 */
330
-  OP_NOTUPTO,        /* 35 From 0 to n matches */
331
-  OP_NOTMINUPTO,     /* 36 */
332
-  OP_NOTEXACT,       /* 37 Exactly n matches */
333
-
334
-  OP_TYPESTAR,       /* 38 The maximizing and minimizing versions of */
335
-  OP_TYPEMINSTAR,    /* 39 all these opcodes must come in pairs, with */
336
-  OP_TYPEPLUS,       /* 40 the minimizing one second. These codes must */
337
-  OP_TYPEMINPLUS,    /* 41 be in exactly the same order as those above. */
338
-  OP_TYPEQUERY,      /* 42 This set applies to character types such as \d */
339
-  OP_TYPEMINQUERY,   /* 43 */
340
-  OP_TYPEUPTO,       /* 44 From 0 to n matches */
341
-  OP_TYPEMINUPTO,    /* 45 */
342
-  OP_TYPEEXACT,      /* 46 Exactly n matches */
343
-
344
-  OP_CRSTAR,         /* 47 The maximizing and minimizing versions of */
345
-  OP_CRMINSTAR,      /* 48 all these opcodes must come in pairs, with */
346
-  OP_CRPLUS,         /* 49 the minimizing one second. These codes must */
347
-  OP_CRMINPLUS,      /* 50 be in exactly the same order as those above. */
348
-  OP_CRQUERY,        /* 51 These are for character classes and back refs */
349
-  OP_CRMINQUERY,     /* 52 */
350
-  OP_CRRANGE,        /* 53 These are different to the three seta above. */
351
-  OP_CRMINRANGE,     /* 54 */
352
-
353
-  OP_CLASS,          /* 55 Match a character class, chars < 256 only */
354
-  OP_NCLASS,         /* 56 Same, but the bitmap was created from a negative
355
-                           class - the difference is relevant only when a UTF-8
356
-                           character > 255 is encountered. */
357
-
358
-  OP_XCLASS,         /* 56 Extended class for handling UTF-8 chars within the
359
-                           class. This does both positive and negative. */
360
-
361
-  OP_REF,            /* 57 Match a back reference */
362
-  OP_RECURSE,        /* 58 Match a numbered subpattern (possibly recursive) */
363
-  OP_CALLOUT,        /* 59 Call out to external function if provided */
364
-
365
-  OP_ALT,            /* 60 Start of alternation */
366
-  OP_KET,            /* 61 End of group that doesn't have an unbounded repeat */
367
-  OP_KETRMAX,        /* 62 These two must remain together and in this */
368
-  OP_KETRMIN,        /* 63 order. They are for groups the repeat for ever. */
369
-
370
-  /* The assertions must come before ONCE and COND */
371
-
372
-  OP_ASSERT,         /* 64 Positive lookahead */
373
-  OP_ASSERT_NOT,     /* 65 Negative lookahead */
374
-  OP_ASSERTBACK,     /* 66 Positive lookbehind */
375
-  OP_ASSERTBACK_NOT, /* 67 Negative lookbehind */
376
-  OP_REVERSE,        /* 68 Move pointer back - used in lookbehind assertions */
377
-
378
-  /* ONCE and COND must come after the assertions, with ONCE first, as there's
379
-  a test for >= ONCE for a subpattern that isn't an assertion. */
380
-
381
-  OP_ONCE,           /* 69 Once matched, don't back up into the subpattern */
382
-  OP_COND,           /* 70 Conditional group */
383
-  OP_CREF,           /* 71 Used to hold an extraction string number (cond ref) */
384
-
385
-  OP_BRAZERO,        /* 72 These two must remain together and in this */
386
-  OP_BRAMINZERO,     /* 73 order. */
387
-
388
-  OP_BRANUMBER,      /* 74 Used for extracting brackets whose number is greater
389
-                           than can fit into an opcode. */
390
-
391
-  OP_BRA             /* 75 This and greater values are used for brackets that
392
-                           extract substrings up to a basic limit. After that,
393
-                           use is made of OP_BRANUMBER. */
289
+    OP_END,            /* 0 End of pattern */
290
+    
291
+    /* Values corresponding to backslashed metacharacters */
292
+    
293
+    OP_SOD,            /* 1 Start of data: \A */
294
+    OP_SOM,            /* 2 Start of match (subject + offset): \G */
295
+    OP_NOT_WORD_BOUNDARY,  /*  3 \B */
296
+    OP_WORD_BOUNDARY,      /*  4 \b */
297
+    OP_NOT_DIGIT,          /*  5 \D */
298
+    OP_DIGIT,              /*  6 \d */
299
+    OP_NOT_WHITESPACE,     /*  7 \S */
300
+    OP_WHITESPACE,         /*  8 \s */
301
+    OP_NOT_WORDCHAR,       /*  9 \W */
302
+    OP_WORDCHAR,           /* 10 \w */
303
+    OP_ANY,            /* 11 Match any character */
304
+    OP_ANYBYTE,        /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
305
+    OP_EODN,           /* 13 End of data or \n at end of data: \Z. */
306
+    OP_EOD,            /* 14 End of data: \z */
307
+    
308
+    OP_OPT,            /* 15 Set runtime options */
309
+    OP_CIRC,           /* 16 Start of line - varies with multiline switch */
310
+    OP_DOLL,           /* 17 End of line - varies with multiline switch */
311
+    OP_CHARS,          /* 18 Match string of characters */
312
+    OP_NOT,            /* 19 Match anything but the following char */
313
+    
314
+    OP_STAR,           /* 20 The maximizing and minimizing versions of */
315
+    OP_MINSTAR,        /* 21 all these opcodes must come in pairs, with */
316
+    OP_PLUS,           /* 22 the minimizing one second. */
317
+    OP_MINPLUS,        /* 23 This first set applies to single characters */
318
+    OP_QUERY,          /* 24 */
319
+    OP_MINQUERY,       /* 25 */
320
+    OP_UPTO,           /* 26 From 0 to n matches */
321
+    OP_MINUPTO,        /* 27 */
322
+    OP_EXACT,          /* 28 Exactly n matches */
323
+    
324
+    OP_NOTSTAR,        /* 29 The maximizing and minimizing versions of */
325
+    OP_NOTMINSTAR,     /* 30 all these opcodes must come in pairs, with */
326
+    OP_NOTPLUS,        /* 31 the minimizing one second. */
327
+    OP_NOTMINPLUS,     /* 32 This set applies to "not" single characters */
328
+    OP_NOTQUERY,       /* 33 */
329
+    OP_NOTMINQUERY,    /* 34 */
330
+    OP_NOTUPTO,        /* 35 From 0 to n matches */
331
+    OP_NOTMINUPTO,     /* 36 */
332
+    OP_NOTEXACT,       /* 37 Exactly n matches */
333
+    
334
+    OP_TYPESTAR,       /* 38 The maximizing and minimizing versions of */
335
+    OP_TYPEMINSTAR,    /* 39 all these opcodes must come in pairs, with */
336
+    OP_TYPEPLUS,       /* 40 the minimizing one second. These codes must */
337
+    OP_TYPEMINPLUS,    /* 41 be in exactly the same order as those above. */
338
+    OP_TYPEQUERY,      /* 42 This set applies to character types such as \d */
339
+    OP_TYPEMINQUERY,   /* 43 */
340
+    OP_TYPEUPTO,       /* 44 From 0 to n matches */
341
+    OP_TYPEMINUPTO,    /* 45 */
342
+    OP_TYPEEXACT,      /* 46 Exactly n matches */
343
+    
344
+    OP_CRSTAR,         /* 47 The maximizing and minimizing versions of */
345
+    OP_CRMINSTAR,      /* 48 all these opcodes must come in pairs, with */
346
+    OP_CRPLUS,         /* 49 the minimizing one second. These codes must */
347
+    OP_CRMINPLUS,      /* 50 be in exactly the same order as those above. */
348
+    OP_CRQUERY,        /* 51 These are for character classes and back refs */
349
+    OP_CRMINQUERY,     /* 52 */
350
+    OP_CRRANGE,        /* 53 These are different to the three seta above. */
351
+    OP_CRMINRANGE,     /* 54 */
352
+    
353
+    OP_CLASS,          /* 55 Match a character class, chars < 256 only */
354
+    OP_NCLASS,         /* 56 Same, but the bitmap was created from a negative
355
+                        class - the difference is relevant only when a UTF-8
356
+                        character > 255 is encountered. */
357
+    
358
+    OP_XCLASS,         /* 56 Extended class for handling UTF-8 chars within the
359
+                        class. This does both positive and negative. */
360
+    
361
+    OP_REF,            /* 57 Match a back reference */
362
+    OP_RECURSE,        /* 58 Match a numbered subpattern (possibly recursive) */
363
+    OP_CALLOUT,        /* 59 Call out to external function if provided */
364
+    
365
+    OP_ALT,            /* 60 Start of alternation */
366
+    OP_KET,            /* 61 End of group that doesn't have an unbounded repeat */
367
+    OP_KETRMAX,        /* 62 These two must remain together and in this */
368
+    OP_KETRMIN,        /* 63 order. They are for groups the repeat for ever. */
369
+    
370
+    /* The assertions must come before ONCE and COND */
371
+    
372
+    OP_ASSERT,         /* 64 Positive lookahead */
373
+    OP_ASSERT_NOT,     /* 65 Negative lookahead */
374
+    OP_ASSERTBACK,     /* 66 Positive lookbehind */
375
+    OP_ASSERTBACK_NOT, /* 67 Negative lookbehind */
376
+    OP_REVERSE,        /* 68 Move pointer back - used in lookbehind assertions */
377
+    
378
+    /* ONCE and COND must come after the assertions, with ONCE first, as there's
379
+     a test for >= ONCE for a subpattern that isn't an assertion. */
380
+    
381
+    OP_ONCE,           /* 69 Once matched, don't back up into the subpattern */
382
+    OP_COND,           /* 70 Conditional group */
383
+    OP_CREF,           /* 71 Used to hold an extraction string number (cond ref) */
384
+    
385
+    OP_BRAZERO,        /* 72 These two must remain together and in this */
386
+    OP_BRAMINZERO,     /* 73 order. */
387
+    
388
+    OP_BRANUMBER,      /* 74 Used for extracting brackets whose number is greater
389
+                        than can fit into an opcode. */
390
+    
391
+    OP_BRA             /* 75 This and greater values are used for brackets that
392
+                        extract substrings up to a basic limit. After that,
393
+                        use is made of OP_BRANUMBER. */
394 394
 };
395 395
 
396 396
 /* WARNING: There is an implicit assumption in study.c that all opcodes are
397
-less than 128 in value. This makes handling UTF-8 character sequences easier.
398
-*/
397
+ less than 128 in value. This makes handling UTF-8 character sequences easier.
398
+ */
399 399
 
400 400
 
401 401
 /* This macro defines textual names for all the opcodes. There are used only
402
-for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The
403
-macro is referenced only in printint.c. */
402
+ for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The
403
+ macro is referenced only in printint.c. */
404 404
 
405 405
 #define OP_NAME_LIST \
406
-  "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d",                \
407
-  "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z",     \
408
-  "Opt", "^", "$", "chars", "not",                                \
409
-  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
410
-  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
411
-  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
412
-  "*", "*?", "+", "+?", "?", "??", "{", "{",                      \
413
-  "class", "nclass", "xclass", "Ref", "Recurse", "Callout",       \
414
-  "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",     \
415
-  "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
416
-  "Brazero", "Braminzero", "Branumber", "Bra"
406
+"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d",                \
407
+"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z",     \
408
+"Opt", "^", "$", "chars", "not",                                \
409
+"*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
410
+"*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
411
+"*", "*?", "+", "+?", "?", "??", "{", "{", "{",                 \
412
+"*", "*?", "+", "+?", "?", "??", "{", "{",                      \
413
+"class", "nclass", "xclass", "Ref", "Recurse", "Callout",       \
414
+"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",     \
415
+"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
416
+"Brazero", "Braminzero", "Branumber", "Bra"
417 417
 
418 418
 
419 419
 /* This macro defines the length of fixed length operations in the compiled
420
-regex. The lengths are used when searching for specific things, and also in the
421
-debugging printing of a compiled regex. We use a macro so that it can be
422
-incorporated both into pcre.c and pcretest.c without being publicly exposed.
423
-
424
-As things have been extended, some of these are no longer fixed lenths, but are
425
-minima instead. For example, the length of a single-character repeat may vary
426
-in UTF-8 mode. The code that uses this table must know about such things. */
420
+ regex. The lengths are used when searching for specific things, and also in the
421
+ debugging printing of a compiled regex. We use a macro so that it can be
422
+ incorporated both into pcre.c and pcretest.c without being publicly exposed.
423
+ 
424
+ As things have been extended, some of these are no longer fixed lenths, but are
425
+ minima instead. For example, the length of a single-character repeat may vary
426
+ in UTF-8 mode. The code that uses this table must know about such things. */
427 427
 
428 428
 #define OP_LENGTHS \
429
-  1,                             /* End                                    */ \
430
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
431
-  1, 1, 1, 1, 2, 1, 1,           /* Any, Anybyte, \Z, \z, Opt, ^, $        */ \
432
-  2,                             /* Chars - the minimum length             */ \
433
-  2,                             /* not                                    */ \
434
-  /* Positive single-char repeats                                          */ \
435
-  2, 2, 2, 2, 2, 2,              /* *, *?, +, +?, ?, ??      ** These are  */ \
436
-  4, 4, 4,                       /* upto, minupto, exact     ** minima     */ \
437
-  /* Negative single-char repeats                                          */ \
438
-  2, 2, 2, 2, 2, 2,              /* NOT *, *?, +, +?, ?, ??                */ \
439
-  4, 4, 4,                       /* NOT upto, minupto, exact               */ \
440
-  /* Positive type repeats                                                 */ \
441
-  2, 2, 2, 2, 2, 2,              /* Type *, *?, +, +?, ?, ??               */ \
442
-  4, 4, 4,                       /* Type upto, minupto, exact              */ \
443
-  /* Character class & ref repeats                                         */ \
444
-  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */ \
445
-  5, 5,                          /* CRRANGE, CRMINRANGE                    */ \
446
- 33,                             /* CLASS                                  */ \
447
- 33,                             /* NCLASS                                 */ \
448
-  0,                             /* XCLASS - variable length               */ \
449
-  3,                             /* REF                                    */ \
450
-  1+LINK_SIZE,                   /* RECURSE                                */ \
451
-  2,                             /* CALLOUT                                */ \
452
-  1+LINK_SIZE,                   /* Alt                                    */ \
453
-  1+LINK_SIZE,                   /* Ket                                    */ \
454
-  1+LINK_SIZE,                   /* KetRmax                                */ \
455
-  1+LINK_SIZE,                   /* KetRmin                                */ \
456
-  1+LINK_SIZE,                   /* Assert                                 */ \
457
-  1+LINK_SIZE,                   /* Assert not                             */ \
458
-  1+LINK_SIZE,                   /* Assert behind                          */ \
459
-  1+LINK_SIZE,                   /* Assert behind not                      */ \
460
-  1+LINK_SIZE,                   /* Reverse                                */ \
461
-  1+LINK_SIZE,                   /* Once                                   */ \
462
-  1+LINK_SIZE,                   /* COND                                   */ \
463
-  3,                             /* CREF                                   */ \
464
-  1, 1,                          /* BRAZERO, BRAMINZERO                    */ \
465
-  3,                             /* BRANUMBER                              */ \
466
-  1+LINK_SIZE                    /* BRA                                    */ \
429
+1,                             /* End                                    */ \
430
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
431
+1, 1, 1, 1, 2, 1, 1,           /* Any, Anybyte, \Z, \z, Opt, ^, $        */ \
432
+2,                             /* Chars - the minimum length             */ \
433
+2,                             /* not                                    */ \
434
+/* Positive single-char repeats                                          */ \
435
+2, 2, 2, 2, 2, 2,              /* *, *?, +, +?, ?, ??      ** These are  */ \
436
+4, 4, 4,                       /* upto, minupto, exact     ** minima     */ \
437
+/* Negative single-char repeats                                          */ \
438
+2, 2, 2, 2, 2, 2,              /* NOT *, *?, +, +?, ?, ??                */ \
439
+4, 4, 4,                       /* NOT upto, minupto, exact               */ \
440
+/* Positive type repeats                                                 */ \
441
+2, 2, 2, 2, 2, 2,              /* Type *, *?, +, +?, ?, ??               */ \
442
+4, 4, 4,                       /* Type upto, minupto, exact              */ \
443
+/* Character class & ref repeats                                         */ \
444
+1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */ \
445
+5, 5,                          /* CRRANGE, CRMINRANGE                    */ \
446
+33,                             /* CLASS                                  */ \
447
+33,                             /* NCLASS                                 */ \
448
+0,                             /* XCLASS - variable length               */ \
449
+3,                             /* REF                                    */ \
450
+1+LINK_SIZE,                   /* RECURSE                                */ \
451
+2,                             /* CALLOUT                                */ \
452
+1+LINK_SIZE,                   /* Alt                                    */ \
453
+1+LINK_SIZE,                   /* Ket                                    */ \
454
+1+LINK_SIZE,                   /* KetRmax                                */ \
455
+1+LINK_SIZE,                   /* KetRmin                                */ \
456
+1+LINK_SIZE,                   /* Assert                                 */ \
457
+1+LINK_SIZE,                   /* Assert not                             */ \
458
+1+LINK_SIZE,                   /* Assert behind                          */ \
459
+1+LINK_SIZE,                   /* Assert behind not                      */ \
460
+1+LINK_SIZE,                   /* Reverse                                */ \
461
+1+LINK_SIZE,                   /* Once                                   */ \
462
+1+LINK_SIZE,                   /* COND                                   */ \
463
+3,                             /* CREF                                   */ \
464
+1, 1,                          /* BRAZERO, BRAMINZERO                    */ \
465
+3,                             /* BRANUMBER                              */ \
466
+1+LINK_SIZE                    /* BRA                                    */ \
467 467
 
468 468
 
469 469
 /* The highest extraction number before we have to start using additional
470
-bytes. (Originally PCRE didn't have support for extraction counts highter than
471
-this number.) The value is limited by the number of opcodes left after OP_BRA,
472
-i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
473
-opcodes. */
470
+ bytes. (Originally PCRE didn't have support for extraction counts highter than
471
+ this number.) The value is limited by the number of opcodes left after OP_BRA,
472
+ i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
473
+ opcodes. */
474 474
 
475 475
 #define EXTRACT_BASIC_MAX  150
476 476
 
... ...
@@ -479,9 +479,9 @@ opcodes. */
479 479
 #define CREF_RECURSE  0xffff
480 480
 
481 481
 /* The texts of compile-time error messages are defined as macros here so that
482
-they can be accessed by the POSIX wrapper and converted into error codes.  Yes,
483
-I could have used error codes in the first place, but didn't feel like changing
484
-just to accommodate the POSIX wrapper. */
482
+ they can be accessed by the POSIX wrapper and converted into error codes.  Yes,
483
+ I could have used error codes in the first place, but didn't feel like changing
484
+ just to accommodate the POSIX wrapper. */
485 485
 
486 486
 #define ERR1  "\\ at end of pattern"
487 487
 #define ERR2  "\\c at end of pattern"
... ...
@@ -528,102 +528,102 @@ just to accommodate the POSIX wrapper. */
528 528
 #define ERR43 "two named groups have the same name"
529 529
 
530 530
 /* All character handling must be done as unsigned characters. Otherwise there
531
-are problems with top-bit-set characters and functions such as isspace().
532
-However, we leave the interface to the outside world as char *, because that
533
-should make things easier for callers. We define a short type for unsigned char
534
-to save lots of typing. I tried "uchar", but it causes problems on Digital
535
-Unix, where it is defined in sys/types, so use "uschar" instead. */
531
+ are problems with top-bit-set characters and functions such as isspace().
532
+ However, we leave the interface to the outside world as char *, because that
533
+ should make things easier for callers. We define a short type for unsigned char
534
+ to save lots of typing. I tried "uchar", but it causes problems on Digital
535
+ Unix, where it is defined in sys/types, so use "uschar" instead. */
536 536
 
537 537
 typedef unsigned char uschar;
538 538
 
539 539
 /* The real format of the start of the pcre block; the index of names and the
540
-code vector run on as long as necessary after the end. */
540
+ code vector run on as long as necessary after the end. */
541 541
 
542 542
 typedef struct real_pcre {
543
-  unsigned long int magic_number;
544
-  size_t size;                        /* Total that was malloced */
545
-  const unsigned char *tables;        /* Pointer to tables */
546
-  unsigned long int options;
547
-  unsigned short int top_bracket;
548
-  unsigned short int top_backref;
549
-  unsigned short int first_byte;
550
-  unsigned short int req_byte;
551
-  unsigned short int name_entry_size; /* Size of any name items; 0 => none */
552
-  unsigned short int name_count;      /* Number of name items */
543
+    unsigned long int magic_number;
544
+    size_t size;                        /* Total that was malloced */
545
+    const unsigned char *tables;        /* Pointer to tables */
546
+    unsigned long int options;
547
+    unsigned short int top_bracket;
548
+    unsigned short int top_backref;
549
+    unsigned short int first_byte;
550
+    unsigned short int req_byte;
551
+    unsigned short int name_entry_size; /* Size of any name items; 0 => none */
552
+    unsigned short int name_count;      /* Number of name items */
553 553
 } real_pcre;
554 554
 
555 555
 /* The format of the block used to store data from pcre_study(). */
556 556
 
557 557
 typedef struct pcre_study_data {
558
-  size_t size;                        /* Total that was malloced */
559
-  uschar options;
560
-  uschar start_bits[32];
558
+    size_t size;                        /* Total that was malloced */
559
+    uschar options;
560
+    uschar start_bits[32];
561 561
 } pcre_study_data;
562 562
 
563 563
 /* Structure for passing "static" information around between the functions
564
-doing the compiling, so that they are thread-safe. */
564
+ doing the compiling, so that they are thread-safe. */
565 565
 
566 566
 typedef struct compile_data {
567
-  const uschar *lcc;            /* Points to lower casing table */
568
-  const uschar *fcc;            /* Points to case-flipping table */
569
-  const uschar *cbits;          /* Points to character type table */
570
-  const uschar *ctypes;         /* Points to table of type maps */
571
-  const uschar *start_code;     /* The start of the compiled code */
572
-  uschar *name_table;           /* The name/number table */
573
-  int  names_found;             /* Number of entries so far */
574
-  int  name_entry_size;         /* Size of each entry */
575
-  int  top_backref;             /* Maximum back reference */
576
-  unsigned int backref_map;     /* Bitmap of low back refs */
577
-  int  req_varyopt;             /* "After variable item" flag for reqbyte */
567
+    const uschar *lcc;            /* Points to lower casing table */
568
+    const uschar *fcc;            /* Points to case-flipping table */
569
+    const uschar *cbits;          /* Points to character type table */
570
+    const uschar *ctypes;         /* Points to table of type maps */
571
+    const uschar *start_code;     /* The start of the compiled code */
572
+    uschar *name_table;           /* The name/number table */
573
+    int  names_found;             /* Number of entries so far */
574
+    int  name_entry_size;         /* Size of each entry */
575
+    int  top_backref;             /* Maximum back reference */
576
+    unsigned int backref_map;     /* Bitmap of low back refs */
577
+    int  req_varyopt;             /* "After variable item" flag for reqbyte */
578 578
 } compile_data;
579 579
 
580 580
 /* Structure for maintaining a chain of pointers to the currently incomplete
581
-branches, for testing for left recursion. */
581
+ branches, for testing for left recursion. */
582 582
 
583 583
 typedef struct branch_chain {
584
-  struct branch_chain *outer;
585
-  uschar *current;
584
+    struct branch_chain *outer;
585
+    uschar *current;
586 586
 } branch_chain;
587 587
 
588 588
 /* Structure for items in a linked list that represents an explicit recursive
589
-call within the pattern. */
589
+ call within the pattern. */
590 590
 
591 591
 typedef struct recursion_info {
592
-  struct recursion_info *prev;  /* Previous recursion record (or NULL) */
593
-  int group_num;                /* Number of group that was called */
594
-  const uschar *after_call;     /* "Return value": points after the call in the expr */
595
-  const uschar *save_start;     /* Old value of md->start_match */
596
-  int *offset_save;             /* Pointer to start of saved offsets */
597
-  int saved_max;                /* Number of saved offsets */
592
+    struct recursion_info *prev;  /* Previous recursion record (or NULL) */
593
+    int group_num;                /* Number of group that was called */
594
+    const uschar *after_call;     /* "Return value": points after the call in the expr */
595
+    const uschar *save_start;     /* Old value of md->start_match */
596
+    int *offset_save;             /* Pointer to start of saved offsets */
597
+    int saved_max;                /* Number of saved offsets */
598 598
 } recursion_info;
599 599
 
600 600
 /* Structure for passing "static" information around between the functions
601
-doing the matching, so that they are thread-safe. */
601
+ doing the matching, so that they are thread-safe. */
602 602
 
603 603
 typedef struct match_data {
604
-  unsigned long int match_call_count; /* As it says */
605
-  unsigned long int match_limit;/* As it says */
606
-  int   *offset_vector;         /* Offset vector */
607
-  int    offset_end;            /* One past the end */
608
-  int    offset_max;            /* The maximum usable for return data */
609
-  const uschar *lcc;            /* Points to lower casing table */
610
-  const uschar *ctypes;         /* Points to table of type maps */
611
-  BOOL   offset_overflow;       /* Set if too many extractions */
612
-  BOOL   notbol;                /* NOTBOL flag */
613
-  BOOL   noteol;                /* NOTEOL flag */
614
-  BOOL   utf8;                  /* UTF8 flag */
615
-  BOOL   endonly;               /* Dollar not before final \n */
616
-  BOOL   notempty;              /* Empty string match not wanted */
617
-  const uschar *start_code;     /* For use when recursing */
618
-  const uschar *start_subject;  /* Start of the subject string */
619
-  const uschar *end_subject;    /* End of the subject string */
620
-  const uschar *start_match;    /* Start of this match attempt */
621
-  const uschar *end_match_ptr;  /* Subject position at end match */
622
-  int    end_offset_top;        /* Highwater mark at end of match */
623
-  int    capture_last;          /* Most recent capture number */
624
-  int    start_offset;          /* The start offset value */
625
-  recursion_info *recursive;    /* Linked list of recursion data */
626
-  void  *callout_data;          /* To pass back to callouts */
604
+    unsigned long int match_call_count; /* As it says */
605
+    unsigned long int match_limit;/* As it says */
606
+    int   *offset_vector;         /* Offset vector */
607
+    int    offset_end;            /* One past the end */
608
+    int    offset_max;            /* The maximum usable for return data */
609
+    const uschar *lcc;            /* Points to lower casing table */
610
+    const uschar *ctypes;         /* Points to table of type maps */
611
+    BOOL   offset_overflow;       /* Set if too many extractions */
612
+    BOOL   notbol;                /* NOTBOL flag */
613
+    BOOL   noteol;                /* NOTEOL flag */
614
+    BOOL   utf8;                  /* UTF8 flag */
615
+    BOOL   endonly;               /* Dollar not before final \n */
616
+    BOOL   notempty;              /* Empty string match not wanted */
617
+    const uschar *start_code;     /* For use when recursing */
618
+    const uschar *start_subject;  /* Start of the subject string */
619
+    const uschar *end_subject;    /* End of the subject string */
620
+    const uschar *start_match;    /* Start of this match attempt */
621
+    const uschar *end_match_ptr;  /* Subject position at end match */
622
+    int    end_offset_top;        /* Highwater mark at end of match */
623
+    int    capture_last;          /* Most recent capture number */
624
+    int    start_offset;          /* The start offset value */
625
+    recursion_info *recursive;    /* Linked list of recursion data */
626
+    void  *callout_data;          /* To pass back to callouts */
627 627
 } match_data;
628 628
 
629 629
 /* Bit definitions for entries in the pcre_ctypes table. */
... ...
@@ -636,7 +636,7 @@ typedef struct match_data {
636 636
 #define ctype_meta    0x80   /* regexp meta char or zero (end pattern) */
637 637
 
638 638
 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
639
-of bits for a class map. Some classes are built by combining these tables. */
639
+ of bits for a class map. Some classes are built by combining these tables. */
640 640
 
641 641
 #define cbit_space     0      /* [:space:] or \s */
642 642
 #define cbit_xdigit   32      /* [:xdigit:] */
... ...
@@ -651,7 +651,7 @@ of bits for a class map. Some classes are built by combining these tables. */
651 651
 #define cbit_length  320      /* Length of the cbits table */
652 652
 
653 653
 /* Offsets of the various tables from the base tables pointer, and
654
-total length. */
654
+ total length. */
655 655
 
656 656
 #define lcc_offset      0
657 657
 #define fcc_offset    256
... ...
@@ -1,42 +1,42 @@
1 1
 /*************************************************
2
-*      Perl-Compatible Regular Expressions       *
3
-*************************************************/
2
+ *      Perl-Compatible Regular Expressions       *
3
+ *************************************************/
4 4
 
5 5
 /*
6
-PCRE is a library of functions to support regular expressions whose syntax
7
-and semantics are as close as possible to those of the Perl 5 language.
8
-
9
-Written by: Philip Hazel <ph10@cam.ac.uk>
10
-
11
-           Copyright (c) 1997-2003 University of Cambridge
12
-
13
-Permission is granted to anyone to use this software for any purpose on any
14
-computer system, and to redistribute it freely, subject to the following
15
-restrictions:
16
-
17
-1. This software is distributed in the hope that it will be useful,
18
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
19
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
20
-
21
-2. The origin of this software must not be misrepresented, either by
22
-   explicit claim or by omission.
23
-
24
-3. Altered versions must be plainly marked as such, and must not be
25
-   misrepresented as being the original software.
26
-
27
-4. If PCRE is embedded in any software that is released under the GNU
28
-   General Purpose Licence (GPL), then the terms of that licence shall
29
-   supersede any condition above with which it is incompatible.
30
-
31
-See the file Tech.Notes for some information on the internals.
32
-*/
6
+ PCRE is a library of functions to support regular expressions whose syntax
7
+ and semantics are as close as possible to those of the Perl 5 language.
8
+ 
9
+ Written by: Philip Hazel <ph10@cam.ac.uk>
10
+ 
11
+ Copyright (c) 1997-2003 University of Cambridge
12
+ 
13
+ -----------------------------------------------------------------------------
14
+ Permission is granted to anyone to use this software for any purpose on any
15
+ computer system, and to redistribute it freely, subject to the following
16
+ restrictions:
17
+ 
18
+ 1. This software is distributed in the hope that it will be useful,
19
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
21
+ 
22
+ 2. The origin of this software must not be misrepresented, either by
23
+ explicit claim or by omission.
24
+ 
25
+ 3. Altered versions must be plainly marked as such, and must not be
26
+ misrepresented as being the original software.
27
+ 
28
+ 4. If PCRE is embedded in any software that is released under the GNU
29
+ General Purpose Licence (GPL), then the terms of that licence shall
30
+ supersede any condition above with which it is incompatible.
31
+ -----------------------------------------------------------------------------
32
+ 
33
+ See the file Tech.Notes for some information on the internals.
34
+ */
33 35
 
34 36
 
35 37
 /* This file is compiled on its own as part of the PCRE library. However,
36
-it is also included in the compilation of dftables.c, in which case the macro
37
-DFTABLES is defined. */
38
+ it is also included in the compilation of dftables.c, in which case the macro
39
+ DFTABLES is defined. */
38 40
 
39 41
 #ifndef DFTABLES
40 42
 #include "internal.h"
... ...
@@ -45,92 +45,92 @@ DFTABLES is defined. */
45 45
 
46 46
 
47 47
 /*************************************************
48
-*           Create PCRE character tables         *
49
-*************************************************/
48
+ *           Create PCRE character tables         *
49
+ *************************************************/
50 50
 
51 51
 /* This function builds a set of character tables for use by PCRE and returns
52
-a pointer to them. They are build using the ctype functions, and consequently
53
-their contents will depend upon the current locale setting. When compiled as
54
-part of the library, the store is obtained via pcre_malloc(), but when compiled
55
-inside dftables, use malloc().
56
-
57
-Arguments:   none
58
-Returns:     pointer to the contiguous block of data
59
-*/
52
+ a pointer to them. They are build using the ctype functions, and consequently
53
+ their contents will depend upon the current locale setting. When compiled as
54
+ part of the library, the store is obtained via pcre_malloc(), but when compiled
55
+ inside dftables, use malloc().
56
+ 
57
+ Arguments:   none
58
+ Returns:     pointer to the contiguous block of data
59
+ */
60 60
 
61 61
 const unsigned char *
62 62
 pcre_maketables(void)
63 63
 {
64
-unsigned char *yield, *p;
65
-int i;
66
-
64
+    unsigned char *yield, *p;
65
+    int i;
66
+    
67 67
 #ifndef DFTABLES
68
-yield = (unsigned char*)(pcre_malloc)(tables_length);
68
+    yield = (unsigned char*)(pcre_malloc)(tables_length);
69 69
 #else
70
-yield = (unsigned char*)malloc(tables_length);
70
+    yield = (unsigned char*)malloc(tables_length);
71 71
 #endif
72
-
73
-if (yield == NULL) return NULL;
74
-p = yield;
75
-
76
-/* First comes the lower casing table */
77
-
78
-for (i = 0; i < 256; i++) *p++ = tolower(i);
79
-
80
-/* Next the case-flipping table */
81
-
82
-for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
83
-
84
-/* Then the character class tables. Don't try to be clever and save effort
85
-on exclusive ones - in some locales things may be different. Note that the
86
-table for "space" includes everything "isspace" gives, including VT in the
87
-default locale. This makes it work for the POSIX class [:space:]. */
88
-
89
-memset(p, 0, cbit_length);
90
-for (i = 0; i < 256; i++)
91
-  {
92
-  if (isdigit(i))
72
+    
73
+    if (yield == NULL) return NULL;
74
+    p = yield;
75
+    
76
+    /* First comes the lower casing table */
77
+    
78
+    for (i = 0; i < 256; i++) *p++ = tolower(i);
79
+    
80
+    /* Next the case-flipping table */
81
+    
82
+    for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
83
+    
84
+    /* Then the character class tables. Don't try to be clever and save effort
85
+     on exclusive ones - in some locales things may be different. Note that the
86
+     table for "space" includes everything "isspace" gives, including VT in the
87
+     default locale. This makes it work for the POSIX class [:space:]. */
88
+    
89
+    memset(p, 0, cbit_length);
90
+    for (i = 0; i < 256; i++)
93 91
     {
94
-    p[cbit_digit  + i/8] |= 1 << (i&7);
95
-    p[cbit_word   + i/8] |= 1 << (i&7);
92
+        if (isdigit(i))
93
+        {
94
+            p[cbit_digit  + i/8] |= 1 << (i&7);
95
+            p[cbit_word   + i/8] |= 1 << (i&7);
96
+        }
97
+        if (isupper(i))
98
+        {
99
+            p[cbit_upper  + i/8] |= 1 << (i&7);
100
+            p[cbit_word   + i/8] |= 1 << (i&7);
101
+        }
102
+        if (islower(i))
103
+        {
104
+            p[cbit_lower  + i/8] |= 1 << (i&7);
105
+            p[cbit_word   + i/8] |= 1 << (i&7);
106
+        }
107
+        if (i == '_')   p[cbit_word   + i/8] |= 1 << (i&7);
108
+        if (isspace(i)) p[cbit_space  + i/8] |= 1 << (i&7);
109
+        if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
110
+        if (isgraph(i)) p[cbit_graph  + i/8] |= 1 << (i&7);
111
+        if (isprint(i)) p[cbit_print  + i/8] |= 1 << (i&7);
112
+        if (ispunct(i)) p[cbit_punct  + i/8] |= 1 << (i&7);
113
+        if (iscntrl(i)) p[cbit_cntrl  + i/8] |= 1 << (i&7);
96 114
     }
97
-  if (isupper(i))
115
+    p += cbit_length;
116
+    
117
+    /* Finally, the character type table. In this, we exclude VT from the white
118
+     space chars, because Perl doesn't recognize it as such for \s and for comments
119
+     within regexes. */
120
+    
121
+    for (i = 0; i < 256; i++)
98 122
     {
99
-    p[cbit_upper  + i/8] |= 1 << (i&7);
100
-    p[cbit_word   + i/8] |= 1 << (i&7);
123
+        int x = 0;
124
+        if (i != 0x0b && isspace(i)) x += ctype_space;
125
+        if (isalpha(i)) x += ctype_letter;
126
+        if (isdigit(i)) x += ctype_digit;
127
+        if (isxdigit(i)) x += ctype_xdigit;
128
+        if (isalnum(i) || i == '_') x += ctype_word;
129
+        if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
130
+        *p++ = x;
101 131
     }
102
-  if (islower(i))
103
-    {
104
-    p[cbit_lower  + i/8] |= 1 << (i&7);
105
-    p[cbit_word   + i/8] |= 1 << (i&7);
106
-    }
107
-  if (i == '_')   p[cbit_word   + i/8] |= 1 << (i&7);
108
-  if (isspace(i)) p[cbit_space  + i/8] |= 1 << (i&7);
109
-  if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
110
-  if (isgraph(i)) p[cbit_graph  + i/8] |= 1 << (i&7);
111
-  if (isprint(i)) p[cbit_print  + i/8] |= 1 << (i&7);
112
-  if (ispunct(i)) p[cbit_punct  + i/8] |= 1 << (i&7);
113
-  if (iscntrl(i)) p[cbit_cntrl  + i/8] |= 1 << (i&7);
114
-  }
115
-p += cbit_length;
116
-
117
-/* Finally, the character type table. In this, we exclude VT from the white
118
-space chars, because Perl doesn't recognize it as such for \s and for comments
119
-within regexes. */
120
-
121
-for (i = 0; i < 256; i++)
122
-  {
123
-  int x = 0;
124
-  if (i != 0x0b && isspace(i)) x += ctype_space;
125
-  if (isalpha(i)) x += ctype_letter;
126
-  if (isdigit(i)) x += ctype_digit;
127
-  if (isxdigit(i)) x += ctype_xdigit;
128
-  if (isalnum(i) || i == '_') x += ctype_word;
129
-  if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
130
-  *p++ = x;
131
-  }
132
-
133
-return yield;
132
+    
133
+    return yield;
134 134
 }
135 135
 
136 136
 /* End of maketables.c */
... ...
@@ -1,44 +1,44 @@
1 1
 /*************************************************
2
-*      Perl-Compatible Regular Expressions       *
3
-*************************************************/
2
+ *      Perl-Compatible Regular Expressions       *
3
+ *************************************************/
4 4
 
5 5
 /*
6
-This is a library of functions to support regular expressions whose syntax
7
-and semantics are as close as possible to those of the Perl 5 language. See
8
-the file Tech.Notes for some information on the internals.
9
-
10
-Written by: Philip Hazel <ph10@cam.ac.uk>
11
-
12
-           Copyright (c) 1997-2003 University of Cambridge
13
-
14
-Permission is granted to anyone to use this software for any purpose on any
15
-computer system, and to redistribute it freely, subject to the following
16
-restrictions:
17
-
18
-1. This software is distributed in the hope that it will be useful,
19
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
20
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
21
-
22
-2. The origin of this software must not be misrepresented, either by
23
-   explicit claim or by omission.
24
-
25
-3. Altered versions must be plainly marked as such, and must not be
26
-   misrepresented as being the original software.
27
-
28
-4. If PCRE is embedded in any software that is released under the GNU
29
-   General Purpose Licence (GPL), then the terms of that licence shall
30
-   supersede any condition above with which it is incompatible.
31
-*/
6
+ This is a library of functions to support regular expressions whose syntax
7
+ and semantics are as close as possible to those of the Perl 5 language. See
8
+ the file Tech.Notes for some information on the internals.
9
+ 
10
+ Written by: Philip Hazel <ph10@cam.ac.uk>
11
+ 
12
+ Copyright (c) 1997-2003 University of Cambridge
13
+ 
14
+ -----------------------------------------------------------------------------
15
+ Permission is granted to anyone to use this software for any purpose on any
16
+ computer system, and to redistribute it freely, subject to the following
17
+ restrictions:
18
+ 
19
+ 1. This software is distributed in the hope that it will be useful,
20
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
+ 
23
+ 2. The origin of this software must not be misrepresented, either by
24
+ explicit claim or by omission.
25
+ 
26
+ 3. Altered versions must be plainly marked as such, and must not be
27
+ misrepresented as being the original software.
28
+ 
29
+ 4. If PCRE is embedded in any software that is released under the GNU
30
+ General Purpose Licence (GPL), then the terms of that licence shall
31
+ supersede any condition above with which it is incompatible.
32
+ -----------------------------------------------------------------------------
33
+ */
32 34
 
33 35
 /* Define DEBUG to get debugging output on stdout. */
34 36
 
35 37
 /* #define DEBUG */
36 38
 
37 39
 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
38
-inline, and there are *still* stupid compilers about that don't like indented
39
-pre-processor statements. I suppose it's only been 10 years... */
40
+ inline, and there are *still* stupid compilers about that don't like indented
41
+ pre-processor statements. I suppose it's only been 10 years... */
40 42
 
41 43
 #ifdef DEBUG
42 44
 #define DPRINTF(p) printf p
... ...
@@ -47,7 +47,7 @@ pre-processor statements. I suppose it's only been 10 years... */
47 47
 #endif
48 48
 
49 49
 /* Include the internals header, which itself includes Standard C headers plus
50
-the external pcre header. */
50
+ the external pcre header. */
51 51
 
52 52
 #include "internal.h"
53 53
 
... ...
@@ -60,37 +60,37 @@ the external pcre header. */
60 60
 
61 61
 
62 62
 /* Maximum number of items on the nested bracket stacks at compile time. This
63
-applies to the nesting of all kinds of parentheses. It does not limit
64
-un-nested, non-capturing parentheses. This number can be made bigger if
65
-necessary - it is used to dimension one int and one unsigned char vector at
66
-compile time. */
63
+ applies to the nesting of all kinds of parentheses. It does not limit
64
+ un-nested, non-capturing parentheses. This number can be made bigger if
65
+ necessary - it is used to dimension one int and one unsigned char vector at
66
+ compile time. */
67 67
 
68 68
 #define BRASTACK_SIZE 200
69 69
 
70 70
 
71 71
 /* Maximum number of ints of offset to save on the stack for recursive calls.
72
-If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73
-because the offset vector is always a multiple of 3 long. */
72
+ If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73
+ because the offset vector is always a multiple of 3 long. */
74 74
 
75 75
 #define REC_STACK_SAVE_MAX 30
76 76
 
77 77
 
78 78
 /* The number of bytes in a literal character string above which we can't add
79
-any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80
-could be 255 when UTF-8 support is excluded, but that means that some of the
81
-test output would be different, which just complicates things.) */
79
+ any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80
+ could be 255 when UTF-8 support is excluded, but that means that some of the
81
+ test output would be different, which just complicates things.) */
82 82
 
83 83
 #define MAXLIT 250
84 84
 
85 85
 
86 86
 /* The maximum remaining length of subject we are prepared to search for a
87
-req_byte match. */
87
+ req_byte match. */
88 88
 
89 89
 #define REQ_BYTE_MAX 1000
90 90
 
91 91
 
92 92
 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93
-the definition is next to the definition of the opcodes in internal.h. */
93
+ the definition is next to the definition of the opcodes in internal.h. */
94 94
 
95 95
 static uschar OP_lengths[] = { OP_LENGTHS };
96 96
 
... ...
@@ -100,121 +100,121 @@ static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 100
 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101 101
 
102 102
 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103
-are simple data values; negative values are for special things like \d and so
104
-on. Zero means further processing is needed (for things like \x), or the escape
105
-is invalid. */
103
+ are simple data values; negative values are for special things like \d and so
104
+ on. Zero means further processing is needed (for things like \x), or the escape
105
+ is invalid. */
106 106
 
107 107
 static const short int escapes[] = {
108 108
     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
109 109
     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
110
-  '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
110
+    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
111 111
     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
112 112
     0, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
113 113
     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
114
-  '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
114
+    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
115 115
     0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
116 116
     0,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
117 117
     0,      0, -ESC_z                                            /* x - z */
118 118
 };
119 119
 
120 120
 /* Tables of names of POSIX character classes and their lengths. The list is
121
-terminated by a zero length entry. The first three must be alpha, upper, lower,
122
-as this is assumed for handling case independence. */
121
+ terminated by a zero length entry. The first three must be alpha, upper, lower,
122
+ as this is assumed for handling case independence. */
123 123
 
124 124
 static const char *posix_names[] = {
125
-  "alpha", "lower", "upper",
126
-  "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127
-  "print", "punct", "space", "word",  "xdigit" };
125
+    "alpha", "lower", "upper",
126
+    "alnum", "ascii", "blank", "cntrl", "digit", "graph",
127
+    "print", "punct", "space", "word",  "xdigit" };
128 128
 
129 129
 static const uschar posix_name_lengths[] = {
130
-  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
130
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
131 131
 
132 132
 /* Table of class bit maps for each POSIX class; up to three may be combined
133
-to form the class. The table for [:blank:] is dynamically modified to remove
134
-the vertical space characters. */
133
+ to form the class. The table for [:blank:] is dynamically modified to remove
134
+ the vertical space characters. */
135 135
 
136 136
 static const int posix_class_maps[] = {
137
-  cbit_lower, cbit_upper, -1,             /* alpha */
138
-  cbit_lower, -1,         -1,             /* lower */
139
-  cbit_upper, -1,         -1,             /* upper */
140
-  cbit_digit, cbit_lower, cbit_upper,     /* alnum */
141
-  cbit_print, cbit_cntrl, -1,             /* ascii */
142
-  cbit_space, -1,         -1,             /* blank - a GNU extension */
143
-  cbit_cntrl, -1,         -1,             /* cntrl */
144
-  cbit_digit, -1,         -1,             /* digit */
145
-  cbit_graph, -1,         -1,             /* graph */
146
-  cbit_print, -1,         -1,             /* print */
147
-  cbit_punct, -1,         -1,             /* punct */
148
-  cbit_space, -1,         -1,             /* space */
149
-  cbit_word,  -1,         -1,             /* word - a Perl extension */
150
-  cbit_xdigit,-1,         -1              /* xdigit */
137
+    cbit_lower, cbit_upper, -1,             /* alpha */
138
+    cbit_lower, -1,         -1,             /* lower */
139
+    cbit_upper, -1,         -1,             /* upper */
140
+    cbit_digit, cbit_lower, cbit_upper,     /* alnum */
141
+    cbit_print, cbit_cntrl, -1,             /* ascii */
142
+    cbit_space, -1,         -1,             /* blank - a GNU extension */
143
+    cbit_cntrl, -1,         -1,             /* cntrl */
144
+    cbit_digit, -1,         -1,             /* digit */
145
+    cbit_graph, -1,         -1,             /* graph */
146
+    cbit_print, -1,         -1,             /* print */
147
+    cbit_punct, -1,         -1,             /* punct */
148
+    cbit_space, -1,         -1,             /* space */
149
+    cbit_word,  -1,         -1,             /* word - a Perl extension */
150
+    cbit_xdigit,-1,         -1              /* xdigit */
151 151
 };
152 152
 
153 153
 /* Table to identify ASCII digits and hex digits. This is used when compiling
154
-patterns. Note that the tables in chartables are dependent on the locale, and
155
-may mark arbitrary characters as digits - but the PCRE compiling code expects
156
-to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
157
-a private table here. It costs 256 bytes, but it is a lot faster than doing
158
-character value tests (at least in some simple cases I timed), and in some
159
-applications one wants PCRE to compile efficiently as well as match
160
-efficiently.
161
-
162
-For convenience, we use the same bit definitions as in chartables:
163
-
164
-  0x04   decimal digit
165
-  0x08   hexadecimal digit
166
-
167
-Then we can use ctype_digit and ctype_xdigit in the code. */
154
+ patterns. Note that the tables in chartables are dependent on the locale, and
155
+ may mark arbitrary characters as digits - but the PCRE compiling code expects
156
+ to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
157
+ a private table here. It costs 256 bytes, but it is a lot faster than doing
158
+ character value tests (at least in some simple cases I timed), and in some
159
+ applications one wants PCRE to compile efficiently as well as match
160
+ efficiently.
161
+ 
162
+ For convenience, we use the same bit definitions as in chartables:
163
+ 
164
+ 0x04   decimal digit
165
+ 0x08   hexadecimal digit
166
+ 
167
+ Then we can use ctype_digit and ctype_xdigit in the code. */
168 168
 
169 169
 static const unsigned char digitab[] =
170
-  {
171
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
172
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
173
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
174
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
175
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
176
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
177
-  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
178
-  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
179
-  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
180
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
181
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
182
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
183
-  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
184
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
185
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
186
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
187
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
188
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
189
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
190
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
191
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
192
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
193
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
194
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
195
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
196
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
197
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
198
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
199
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
200
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
201
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
202
-  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
170
+{
171
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
172
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
173
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
174
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
175
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
176
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
177
+    0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
178
+    0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
179
+    0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
180
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
181
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
182
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
183
+    0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
184
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
185
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
186
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
187
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
188
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
189
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
190
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
191
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
192
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
193
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
194
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
195
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
196
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
197
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
198
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
199
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
200
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
201
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
202
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
203 203
 
204 204
 /* Definition to allow mutual recursion */
205 205
 
206 206
 static BOOL
207
-  compile_regex(int, int, int *, uschar **, const uschar **, const char **,
208
-    BOOL, int, int *, int *, branch_chain *, compile_data *);
207
+compile_regex(int, int, int *, uschar **, const uschar **, const char **,
208
+              BOOL, int, int *, int *, branch_chain *, compile_data *);
209 209
 
210 210
 /* Structure for building a chain of data that actually lives on the
211
-stack, for holding the values of the subject pointer at the start of each
212
-subpattern, so as to detect when an empty string has been matched by a
213
-subpattern - to break infinite loops. */
211
+ stack, for holding the values of the subject pointer at the start of each
212
+ subpattern, so as to detect when an empty string has been matched by a
213
+ subpattern - to break infinite loops. */
214 214
 
215 215
 typedef struct eptrblock {
216
-  struct eptrblock *prev;
217
-  const uschar *saved_eptr;
216
+    struct eptrblock *prev;
217
+    const uschar *saved_eptr;
218 218
 } eptrblock;
219 219
 
220 220
 /* Flag bits for the match() function */
... ...
@@ -223,7 +223,7 @@ typedef struct eptrblock {
223 223
 #define match_isgroup      0x02    /* Set if start of bracketed group */
224 224
 
225 225
 /* Non-error returns from the match() function. Error returns are externally
226
-defined PCRE_ERROR_xxx codes, which are all negative. */
226
+ defined PCRE_ERROR_xxx codes, which are all negative. */
227 227
 
228 228
 #define MATCH_MATCH        1
229 229
 #define MATCH_NOMATCH      0
... ...
@@ -231,14 +231,14 @@ defined PCRE_ERROR_xxx codes, which are all negative. */
231 231
 
232 232
 
233 233
 /*************************************************
234
-*               Global variables                 *
235
-*************************************************/
234
+ *               Global variables                 *
235
+ *************************************************/
236 236
 
237 237
 /* PCRE is thread-clean and doesn't use any global variables in the normal
238
-sense. However, it calls memory allocation and free functions via the two
239
-indirections below, and it can optionally do callouts. These values can be
240
-changed by the caller, but are shared between all threads. However, when
241
-compiling for Virtual Pascal, things are done differently (see pcre.in). */
238
+ sense. However, it calls memory allocation and free functions via the two
239
+ indirections below, and it can optionally do callouts. These values can be
240
+ changed by the caller, but are shared between all threads. However, when
241
+ compiling for Virtual Pascal, things are done differently (see pcre.in). */
242 242
 
243 243
 #ifndef VPCOMPAT
244 244
 void *(*pcre_malloc)(size_t) = malloc;
... ...
@@ -248,12 +248,12 @@ int   (*pcre_callout)(pcre_callout_block *) = NULL;
248 248
 
249 249
 
250 250
 /*************************************************
251
-*    Macros and tables for character handling    *
252
-*************************************************/
251
+ *    Macros and tables for character handling    *
252
+ *************************************************/
253 253
 
254 254
 /* When UTF-8 encoding is being used, a character is no longer just a single
255
-byte. The macros for character handling generate simple sequences when used in
256
-byte-mode, and more complicated ones for UTF-8 characters. */
255
+ byte. The macros for character handling generate simple sequences when used in
256
+ byte-mode, and more complicated ones for UTF-8 characters. */
257 257
 
258 258
 #ifndef SUPPORT_UTF8
259 259
 #define GETCHAR(c, eptr) c = *eptr;
... ...
@@ -265,77 +265,77 @@ byte-mode, and more complicated ones for UTF-8 characters. */
265 265
 #else   /* SUPPORT_UTF8 */
266 266
 
267 267
 /* Get the next UTF-8 character, not advancing the pointer. This is called when
268
-we know we are in UTF-8 mode. */
268
+ we know we are in UTF-8 mode. */
269 269
 
270 270
 #define GETCHAR(c, eptr) \
271
-  c = *eptr; \
272
-  if ((c & 0xc0) == 0xc0) \
273
-    { \
274
-    int gcii; \
275
-    int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
276
-    int gcss = 6*gcaa; \
277
-    c = (c & utf8_table3[gcaa]) << gcss; \
278
-    for (gcii = 1; gcii <= gcaa; gcii++) \
279
-      { \
280
-      gcss -= 6; \
281
-      c |= (eptr[gcii] & 0x3f) << gcss; \
282
-      } \
283
-    }
271
+c = *eptr; \
272
+if ((c & 0xc0) == 0xc0) \
273
+{ \
274
+int gcii; \
275
+int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
276
+int gcss = 6*gcaa; \
277
+c = (c & utf8_table3[gcaa]) << gcss; \
278
+for (gcii = 1; gcii <= gcaa; gcii++) \
279
+{ \
280
+gcss -= 6; \
281
+c |= (eptr[gcii] & 0x3f) << gcss; \
282
+} \
283
+}
284 284
 
285 285
 /* Get the next UTF-8 character, advancing the pointer. This is called when we
286
-know we are in UTF-8 mode. */
286
+ know we are in UTF-8 mode. */
287 287
 
288 288
 #define GETCHARINC(c, eptr) \
289
-  c = *eptr++; \
290
-  if ((c & 0xc0) == 0xc0) \
291
-    { \
292
-    int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
293
-    int gcss = 6*gcaa; \
294
-    c = (c & utf8_table3[gcaa]) << gcss; \
295
-    while (gcaa-- > 0) \
296
-      { \
297
-      gcss -= 6; \
298
-      c |= (*eptr++ & 0x3f) << gcss; \
299
-      } \
300
-    }
289
+c = *eptr++; \
290
+if ((c & 0xc0) == 0xc0) \
291
+{ \
292
+int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
293
+int gcss = 6*gcaa; \
294
+c = (c & utf8_table3[gcaa]) << gcss; \
295
+while (gcaa-- > 0) \
296
+{ \
297
+gcss -= 6; \
298
+c |= (*eptr++ & 0x3f) << gcss; \
299
+} \
300
+}
301 301
 
302 302
 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
303 303
 
304 304
 #define GETCHARINCTEST(c, eptr) \
305
-  c = *eptr++; \
306
-  if (md->utf8 && (c & 0xc0) == 0xc0) \
307
-    { \
308
-    int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
309
-    int gcss = 6*gcaa; \
310
-    c = (c & utf8_table3[gcaa]) << gcss; \
311
-    while (gcaa-- > 0) \
312
-      { \
313
-      gcss -= 6; \
314
-      c |= (*eptr++ & 0x3f) << gcss; \
315
-      } \
316
-    }
305
+c = *eptr++; \
306
+if (md->utf8 && (c & 0xc0) == 0xc0) \
307
+{ \
308
+int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
309
+int gcss = 6*gcaa; \
310
+c = (c & utf8_table3[gcaa]) << gcss; \
311
+while (gcaa-- > 0) \
312
+{ \
313
+gcss -= 6; \
314
+c |= (*eptr++ & 0x3f) << gcss; \
315
+} \
316
+}
317 317
 
318 318
 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
319
-if there are extra bytes. This is called when we know we are in UTF-8 mode. */
319
+ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
320 320
 
321 321
 #define GETCHARLEN(c, eptr, len) \
322
-  c = *eptr; \
323
-  if ((c & 0xc0) == 0xc0) \
324
-    { \
325
-    int gcii; \
326
-    int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
327
-    int gcss = 6*gcaa; \
328
-    c = (c & utf8_table3[gcaa]) << gcss; \
329
-    for (gcii = 1; gcii <= gcaa; gcii++) \
330
-      { \
331
-      gcss -= 6; \
332
-      c |= (eptr[gcii] & 0x3f) << gcss; \
333
-      } \
334
-    len += gcaa; \
335
-    }
322
+c = *eptr; \
323
+if ((c & 0xc0) == 0xc0) \
324
+{ \
325
+int gcii; \
326
+int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
327
+int gcss = 6*gcaa; \
328
+c = (c & utf8_table3[gcaa]) << gcss; \
329
+for (gcii = 1; gcii <= gcaa; gcii++) \
330
+{ \
331
+gcss -= 6; \
332
+c |= (eptr[gcii] & 0x3f) << gcss; \
333
+} \
334
+len += gcaa; \
335
+}
336 336
 
337 337
 /* If the pointer is not at the start of a character, move it back until
338
-it is. Called only in UTF-8 mode. */
338
+ it is. Called only in UTF-8 mode. */
339 339
 
340 340
 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
341 341
 
... ...
@@ -344,14 +344,14 @@ it is. Called only in UTF-8 mode. */
344 344
 
345 345
 
346 346
 /*************************************************
347
-*             Default character tables           *
348
-*************************************************/
347
+ *             Default character tables           *
348
+ *************************************************/
349 349
 
350 350
 /* A default set of character tables is included in the PCRE binary. Its source
351
-is built by the maketables auxiliary program, which uses the default C ctypes
352
-functions, and put in the file chartables.c. These tables are used by PCRE
353
-whenever the caller of pcre_compile() does not provide an alternate set of
354
-tables. */
351
+ is built by the maketables auxiliary program, which uses the default C ctypes
352
+ functions, and put in the file chartables.c. These tables are used by PCRE
353
+ whenever the caller of pcre_compile() does not provide an alternate set of
354
+ tables. */
355 355
 
356 356
 #include "chartables.c"
357 357
 
... ...
@@ -359,71 +359,71 @@ tables. */
359 359
 
360 360
 #ifdef SUPPORT_UTF8
361 361
 /*************************************************
362
-*           Tables for UTF-8 support             *
363
-*************************************************/
362
+ *           Tables for UTF-8 support             *
363
+ *************************************************/
364 364
 
365 365
 /* These are the breakpoints for different numbers of bytes in a UTF-8
366
-character. */
366
+ character. */
367 367
 
368 368
 static const int utf8_table1[] =
369
-  { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
369
+{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
370 370
 
371 371
 /* These are the indicator bits and the mask for the data bits to set in the
372
-first byte of a character, indexed by the number of additional bytes. */
372
+ first byte of a character, indexed by the number of additional bytes. */
373 373
 
374 374
 static const int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
375 375
 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
376 376
 
377 377
 /* Table of the number of extra characters, indexed by the first character
378
-masked with 0x3f. The highest number for a valid UTF-8 character is in fact
379
-0x3d. */
378
+ masked with 0x3f. The highest number for a valid UTF-8 character is in fact
379
+ 0x3d. */
380 380
 
381 381
 static const uschar utf8_table4[] = {
382
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
383
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
384
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
385
-  3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
382
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
383
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
384
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
385
+    3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
386 386
 
387 387
 
388 388
 /*************************************************
389
-*       Convert character value to UTF-8         *
390
-*************************************************/
389
+ *       Convert character value to UTF-8         *
390
+ *************************************************/
391 391
 
392 392
 /* This function takes an integer value in the range 0 - 0x7fffffff
393
-and encodes it as a UTF-8 character in 0 to 6 bytes.
394
-
395
-Arguments:
396
-  cvalue     the character value
397
-  buffer     pointer to buffer for result - at least 6 bytes long
398
-
399
-Returns:     number of characters placed in the buffer
400
-*/
393
+ and encodes it as a UTF-8 character in 0 to 6 bytes.
394
+ 
395
+ Arguments:
396
+ cvalue     the character value
397
+ buffer     pointer to buffer for result - at least 6 bytes long
398
+ 
399
+ Returns:     number of characters placed in the buffer
400
+ */
401 401
 
402 402
 static int
403 403
 ord2utf8(int cvalue, uschar *buffer)
404 404
 {
405
-register int i, j;
406
-for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
407
-  if (cvalue <= utf8_table1[i]) break;
408
-buffer += i;
409
-for (j = i; j > 0; j--)
410
- {
411
- *buffer-- = 0x80 | (cvalue & 0x3f);
412
- cvalue >>= 6;
413
- }
414
-*buffer = utf8_table2[i] | cvalue;
415
-return i + 1;
405
+    register int i, j;
406
+    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
407
+        if (cvalue <= utf8_table1[i]) break;
408
+    buffer += i;
409
+    for (j = i; j > 0; j--)
410
+    {
411
+        *buffer-- = 0x80 | (cvalue & 0x3f);
412
+        cvalue >>= 6;
413
+    }
414
+    *buffer = utf8_table2[i] | cvalue;
415
+    return i + 1;
416 416
 }
417 417
 #endif
418 418
 
419 419
 
420 420
 
421 421
 /*************************************************
422
-*         Print compiled regex                   *
423
-*************************************************/
422
+ *         Print compiled regex                   *
423
+ *************************************************/
424 424
 
425 425
 /* The code for doing this is held in a separate file that is also included in
426
-pcretest.c. It defines a function called print_internals(). */
426
+ pcretest.c. It defines a function called print_internals(). */
427 427
 
428 428
 #ifdef DEBUG
429 429
 #include "printint.c"
... ...
@@ -432,8 +432,8 @@ pcretest.c. It defines a function called print_internals(). */
432 432
 
433 433
 
434 434
 /*************************************************
435
-*          Return version string                 *
436
-*************************************************/
435
+ *          Return version string                 *
436
+ *************************************************/
437 437
 
438 438
 #define STRING(a)  # a
439 439
 #define XSTRING(s) STRING(s)
... ...
@@ -441,212 +441,212 @@ pcretest.c. It defines a function called print_internals(). */
441 441
 const char *
442 442
 pcre_version(void)
443 443
 {
444
-return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
444
+    return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
445 445
 }
446 446
 
447 447
 
448 448
 
449 449
 
450 450
 /*************************************************
451
-* (Obsolete) Return info about compiled pattern  *
452
-*************************************************/
451
+ * (Obsolete) Return info about compiled pattern  *
452
+ *************************************************/
453 453
 
454 454
 /* This is the original "info" function. It picks potentially useful data out
455
-of the private structure, but its interface was too rigid. It remains for
456
-backwards compatibility. The public options are passed back in an int - though
457
-the re->options field has been expanded to a long int, all the public options
458
-at the low end of it, and so even on 16-bit systems this will still be OK.
459
-Therefore, I haven't changed the API for pcre_info().
460
-
461
-Arguments:
462
-  external_re   points to compiled code
463
-  optptr        where to pass back the options
464
-  first_byte    where to pass back the first character,
465
-                or -1 if multiline and all branches start ^,
466
-                or -2 otherwise
467
-
468
-Returns:        number of capturing subpatterns
469
-                or negative values on error
470
-*/
455
+ of the private structure, but its interface was too rigid. It remains for
456
+ backwards compatibility. The public options are passed back in an int - though
457
+ the re->options field has been expanded to a long int, all the public options
458
+ at the low end of it, and so even on 16-bit systems this will still be OK.
459
+ Therefore, I haven't changed the API for pcre_info().
460
+ 
461
+ Arguments:
462
+ external_re   points to compiled code
463
+ optptr        where to pass back the options
464
+ first_byte    where to pass back the first character,
465
+ or -1 if multiline and all branches start ^,
466
+ or -2 otherwise
467
+ 
468
+ Returns:        number of capturing subpatterns
469
+ or negative values on error
470
+ */
471 471
 
472 472
 int
473 473
 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
474 474
 {
475
-const real_pcre *re = (const real_pcre *)external_re;
476
-if (re == NULL) return PCRE_ERROR_NULL;
477
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
478
-if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
479
-if (first_byte != NULL)
480
-  *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
481
-     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
482
-return re->top_bracket;
475
+    const real_pcre *re = (const real_pcre *)external_re;
476
+    if (re == NULL) return PCRE_ERROR_NULL;
477
+    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
478
+    if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
479
+    if (first_byte != NULL)
480
+        *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
481
+        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
482
+    return re->top_bracket;
483 483
 }
484 484
 
485 485
 
486 486
 
487 487
 /*************************************************
488
-*        Return info about compiled pattern      *
489
-*************************************************/
488
+ *        Return info about compiled pattern      *
489
+ *************************************************/
490 490
 
491 491
 /* This is a newer "info" function which has an extensible interface so
492
-that additional items can be added compatibly.
493
-
494
-Arguments:
495
-  external_re      points to compiled code
496
-  extra_data       points extra data, or NULL
497
-  what             what information is required
498
-  where            where to put the information
499
-
500
-Returns:           0 if data returned, negative on error
501
-*/
492
+ that additional items can be added compatibly.
493
+ 
494
+ Arguments:
495
+ external_re      points to compiled code
496
+ extra_data       points extra data, or NULL
497
+ what             what information is required
498
+ where            where to put the information
499
+ 
500
+ Returns:           0 if data returned, negative on error
501
+ */
502 502
 
503 503
 int
504 504
 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
505
-  void *where)
505
+              void *where)
506 506
 {
507
-const real_pcre *re = (const real_pcre *)external_re;
508
-const pcre_study_data *study = NULL;
509
-
510
-if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
511
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
512
-
513
-if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
514
-  study = extra_data->study_data;
515
-
516
-switch (what)
517
-  {
518
-  case PCRE_INFO_OPTIONS:
519
-  *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
520
-  break;
521
-
522
-  case PCRE_INFO_SIZE:
523
-  *((size_t *)where) = re->size;
524
-  break;
525
-
526
-  case PCRE_INFO_STUDYSIZE:
527
-  *((size_t *)where) = (study == NULL)? 0 : study->size;
528
-  break;
529
-
530
-  case PCRE_INFO_CAPTURECOUNT:
531
-  *((int *)where) = re->top_bracket;
532
-  break;
533
-
534
-  case PCRE_INFO_BACKREFMAX:
535
-  *((int *)where) = re->top_backref;
536
-  break;
537
-
538
-  case PCRE_INFO_FIRSTBYTE:
539
-  *((int *)where) =
540
-    ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
541
-    ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
542
-  break;
543
-
544
-  case PCRE_INFO_FIRSTTABLE:
545
-  *((const uschar **)where) =
546
-    (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
547
-      study->start_bits : NULL;
548
-  break;
549
-
550
-  case PCRE_INFO_LASTLITERAL:
551
-  *((int *)where) =
552
-    ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
553
-  break;
554
-
555
-  case PCRE_INFO_NAMEENTRYSIZE:
556
-  *((int *)where) = re->name_entry_size;
557
-  break;
558
-
559
-  case PCRE_INFO_NAMECOUNT:
560
-  *((int *)where) = re->name_count;
561
-  break;
562
-
563
-  case PCRE_INFO_NAMETABLE:
564
-  *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
565
-  break;
566
-
567
-  default: return PCRE_ERROR_BADOPTION;
568
-  }
569
-
570
-return 0;
507
+    const real_pcre *re = (const real_pcre *)external_re;
508
+    const pcre_study_data *study = NULL;
509
+    
510
+    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
511
+    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
512
+    
513
+    if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
514
+        study = extra_data->study_data;
515
+    
516
+    switch (what)
517
+    {
518
+        case PCRE_INFO_OPTIONS:
519
+            *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
520
+            break;
521
+            
522
+        case PCRE_INFO_SIZE:
523
+            *((size_t *)where) = re->size;
524
+            break;
525
+            
526
+        case PCRE_INFO_STUDYSIZE:
527
+            *((size_t *)where) = (study == NULL)? 0 : study->size;
528
+            break;
529
+            
530
+        case PCRE_INFO_CAPTURECOUNT:
531
+            *((int *)where) = re->top_bracket;
532
+            break;
533
+            
534
+        case PCRE_INFO_BACKREFMAX:
535
+            *((int *)where) = re->top_backref;
536
+            break;
537
+            
538
+        case PCRE_INFO_FIRSTBYTE:
539
+            *((int *)where) =
540
+            ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
541
+            ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
542
+            break;
543
+            
544
+        case PCRE_INFO_FIRSTTABLE:
545
+            *((const uschar **)where) =
546
+            (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
547
+            study->start_bits : NULL;
548
+            break;
549
+            
550
+        case PCRE_INFO_LASTLITERAL:
551
+            *((int *)where) =
552
+            ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
553
+            break;
554
+            
555
+        case PCRE_INFO_NAMEENTRYSIZE:
556
+            *((int *)where) = re->name_entry_size;
557
+            break;
558
+            
559
+        case PCRE_INFO_NAMECOUNT:
560
+            *((int *)where) = re->name_count;
561
+            break;
562
+            
563
+        case PCRE_INFO_NAMETABLE:
564
+            *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
565
+            break;
566
+            
567
+        default: return PCRE_ERROR_BADOPTION;
568
+    }
569
+    
570
+    return 0;
571 571
 }
572 572
 
573 573
 
574 574
 
575 575
 /*************************************************
576
-* Return info about what features are configured *
577
-*************************************************/
576
+ * Return info about what features are configured *
577
+ *************************************************/
578 578
 
579 579
 /* This is function which has an extensible interface so that additional items
580
-can be added compatibly.
581
-
582
-Arguments:
583
-  what             what information is required
584
-  where            where to put the information
585
-
586
-Returns:           0 if data returned, negative on error
587
-*/
580
+ can be added compatibly.
581
+ 
582
+ Arguments:
583
+ what             what information is required
584
+ where            where to put the information
585
+ 
586
+ Returns:           0 if data returned, negative on error
587
+ */
588 588
 
589 589
 int
590 590
 pcre_config(int what, void *where)
591 591
 {
592
-switch (what)
593
-  {
594
-  case PCRE_CONFIG_UTF8:
595
-  #ifdef SUPPORT_UTF8
596
-  *((int *)where) = 1;
597
-  #else
598
-  *((int *)where) = 0;
599
-  #endif
600
-  break;
601
-
602
-  case PCRE_CONFIG_NEWLINE:
603
-  *((int *)where) = NEWLINE;
604
-  break;
605
-
606
-  case PCRE_CONFIG_LINK_SIZE:
607
-  *((int *)where) = LINK_SIZE;
608
-  break;
609
-
610
-  case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
611
-  *((int *)where) = POSIX_MALLOC_THRESHOLD;
612
-  break;
613
-
614
-  case PCRE_CONFIG_MATCH_LIMIT:
615
-  *((unsigned int *)where) = MATCH_LIMIT;
616
-  break;
617
-
618
-  default: return PCRE_ERROR_BADOPTION;
619
-  }
620
-
621
-return 0;
592
+    switch (what)
593
+    {
594
+        case PCRE_CONFIG_UTF8:
595
+#ifdef SUPPORT_UTF8
596
+            *((int *)where) = 1;
597
+#else
598
+            *((int *)where) = 0;
599
+#endif
600
+            break;
601
+            
602
+        case PCRE_CONFIG_NEWLINE:
603
+            *((int *)where) = NEWLINE;
604
+            break;
605
+            
606
+        case PCRE_CONFIG_LINK_SIZE:
607
+            *((int *)where) = LINK_SIZE;
608
+            break;
609
+            
610
+        case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
611
+            *((int *)where) = POSIX_MALLOC_THRESHOLD;
612
+            break;
613
+            
614
+        case PCRE_CONFIG_MATCH_LIMIT:
615
+            *((unsigned int *)where) = MATCH_LIMIT;
616
+            break;
617
+            
618
+        default: return PCRE_ERROR_BADOPTION;
619
+    }
620
+    
621
+    return 0;
622 622
 }
623 623
 
624 624
 
625 625
 
626 626
 #ifdef DEBUG
627 627
 /*************************************************
628
-*        Debugging function to print chars       *
629
-*************************************************/
628
+ *        Debugging function to print chars       *
629
+ *************************************************/
630 630
 
631 631
 /* Print a sequence of chars in printable format, stopping at the end of the
632
-subject if the requested.
633
-
634
-Arguments:
635
-  p           points to characters
636
-  length      number to print
637
-  is_subject  TRUE if printing from within md->start_subject
638
-  md          pointer to matching data block, if is_subject is TRUE
639
-
640
-Returns:     nothing
641
-*/
632
+ subject if the requested.
633
+ 
634
+ Arguments:
635
+ p           points to characters
636
+ length      number to print
637
+ is_subject  TRUE if printing from within md->start_subject
638
+ md          pointer to matching data block, if is_subject is TRUE
639
+ 
640
+ Returns:     nothing
641
+ */
642 642
 
643 643
 static void
644 644
 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
645 645
 {
646
-int c;
647
-if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
648
-while (length-- > 0)
649
-  if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
646
+    int c;
647
+    if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
648
+    while (length-- > 0)
649
+        if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
650 650
 }
651 651
 #endif
652 652
 
... ...
@@ -654,6943 +654,6943 @@ while (length-- > 0)
654 654
 
655 655
 
656 656
 /*************************************************
657
-*            Handle escapes                      *
658
-*************************************************/
657
+ *            Handle escapes                      *
658
+ *************************************************/
659 659
 
660 660
 /* This function is called when a \ has been encountered. It either returns a
661
-positive value for a simple escape such as \n, or a negative value which
662
-encodes one of the more complicated things such as \d. When UTF-8 is enabled,
663
-a positive value greater than 255 may be returned. On entry, ptr is pointing at
664
-the \. On exit, it is on the final character of the escape sequence.
665
-
666
-Arguments:
667
-  ptrptr     points to the pattern position pointer
668
-  errorptr   points to the pointer to the error message
669
-  bracount   number of previous extracting brackets
670
-  options    the options bits
671
-  isclass    TRUE if inside a character class
672
-  cd         pointer to char tables block
673
-
674
-Returns:     zero or positive => a data character
675
-             negative => a special escape sequence
676
-             on error, errorptr is set
677
-*/
661
+ positive value for a simple escape such as \n, or a negative value which
662
+ encodes one of the more complicated things such as \d. When UTF-8 is enabled,
663
+ a positive value greater than 255 may be returned. On entry, ptr is pointing at
664
+ the \. On exit, it is on the final character of the escape sequence.
665
+ 
666
+ Arguments:
667
+ ptrptr     points to the pattern position pointer
668
+ errorptr   points to the pointer to the error message
669
+ bracount   number of previous extracting brackets
670
+ options    the options bits
671
+ isclass    TRUE if inside a character class
672
+ cd         pointer to char tables block
673
+ 
674
+ Returns:     zero or positive => a data character
675
+ negative => a special escape sequence
676
+ on error, errorptr is set
677
+ */
678 678
 
679 679
 static int
680 680
 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
681
-  int options, BOOL isclass, compile_data *cd)
681
+             int options, BOOL isclass, compile_data *cd)
682 682
 {
683
-const uschar *ptr = *ptrptr;
684
-int c, i;
685
-
686
-/* If backslash is at the end of the pattern, it's an error. */
687
-
688
-c = *(++ptr);
689
-if (c == 0) *errorptr = ERR1;
690
-
691
-/* Digits or letters may have special meaning; all others are literals. */
692
-
693
-else if (c < '0' || c > 'z') {}
694
-
695
-/* Do an initial lookup in a table. A non-zero result is something that can be
696
-returned immediately. Otherwise further processing may be required. */
697
-
698
-else if ((i = escapes[c - '0']) != 0) c = i;
699
-
700
-/* Escapes that need further processing, or are illegal. */
701
-
702
-else
703
-  {
704
-  const uschar *oldptr;
705
-  switch (c)
683
+    const uschar *ptr = *ptrptr;
684
+    int c, i;
685
+    
686
+    /* If backslash is at the end of the pattern, it's an error. */
687
+    
688
+    c = *(++ptr);
689
+    if (c == 0) *errorptr = ERR1;
690
+    
691
+    /* Digits or letters may have special meaning; all others are literals. */
692
+    
693
+    else if (c < '0' || c > 'z') {}
694
+    
695
+    /* Do an initial lookup in a table. A non-zero result is something that can be
696
+     returned immediately. Otherwise further processing may be required. */
697
+    
698
+    else if ((i = escapes[c - '0']) != 0) c = i;
699
+    
700
+    /* Escapes that need further processing, or are illegal. */
701
+    
702
+    else
706 703
     {
707
-    /* A number of Perl escapes are not handled by PCRE. We give an explicit
708
-    error. */
709
-
710
-    case 'l':
711
-    case 'L':
712
-    case 'N':
713
-    case 'p':
714
-    case 'P':
715
-    case 'u':
716
-    case 'U':
717
-    case 'X':
718
-    *errorptr = ERR37;
719
-    break;
720
-
721
-    /* The handling of escape sequences consisting of a string of digits
722
-    starting with one that is not zero is not straightforward. By experiment,
723
-    the way Perl works seems to be as follows:
724
-
725
-    Outside a character class, the digits are read as a decimal number. If the
726
-    number is less than 10, or if there are that many previous extracting
727
-    left brackets, then it is a back reference. Otherwise, up to three octal
728
-    digits are read to form an escaped byte. Thus \123 is likely to be octal
729
-    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
730
-    value is greater than 377, the least significant 8 bits are taken. Inside a
731
-    character class, \ followed by a digit is always an octal number. */
732
-
733
-    case '1': case '2': case '3': case '4': case '5':
734
-    case '6': case '7': case '8': case '9':
735
-
736
-    if (!isclass)
737
-      {
738
-      oldptr = ptr;
739
-      c -= '0';
740
-      while ((digitab[ptr[1]] & ctype_digit) != 0)
741
-        c = c * 10 + *(++ptr) - '0';
742
-      if (c < 10 || c <= bracount)
704
+        const uschar *oldptr;
705
+        switch (c)
743 706
         {
744
-        c = -(ESC_REF + c);
745
-        break;
746
-        }
747
-      ptr = oldptr;      /* Put the pointer back and fall through */
748
-      }
749
-
750
-    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
751
-    generates a binary zero byte and treats the digit as a following literal.
752
-    Thus we have to pull back the pointer by one. */
753
-
754
-    if ((c = *ptr) >= '8')
755
-      {
756
-      ptr--;
757
-      c = 0;
758
-      break;
759
-      }
760
-
761
-    /* \0 always starts an octal number, but we may drop through to here with a
762
-    larger first octal digit. */
763
-
764
-    case '0':
765
-    c -= '0';
766
-    while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
767
-        c = c * 8 + *(++ptr) - '0';
768
-    c &= 255;     /* Take least significant 8 bits */
769
-    break;
770
-
771
-    /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
772
-    which can be greater than 0xff, but only if the ddd are hex digits. */
773
-
774
-    case 'x':
707
+                /* A number of Perl escapes are not handled by PCRE. We give an explicit
708
+                 error. */
709
+                
710
+            case 'l':
711
+            case 'L':
712
+            case 'N':
713
+            case 'p':
714
+            case 'P':
715
+            case 'u':
716
+            case 'U':
717
+            case 'X':
718
+                *errorptr = ERR37;
719
+                break;
720
+                
721
+                /* The handling of escape sequences consisting of a string of digits
722
+                 starting with one that is not zero is not straightforward. By experiment,
723
+                 the way Perl works seems to be as follows:
724
+                 
725
+                 Outside a character class, the digits are read as a decimal number. If the
726
+                 number is less than 10, or if there are that many previous extracting
727
+                 left brackets, then it is a back reference. Otherwise, up to three octal
728
+                 digits are read to form an escaped byte. Thus \123 is likely to be octal
729
+                 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
730
+                 value is greater than 377, the least significant 8 bits are taken. Inside a
731
+                 character class, \ followed by a digit is always an octal number. */
732
+                
733
+            case '1': case '2': case '3': case '4': case '5':
734
+            case '6': case '7': case '8': case '9':
735
+                
736
+                if (!isclass)
737
+                {
738
+                    oldptr = ptr;
739
+                    c -= '0';
740
+                    while ((digitab[ptr[1]] & ctype_digit) != 0)
741
+                        c = c * 10 + *(++ptr) - '0';
742
+                    if (c < 10 || c <= bracount)
743
+                    {
744
+                        c = -(ESC_REF + c);
745
+                        break;
746
+                    }
747
+                    ptr = oldptr;      /* Put the pointer back and fall through */
748
+                }
749
+                
750
+                /* Handle an octal number following \. If the first digit is 8 or 9, Perl
751
+                 generates a binary zero byte and treats the digit as a following literal.
752
+                 Thus we have to pull back the pointer by one. */
753
+                
754
+                if ((c = *ptr) >= '8')
755
+                {
756
+                    ptr--;
757
+                    c = 0;
758
+                    break;
759
+                }
760
+                
761
+                /* \0 always starts an octal number, but we may drop through to here with a
762
+                 larger first octal digit. */
763
+                
764
+            case '0':
765
+                c -= '0';
766
+                while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
767
+                    c = c * 8 + *(++ptr) - '0';
768
+                c &= 255;     /* Take least significant 8 bits */
769
+                break;
770
+                
771
+                /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
772
+                 which can be greater than 0xff, but only if the ddd are hex digits. */
773
+                
774
+            case 'x':
775 775
 #ifdef SUPPORT_UTF8
776
-    if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
777
-      {
778
-      const uschar *pt = ptr + 2;
779
-      register int count = 0;
780
-      c = 0;
781
-      while ((digitab[*pt] & ctype_xdigit) != 0)
782
-        {
783
-        int cc = *pt++;
784
-        if (cc >= 'a') cc -= 32;            /* Convert to upper case */
785
-        count++;
786
-        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
787
-        }
788
-      if (*pt == '}')
789
-        {
790
-        if (c < 0 || count > 8) *errorptr = ERR34;
791
-        ptr = pt;
792
-        break;
793
-        }
794
-      /* If the sequence of hex digits does not end with '}', then we don't
795
-      recognize this construct; fall through to the normal \x handling. */
796
-      }
776
+                if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
777
+                {
778
+                    const uschar *pt = ptr + 2;
779
+                    register int count = 0;
780
+                    c = 0;
781
+                    while ((digitab[*pt] & ctype_xdigit) != 0)
782
+                    {
783
+                        int cc = *pt++;
784
+                        if (cc >= 'a') cc -= 32;            /* Convert to upper case */
785
+                        count++;
786
+                        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
787
+                    }
788
+                    if (*pt == '}')
789
+                    {
790
+                        if (c < 0 || count > 8) *errorptr = ERR34;
791
+                        ptr = pt;
792
+                        break;
793
+                    }
794
+                    /* If the sequence of hex digits does not end with '}', then we don't
795
+                     recognize this construct; fall through to the normal \x handling. */
796
+                }
797 797
 #endif
798
-
799
-    /* Read just a single hex char */
800
-
801
-    c = 0;
802
-    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
803
-      {
804
-      int cc = *(++ptr);
805
-      if (cc >= 'a') cc -= 32;              /* Convert to upper case */
806
-      c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
807
-      }
808
-    break;
809
-
810
-    /* Other special escapes not starting with a digit are straightforward */
811
-
812
-    case 'c':
813
-    c = *(++ptr);
814
-    if (c == 0)
815
-      {
816
-      *errorptr = ERR2;
817
-      return 0;
818
-      }
819
-
820
-    /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
821
-    is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
822
-
823
-    if (c >= 'a' && c <= 'z') c -= 32;
824
-    c ^= 0x40;
825
-    break;
826
-
827
-    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
828
-    other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
829
-    for Perl compatibility, it is a literal. This code looks a bit odd, but
830
-    there used to be some cases other than the default, and there may be again
831
-    in future, so I haven't "optimized" it. */
832
-
833
-    default:
834
-    if ((options & PCRE_EXTRA) != 0) switch(c)
835
-      {
836
-      default:
837
-      *errorptr = ERR3;
838
-      break;
839
-      }
840
-    break;
798
+                
799
+                /* Read just a single hex char */
800
+                
801
+                c = 0;
802
+                while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
803
+                {
804
+                    int cc = *(++ptr);
805
+                    if (cc >= 'a') cc -= 32;              /* Convert to upper case */
806
+                    c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
807
+                }
808
+                break;
809
+                
810
+                /* Other special escapes not starting with a digit are straightforward */
811
+                
812
+            case 'c':
813
+                c = *(++ptr);
814
+                if (c == 0)
815
+                {
816
+                    *errorptr = ERR2;
817
+                    return 0;
818
+                }
819
+                
820
+                /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
821
+                 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
822
+                
823
+                if (c >= 'a' && c <= 'z') c -= 32;
824
+                c ^= 0x40;
825
+                break;
826
+                
827
+                /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
828
+                 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
829
+                 for Perl compatibility, it is a literal. This code looks a bit odd, but
830
+                 there used to be some cases other than the default, and there may be again
831
+                 in future, so I haven't "optimized" it. */
832
+                
833
+            default:
834
+                if ((options & PCRE_EXTRA) != 0) switch(c)
835
+                {
836
+                    default:
837
+                        *errorptr = ERR3;
838
+                        break;
839
+                }
840
+                break;
841
+        }
841 842
     }
842
-  }
843
-
844
-*ptrptr = ptr;
845
-return c;
843
+    
844
+    *ptrptr = ptr;
845
+    return c;
846 846
 }
847 847
 
848 848
 
849 849
 
850 850
 /*************************************************
851
-*            Check for counted repeat            *
852
-*************************************************/
851
+ *            Check for counted repeat            *
852
+ *************************************************/
853 853
 
854 854
 /* This function is called when a '{' is encountered in a place where it might
855
-start a quantifier. It looks ahead to see if it really is a quantifier or not.
856
-It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
857
-where the ddds are digits.
858
-
859
-Arguments:
860
-  p         pointer to the first char after '{'
861
-  cd        pointer to char tables block
862
-
863
-Returns:    TRUE or FALSE
864
-*/
855
+ start a quantifier. It looks ahead to see if it really is a quantifier or not.
856
+ It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
857
+ where the ddds are digits.
858
+ 
859
+ Arguments:
860
+ p         pointer to the first char after '{'
861
+ cd        pointer to char tables block
862
+ 
863
+ Returns:    TRUE or FALSE
864
+ */
865 865
 
866 866
 static BOOL
867 867
 is_counted_repeat(const uschar *p, compile_data *cd)
868 868
 {
869 869
     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
870
-while ((digitab[*p] & ctype_digit) != 0) p++;
871
-if (*p == '}') return TRUE;
872
-
873
-if (*p++ != ',') return FALSE;
874
-if (*p == '}') return TRUE;
875
-
870
+    while ((digitab[*p] & ctype_digit) != 0) p++;
871
+    if (*p == '}') return TRUE;
872
+    
873
+    if (*p++ != ',') return FALSE;
874
+    if (*p == '}') return TRUE;
875
+    
876 876
     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
877
-while ((digitab[*p] & ctype_digit) != 0) p++;
878
-
879
-return (*p == '}');
877
+    while ((digitab[*p] & ctype_digit) != 0) p++;
878
+    
879
+    return (*p == '}');
880 880
 }
881 881
 
882 882
 
883 883
 
884 884
 /*************************************************
885
-*         Read repeat counts                     *
886
-*************************************************/
885
+ *         Read repeat counts                     *
886
+ *************************************************/
887 887
 
888 888
 /* Read an item of the form {n,m} and return the values. This is called only
889
-after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
890
-so the syntax is guaranteed to be correct, but we need to check the values.
891
-
892
-Arguments:
893
-  p          pointer to first char after '{'
894
-  minp       pointer to int for min
895
-  maxp       pointer to int for max
896
-             returned as -1 if no max
897
-  errorptr   points to pointer to error message
898
-  cd         pointer to character tables clock
899
-
900
-Returns:     pointer to '}' on success;
901
-             current ptr on error, with errorptr set
902
-*/
889
+ after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
890
+ so the syntax is guaranteed to be correct, but we need to check the values.
891
+ 
892
+ Arguments:
893
+ p          pointer to first char after '{'
894
+ minp       pointer to int for min
895
+ maxp       pointer to int for max
896
+ returned as -1 if no max
897
+ errorptr   points to pointer to error message
898
+ cd         pointer to character tables clock
899
+ 
900
+ Returns:     pointer to '}' on success;
901
+ current ptr on error, with errorptr set
902
+ */
903 903
 
904 904
 static const uschar *
905 905
 read_repeat_counts(const uschar *p, int *minp, int *maxp,
906
-  const char **errorptr, compile_data *cd)
906
+                   const char **errorptr, compile_data *cd)
907 907
 {
908
-int min = 0;
909
-int max = -1;
910
-
911
-while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
912
-
913
-if (*p == '}') max = min; else
914
-  {
915
-  if (*(++p) != '}')
908
+    int min = 0;
909
+    int max = -1;
910
+    
911
+    while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
912
+    
913
+    if (*p == '}') max = min; else
914
+    {
915
+        if (*(++p) != '}')
916
+        {
917
+            max = 0;
918
+            while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919
+            if (max < min)
920
+            {
921
+                *errorptr = ERR4;
922
+                return p;
923
+            }
924
+        }
925
+    }
926
+    
927
+    /* Do paranoid checks, then fill in the required variables, and pass back the
928
+     pointer to the terminating '}'. */
929
+    
930
+    if (min > 65535 || max > 65535)
931
+        *errorptr = ERR5;
932
+    else
916 933
     {
917
-    max = 0;
918
-    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919
-    if (max < min)
920
-      {
921
-      *errorptr = ERR4;
922
-      return p;
923
-      }
934
+        *minp = min;
935
+        *maxp = max;
924 936
     }
925
-  }
926
-
927
-/* Do paranoid checks, then fill in the required variables, and pass back the
928
-pointer to the terminating '}'. */
929
-
930
-if (min > 65535 || max > 65535)
931
-  *errorptr = ERR5;
932
-else
933
-  {
934
-  *minp = min;
935
-  *maxp = max;
936
-  }
937
-return p;
937
+    return p;
938 938
 }
939 939
 
940 940
 
941 941
 
942 942
 /*************************************************
943
-*      Find first significant op code            *
944
-*************************************************/
943
+ *      Find first significant op code            *
944
+ *************************************************/
945 945
 
946 946
 /* This is called by several functions that scan a compiled expression looking
947
-for a fixed first character, or an anchoring op code etc. It skips over things
948
-that do not influence this. For some calls, a change of option is important.
949
-
950
-Arguments:
951
-  code       pointer to the start of the group
952
-  options    pointer to external options
953
-  optbit     the option bit whose changing is significant, or
954
-               zero if none are
955
-
956
-Returns:     pointer to the first significant opcode
957
-*/
947
+ for a fixed first character, or an anchoring op code etc. It skips over things
948
+ that do not influence this. For some calls, a change of option is important.
949
+ 
950
+ Arguments:
951
+ code       pointer to the start of the group
952
+ options    pointer to external options
953
+ optbit     the option bit whose changing is significant, or
954
+ zero if none are
955
+ 
956
+ Returns:     pointer to the first significant opcode
957
+ */
958 958
 
959 959
 static const uschar*
960 960
 first_significant_code(const uschar *code, int *options, int optbit)
961 961
 {
962
-for (;;)
963
-  {
964
-  switch ((int)*code)
962
+    for (;;)
965 963
     {
966
-    case OP_OPT:
967
-    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
968
-      *options = (int)code[1];
969
-    code += 2;
970
-    break;
971
-
972
-    case OP_ASSERT_NOT:
973
-    case OP_ASSERTBACK:
974
-    case OP_ASSERTBACK_NOT:
975
-    do code += GET(code, 1); while (*code == OP_ALT);
976
-    /* Fall through */
977
-
978
-    case OP_CALLOUT:
979
-    case OP_CREF:
980
-    case OP_BRANUMBER:
981
-    case OP_WORD_BOUNDARY:
982
-    case OP_NOT_WORD_BOUNDARY:
983
-    code += OP_lengths[*code];
984
-    break;
985
-
986
-    default:
987
-    return code;
964
+        switch ((int)*code)
965
+        {
966
+            case OP_OPT:
967
+                if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
968
+                    *options = (int)code[1];
969
+                code += 2;
970
+                break;
971
+                
972
+            case OP_ASSERT_NOT:
973
+            case OP_ASSERTBACK:
974
+            case OP_ASSERTBACK_NOT:
975
+                do code += GET(code, 1); while (*code == OP_ALT);
976
+                /* Fall through */
977
+                
978
+            case OP_CALLOUT:
979
+            case OP_CREF:
980
+            case OP_BRANUMBER:
981
+            case OP_WORD_BOUNDARY:
982
+            case OP_NOT_WORD_BOUNDARY:
983
+                code += OP_lengths[*code];
984
+                break;
985
+                
986
+            default:
987
+                return code;
988
+        }
988 989
     }
989
-  }
990
-/* Control never reaches here */
990
+    /* Control never reaches here */
991 991
 }
992 992
 
993 993
 
994 994
 
995 995
 
996 996
 /*************************************************
997
-*        Find the fixed length of a pattern      *
998
-*************************************************/
997
+ *        Find the fixed length of a pattern      *
998
+ *************************************************/
999 999
 
1000 1000
 /* Scan a pattern and compute the fixed length of subject that will match it,
1001
-if the length is fixed. This is needed for dealing with backward assertions.
1002
-In UTF8 mode, the result is in characters rather than bytes.
1003
-
1004
-Arguments:
1005
-  code     points to the start of the pattern (the bracket)
1006
-  options  the compiling options
1007
-
1008
-Returns:   the fixed length, or -1 if there is no fixed length,
1009
-             or -2 if \C was encountered
1010
-*/
1001
+ if the length is fixed. This is needed for dealing with backward assertions.
1002
+ In UTF8 mode, the result is in characters rather than bytes.
1003
+ 
1004
+ Arguments:
1005
+ code     points to the start of the pattern (the bracket)
1006
+ options  the compiling options
1007
+ 
1008
+ Returns:   the fixed length, or -1 if there is no fixed length,
1009
+ or -2 if \C was encountered
1010
+ */
1011 1011
 
1012 1012
 static int
1013 1013
 find_fixedlength(uschar *code, int options)
1014 1014
 {
1015
-int length = -1;
1016
-
1017
-register int branchlength = 0;
1018
-register uschar *cc = code + 1 + LINK_SIZE;
1019
-
1020
-/* Scan along the opcodes for this branch. If we get to the end of the
1021
-branch, check the length against that of the other branches. */
1022
-
1023
-for (;;)
1024
-  {
1025
-  int d;
1026
-  register int op = *cc;
1027
-  if (op >= OP_BRA) op = OP_BRA;
1028
-
1029
-  switch (op)
1015
+    int length = -1;
1016
+    
1017
+    register int branchlength = 0;
1018
+    register uschar *cc = code + 1 + LINK_SIZE;
1019
+    
1020
+    /* Scan along the opcodes for this branch. If we get to the end of the
1021
+     branch, check the length against that of the other branches. */
1022
+    
1023
+    for (;;)
1030 1024
     {
1031
-    case OP_BRA:
1032
-    case OP_ONCE:
1033
-    case OP_COND:
1034
-    d = find_fixedlength(cc, options);
1035
-    if (d < 0) return d;
1036
-    branchlength += d;
1037
-    do cc += GET(cc, 1); while (*cc == OP_ALT);
1038
-    cc += 1 + LINK_SIZE;
1039
-    break;
1040
-
1041
-    /* Reached end of a branch; if it's a ket it is the end of a nested
1042
-    call. If it's ALT it is an alternation in a nested call. If it is
1043
-    END it's the end of the outer call. All can be handled by the same code. */
1044
-
1045
-    case OP_ALT:
1046
-    case OP_KET:
1047
-    case OP_KETRMAX:
1048
-    case OP_KETRMIN:
1049
-    case OP_END:
1050
-    if (length < 0) length = branchlength;
1051
-      else if (length != branchlength) return -1;
1052
-    if (*cc != OP_ALT) return length;
1053
-    cc += 1 + LINK_SIZE;
1054
-    branchlength = 0;
1055
-    break;
1056
-
1057
-    /* Skip over assertive subpatterns */
1058
-
1059
-    case OP_ASSERT:
1060
-    case OP_ASSERT_NOT:
1061
-    case OP_ASSERTBACK:
1062
-    case OP_ASSERTBACK_NOT:
1063
-    do cc += GET(cc, 1); while (*cc == OP_ALT);
1064
-    /* Fall through */
1065
-
1066
-    /* Skip over things that don't match chars */
1067
-
1068
-    case OP_REVERSE:
1069
-    case OP_BRANUMBER:
1070
-    case OP_CREF:
1071
-    case OP_OPT:
1072
-    case OP_CALLOUT:
1073
-    case OP_SOD:
1074
-    case OP_SOM:
1075
-    case OP_EOD:
1076
-    case OP_EODN:
1077
-    case OP_CIRC:
1078
-    case OP_DOLL:
1079
-    case OP_NOT_WORD_BOUNDARY:
1080
-    case OP_WORD_BOUNDARY:
1081
-    cc += OP_lengths[*cc];
1082
-    break;
1083
-
1084
-    /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1085
-    This requires a scan of the string, unfortunately. We assume valid UTF-8
1086
-    strings, so all we do is reduce the length by one for every byte whose bits
1087
-    are 10xxxxxx. */
1088
-
1089
-    case OP_CHARS:
1090
-    branchlength += *(++cc);
1025
+        int d;
1026
+        register int op = *cc;
1027
+        if (op >= OP_BRA) op = OP_BRA;
1028
+        
1029
+        switch (op)
1030
+        {
1031
+            case OP_BRA:
1032
+            case OP_ONCE:
1033
+            case OP_COND:
1034
+                d = find_fixedlength(cc, options);
1035
+                if (d < 0) return d;
1036
+                branchlength += d;
1037
+                do cc += GET(cc, 1); while (*cc == OP_ALT);
1038
+                cc += 1 + LINK_SIZE;
1039
+                break;
1040
+                
1041
+                /* Reached end of a branch; if it's a ket it is the end of a nested
1042
+                 call. If it's ALT it is an alternation in a nested call. If it is
1043
+                 END it's the end of the outer call. All can be handled by the same code. */
1044
+                
1045
+            case OP_ALT:
1046
+            case OP_KET:
1047
+            case OP_KETRMAX:
1048
+            case OP_KETRMIN:
1049
+            case OP_END:
1050
+                if (length < 0) length = branchlength;
1051
+                else if (length != branchlength) return -1;
1052
+                if (*cc != OP_ALT) return length;
1053
+                cc += 1 + LINK_SIZE;
1054
+                branchlength = 0;
1055
+                break;
1056
+                
1057
+                /* Skip over assertive subpatterns */
1058
+                
1059
+            case OP_ASSERT:
1060
+            case OP_ASSERT_NOT:
1061
+            case OP_ASSERTBACK:
1062
+            case OP_ASSERTBACK_NOT:
1063
+                do cc += GET(cc, 1); while (*cc == OP_ALT);
1064
+                /* Fall through */
1065
+                
1066
+                /* Skip over things that don't match chars */
1067
+                
1068
+            case OP_REVERSE:
1069
+            case OP_BRANUMBER:
1070
+            case OP_CREF:
1071
+            case OP_OPT:
1072
+            case OP_CALLOUT:
1073
+            case OP_SOD:
1074
+            case OP_SOM:
1075
+            case OP_EOD:
1076
+            case OP_EODN:
1077
+            case OP_CIRC:
1078
+            case OP_DOLL:
1079
+            case OP_NOT_WORD_BOUNDARY:
1080
+            case OP_WORD_BOUNDARY:
1081
+                cc += OP_lengths[*cc];
1082
+                break;
1083
+                
1084
+                /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1085
+                 This requires a scan of the string, unfortunately. We assume valid UTF-8
1086
+                 strings, so all we do is reduce the length by one for every byte whose bits
1087
+                 are 10xxxxxx. */
1088
+                
1089
+            case OP_CHARS:
1090
+                branchlength += *(++cc);
1091 1091
 #ifdef SUPPORT_UTF8
1092
-    if ((options & PCRE_UTF8) != 0)
1093
-      for (d = 1; d <= *cc; d++)
1094
-        if ((cc[d] & 0xc0) == 0x80) branchlength--;
1092
+                if ((options & PCRE_UTF8) != 0)
1093
+                    for (d = 1; d <= *cc; d++)
1094
+                        if ((cc[d] & 0xc0) == 0x80) branchlength--;
1095 1095
 #endif
1096
-    cc += *cc + 1;
1097
-    break;
1098
-
1099
-    /* Handle exact repetitions. The count is already in characters, but we
1100
-    need to skip over a multibyte character in UTF8 mode.  */
1101
-
1102
-    case OP_EXACT:
1103
-    branchlength += GET2(cc,1);
1104
-    cc += 4;
1096
+                cc += *cc + 1;
1097
+                break;
1098
+                
1099
+                /* Handle exact repetitions. The count is already in characters, but we
1100
+                 need to skip over a multibyte character in UTF8 mode.  */
1101
+                
1102
+            case OP_EXACT:
1103
+                branchlength += GET2(cc,1);
1104
+                cc += 4;
1105 1105
 #ifdef SUPPORT_UTF8
1106
-    if ((options & PCRE_UTF8) != 0)
1107
-      {
1108
-      while((*cc & 0x80) == 0x80) cc++;
1109
-      }
1106
+                if ((options & PCRE_UTF8) != 0)
1107
+                {
1108
+                    while((*cc & 0x80) == 0x80) cc++;
1109
+                }
1110 1110
 #endif
1111
-    break;
1112
-
1113
-    case OP_TYPEEXACT:
1114
-    branchlength += GET2(cc,1);
1115
-    cc += 4;
1116
-    break;
1117
-
1118
-    /* Handle single-char matchers */
1119
-
1120
-    case OP_NOT_DIGIT:
1121
-    case OP_DIGIT:
1122
-    case OP_NOT_WHITESPACE:
1123
-    case OP_WHITESPACE:
1124
-    case OP_NOT_WORDCHAR:
1125
-    case OP_WORDCHAR:
1126
-    case OP_ANY:
1127
-    branchlength++;
1128
-    cc++;
1129
-    break;
1130
-
1131
-    /* The single-byte matcher isn't allowed */
1132
-
1133
-    case OP_ANYBYTE:
1134
-    return -2;
1135
-
1136
-    /* Check a class for variable quantification */
1137
-
1111
+                break;
1112
+                
1113
+            case OP_TYPEEXACT:
1114
+                branchlength += GET2(cc,1);
1115
+                cc += 4;
1116
+                break;
1117
+                
1118
+                /* Handle single-char matchers */
1119
+                
1120
+            case OP_NOT_DIGIT:
1121
+            case OP_DIGIT:
1122
+            case OP_NOT_WHITESPACE:
1123
+            case OP_WHITESPACE:
1124
+            case OP_NOT_WORDCHAR:
1125
+            case OP_WORDCHAR:
1126
+            case OP_ANY:
1127
+                branchlength++;
1128
+                cc++;
1129
+                break;
1130
+                
1131
+                /* The single-byte matcher isn't allowed */
1132
+                
1133
+            case OP_ANYBYTE:
1134
+                return -2;
1135
+                
1136
+                /* Check a class for variable quantification */
1137
+                
1138 1138
 #ifdef SUPPORT_UTF8
1139
-    case OP_XCLASS:
1140
-    cc += GET(cc, 1) - 33;
1141
-    /* Fall through */
1139
+            case OP_XCLASS:
1140
+                cc += GET(cc, 1) - 33;
1141
+                /* Fall through */
1142 1142
 #endif
1143
-
1144
-    case OP_CLASS:
1145
-    case OP_NCLASS:
1146
-    cc += 33;
1147
-
1148
-    switch (*cc)
1149
-      {
1150
-      case OP_CRSTAR:
1151
-      case OP_CRMINSTAR:
1152
-      case OP_CRQUERY:
1153
-      case OP_CRMINQUERY:
1154
-      return -1;
1155
-
1156
-      case OP_CRRANGE:
1157
-      case OP_CRMINRANGE:
1158
-      if (GET2(cc,1) != GET2(cc,3)) return -1;
1159
-      branchlength += GET2(cc,1);
1160
-      cc += 5;
1161
-      break;
1162
-
1163
-      default:
1164
-      branchlength++;
1165
-      }
1166
-    break;
1167
-
1168
-    /* Anything else is variable length */
1169
-
1170
-    default:
1171
-    return -1;
1143
+                
1144
+            case OP_CLASS:
1145
+            case OP_NCLASS:
1146
+                cc += 33;
1147
+                
1148
+                switch (*cc)
1149
+            {
1150
+                case OP_CRSTAR:
1151
+                case OP_CRMINSTAR:
1152
+                case OP_CRQUERY:
1153
+                case OP_CRMINQUERY:
1154
+                    return -1;
1155
+                    
1156
+                case OP_CRRANGE:
1157
+                case OP_CRMINRANGE:
1158
+                    if (GET2(cc,1) != GET2(cc,3)) return -1;
1159
+                    branchlength += GET2(cc,1);
1160
+                    cc += 5;
1161
+                    break;
1162
+                    
1163
+                default:
1164
+                    branchlength++;
1165
+            }
1166
+                break;
1167
+                
1168
+                /* Anything else is variable length */
1169
+                
1170
+            default:
1171
+                return -1;
1172
+        }
1172 1173
     }
1173
-  }
1174
-/* Control never gets here */
1174
+    /* Control never gets here */
1175 1175
 }
1176 1176
 
1177 1177
 
1178 1178
 
1179 1179
 
1180 1180
 /*************************************************
1181
-*    Scan compiled regex for numbered bracket    *
1182
-*************************************************/
1181
+ *    Scan compiled regex for numbered bracket    *
1182
+ *************************************************/
1183 1183
 
1184 1184
 /* This little function scans through a compiled pattern until it finds a
1185
-capturing bracket with the given number.
1186
-
1187
-Arguments:
1188
-  code        points to start of expression
1189
-  utf8        TRUE in UTF-8 mode
1190
-  number      the required bracket number
1191
-
1192
-Returns:      pointer to the opcode for the bracket, or NULL if not found
1193
-*/
1185
+ capturing bracket with the given number.
1186
+ 
1187
+ Arguments:
1188
+ code        points to start of expression
1189
+ utf8        TRUE in UTF-8 mode
1190
+ number      the required bracket number
1191
+ 
1192
+ Returns:      pointer to the opcode for the bracket, or NULL if not found
1193
+ */
1194 1194
 
1195 1195
 static const uschar *
1196 1196
 find_bracket(const uschar *code, BOOL utf8, int number)
1197 1197
 {
1198 1198
 #ifndef SUPPORT_UTF8
1199
-utf8 = utf8;               /* Stop pedantic compilers complaining */
1199
+    utf8 = utf8;               /* Stop pedantic compilers complaining */
1200 1200
 #endif
1201
-
1202
-for (;;)
1203
-  {
1204
-  register int c = *code;
1205
-  if (c == OP_END) return NULL;
1206
-  else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1207
-  else if (c > OP_BRA)
1201
+    
1202
+    for (;;)
1208 1203
     {
1209
-    int n = c - OP_BRA;
1210
-    if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1211
-    if (n == number) return (uschar *)code;
1212
-    code += OP_lengths[OP_BRA];
1213
-    }
1214
-  else
1215
-    {
1216
-    code += OP_lengths[c];
1217
-
1218
-    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1219
-    by a multi-byte character. The length in the table is a minimum, so we have
1220
-    to scan along to skip the extra characters. All opcodes are less than 128,
1221
-    so we can use relatively efficient code. */
1222
-
1204
+        register int c = *code;
1205
+        if (c == OP_END) return NULL;
1206
+        else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1207
+        else if (c > OP_BRA)
1208
+        {
1209
+            int n = c - OP_BRA;
1210
+            if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1211
+            if (n == number) return (uschar *)code;
1212
+            code += OP_lengths[OP_BRA];
1213
+        }
1214
+        else
1215
+        {
1216
+            code += OP_lengths[c];
1217
+            
1218
+            /* In UTF-8 mode, opcodes that are followed by a character may be followed
1219
+             by a multi-byte character. The length in the table is a minimum, so we have
1220
+             to scan along to skip the extra characters. All opcodes are less than 128,
1221
+             so we can use relatively efficient code. */
1222
+            
1223 1223
 #ifdef SUPPORT_UTF8
1224
-    if (utf8) switch(c)
1225
-      {
1226
-      case OP_EXACT:
1227
-      case OP_UPTO:
1228
-      case OP_MINUPTO:
1229
-      case OP_STAR:
1230
-      case OP_MINSTAR:
1231
-      case OP_PLUS:
1232
-      case OP_MINPLUS:
1233
-      case OP_QUERY:
1234
-      case OP_MINQUERY:
1235
-      while ((*code & 0xc0) == 0x80) code++;
1236
-      break;
1237
-      }
1224
+            if (utf8) switch(c)
1225
+            {
1226
+                case OP_EXACT:
1227
+                case OP_UPTO:
1228
+                case OP_MINUPTO:
1229
+                case OP_STAR:
1230
+                case OP_MINSTAR:
1231
+                case OP_PLUS:
1232
+                case OP_MINPLUS:
1233
+                case OP_QUERY:
1234
+                case OP_MINQUERY:
1235
+                    while ((*code & 0xc0) == 0x80) code++;
1236
+                    break;
1237
+            }
1238 1238
 #endif
1239
+        }
1239 1240
     }
1240
-  }
1241 1241
 }
1242 1242
 
1243 1243
 
1244 1244
 
1245 1245
 /*************************************************
1246
-*    Scan compiled branch for non-emptiness      *
1247
-*************************************************/
1246
+ *    Scan compiled branch for non-emptiness      *
1247
+ *************************************************/
1248 1248
 
1249 1249
 /* This function scans through a branch of a compiled pattern to see whether it
1250
-can match the empty string or not. It is called only from could_be_empty()
1251
-below. Note that first_significant_code() skips over assertions. If we hit an
1252
-unclosed bracket, we return "empty" - this means we've struck an inner bracket
1253
-whose current branch will already have been scanned.
1254
-
1255
-Arguments:
1256
-  code        points to start of search
1257
-  endcode     points to where to stop
1258
-  utf8        TRUE if in UTF8 mode
1259
-
1260
-Returns:      TRUE if what is matched could be empty
1261
-*/
1250
+ can match the empty string or not. It is called only from could_be_empty()
1251
+ below. Note that first_significant_code() skips over assertions. If we hit an
1252
+ unclosed bracket, we return "empty" - this means we've struck an inner bracket
1253
+ whose current branch will already have been scanned.
1254
+ 
1255
+ Arguments:
1256
+ code        points to start of search
1257
+ endcode     points to where to stop
1258
+ utf8        TRUE if in UTF8 mode
1259
+ 
1260
+ Returns:      TRUE if what is matched could be empty
1261
+ */
1262 1262
 
1263 1263
 static BOOL
1264 1264
 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1265 1265
 {
1266
-register int c;
1267
-for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1268
-     code < endcode;
1269
-     code = first_significant_code(code + OP_lengths[c], NULL, 0))
1270
-  {
1271
-  const uschar *ccode;
1272
-
1273
-  c = *code;
1274
-
1275
-  if (c >= OP_BRA)
1276
-    {
1277
-    BOOL empty_branch;
1278
-    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1279
-
1280
-    /* Scan a closed bracket */
1281
-
1282
-    empty_branch = FALSE;
1283
-    do
1284
-      {
1285
-      if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1286
-        empty_branch = TRUE;
1287
-      code += GET(code, 1);
1288
-      }
1289
-    while (*code == OP_ALT);
1290
-    if (!empty_branch) return FALSE;   /* All branches are non-empty */
1291
-    code += 1 + LINK_SIZE;
1292
-    c = *code;
1293
-    }
1294
-
1295
-  else switch (c)
1266
+    register int c;
1267
+    for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1268
+         code < endcode;
1269
+         code = first_significant_code(code + OP_lengths[c], NULL, 0))
1296 1270
     {
1297
-    /* Check for quantifiers after a class */
1298
-
1271
+        const uschar *ccode;
1272
+        
1273
+        c = *code;
1274
+        
1275
+        if (c >= OP_BRA)
1276
+        {
1277
+            BOOL empty_branch;
1278
+            if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1279
+            
1280
+            /* Scan a closed bracket */
1281
+            
1282
+            empty_branch = FALSE;
1283
+            do
1284
+            {
1285
+                if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1286
+                    empty_branch = TRUE;
1287
+                code += GET(code, 1);
1288
+            }
1289
+            while (*code == OP_ALT);
1290
+            if (!empty_branch) return FALSE;   /* All branches are non-empty */
1291
+            code += 1 + LINK_SIZE;
1292
+            c = *code;
1293
+        }
1294
+        
1295
+        else switch (c)
1296
+        {
1297
+                /* Check for quantifiers after a class */
1298
+                
1299 1299
 #ifdef SUPPORT_UTF8
1300
-    case OP_XCLASS:
1301
-    ccode = code + GET(code, 1);
1302
-    goto CHECK_CLASS_REPEAT;
1300
+            case OP_XCLASS:
1301
+                ccode = code + GET(code, 1);
1302
+                goto CHECK_CLASS_REPEAT;
1303 1303
 #endif
1304
-
1305
-    case OP_CLASS:
1306
-    case OP_NCLASS:
1307
-    ccode = code + 33;
1308
-
1304
+                
1305
+            case OP_CLASS:
1306
+            case OP_NCLASS:
1307
+                ccode = code + 33;
1308
+                
1309 1309
 #ifdef SUPPORT_UTF8
1310
-    CHECK_CLASS_REPEAT:
1310
+            CHECK_CLASS_REPEAT:
1311 1311
 #endif
1312
-
1313
-    switch (*ccode)
1314
-      {
1315
-      case OP_CRSTAR:            /* These could be empty; continue */
1316
-      case OP_CRMINSTAR:
1317
-      case OP_CRQUERY:
1318
-      case OP_CRMINQUERY:
1319
-      break;
1320
-
1321
-      default:                   /* Non-repeat => class must match */
1322
-      case OP_CRPLUS:            /* These repeats aren't empty */
1323
-      case OP_CRMINPLUS:
1324
-      return FALSE;
1325
-
1326
-      case OP_CRRANGE:
1327
-      case OP_CRMINRANGE:
1328
-      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1329
-      break;
1330
-      }
1331
-    break;
1332
-
1333
-    /* Opcodes that must match a character */
1334
-
1335
-    case OP_NOT_DIGIT:
1336
-    case OP_DIGIT:
1337
-    case OP_NOT_WHITESPACE:
1338
-    case OP_WHITESPACE:
1339
-    case OP_NOT_WORDCHAR:
1340
-    case OP_WORDCHAR:
1341
-    case OP_ANY:
1342
-    case OP_ANYBYTE:
1343
-    case OP_CHARS:
1344
-    case OP_NOT:
1345
-    case OP_PLUS:
1346
-    case OP_MINPLUS:
1347
-    case OP_EXACT:
1348
-    case OP_NOTPLUS:
1349
-    case OP_NOTMINPLUS:
1350
-    case OP_NOTEXACT:
1351
-    case OP_TYPEPLUS:
1352
-    case OP_TYPEMINPLUS:
1353
-    case OP_TYPEEXACT:
1354
-    return FALSE;
1355
-
1356
-    /* End of branch */
1357
-
1358
-    case OP_KET:
1359
-    case OP_KETRMAX:
1360
-    case OP_KETRMIN:
1361
-    case OP_ALT:
1362
-    return TRUE;
1363
-
1364
-    /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1365
-    followed by a multibyte character */
1366
-
1312
+                
1313
+                switch (*ccode)
1314
+            {
1315
+                case OP_CRSTAR:            /* These could be empty; continue */
1316
+                case OP_CRMINSTAR:
1317
+                case OP_CRQUERY:
1318
+                case OP_CRMINQUERY:
1319
+                    break;
1320
+                    
1321
+                default:                   /* Non-repeat => class must match */
1322
+                case OP_CRPLUS:            /* These repeats aren't empty */
1323
+                case OP_CRMINPLUS:
1324
+                    return FALSE;
1325
+                    
1326
+                case OP_CRRANGE:
1327
+                case OP_CRMINRANGE:
1328
+                    if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1329
+                    break;
1330
+            }
1331
+                break;
1332
+                
1333
+                /* Opcodes that must match a character */
1334
+                
1335
+            case OP_NOT_DIGIT:
1336
+            case OP_DIGIT:
1337
+            case OP_NOT_WHITESPACE:
1338
+            case OP_WHITESPACE:
1339
+            case OP_NOT_WORDCHAR:
1340
+            case OP_WORDCHAR:
1341
+            case OP_ANY:
1342
+            case OP_ANYBYTE:
1343
+            case OP_CHARS:
1344
+            case OP_NOT:
1345
+            case OP_PLUS:
1346
+            case OP_MINPLUS:
1347
+            case OP_EXACT:
1348
+            case OP_NOTPLUS:
1349
+            case OP_NOTMINPLUS:
1350
+            case OP_NOTEXACT:
1351
+            case OP_TYPEPLUS:
1352
+            case OP_TYPEMINPLUS:
1353
+            case OP_TYPEEXACT:
1354
+                return FALSE;
1355
+                
1356
+                /* End of branch */
1357
+                
1358
+            case OP_KET:
1359
+            case OP_KETRMAX:
1360
+            case OP_KETRMIN:
1361
+            case OP_ALT:
1362
+                return TRUE;
1363
+                
1364
+                /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1365
+                 followed by a multibyte character */
1366
+                
1367 1367
 #ifdef SUPPORT_UTF8
1368
-    case OP_STAR:
1369
-    case OP_MINSTAR:
1370
-    case OP_QUERY:
1371
-    case OP_MINQUERY:
1372
-    case OP_UPTO:
1373
-    case OP_MINUPTO:
1374
-    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1375
-    break;
1368
+            case OP_STAR:
1369
+            case OP_MINSTAR:
1370
+            case OP_QUERY:
1371
+            case OP_MINQUERY:
1372
+            case OP_UPTO:
1373
+            case OP_MINUPTO:
1374
+                if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1375
+                break;
1376 1376
 #endif
1377
+        }
1377 1378
     }
1378
-  }
1379
-
1380
-return TRUE;
1379
+    
1380
+    return TRUE;
1381 1381
 }
1382 1382
 
1383 1383
 
1384 1384
 
1385 1385
 /*************************************************
1386
-*    Scan compiled regex for non-emptiness       *
1387
-*************************************************/
1386
+ *    Scan compiled regex for non-emptiness       *
1387
+ *************************************************/
1388 1388
 
1389 1389
 /* This function is called to check for left recursive calls. We want to check
1390
-the current branch of the current pattern to see if it could match the empty
1391
-string. If it could, we must look outwards for branches at other levels,
1392
-stopping when we pass beyond the bracket which is the subject of the recursion.
1393
-
1394
-Arguments:
1395
-  code        points to start of the recursion
1396
-  endcode     points to where to stop (current RECURSE item)
1397
-  bcptr       points to the chain of current (unclosed) branch starts
1398
-  utf8        TRUE if in UTF-8 mode
1399
-
1400
-Returns:      TRUE if what is matched could be empty
1401
-*/
1390
+ the current branch of the current pattern to see if it could match the empty
1391
+ string. If it could, we must look outwards for branches at other levels,
1392
+ stopping when we pass beyond the bracket which is the subject of the recursion.
1393
+ 
1394
+ Arguments:
1395
+ code        points to start of the recursion
1396
+ endcode     points to where to stop (current RECURSE item)
1397
+ bcptr       points to the chain of current (unclosed) branch starts
1398
+ utf8        TRUE if in UTF-8 mode
1399
+ 
1400
+ Returns:      TRUE if what is matched could be empty
1401
+ */
1402 1402
 
1403 1403
 static BOOL
1404 1404
 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1405
-  BOOL utf8)
1405
+               BOOL utf8)
1406 1406
 {
1407
-while (bcptr != NULL && bcptr->current >= code)
1408
-  {
1409
-  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1410
-  bcptr = bcptr->outer;
1411
-  }
1412
-return TRUE;
1407
+    while (bcptr != NULL && bcptr->current >= code)
1408
+    {
1409
+        if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1410
+        bcptr = bcptr->outer;
1411
+    }
1412
+    return TRUE;
1413 1413
 }
1414 1414
 
1415 1415
 
1416 1416
 
1417 1417
 /*************************************************
1418
-*           Check for POSIX class syntax         *
1419
-*************************************************/
1418
+ *           Check for POSIX class syntax         *
1419
+ *************************************************/
1420 1420
 
1421 1421
 /* This function is called when the sequence "[:" or "[." or "[=" is
1422
-encountered in a character class. It checks whether this is followed by an
1423
-optional ^ and then a sequence of letters, terminated by a matching ":]" or
1424
-".]" or "=]".
1425
-
1426
-Argument:
1427
-  ptr      pointer to the initial [
1428
-  endptr   where to return the end pointer
1429
-  cd       pointer to compile data
1430
-
1431
-Returns:   TRUE or FALSE
1432
-*/
1422
+ encountered in a character class. It checks whether this is followed by an
1423
+ optional ^ and then a sequence of letters, terminated by a matching ":]" or
1424
+ ".]" or "=]".
1425
+ 
1426
+ Argument:
1427
+ ptr      pointer to the initial [
1428
+ endptr   where to return the end pointer
1429
+ cd       pointer to compile data
1430
+ 
1431
+ Returns:   TRUE or FALSE
1432
+ */
1433 1433
 
1434 1434
 static BOOL
1435 1435
 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1436 1436
 {
1437
-int terminator;          /* Don't combine these lines; the Solaris cc */
1438
-terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1439
-if (*(++ptr) == '^') ptr++;
1440
-while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1441
-if (*ptr == terminator && ptr[1] == ']')
1442
-  {
1443
-  *endptr = ptr;
1444
-  return TRUE;
1445
-  }
1446
-return FALSE;
1437
+    int terminator;          /* Don't combine these lines; the Solaris cc */
1438
+    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1439
+    if (*(++ptr) == '^') ptr++;
1440
+    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1441
+    if (*ptr == terminator && ptr[1] == ']')
1442
+    {
1443
+        *endptr = ptr;
1444
+        return TRUE;
1445
+    }
1446
+    return FALSE;
1447 1447
 }
1448 1448
 
1449 1449
 
1450 1450
 
1451 1451
 
1452 1452
 /*************************************************
1453
-*          Check POSIX class name                *
1454
-*************************************************/
1453
+ *          Check POSIX class name                *
1454
+ *************************************************/
1455 1455
 
1456 1456
 /* This function is called to check the name given in a POSIX-style class entry
1457
-such as [:alnum:].
1458
-
1459
-Arguments:
1460
-  ptr        points to the first letter
1461
-  len        the length of the name
1462
-
1463
-Returns:     a value representing the name, or -1 if unknown
1464
-*/
1457
+ such as [:alnum:].
1458
+ 
1459
+ Arguments:
1460
+ ptr        points to the first letter
1461
+ len        the length of the name
1462
+ 
1463
+ Returns:     a value representing the name, or -1 if unknown
1464
+ */
1465 1465
 
1466 1466
 static int
1467 1467
 check_posix_name(const uschar *ptr, int len)
1468 1468
 {
1469
-register int yield = 0;
1470
-while (posix_name_lengths[yield] != 0)
1471
-  {
1472
-  if (len == posix_name_lengths[yield] &&
1473
-    strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1474
-  yield++;
1475
-  }
1476
-return -1;
1469
+    register int yield = 0;
1470
+    while (posix_name_lengths[yield] != 0)
1471
+    {
1472
+        if (len == posix_name_lengths[yield] &&
1473
+            strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1474
+        yield++;
1475
+    }
1476
+    return -1;
1477 1477
 }
1478 1478
 
1479 1479
 
1480 1480
 
1481 1481
 
1482 1482
 /*************************************************
1483
-*           Compile one branch                   *
1484
-*************************************************/
1483
+ *           Compile one branch                   *
1484
+ *************************************************/
1485 1485
 
1486 1486
 /* Scan the pattern, compiling it into the code vector. If the options are
1487
-changed during the branch, the pointer is used to change the external options
1488
-bits.
1489
-
1490
-Arguments:
1491
-  optionsptr     pointer to the option bits
1492
-  brackets       points to number of extracting brackets used
1493
-  code           points to the pointer to the current code point
1494
-  ptrptr         points to the current pattern pointer
1495
-  errorptr       points to pointer to error message
1496
-  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1497
-  reqbyteptr     set to the last literal character required, else < 0
1498
-  bcptr          points to current branch chain
1499
-  cd             contains pointers to tables etc.
1500
-
1501
-Returns:         TRUE on success
1502
-                 FALSE, with *errorptr set on error
1503
-*/
1487
+ changed during the branch, the pointer is used to change the external options
1488
+ bits.
1489
+ 
1490
+ Arguments:
1491
+ optionsptr     pointer to the option bits
1492
+ brackets       points to number of extracting brackets used
1493
+ code           points to the pointer to the current code point
1494
+ ptrptr         points to the current pattern pointer
1495
+ errorptr       points to pointer to error message
1496
+ firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1497
+ reqbyteptr     set to the last literal character required, else < 0
1498
+ bcptr          points to current branch chain
1499
+ cd             contains pointers to tables etc.
1500
+ 
1501
+ Returns:         TRUE on success
1502
+ FALSE, with *errorptr set on error
1503
+ */
1504 1504
 
1505 1505
 static BOOL
1506 1506
 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1507
-  const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1508
-  int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1507
+               const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1508
+               int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1509 1509
 {
1510
-int repeat_type, op_type;
1511
-int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
1512
-int bravalue = 0;
1513
-int length;
1514
-int greedy_default, greedy_non_default;
1515
-int firstbyte, reqbyte;
1516
-int zeroreqbyte, zerofirstbyte;
1517
-int req_caseopt, reqvary, tempreqvary;
1518
-int condcount = 0;
1519
-int options = *optionsptr;
1520
-register int c;
1521
-register uschar *code = *codeptr;
1522
-uschar *tempcode;
1523
-BOOL inescq = FALSE;
1524
-BOOL groupsetfirstbyte = FALSE;
1525
-const uschar *ptr = *ptrptr;
1526
-const uschar *tempptr;
1527
-uschar *previous = NULL;
1528
-uschar class[32];
1529
-
1510
+    int repeat_type, op_type;
1511
+    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
1512
+    int bravalue = 0;
1513
+    int length;
1514
+    int greedy_default, greedy_non_default;
1515
+    int firstbyte, reqbyte;
1516
+    int zeroreqbyte, zerofirstbyte;
1517
+    int req_caseopt, reqvary, tempreqvary;
1518
+    int condcount = 0;
1519
+    int options = *optionsptr;
1520
+    register int c;
1521
+    register uschar *code = *codeptr;
1522
+    uschar *tempcode;
1523
+    BOOL inescq = FALSE;
1524
+    BOOL groupsetfirstbyte = FALSE;
1525
+    const uschar *ptr = *ptrptr;
1526
+    const uschar *tempptr;
1527
+    uschar *previous = NULL;
1528
+    uschar class[32];
1529
+    
1530 1530
 #ifdef SUPPORT_UTF8
1531
-BOOL class_utf8;
1532
-BOOL utf8 = (options & PCRE_UTF8) != 0;
1533
-uschar *class_utf8data;
1534
-uschar utf8_char[6];
1531
+    BOOL class_utf8;
1532
+    BOOL utf8 = (options & PCRE_UTF8) != 0;
1533
+    uschar *class_utf8data;
1534
+    uschar utf8_char[6];
1535 1535
 #else
1536
-BOOL utf8 = FALSE;
1536
+    BOOL utf8 = FALSE;
1537 1537
 #endif
1538
-
1539
-/* Set up the default and non-default settings for greediness */
1540
-
1541
-greedy_default = ((options & PCRE_UNGREEDY) != 0);
1542
-greedy_non_default = greedy_default ^ 1;
1543
-
1544
-/* Initialize no first char, no required char. REQ_UNSET means "no char
1545
-matching encountered yet". It gets changed to REQ_NONE if we hit something that
1546
-matches a non-fixed char first char; reqbyte just remains unset if we never
1547
-find one.
1548
-
1549
-When we hit a repeat whose minimum is zero, we may have to adjust these values
1550
-to take the zero repeat into account. This is implemented by setting them to
1551
-zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1552
-item types that can be repeated set these backoff variables appropriately. */
1553
-
1554
-firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1555
-
1556
-/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1557
-according to the current setting of the caseless flag. REQ_CASELESS is a bit
1558
-value > 255. It is added into the firstbyte or reqbyte variables to record the
1559
-case status of the value. */
1560
-
1561
-req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1562
-
1563
-/* Switch on next character until the end of the branch */
1564
-
1565
-for (;; ptr++)
1566
-  {
1567
-  BOOL negate_class;
1568
-  BOOL possessive_quantifier;
1569
-  int class_charcount;
1570
-  int class_lastchar;
1571
-  int newoptions;
1572
-  int recno;
1573
-  int skipbytes;
1574
-  int subreqbyte;
1575
-  int subfirstbyte;
1576
-
1577
-  c = *ptr;
1578
-  if (inescq && c != 0) goto NORMAL_CHAR;
1579
-
1580
-  if ((options & PCRE_EXTENDED) != 0)
1581
-    {
1582
-    if ((cd->ctypes[c] & ctype_space) != 0) continue;
1583
-    if (c == '#')
1584
-      {
1585
-      /* The space before the ; is to avoid a warning on a silly compiler
1586
-      on the Macintosh. */
1587
-      while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1588
-      if (c != 0) continue;   /* Else fall through to handle end of string */
1589
-      }
1590
-    }
1591
-
1592
-  switch(c)
1538
+    
1539
+    /* Set up the default and non-default settings for greediness */
1540
+    
1541
+    greedy_default = ((options & PCRE_UNGREEDY) != 0);
1542
+    greedy_non_default = greedy_default ^ 1;
1543
+    
1544
+    /* Initialize no first char, no required char. REQ_UNSET means "no char
1545
+     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1546
+     matches a non-fixed char first char; reqbyte just remains unset if we never
1547
+     find one.
1548
+     
1549
+     When we hit a repeat whose minimum is zero, we may have to adjust these values
1550
+     to take the zero repeat into account. This is implemented by setting them to
1551
+     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1552
+     item types that can be repeated set these backoff variables appropriately. */
1553
+    
1554
+    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1555
+    
1556
+    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1557
+     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1558
+     value > 255. It is added into the firstbyte or reqbyte variables to record the
1559
+     case status of the value. */
1560
+    
1561
+    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1562
+    
1563
+    /* Switch on next character until the end of the branch */
1564
+    
1565
+    for (;; ptr++)
1593 1566
     {
1594
-    /* The branch terminates at end of string, |, or ). */
1595
-
1596
-    case 0:
1597
-    case '|':
1598
-    case ')':
1599
-    *firstbyteptr = firstbyte;
1600
-    *reqbyteptr = reqbyte;
1601
-    *codeptr = code;
1602
-    *ptrptr = ptr;
1603
-    return TRUE;
1604
-
1605
-    /* Handle single-character metacharacters. In multiline mode, ^ disables
1606
-    the setting of any following char as a first character. */
1607
-
1608
-    case '^':
1609
-    if ((options & PCRE_MULTILINE) != 0)
1610
-      {
1611
-      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1612
-      }
1613
-    previous = NULL;
1614
-    *code++ = OP_CIRC;
1615
-    break;
1616
-
1617
-    case '$':
1618
-    previous = NULL;
1619
-    *code++ = OP_DOLL;
1620
-    break;
1621
-
1622
-    /* There can never be a first char if '.' is first, whatever happens about
1623
-    repeats. The value of reqbyte doesn't change either. */
1624
-
1625
-    case '.':
1626
-    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1627
-    zerofirstbyte = firstbyte;
1628
-    zeroreqbyte = reqbyte;
1629
-    previous = code;
1630
-    *code++ = OP_ANY;
1631
-    break;
1632
-
1633
-    /* Character classes. If the included characters are all < 255 in value, we
1634
-    build a 32-byte bitmap of the permitted characters, except in the special
1635
-    case where there is only one such character. For negated classes, we build
1636
-    the map as usual, then invert it at the end. However, we use a different
1637
-    opcode so that data characters > 255 can be handled correctly.
1638
-
1639
-    If the class contains characters outside the 0-255 range, a different
1640
-    opcode is compiled. It may optionally have a bit map for characters < 256,
1641
-    but those above are are explicitly listed afterwards. A flag byte tells
1642
-    whether the bitmap is present, and whether this is a negated class or not.
1643
-    */
1644
-
1645
-    case '[':
1646
-    previous = code;
1647
-
1648
-    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1649
-    they are encountered at the top level, so we'll do that too. */
1650
-
1651
-    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1652
-        check_posix_syntax(ptr, &tempptr, cd))
1653
-      {
1654
-      *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1655
-      goto FAILED;
1656
-      }
1657
-
1658
-    /* If the first character is '^', set the negation flag and skip it. */
1659
-
1660
-    if ((c = *(++ptr)) == '^')
1661
-      {
1662
-      negate_class = TRUE;
1663
-      c = *(++ptr);
1664
-      }
1665
-    else
1666
-      {
1667
-      negate_class = FALSE;
1668
-      }
1669
-
1670
-    /* Keep a count of chars with values < 256 so that we can optimize the case
1671
-    of just a single character (as long as it's < 256). For higher valued UTF-8
1672
-    characters, we don't yet do any optimization. */
1673
-
1674
-    class_charcount = 0;
1675
-    class_lastchar = -1;
1676
-
1677
-#ifdef SUPPORT_UTF8
1678
-    class_utf8 = FALSE;                       /* No chars >= 256 */
1679
-    class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
1680
-#endif
1681
-
1682
-    /* Initialize the 32-char bit map to all zeros. We have to build the
1683
-    map in a temporary bit of store, in case the class contains only 1
1684
-    character (< 256), because in that case the compiled code doesn't use the
1685
-    bit map. */
1686
-
1687
-    memset(class, 0, 32 * sizeof(uschar));
1688
-
1689
-    /* Process characters until ] is reached. By writing this as a "do" it
1690
-    means that an initial ] is taken as a data character. The first pass
1691
-    through the regex checked the overall syntax, so we don't need to be very
1692
-    strict here. At the start of the loop, c contains the first byte of the
1693
-    character. */
1694
-
1695
-    do
1696
-      {
1697
-#ifdef SUPPORT_UTF8
1698
-      if (utf8 && c > 127)
1699
-        {                           /* Braces are required because the */
1700
-        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
1701
-        }
1702
-#endif
1703
-
1704
-      /* Inside \Q...\E everything is literal except \E */
1705
-
1706
-      if (inescq)
1567
+        BOOL negate_class;
1568
+        BOOL possessive_quantifier;
1569
+        int class_charcount;
1570
+        int class_lastchar;
1571
+        int newoptions;
1572
+        int recno;
1573
+        int skipbytes;
1574
+        int subreqbyte;
1575
+        int subfirstbyte;
1576
+        
1577
+        c = *ptr;
1578
+        if (inescq && c != 0) goto NORMAL_CHAR;
1579
+        
1580
+        if ((options & PCRE_EXTENDED) != 0)
1707 1581
         {
1708
-        if (c == '\\' && ptr[1] == 'E')
1709
-          {
1710
-          inescq = FALSE;
1711
-          ptr++;
1712
-          continue;
1713
-          }
1714
-        else goto LONE_SINGLE_CHARACTER;
1582
+            if ((cd->ctypes[c] & ctype_space) != 0) continue;
1583
+            if (c == '#')
1584
+            {
1585
+                /* The space before the ; is to avoid a warning on a silly compiler
1586
+                 on the Macintosh. */
1587
+                while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1588
+                if (c != 0) continue;   /* Else fall through to handle end of string */
1589
+            }
1715 1590
         }
1716
-
1717
-      /* Handle POSIX class names. Perl allows a negation extension of the
1718
-      form [:^name:]. A square bracket that doesn't match the syntax is
1719
-      treated as a literal. We also recognize the POSIX constructions
1720
-      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1721
-      5.6 and 5.8 do. */
1722
-
1723
-      if (c == '[' &&
1724
-          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1725
-          check_posix_syntax(ptr, &tempptr, cd))
1726
-        {
1727
-        BOOL local_negate = FALSE;
1728
-        int posix_class, i;
1729
-        register const uschar *cbits = cd->cbits;
1730
-
1731
-        if (ptr[1] != ':')
1732
-          {
1733
-          *errorptr = ERR31;
1734
-          goto FAILED;
1735
-          }
1736
-
1737
-        ptr += 2;
1738
-        if (*ptr == '^')
1739
-          {
1740
-          local_negate = TRUE;
1741
-          ptr++;
1742
-          }
1743
-
1744
-        posix_class = check_posix_name(ptr, tempptr - ptr);
1745
-        if (posix_class < 0)
1746
-          {
1747
-          *errorptr = ERR30;
1748
-          goto FAILED;
1749
-          }
1750
-
1751
-        /* If matching is caseless, upper and lower are converted to
1752
-        alpha. This relies on the fact that the class table starts with
1753
-        alpha, lower, upper as the first 3 entries. */
1754
-
1755
-        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1756
-          posix_class = 0;
1757
-
1758
-        /* Or into the map we are building up to 3 of the static class
1759
-        tables, or their negations. The [:blank:] class sets up the same
1760
-        chars as the [:space:] class (all white space). We remove the vertical
1761
-        white space chars afterwards. */
1762
-
1763
-        posix_class *= 3;
1764
-        for (i = 0; i < 3; i++)
1765
-          {
1766
-          BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1767
-          int taboffset = posix_class_maps[posix_class + i];
1768
-          if (taboffset < 0) break;
1769
-          if (local_negate)
1770
-            {
1771
-            for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1772
-            if (isblank) class[1] |= 0x3c;
1773
-            }
1774
-          else
1775
-            {
1776
-            for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1777
-            if (isblank) class[1] &= ~0x3c;
1778
-            }
1779
-          }
1780
-
1781
-        ptr = tempptr + 1;
1782
-        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1783
-        continue;    /* End of POSIX syntax handling */
1784
-        }
1785
-
1786
-      /* Backslash may introduce a single character, or it may introduce one
1787
-      of the specials, which just set a flag. Escaped items are checked for
1788
-      validity in the pre-compiling pass. The sequence \b is a special case.
1789
-      Inside a class (and only there) it is treated as backspace. Elsewhere
1790
-      it marks a word boundary. Other escapes have preset maps ready to
1791
-      or into the one we are building. We assume they have more than one
1792
-      character in them, so set class_charcount bigger than one. */
1793
-
1794
-      if (c == '\\')
1591
+        
1592
+        switch(c)
1795 1593
         {
1796
-        c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1797
-        if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
1798
-
1799
-        if (-c == ESC_Q)            /* Handle start of quoted string */
1800
-          {
1801
-          if (ptr[1] == '\\' && ptr[2] == 'E')
1802
-            {
1803
-            ptr += 2; /* avoid empty string */
1804
-            }
1805
-          else inescq = TRUE;
1806
-          continue;
1807
-          }
1808
-
1809
-        else if (c < 0)
1810
-          {
1811
-          register const uschar *cbits = cd->cbits;
1812
-          class_charcount = 10;     /* Greater than 1 is what matters */
1813
-          switch (-c)
1814
-            {
1815
-            case ESC_d:
1816
-            for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1817
-            continue;
1818
-
1819
-            case ESC_D:
1820
-            for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1821
-            continue;
1822
-
1823
-            case ESC_w:
1824
-            for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1825
-            continue;
1826
-
1827
-            case ESC_W:
1828
-            for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1829
-            continue;
1830
-
1831
-            case ESC_s:
1832
-            for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1833
-            class[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
1834
-            continue;
1835
-
1836
-            case ESC_S:
1837
-            for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1838
-            class[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
1839
-            continue;
1840
-
1841
-            /* Unrecognized escapes are faulted if PCRE is running in its
1842
-            strict mode. By default, for compatibility with Perl, they are
1843
-            treated as literals. */
1844
-
1594
+                /* The branch terminates at end of string, |, or ). */
1595
+                
1596
+            case 0:
1597
+            case '|':
1598
+            case ')':
1599
+                *firstbyteptr = firstbyte;
1600
+                *reqbyteptr = reqbyte;
1601
+                *codeptr = code;
1602
+                *ptrptr = ptr;
1603
+                return TRUE;
1604
+                
1605
+                /* Handle single-character metacharacters. In multiline mode, ^ disables
1606
+                 the setting of any following char as a first character. */
1607
+                
1608
+            case '^':
1609
+                if ((options & PCRE_MULTILINE) != 0)
1610
+                {
1611
+                    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1612
+                }
1613
+                previous = NULL;
1614
+                *code++ = OP_CIRC;
1615
+                break;
1616
+                
1617
+            case '$':
1618
+                previous = NULL;
1619
+                *code++ = OP_DOLL;
1620
+                break;
1621
+                
1622
+                /* There can never be a first char if '.' is first, whatever happens about
1623
+                 repeats. The value of reqbyte doesn't change either. */
1624
+                
1625
+            case '.':
1626
+                if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1627
+                zerofirstbyte = firstbyte;
1628
+                zeroreqbyte = reqbyte;
1629
+                previous = code;
1630
+                *code++ = OP_ANY;
1631
+                break;
1632
+                
1633
+                /* Character classes. If the included characters are all < 255 in value, we
1634
+                 build a 32-byte bitmap of the permitted characters, except in the special
1635
+                 case where there is only one such character. For negated classes, we build
1636
+                 the map as usual, then invert it at the end. However, we use a different
1637
+                 opcode so that data characters > 255 can be handled correctly.
1638
+                 
1639
+                 If the class contains characters outside the 0-255 range, a different
1640
+                 opcode is compiled. It may optionally have a bit map for characters < 256,
1641
+                 but those above are are explicitly listed afterwards. A flag byte tells
1642
+                 whether the bitmap is present, and whether this is a negated class or not.
1643
+                 */
1644
+                
1645
+            case '[':
1646
+                previous = code;
1647
+                
1648
+                /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1649
+                 they are encountered at the top level, so we'll do that too. */
1650
+                
1651
+                if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1652
+                    check_posix_syntax(ptr, &tempptr, cd))
1653
+                {
1654
+                    *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1655
+                    goto FAILED;
1656
+                }
1657
+                
1658
+                /* If the first character is '^', set the negation flag and skip it. */
1659
+                
1660
+                if ((c = *(++ptr)) == '^')
1661
+                {
1662
+                    negate_class = TRUE;
1663
+                    c = *(++ptr);
1664
+                }
1665
+                else
1666
+                {
1667
+                    negate_class = FALSE;
1668
+                }
1669
+                
1670
+                /* Keep a count of chars with values < 256 so that we can optimize the case
1671
+                 of just a single character (as long as it's < 256). For higher valued UTF-8
1672
+                 characters, we don't yet do any optimization. */
1673
+                
1674
+                class_charcount = 0;
1675
+                class_lastchar = -1;
1676
+                
1677
+#ifdef SUPPORT_UTF8
1678
+                class_utf8 = FALSE;                       /* No chars >= 256 */
1679
+                class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
1680
+#endif
1681
+                
1682
+                /* Initialize the 32-char bit map to all zeros. We have to build the
1683
+                 map in a temporary bit of store, in case the class contains only 1
1684
+                 character (< 256), because in that case the compiled code doesn't use the
1685
+                 bit map. */
1686
+                
1687
+                memset(class, 0, 32 * sizeof(uschar));
1688
+                
1689
+                /* Process characters until ] is reached. By writing this as a "do" it
1690
+                 means that an initial ] is taken as a data character. The first pass
1691
+                 through the regex checked the overall syntax, so we don't need to be very
1692
+                 strict here. At the start of the loop, c contains the first byte of the
1693
+                 character. */
1694
+                
1695
+                do
1696
+                {
1697
+#ifdef SUPPORT_UTF8
1698
+                    if (utf8 && c > 127)
1699
+                    {                           /* Braces are required because the */
1700
+                        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
1701
+                    }
1702
+#endif
1703
+                    
1704
+                    /* Inside \Q...\E everything is literal except \E */
1705
+                    
1706
+                    if (inescq)
1707
+                    {
1708
+                        if (c == '\\' && ptr[1] == 'E')
1709
+                        {
1710
+                            inescq = FALSE;
1711
+                            ptr++;
1712
+                            continue;
1713
+                        }
1714
+                        else goto LONE_SINGLE_CHARACTER;
1715
+                    }
1716
+                    
1717
+                    /* Handle POSIX class names. Perl allows a negation extension of the
1718
+                     form [:^name:]. A square bracket that doesn't match the syntax is
1719
+                     treated as a literal. We also recognize the POSIX constructions
1720
+                     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1721
+                     5.6 and 5.8 do. */
1722
+                    
1723
+                    if (c == '[' &&
1724
+                        (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1725
+                        check_posix_syntax(ptr, &tempptr, cd))
1726
+                    {
1727
+                        BOOL local_negate = FALSE;
1728
+                        int posix_class, i;
1729
+                        register const uschar *cbits = cd->cbits;
1730
+                        
1731
+                        if (ptr[1] != ':')
1732
+                        {
1733
+                            *errorptr = ERR31;
1734
+                            goto FAILED;
1735
+                        }
1736
+                        
1737
+                        ptr += 2;
1738
+                        if (*ptr == '^')
1739
+                        {
1740
+                            local_negate = TRUE;
1741
+                            ptr++;
1742
+                        }
1743
+                        
1744
+                        posix_class = check_posix_name(ptr, tempptr - ptr);
1745
+                        if (posix_class < 0)
1746
+                        {
1747
+                            *errorptr = ERR30;
1748
+                            goto FAILED;
1749
+                        }
1750
+                        
1751
+                        /* If matching is caseless, upper and lower are converted to
1752
+                         alpha. This relies on the fact that the class table starts with
1753
+                         alpha, lower, upper as the first 3 entries. */
1754
+                        
1755
+                        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1756
+                            posix_class = 0;
1757
+                        
1758
+                        /* Or into the map we are building up to 3 of the static class
1759
+                         tables, or their negations. The [:blank:] class sets up the same
1760
+                         chars as the [:space:] class (all white space). We remove the vertical
1761
+                         white space chars afterwards. */
1762
+                        
1763
+                        posix_class *= 3;
1764
+                        for (i = 0; i < 3; i++)
1765
+                        {
1766
+                            BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
1767
+                            int taboffset = posix_class_maps[posix_class + i];
1768
+                            if (taboffset < 0) break;
1769
+                            if (local_negate)
1770
+                            {
1771
+                                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1772
+                                if (isblank) class[1] |= 0x3c;
1773
+                            }
1774
+                            else
1775
+                            {
1776
+                                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1777
+                                if (isblank) class[1] &= ~0x3c;
1778
+                            }
1779
+                        }
1780
+                        
1781
+                        ptr = tempptr + 1;
1782
+                        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1783
+                        continue;    /* End of POSIX syntax handling */
1784
+                    }
1785
+                    
1786
+                    /* Backslash may introduce a single character, or it may introduce one
1787
+                     of the specials, which just set a flag. Escaped items are checked for
1788
+                     validity in the pre-compiling pass. The sequence \b is a special case.
1789
+                     Inside a class (and only there) it is treated as backspace. Elsewhere
1790
+                     it marks a word boundary. Other escapes have preset maps ready to
1791
+                     or into the one we are building. We assume they have more than one
1792
+                     character in them, so set class_charcount bigger than one. */
1793
+                    
1794
+                    if (c == '\\')
1795
+                    {
1796
+                        c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1797
+                        if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
1798
+                        
1799
+                        if (-c == ESC_Q)            /* Handle start of quoted string */
1800
+                        {
1801
+                            if (ptr[1] == '\\' && ptr[2] == 'E')
1802
+                            {
1803
+                                ptr += 2; /* avoid empty string */
1804
+                            }
1805
+                            else inescq = TRUE;
1806
+                            continue;
1807
+                        }
1808
+                        
1809
+                        else if (c < 0)
1810
+                        {
1811
+                            register const uschar *cbits = cd->cbits;
1812
+                            class_charcount = 10;     /* Greater than 1 is what matters */
1813
+                            switch (-c)
1814
+                            {
1815
+                                case ESC_d:
1816
+                                    for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1817
+                                    continue;
1818
+                                    
1819
+                                case ESC_D:
1820
+                                    for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1821
+                                    continue;
1822
+                                    
1823
+                                case ESC_w:
1824
+                                    for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1825
+                                    continue;
1826
+                                    
1827
+                                case ESC_W:
1828
+                                    for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1829
+                                    continue;
1830
+                                    
1831
+                                case ESC_s:
1832
+                                    for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1833
+                                    class[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
1834
+                                    continue;
1835
+                                    
1836
+                                case ESC_S:
1837
+                                    for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1838
+                                    class[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
1839
+                                    continue;
1840
+                                    
1841
+                                    /* Unrecognized escapes are faulted if PCRE is running in its
1842
+                                     strict mode. By default, for compatibility with Perl, they are
1843
+                                     treated as literals. */
1844
+                                    
1845
+                                default:
1846
+                                    if ((options & PCRE_EXTRA) != 0)
1847
+                                    {
1848
+                                        *errorptr = ERR7;
1849
+                                        goto FAILED;
1850
+                                    }
1851
+                                    c = *ptr;    /* The final character */
1852
+                            }
1853
+                        }
1854
+                        
1855
+                        /* Fall through if we have a single character (c >= 0). This may be
1856
+                         > 256 in UTF-8 mode. */
1857
+                        
1858
+                    }   /* End of backslash handling */
1859
+                    
1860
+                    /* A single character may be followed by '-' to form a range. However,
1861
+                     Perl does not permit ']' to be the end of the range. A '-' character
1862
+                     here is treated as a literal. */
1863
+                    
1864
+                    if (ptr[1] == '-' && ptr[2] != ']')
1865
+                    {
1866
+                        int d;
1867
+                        ptr += 2;
1868
+                        
1869
+#ifdef SUPPORT_UTF8
1870
+                        if (utf8)
1871
+                        {                           /* Braces are required because the */
1872
+                            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
1873
+                        }
1874
+                        else
1875
+#endif
1876
+                            d = *ptr;
1877
+                        
1878
+                        /* The second part of a range can be a single-character escape, but
1879
+                         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1880
+                         in such circumstances. */
1881
+                        
1882
+                        if (d == '\\')
1883
+                        {
1884
+                            const uschar *oldptr = ptr;
1885
+                            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1886
+                            
1887
+                            /* \b is backslash; any other special means the '-' was literal */
1888
+                            
1889
+                            if (d < 0)
1890
+                            {
1891
+                                if (d == -ESC_b) d = '\b'; else
1892
+                                {
1893
+                                    ptr = oldptr - 2;
1894
+                                    goto LONE_SINGLE_CHARACTER;  /* A few lines below */
1895
+                                }
1896
+                            }
1897
+                        }
1898
+                        
1899
+                        /* Check that the two values are in the correct order */
1900
+                        
1901
+                        if (d < c)
1902
+                        {
1903
+                            *errorptr = ERR8;
1904
+                            goto FAILED;
1905
+                        }
1906
+                        
1907
+                        /* If d is greater than 255, we can't just use the bit map, so set up
1908
+                         for the UTF-8 supporting class type. If we are not caseless, we can
1909
+                         just set up a single range. If we are caseless, the characters < 256
1910
+                         are handled with a bitmap, in order to get the case-insensitive
1911
+                         handling. */
1912
+                        
1913
+#ifdef SUPPORT_UTF8
1914
+                        if (d > 255)
1915
+                        {
1916
+                            class_utf8 = TRUE;
1917
+                            *class_utf8data++ = XCL_RANGE;
1918
+                            if ((options & PCRE_CASELESS) == 0)
1919
+                            {
1920
+                                class_utf8data += ord2utf8(c, class_utf8data);
1921
+                                class_utf8data += ord2utf8(d, class_utf8data);
1922
+                                continue;  /* Go get the next char in the class */
1923
+                            }
1924
+                            class_utf8data += ord2utf8(256, class_utf8data);
1925
+                            class_utf8data += ord2utf8(d, class_utf8data);
1926
+                            d = 255;
1927
+                            /* Fall through */
1928
+                        }
1929
+#endif
1930
+                        /* We use the bit map if the range is entirely < 255, or if part of it
1931
+                         is < 255 and matching is caseless. */
1932
+                        
1933
+                        for (; c <= d; c++)
1934
+                        {
1935
+                            class[c/8] |= (1 << (c&7));
1936
+                            if ((options & PCRE_CASELESS) != 0)
1937
+                            {
1938
+                                int uc = cd->fcc[c];           /* flip case */
1939
+                                class[uc/8] |= (1 << (uc&7));
1940
+                            }
1941
+                            class_charcount++;                /* in case a one-char range */
1942
+                            class_lastchar = c;
1943
+                        }
1944
+                        
1945
+                        continue;   /* Go get the next char in the class */
1946
+                    }
1947
+                    
1948
+                    /* Handle a lone single character - we can get here for a normal
1949
+                     non-escape char, or after \ that introduces a single character. */
1950
+                    
1951
+                LONE_SINGLE_CHARACTER:
1952
+                    
1953
+                    /* Handle a multibyte character */
1954
+                    
1955
+#ifdef SUPPORT_UTF8
1956
+                    if (utf8 && c > 255)
1957
+                    {
1958
+                        class_utf8 = TRUE;
1959
+                        *class_utf8data++ = XCL_SINGLE;
1960
+                        class_utf8data += ord2utf8(c, class_utf8data);
1961
+                    }
1962
+                    else
1963
+#endif
1964
+                    /* Handle a single-byte character */
1965
+                    {
1966
+                        class [c/8] |= (1 << (c&7));
1967
+                        if ((options & PCRE_CASELESS) != 0)
1968
+                        {
1969
+                            c = cd->fcc[c];   /* flip case */
1970
+                            class[c/8] |= (1 << (c&7));
1971
+                        }
1972
+                        class_charcount++;
1973
+                        class_lastchar = c;
1974
+                    }
1975
+                }
1976
+                
1977
+                /* Loop until ']' reached; the check for end of string happens inside the
1978
+                 loop. This "while" is the end of the "do" above. */
1979
+                
1980
+                while ((c = *(++ptr)) != ']' || inescq);
1981
+                
1982
+                /* If class_charcount is 1, we saw precisely one character with a value <
1983
+                 256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1984
+                 the one character is < 128. In non-UTF-8 mode we can always optimize.
1985
+                 
1986
+                 The optimization throws away the bit map. We turn the item into a
1987
+                 1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1988
+                 that OP_NOT does not support multibyte characters. In the positive case, it
1989
+                 can cause firstbyte to be set. Otherwise, there can be no first char if
1990
+                 this item is first, whatever repeat count may follow. In the case of
1991
+                 reqbyte, save the previous value for reinstating. */
1992
+                
1993
+#ifdef SUPPORT_UTF8
1994
+                if (class_charcount == 1 &&
1995
+                    (!utf8 ||
1996
+                     (!class_utf8 && class_lastchar < 128)))
1997
+#else
1998
+                    if (class_charcount == 1)
1999
+#endif
2000
+                    {
2001
+                        zeroreqbyte = reqbyte;
2002
+                        if (negate_class)
2003
+                        {
2004
+                            if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2005
+                            zerofirstbyte = firstbyte;
2006
+                            *code++ = OP_NOT;
2007
+                        }
2008
+                        else
2009
+                        {
2010
+                            if (firstbyte == REQ_UNSET)
2011
+                            {
2012
+                                zerofirstbyte = REQ_NONE;
2013
+                                firstbyte = class_lastchar | req_caseopt;
2014
+                            }
2015
+                            else
2016
+                            {
2017
+                                zerofirstbyte = firstbyte;
2018
+                                reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
2019
+                            }
2020
+                            *code++ = OP_CHARS;
2021
+                            *code++ = 1;
2022
+                        }
2023
+                        *code++ = class_lastchar;
2024
+                        break;  /* End of class handling */
2025
+                    }       /* End of 1-byte optimization */
2026
+                
2027
+                /* Otherwise, if this is the first thing in the branch, there can be no
2028
+                 first char setting, whatever the repeat count. Any reqbyte setting must
2029
+                 remain unchanged after any kind of repeat. */
2030
+                
2031
+                if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2032
+                zerofirstbyte = firstbyte;
2033
+                zeroreqbyte = reqbyte;
2034
+                
2035
+                /* If there are characters with values > 255, we have to compile an
2036
+                 extended class, with its own opcode. If there are no characters < 256,
2037
+                 we can omit the bitmap. */
2038
+                
2039
+#ifdef SUPPORT_UTF8
2040
+                if (class_utf8)
2041
+                {
2042
+                    *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
2043
+                    *code++ = OP_XCLASS;
2044
+                    code += LINK_SIZE;
2045
+                    *code = negate_class? XCL_NOT : 0;
2046
+                    
2047
+                    /* If the map is required, install it, and move on to the end of
2048
+                     the extra data */
2049
+                    
2050
+                    if (class_charcount > 0)
2051
+                    {
2052
+                        *code++ |= XCL_MAP;
2053
+                        memcpy(code, class, 32);
2054
+                        code = class_utf8data;
2055
+                    }
2056
+                    
2057
+                    /* If the map is not required, slide down the extra data. */
2058
+                    
2059
+                    else
2060
+                    {
2061
+                        int len = class_utf8data - (code + 33);
2062
+                        memmove(code + 1, code + 33, len);
2063
+                        code += len + 1;
2064
+                    }
2065
+                    
2066
+                    /* Now fill in the complete length of the item */
2067
+                    
2068
+                    PUT(previous, 1, code - previous);
2069
+                    break;   /* End of class handling */
2070
+                }
2071
+#endif
2072
+                
2073
+                /* If there are no characters > 255, negate the 32-byte map if necessary,
2074
+                 and copy it into the code vector. If this is the first thing in the branch,
2075
+                 there can be no first char setting, whatever the repeat count. Any reqbyte
2076
+                 setting must remain unchanged after any kind of repeat. */
2077
+                
2078
+                if (negate_class)
2079
+                {
2080
+                    *code++ = OP_NCLASS;
2081
+                    for (c = 0; c < 32; c++) code[c] = ~class[c];
2082
+                }
2083
+                else
2084
+                {
2085
+                    *code++ = OP_CLASS;
2086
+                    memcpy(code, class, 32);
2087
+                }
2088
+                code += 32;
2089
+                break;
2090
+                
2091
+                /* Various kinds of repeat */
2092
+                
2093
+            case '{':
2094
+                if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
2095
+                ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
2096
+                if (*errorptr != NULL) goto FAILED;
2097
+                goto REPEAT;
2098
+                
2099
+            case '*':
2100
+                repeat_min = 0;
2101
+                repeat_max = -1;
2102
+                goto REPEAT;
2103
+                
2104
+            case '+':
2105
+                repeat_min = 1;
2106
+                repeat_max = -1;
2107
+                goto REPEAT;
2108
+                
2109
+            case '?':
2110
+                repeat_min = 0;
2111
+                repeat_max = 1;
2112
+                
2113
+            REPEAT:
2114
+                if (previous == NULL)
2115
+                {
2116
+                    *errorptr = ERR9;
2117
+                    goto FAILED;
2118
+                }
2119
+                
2120
+                if (repeat_min == 0)
2121
+                {
2122
+                    firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2123
+                    reqbyte = zeroreqbyte;        /* Ditto */
2124
+                }
2125
+                
2126
+                /* Remember whether this is a variable length repeat */
2127
+                
2128
+                reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2129
+                
2130
+                op_type = 0;                    /* Default single-char op codes */
2131
+                possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2132
+                
2133
+                /* Save start of previous item, in case we have to move it up to make space
2134
+                 for an inserted OP_ONCE for the additional '+' extension. */
2135
+                
2136
+                tempcode = previous;
2137
+                
2138
+                /* If the next character is '+', we have a possessive quantifier. This
2139
+                 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2140
+                 If the next character is '?' this is a minimizing repeat, by default,
2141
+                 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2142
+                 repeat type to the non-default. */
2143
+                
2144
+                if (ptr[1] == '+')
2145
+                {
2146
+                    repeat_type = 0;                  /* Force greedy */
2147
+                    possessive_quantifier = TRUE;
2148
+                    ptr++;
2149
+                }
2150
+                else if (ptr[1] == '?')
2151
+                {
2152
+                    repeat_type = greedy_non_default;
2153
+                    ptr++;
2154
+                }
2155
+                else repeat_type = greedy_default;
2156
+                
2157
+                /* If previous was a recursion, we need to wrap it inside brackets so that
2158
+                 it can be replicated if necessary. */
2159
+                
2160
+                if (*previous == OP_RECURSE)
2161
+                {
2162
+                    memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2163
+                    code += 1 + LINK_SIZE;
2164
+                    *previous = OP_BRA;
2165
+                    PUT(previous, 1, code - previous);
2166
+                    *code = OP_KET;
2167
+                    PUT(code, 1, code - previous);
2168
+                    code += 1 + LINK_SIZE;
2169
+                }
2170
+                
2171
+                /* If previous was a string of characters, chop off the last one and use it
2172
+                 as the subject of the repeat. If there was only one character, we can
2173
+                 abolish the previous item altogether. If a one-char item has a minumum of
2174
+                 more than one, ensure that it is set in reqbyte - it might not be if a
2175
+                 sequence such as x{3} is the first thing in a branch because the x will
2176
+                 have gone into firstbyte instead.  */
2177
+                
2178
+                if (*previous == OP_CHARS)
2179
+                {
2180
+                    /* Deal with UTF-8 characters that take up more than one byte. It's
2181
+                     easier to write this out separately than try to macrify it. Use c to
2182
+                     hold the length of the character in bytes, plus 0x80 to flag that it's a
2183
+                     length rather than a small character. */
2184
+                    
2185
+#ifdef SUPPORT_UTF8
2186
+                    if (utf8 && (code[-1] & 0x80) != 0)
2187
+                    {
2188
+                        uschar *lastchar = code - 1;
2189
+                        while((*lastchar & 0xc0) == 0x80) lastchar--;
2190
+                        c = code - lastchar;            /* Length of UTF-8 character */
2191
+                        memcpy(utf8_char, lastchar, c); /* Save the char */
2192
+                        if (lastchar == previous + 2)   /* There was only one character */
2193
+                        {
2194
+                            code = previous;              /* Abolish the previous item */
2195
+                        }
2196
+                        else
2197
+                        {
2198
+                            previous[1] -= c;             /* Adjust length of previous */
2199
+                            code = lastchar;              /* Lost char off the end */
2200
+                            tempcode = code;              /* Adjust position to be moved for '+' */
2201
+                        }
2202
+                        c |= 0x80;                      /* Flag c as a length */
2203
+                    }
2204
+                    else
2205
+#endif
2206
+                        
2207
+                    /* Handle the case of a single byte - either with no UTF8 support, or
2208
+                     with UTF-8 disabled, or for a UTF-8 character < 128. */
2209
+                        
2210
+                    {
2211
+                        c = *(--code);
2212
+                        if (code == previous + 2)   /* There was only one character */
2213
+                        {
2214
+                            code = previous;              /* Abolish the previous item */
2215
+                            if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2216
+                        }
2217
+                        else
2218
+                        {
2219
+                            previous[1]--;             /* adjust length */
2220
+                            tempcode = code;           /* Adjust position to be moved for '+' */
2221
+                        }
2222
+                    }
2223
+                    
2224
+                    goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2225
+                }
2226
+                
2227
+                /* If previous was a single negated character ([^a] or similar), we use
2228
+                 one of the special opcodes, replacing it. The code is shared with single-
2229
+                 character repeats by setting opt_type to add a suitable offset into
2230
+                 repeat_type. OP_NOT is currently used only for single-byte chars. */
2231
+                
2232
+                else if (*previous == OP_NOT)
2233
+                {
2234
+                    op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2235
+                    c = previous[1];
2236
+                    code = previous;
2237
+                    goto OUTPUT_SINGLE_REPEAT;
2238
+                }
2239
+                
2240
+                /* If previous was a character type match (\d or similar), abolish it and
2241
+                 create a suitable repeat item. The code is shared with single-character
2242
+                 repeats by setting op_type to add a suitable offset into repeat_type. */
2243
+                
2244
+                else if (*previous < OP_EODN)
2245
+                {
2246
+                    op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2247
+                    c = *previous;
2248
+                    code = previous;
2249
+                    
2250
+                OUTPUT_SINGLE_REPEAT:
2251
+                    
2252
+                    /* If the maximum is zero then the minimum must also be zero; Perl allows
2253
+                     this case, so we do too - by simply omitting the item altogether. */
2254
+                    
2255
+                    if (repeat_max == 0) goto END_REPEAT;
2256
+                    
2257
+                    /* Combine the op_type with the repeat_type */
2258
+                    
2259
+                    repeat_type += op_type;
2260
+                    
2261
+                    /* A minimum of zero is handled either as the special case * or ?, or as
2262
+                     an UPTO, with the maximum given. */
2263
+                    
2264
+                    if (repeat_min == 0)
2265
+                    {
2266
+                        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2267
+                        else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2268
+                        else
2269
+                        {
2270
+                            *code++ = OP_UPTO + repeat_type;
2271
+                            PUT2INC(code, 0, repeat_max);
2272
+                        }
2273
+                    }
2274
+                    
2275
+                    /* The case {1,} is handled as the special case + */
2276
+                    
2277
+                    else if (repeat_min == 1 && repeat_max == -1)
2278
+                        *code++ = OP_PLUS + repeat_type;
2279
+                    
2280
+                    /* The case {n,n} is just an EXACT, while the general case {n,m} is
2281
+                     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2282
+                    
2283
+                    else
2284
+                    {
2285
+                        if (repeat_min != 1)
2286
+                        {
2287
+                            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
2288
+                            PUT2INC(code, 0, repeat_min);
2289
+                        }
2290
+                        
2291
+                        /* If the mininum is 1 and the previous item was a character string,
2292
+                         we either have to put back the item that got cancelled if the string
2293
+                         length was 1, or add the character back onto the end of a longer
2294
+                         string. For a character type nothing need be done; it will just get
2295
+                         put back naturally. Note that the final character is always going to
2296
+                         get added below, so we leave code ready for its insertion. */
2297
+                        
2298
+                        else if (*previous == OP_CHARS)
2299
+                        {
2300
+                            if (code == previous) code += 2; else
2301
+                                
2302
+                            /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2303
+                             bit set as a flag. The length will always be between 2 and 6. */
2304
+                                
2305
+#ifdef SUPPORT_UTF8
2306
+                                if (utf8 && c >= 128) previous[1] += c & 7; else
2307
+#endif
2308
+                                    previous[1]++;
2309
+                        }
2310
+                        
2311
+                        /*  For a single negated character we also have to put back the
2312
+                         item that got cancelled. At present this applies only to single byte
2313
+                         characters in any mode. */
2314
+                        
2315
+                        else if (*previous == OP_NOT) code++;
2316
+                        
2317
+                        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2318
+                         we have to insert the character for the previous code. In UTF-8 mode,
2319
+                         long characters have their length in c, with the 0x80 bit as a flag. */
2320
+                        
2321
+                        if (repeat_max < 0)
2322
+                        {
2323
+#ifdef SUPPORT_UTF8
2324
+                            if (utf8 && c >= 128)
2325
+                            {
2326
+                                memcpy(code, utf8_char, c & 7);
2327
+                                code += c & 7;
2328
+                            }
2329
+                            else
2330
+#endif
2331
+                                *code++ = c;
2332
+                            *code++ = OP_STAR + repeat_type;
2333
+                        }
2334
+                        
2335
+                        /* Else insert an UPTO if the max is greater than the min, again
2336
+                         preceded by the character, for the previously inserted code. */
2337
+                        
2338
+                        else if (repeat_max != repeat_min)
2339
+                        {
2340
+#ifdef SUPPORT_UTF8
2341
+                            if (utf8 && c >= 128)
2342
+                            {
2343
+                                memcpy(code, utf8_char, c & 7);
2344
+                                code += c & 7;
2345
+                            }
2346
+                            else
2347
+#endif
2348
+                                *code++ = c;
2349
+                            repeat_max -= repeat_min;
2350
+                            *code++ = OP_UPTO + repeat_type;
2351
+                            PUT2INC(code, 0, repeat_max);
2352
+                        }
2353
+                    }
2354
+                    
2355
+                    /* The character or character type itself comes last in all cases. */
2356
+                    
2357
+#ifdef SUPPORT_UTF8
2358
+                    if (utf8 && c >= 128)
2359
+                    {
2360
+                        memcpy(code, utf8_char, c & 7);
2361
+                        code += c & 7;
2362
+                    }
2363
+                    else
2364
+#endif
2365
+                        
2366
+                        *code++ = c;
2367
+                }
2368
+                
2369
+                /* If previous was a character class or a back reference, we put the repeat
2370
+                 stuff after it, but just skip the item if the repeat was {0,0}. */
2371
+                
2372
+                else if (*previous == OP_CLASS ||
2373
+                         *previous == OP_NCLASS ||
2374
+#ifdef SUPPORT_UTF8
2375
+                         *previous == OP_XCLASS ||
2376
+#endif
2377
+                         *previous == OP_REF)
2378
+                {
2379
+                    if (repeat_max == 0)
2380
+                    {
2381
+                        code = previous;
2382
+                        goto END_REPEAT;
2383
+                    }
2384
+                    if (repeat_min == 0 && repeat_max == -1)
2385
+                        *code++ = OP_CRSTAR + repeat_type;
2386
+                    else if (repeat_min == 1 && repeat_max == -1)
2387
+                        *code++ = OP_CRPLUS + repeat_type;
2388
+                    else if (repeat_min == 0 && repeat_max == 1)
2389
+                        *code++ = OP_CRQUERY + repeat_type;
2390
+                    else
2391
+                    {
2392
+                        *code++ = OP_CRRANGE + repeat_type;
2393
+                        PUT2INC(code, 0, repeat_min);
2394
+                        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
2395
+                        PUT2INC(code, 0, repeat_max);
2396
+                    }
2397
+                }
2398
+                
2399
+                /* If previous was a bracket group, we may have to replicate it in certain
2400
+                 cases. */
2401
+                
2402
+                else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2403
+                         *previous == OP_COND)
2404
+                {
2405
+                    register int i;
2406
+                    int ketoffset = 0;
2407
+                    int len = code - previous;
2408
+                    uschar *bralink = NULL;
2409
+                    
2410
+                    /* If the maximum repeat count is unlimited, find the end of the bracket
2411
+                     by scanning through from the start, and compute the offset back to it
2412
+                     from the current code pointer. There may be an OP_OPT setting following
2413
+                     the final KET, so we can't find the end just by going back from the code
2414
+                     pointer. */
2415
+                    
2416
+                    if (repeat_max == -1)
2417
+                    {
2418
+                        register uschar *ket = previous;
2419
+                        do ket += GET(ket, 1); while (*ket != OP_KET);
2420
+                        ketoffset = code - ket;
2421
+                    }
2422
+                    
2423
+                    /* The case of a zero minimum is special because of the need to stick
2424
+                     OP_BRAZERO in front of it, and because the group appears once in the
2425
+                     data, whereas in other cases it appears the minimum number of times. For
2426
+                     this reason, it is simplest to treat this case separately, as otherwise
2427
+                     the code gets far too messy. There are several special subcases when the
2428
+                     minimum is zero. */
2429
+                    
2430
+                    if (repeat_min == 0)
2431
+                    {
2432
+                        /* If the maximum is also zero, we just omit the group from the output
2433
+                         altogether. */
2434
+                        
2435
+                        if (repeat_max == 0)
2436
+                        {
2437
+                            code = previous;
2438
+                            goto END_REPEAT;
2439
+                        }
2440
+                        
2441
+                        /* If the maximum is 1 or unlimited, we just have to stick in the
2442
+                         BRAZERO and do no more at this point. */
2443
+                        
2444
+                        if (repeat_max <= 1)
2445
+                        {
2446
+                            memmove(previous+1, previous, len);
2447
+                            code++;
2448
+                            *previous++ = OP_BRAZERO + repeat_type;
2449
+                        }
2450
+                        
2451
+                        /* If the maximum is greater than 1 and limited, we have to replicate
2452
+                         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2453
+                         The first one has to be handled carefully because it's the original
2454
+                         copy, which has to be moved up. The remainder can be handled by code
2455
+                         that is common with the non-zero minimum case below. We just have to
2456
+                         adjust the value or repeat_max, since one less copy is required. */
2457
+                        
2458
+                        else
2459
+                        {
2460
+                            int offset;
2461
+                            memmove(previous + 2 + LINK_SIZE, previous, len);
2462
+                            code += 2 + LINK_SIZE;
2463
+                            *previous++ = OP_BRAZERO + repeat_type;
2464
+                            *previous++ = OP_BRA;
2465
+                            
2466
+                            /* We chain together the bracket offset fields that have to be
2467
+                             filled in later when the ends of the brackets are reached. */
2468
+                            
2469
+                            offset = (bralink == NULL)? 0 : previous - bralink;
2470
+                            bralink = previous;
2471
+                            PUTINC(previous, 0, offset);
2472
+                        }
2473
+                        
2474
+                        repeat_max--;
2475
+                    }
2476
+                    
2477
+                    /* If the minimum is greater than zero, replicate the group as many
2478
+                     times as necessary, and adjust the maximum to the number of subsequent
2479
+                     copies that we need. If we set a first char from the group, and didn't
2480
+                     set a required char, copy the latter from the former. */
2481
+                    
2482
+                    else
2483
+                    {
2484
+                        if (repeat_min > 1)
2485
+                        {
2486
+                            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2487
+                            for (i = 1; i < repeat_min; i++)
2488
+                            {
2489
+                                memcpy(code, previous, len);
2490
+                                code += len;
2491
+                            }
2492
+                        }
2493
+                        if (repeat_max > 0) repeat_max -= repeat_min;
2494
+                    }
2495
+                    
2496
+                    /* This code is common to both the zero and non-zero minimum cases. If
2497
+                     the maximum is limited, it replicates the group in a nested fashion,
2498
+                     remembering the bracket starts on a stack. In the case of a zero minimum,
2499
+                     the first one was set up above. In all cases the repeat_max now specifies
2500
+                     the number of additional copies needed. */
2501
+                    
2502
+                    if (repeat_max >= 0)
2503
+                    {
2504
+                        for (i = repeat_max - 1; i >= 0; i--)
2505
+                        {
2506
+                            *code++ = OP_BRAZERO + repeat_type;
2507
+                            
2508
+                            /* All but the final copy start a new nesting, maintaining the
2509
+                             chain of brackets outstanding. */
2510
+                            
2511
+                            if (i != 0)
2512
+                            {
2513
+                                int offset;
2514
+                                *code++ = OP_BRA;
2515
+                                offset = (bralink == NULL)? 0 : code - bralink;
2516
+                                bralink = code;
2517
+                                PUTINC(code, 0, offset);
2518
+                            }
2519
+                            
2520
+                            memcpy(code, previous, len);
2521
+                            code += len;
2522
+                        }
2523
+                        
2524
+                        /* Now chain through the pending brackets, and fill in their length
2525
+                         fields (which are holding the chain links pro tem). */
2526
+                        
2527
+                        while (bralink != NULL)
2528
+                        {
2529
+                            int oldlinkoffset;
2530
+                            int offset = code - bralink + 1;
2531
+                            uschar *bra = code - offset;
2532
+                            oldlinkoffset = GET(bra, 1);
2533
+                            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2534
+                            *code++ = OP_KET;
2535
+                            PUTINC(code, 0, offset);
2536
+                            PUT(bra, 1, offset);
2537
+                        }
2538
+                    }
2539
+                    
2540
+                    /* If the maximum is unlimited, set a repeater in the final copy. We
2541
+                     can't just offset backwards from the current code point, because we
2542
+                     don't know if there's been an options resetting after the ket. The
2543
+                     correct offset was computed above. */
2544
+                    
2545
+                    else code[-ketoffset] = OP_KETRMAX + repeat_type;
2546
+                }
2547
+                
2548
+                /* Else there's some kind of shambles */
2549
+                
2550
+                else
2551
+                {
2552
+                    *errorptr = ERR11;
2553
+                    goto FAILED;
2554
+                }
2555
+                
2556
+                /* If the character following a repeat is '+', we wrap the entire repeated
2557
+                 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2558
+                 Sun's Java package. The repeated item starts at tempcode, not at previous,
2559
+                 which might be the first part of a string whose (former) last char we
2560
+                 repeated. However, we don't support '+' after a greediness '?'. */
2561
+                
2562
+                if (possessive_quantifier)
2563
+                {
2564
+                    int len = code - tempcode;
2565
+                    memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2566
+                    code += 1 + LINK_SIZE;
2567
+                    len += 1 + LINK_SIZE;
2568
+                    tempcode[0] = OP_ONCE;
2569
+                    *code++ = OP_KET;
2570
+                    PUTINC(code, 0, len);
2571
+                    PUT(tempcode, 1, len);
2572
+                }
2573
+                
2574
+                /* In all case we no longer have a previous item. We also set the
2575
+                 "follows varying string" flag for subsequently encountered reqbytes if
2576
+                 it isn't already set and we have just passed a varying length item. */
2577
+                
2578
+            END_REPEAT:
2579
+                previous = NULL;
2580
+                cd->req_varyopt |= reqvary;
2581
+                break;
2582
+                
2583
+                
2584
+                /* Start of nested bracket sub-expression, or comment or lookahead or
2585
+                 lookbehind or option setting or condition. First deal with special things
2586
+                 that can come after a bracket; all are introduced by ?, and the appearance
2587
+                 of any of them means that this is not a referencing group. They were
2588
+                 checked for validity in the first pass over the string, so we don't have to
2589
+                 check for syntax errors here.  */
2590
+                
2591
+            case '(':
2592
+                newoptions = options;
2593
+                skipbytes = 0;
2594
+                
2595
+                if (*(++ptr) == '?')
2596
+                {
2597
+                    int set, unset;
2598
+                    int *optset;
2599
+                    
2600
+                    switch (*(++ptr))
2601
+                    {
2602
+                        case '#':                 /* Comment; skip to ket */
2603
+                            ptr++;
2604
+                            while (*ptr != ')') ptr++;
2605
+                            continue;
2606
+                            
2607
+                        case ':':                 /* Non-extracting bracket */
2608
+                            bravalue = OP_BRA;
2609
+                            ptr++;
2610
+                            break;
2611
+                            
2612
+                        case '(':
2613
+                            bravalue = OP_COND;       /* Conditional group */
2614
+                            
2615
+                            /* Condition to test for recursion */
2616
+                            
2617
+                            if (ptr[1] == 'R')
2618
+                            {
2619
+                                code[1+LINK_SIZE] = OP_CREF;
2620
+                                PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2621
+                                skipbytes = 3;
2622
+                                ptr += 3;
2623
+                            }
2624
+                            
2625
+                            /* Condition to test for a numbered subpattern match. We know that
2626
+                             if a digit follows ( then there will just be digits until ) because
2627
+                             the syntax was checked in the first pass. */
2628
+                            
2629
+                            else if ((digitab[ptr[1]] & ctype_digit) != 0)
2630
+                            {
2631
+                                int condref;                 /* Don't amalgamate; some compilers */
2632
+                                condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
2633
+                                while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2634
+                                if (condref == 0)
2635
+                                {
2636
+                                    *errorptr = ERR35;
2637
+                                    goto FAILED;
2638
+                                }
2639
+                                ptr++;
2640
+                                code[1+LINK_SIZE] = OP_CREF;
2641
+                                PUT2(code, 2+LINK_SIZE, condref);
2642
+                                skipbytes = 3;
2643
+                            }
2644
+                            /* For conditions that are assertions, we just fall through, having
2645
+                             set bravalue above. */
2646
+                            break;
2647
+                            
2648
+                        case '=':                 /* Positive lookahead */
2649
+                            bravalue = OP_ASSERT;
2650
+                            ptr++;
2651
+                            break;
2652
+                            
2653
+                        case '!':                 /* Negative lookahead */
2654
+                            bravalue = OP_ASSERT_NOT;
2655
+                            ptr++;
2656
+                            break;
2657
+                            
2658
+                        case '<':                 /* Lookbehinds */
2659
+                            switch (*(++ptr))
2660
+                        {
2661
+                            case '=':               /* Positive lookbehind */
2662
+                                bravalue = OP_ASSERTBACK;
2663
+                                ptr++;
2664
+                                break;
2665
+                                
2666
+                            case '!':               /* Negative lookbehind */
2667
+                                bravalue = OP_ASSERTBACK_NOT;
2668
+                                ptr++;
2669
+                                break;
2670
+                        }
2671
+                            break;
2672
+                            
2673
+                        case '>':                 /* One-time brackets */
2674
+                            bravalue = OP_ONCE;
2675
+                            ptr++;
2676
+                            break;
2677
+                            
2678
+                        case 'C':                 /* Callout - may be followed by digits */
2679
+                            *code++ = OP_CALLOUT;
2680
+                        {
2681
+                            int n = 0;
2682
+                            while ((digitab[*(++ptr)] & ctype_digit) != 0)
2683
+                                n = n * 10 + *ptr - '0';
2684
+                            if (n > 255)
2685
+                            {
2686
+                                *errorptr = ERR38;
2687
+                                goto FAILED;
2688
+                            }
2689
+                            *code++ = n;
2690
+                        }
2691
+                            previous = NULL;
2692
+                            continue;
2693
+                            
2694
+                        case 'P':                 /* Named subpattern handling */
2695
+                            if (*(++ptr) == '<')      /* Definition */
2696
+                            {
2697
+                                int i, namelen;
2698
+                                uschar *slot = cd->name_table;
2699
+                                const uschar *name;     /* Don't amalgamate; some compilers */
2700
+                                name = ++ptr;           /* grumble at autoincrement in declaration */
2701
+                                
2702
+                                while (*ptr++ != '>');
2703
+                                namelen = ptr - name - 1;
2704
+                                
2705
+                                for (i = 0; i < cd->names_found; i++)
2706
+                                {
2707
+                                    int crc = memcmp(name, slot+2, namelen);
2708
+                                    if (crc == 0)
2709
+                                    {
2710
+                                        if (slot[2+namelen] == 0)
2711
+                                        {
2712
+                                            *errorptr = ERR43;
2713
+                                            goto FAILED;
2714
+                                        }
2715
+                                        crc = -1;             /* Current name is substring */
2716
+                                    }
2717
+                                    if (crc < 0)
2718
+                                    {
2719
+                                        memmove(slot + cd->name_entry_size, slot,
2720
+                                                (cd->names_found - i) * cd->name_entry_size);
2721
+                                        break;
2722
+                                    }
2723
+                                    slot += cd->name_entry_size;
2724
+                                }
2725
+                                
2726
+                                PUT2(slot, 0, *brackets + 1);
2727
+                                memcpy(slot + 2, name, namelen);
2728
+                                slot[2+namelen] = 0;
2729
+                                cd->names_found++;
2730
+                                goto NUMBERED_GROUP;
2731
+                            }
2732
+                            
2733
+                            if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
2734
+                            {
2735
+                                int i, namelen;
2736
+                                int type = *ptr++;
2737
+                                const uschar *name = ptr;
2738
+                                uschar *slot = cd->name_table;
2739
+                                
2740
+                                while (*ptr != ')') ptr++;
2741
+                                namelen = ptr - name;
2742
+                                
2743
+                                for (i = 0; i < cd->names_found; i++)
2744
+                                {
2745
+                                    if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2746
+                                    slot += cd->name_entry_size;
2747
+                                }
2748
+                                if (i >= cd->names_found)
2749
+                                {
2750
+                                    *errorptr = ERR15;
2751
+                                    goto FAILED;
2752
+                                }
2753
+                                
2754
+                                recno = GET2(slot, 0);
2755
+                                
2756
+                                if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
2757
+                                
2758
+                                /* Back reference */
2759
+                                
2760
+                                previous = code;
2761
+                                *code++ = OP_REF;
2762
+                                PUT2INC(code, 0, recno);
2763
+                                cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2764
+                                if (recno > cd->top_backref) cd->top_backref = recno;
2765
+                                continue;
2766
+                            }
2767
+                            
2768
+                            /* Should never happen */
2769
+                            break;
2770
+                            
2771
+                        case 'R':                 /* Pattern recursion */
2772
+                            ptr++;                    /* Same as (?0)      */
2773
+                            /* Fall through */
2774
+                            
2775
+                            /* Recursion or "subroutine" call */
2776
+                            
2777
+                        case '0': case '1': case '2': case '3': case '4':
2778
+                        case '5': case '6': case '7': case '8': case '9':
2779
+                        {
2780
+                            const uschar *called;
2781
+                            recno = 0;
2782
+                            while((digitab[*ptr] & ctype_digit) != 0)
2783
+                                recno = recno * 10 + *ptr++ - '0';
2784
+                            
2785
+                            /* Come here from code above that handles a named recursion */
2786
+                            
2787
+                        HANDLE_RECURSION:
2788
+                            
2789
+                            previous = code;
2790
+                            
2791
+                            /* Find the bracket that is being referenced. Temporarily end the
2792
+                             regex in case it doesn't exist. */
2793
+                            
2794
+                            *code = OP_END;
2795
+                            called = (recno == 0)?
2796
+                            cd->start_code : find_bracket(cd->start_code, utf8, recno);
2797
+                            
2798
+                            if (called == NULL)
2799
+                            {
2800
+                                *errorptr = ERR15;
2801
+                                goto FAILED;
2802
+                            }
2803
+                            
2804
+                            /* If the subpattern is still open, this is a recursive call. We
2805
+                             check to see if this is a left recursion that could loop for ever,
2806
+                             and diagnose that case. */
2807
+                            
2808
+                            if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2809
+                            {
2810
+                                *errorptr = ERR40;
2811
+                                goto FAILED;
2812
+                            }
2813
+                            
2814
+                            /* Insert the recursion/subroutine item */
2815
+                            
2816
+                            *code = OP_RECURSE;
2817
+                            PUT(code, 1, called - cd->start_code);
2818
+                            code += 1 + LINK_SIZE;
2819
+                        }
2820
+                            continue;
2821
+                            
2822
+                            /* Character after (? not specially recognized */
2823
+                            
2824
+                        default:                  /* Option setting */
2825
+                            set = unset = 0;
2826
+                            optset = &set;
2827
+                            
2828
+                            while (*ptr != ')' && *ptr != ':')
2829
+                            {
2830
+                                switch (*ptr++)
2831
+                                {
2832
+                                    case '-': optset = &unset; break;
2833
+                                        
2834
+                                    case 'i': *optset |= PCRE_CASELESS; break;
2835
+                                    case 'm': *optset |= PCRE_MULTILINE; break;
2836
+                                    case 's': *optset |= PCRE_DOTALL; break;
2837
+                                    case 'x': *optset |= PCRE_EXTENDED; break;
2838
+                                    case 'U': *optset |= PCRE_UNGREEDY; break;
2839
+                                    case 'X': *optset |= PCRE_EXTRA; break;
2840
+                                }
2841
+                            }
2842
+                            
2843
+                            /* Set up the changed option bits, but don't change anything yet. */
2844
+                            
2845
+                            newoptions = (options | set) & (~unset);
2846
+                            
2847
+                            /* If the options ended with ')' this is not the start of a nested
2848
+                             group with option changes, so the options change at this level. Compile
2849
+                             code to change the ims options if this setting actually changes any of
2850
+                             them. We also pass the new setting back so that it can be put at the
2851
+                             start of any following branches, and when this group ends (if we are in
2852
+                             a group), a resetting item can be compiled.
2853
+                             
2854
+                             Note that if this item is right at the start of the pattern, the
2855
+                             options will have been abstracted and made global, so there will be no
2856
+                             change to compile. */
2857
+                            
2858
+                            if (*ptr == ')')
2859
+                            {
2860
+                                if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2861
+                                {
2862
+                                    *code++ = OP_OPT;
2863
+                                    *code++ = newoptions & PCRE_IMS;
2864
+                                }
2865
+                                
2866
+                                /* Change options at this level, and pass them back for use
2867
+                                 in subsequent branches. Reset the greedy defaults and the case
2868
+                                 value for firstbyte and reqbyte. */
2869
+                                
2870
+                                *optionsptr = options = newoptions;
2871
+                                greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2872
+                                greedy_non_default = greedy_default ^ 1;
2873
+                                req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2874
+                                
2875
+                                previous = NULL;       /* This item can't be repeated */
2876
+                                continue;              /* It is complete */
2877
+                            }
2878
+                            
2879
+                            /* If the options ended with ':' we are heading into a nested group
2880
+                             with possible change of options. Such groups are non-capturing and are
2881
+                             not assertions of any kind. All we need to do is skip over the ':';
2882
+                             the newoptions value is handled below. */
2883
+                            
2884
+                            bravalue = OP_BRA;
2885
+                            ptr++;
2886
+                    }
2887
+                }
2888
+                
2889
+                /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2890
+                 non-capturing and behave like (?:...) brackets */
2891
+                
2892
+                else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2893
+                {
2894
+                    bravalue = OP_BRA;
2895
+                }
2896
+                
2897
+                /* Else we have a referencing group; adjust the opcode. If the bracket
2898
+                 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2899
+                 arrange for the true number to follow later, in an OP_BRANUMBER item. */
2900
+                
2901
+                else
2902
+                {
2903
+                NUMBERED_GROUP:
2904
+                    if (++(*brackets) > EXTRACT_BASIC_MAX)
2905
+                    {
2906
+                        bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2907
+                        code[1+LINK_SIZE] = OP_BRANUMBER;
2908
+                        PUT2(code, 2+LINK_SIZE, *brackets);
2909
+                        skipbytes = 3;
2910
+                    }
2911
+                    else bravalue = OP_BRA + *brackets;
2912
+                }
2913
+                
2914
+                /* Process nested bracketed re. Assertions may not be repeated, but other
2915
+                 kinds can be. We copy code into a non-register variable in order to be able
2916
+                 to pass its address because some compilers complain otherwise. Pass in a
2917
+                 new setting for the ims options if they have changed. */
2918
+                
2919
+                previous = (bravalue >= OP_ONCE)? code : NULL;
2920
+                *code = bravalue;
2921
+                tempcode = code;
2922
+                tempreqvary = cd->req_varyopt;     /* Save value before bracket */
2923
+                
2924
+                if (!compile_regex(
2925
+                                   newoptions,                   /* The complete new option state */
2926
+                                   options & PCRE_IMS,           /* The previous ims option state */
2927
+                                   brackets,                     /* Extracting bracket count */
2928
+                                   &tempcode,                    /* Where to put code (updated) */
2929
+                                   &ptr,                         /* Input pointer (updated) */
2930
+                                   errorptr,                     /* Where to put an error message */
2931
+                                   (bravalue == OP_ASSERTBACK ||
2932
+                                    bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2933
+                                   skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
2934
+                                   &subfirstbyte,                /* For possible first char */
2935
+                                   &subreqbyte,                  /* For possible last char */
2936
+                                   bcptr,                        /* Current branch chain */
2937
+                                   cd))                          /* Tables block */
2938
+                    goto FAILED;
2939
+                
2940
+                /* At the end of compiling, code is still pointing to the start of the
2941
+                 group, while tempcode has been updated to point past the end of the group
2942
+                 and any option resetting that may follow it. The pattern pointer (ptr)
2943
+                 is on the bracket. */
2944
+                
2945
+                /* If this is a conditional bracket, check that there are no more than
2946
+                 two branches in the group. */
2947
+                
2948
+                else if (bravalue == OP_COND)
2949
+                {
2950
+                    uschar *tc = code;
2951
+                    condcount = 0;
2952
+                    
2953
+                    do {
2954
+                        condcount++;
2955
+                        tc += GET(tc,1);
2956
+                    }
2957
+                    while (*tc != OP_KET);
2958
+                    
2959
+                    if (condcount > 2)
2960
+                    {
2961
+                        *errorptr = ERR27;
2962
+                        goto FAILED;
2963
+                    }
2964
+                    
2965
+                    /* If there is just one branch, we must not make use of its firstbyte or
2966
+                     reqbyte, because this is equivalent to an empty second branch. */
2967
+                    
2968
+                    if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2969
+                }
2970
+                
2971
+                /* Handle updating of the required and first characters. Update for normal
2972
+                 brackets of all kinds, and conditions with two branches (see code above).
2973
+                 If the bracket is followed by a quantifier with zero repeat, we have to
2974
+                 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2975
+                 main loop so that they can be accessed for the back off. */
2976
+                
2977
+                zeroreqbyte = reqbyte;
2978
+                zerofirstbyte = firstbyte;
2979
+                groupsetfirstbyte = FALSE;
2980
+                
2981
+                if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2982
+                {
2983
+                    /* If we have not yet set a firstbyte in this branch, take it from the
2984
+                     subpattern, remembering that it was set here so that a repeat of more
2985
+                     than one can replicate it as reqbyte if necessary. If the subpattern has
2986
+                     no firstbyte, set "none" for the whole branch. In both cases, a zero
2987
+                     repeat forces firstbyte to "none". */
2988
+                    
2989
+                    if (firstbyte == REQ_UNSET)
2990
+                    {
2991
+                        if (subfirstbyte >= 0)
2992
+                        {
2993
+                            firstbyte = subfirstbyte;
2994
+                            groupsetfirstbyte = TRUE;
2995
+                        }
2996
+                        else firstbyte = REQ_NONE;
2997
+                        zerofirstbyte = REQ_NONE;
2998
+                    }
2999
+                    
3000
+                    /* If firstbyte was previously set, convert the subpattern's firstbyte
3001
+                     into reqbyte if there wasn't one, using the vary flag that was in
3002
+                     existence beforehand. */
3003
+                    
3004
+                    else if (subfirstbyte >= 0 && subreqbyte < 0)
3005
+                        subreqbyte = subfirstbyte | tempreqvary;
3006
+                    
3007
+                    /* If the subpattern set a required byte (or set a first byte that isn't
3008
+                     really the first byte - see above), set it. */
3009
+                    
3010
+                    if (subreqbyte >= 0) reqbyte = subreqbyte;
3011
+                }
3012
+                
3013
+                /* For a forward assertion, we take the reqbyte, if set. This can be
3014
+                 helpful if the pattern that follows the assertion doesn't set a different
3015
+                 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3016
+                 for an assertion, however because it leads to incorrect effect for patterns
3017
+                 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3018
+                 of a firstbyte. This is overcome by a scan at the end if there's no
3019
+                 firstbyte, looking for an asserted first char. */
3020
+                
3021
+                else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3022
+                
3023
+                /* Now update the main code pointer to the end of the group. */
3024
+                
3025
+                code = tempcode;
3026
+                
3027
+                /* Error if hit end of pattern */
3028
+                
3029
+                if (*ptr != ')')
3030
+                {
3031
+                    *errorptr = ERR14;
3032
+                    goto FAILED;
3033
+                }
3034
+                break;
3035
+                
3036
+                /* Check \ for being a real metacharacter; if not, fall through and handle
3037
+                 it as a data character at the start of a string. Escape items are checked
3038
+                 for validity in the pre-compiling pass. */
3039
+                
3040
+            case '\\':
3041
+                tempptr = ptr;
3042
+                c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3043
+                
3044
+                /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3045
+                 are arranged to be the negation of the corresponding OP_values. For the
3046
+                 back references, the values are ESC_REF plus the reference number. Only
3047
+                 back references and those types that consume a character may be repeated.
3048
+                 We can test for values between ESC_b and ESC_Z for the latter; this may
3049
+                 have to change if any new ones are ever created. */
3050
+                
3051
+                if (c < 0)
3052
+                {
3053
+                    if (-c == ESC_Q)            /* Handle start of quoted string */
3054
+                    {
3055
+                        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3056
+                        else inescq = TRUE;
3057
+                        continue;
3058
+                    }
3059
+                    
3060
+                    /* For metasequences that actually match a character, we disable the
3061
+                     setting of a first character if it hasn't already been set. */
3062
+                    
3063
+                    if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3064
+                        firstbyte = REQ_NONE;
3065
+                    
3066
+                    /* Set values to reset to if this is followed by a zero repeat. */
3067
+                    
3068
+                    zerofirstbyte = firstbyte;
3069
+                    zeroreqbyte = reqbyte;
3070
+                    
3071
+                    /* Back references are handled specially */
3072
+                    
3073
+                    if (-c >= ESC_REF)
3074
+                    {
3075
+                        int number = -c - ESC_REF;
3076
+                        previous = code;
3077
+                        *code++ = OP_REF;
3078
+                        PUT2INC(code, 0, number);
3079
+                    }
3080
+                    else
3081
+                    {
3082
+                        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3083
+                        *code++ = -c;
3084
+                    }
3085
+                    continue;
3086
+                }
3087
+                
3088
+                /* Data character: reset and fall through */
3089
+                
3090
+                ptr = tempptr;
3091
+                c = '\\';
3092
+                
3093
+                /* Handle a run of data characters until a metacharacter is encountered.
3094
+                 The first character is guaranteed not to be whitespace or # when the
3095
+                 extended flag is set. */
3096
+                
3097
+            NORMAL_CHAR:
1845 3098
             default:
1846
-            if ((options & PCRE_EXTRA) != 0)
1847
-              {
1848
-              *errorptr = ERR7;
1849
-              goto FAILED;
1850
-              }
1851
-            c = *ptr;    /* The final character */
1852
-            }
1853
-          }
1854
-
1855
-        /* Fall through if we have a single character (c >= 0). This may be
1856
-        > 256 in UTF-8 mode. */
1857
-
1858
-        }   /* End of backslash handling */
1859
-
1860
-      /* A single character may be followed by '-' to form a range. However,
1861
-      Perl does not permit ']' to be the end of the range. A '-' character
1862
-      here is treated as a literal. */
1863
-
1864
-      if (ptr[1] == '-' && ptr[2] != ']')
1865
-        {
1866
-        int d;
1867
-        ptr += 2;
1868
-
3099
+                previous = code;
3100
+                *code = OP_CHARS;
3101
+                code += 2;
3102
+                length = 0;
3103
+                
3104
+                do
3105
+                {
3106
+                    /* If in \Q...\E, check for the end; if not, we always have a literal */
3107
+                    
3108
+                    if (inescq)
3109
+                    {
3110
+                        if (c == '\\' && ptr[1] == 'E')
3111
+                        {
3112
+                            inescq = FALSE;
3113
+                            ptr++;
3114
+                        }
3115
+                        else
3116
+                        {
3117
+                            *code++ = c;
3118
+                            length++;
3119
+                        }
3120
+                        continue;
3121
+                    }
3122
+                    
3123
+                    /* Skip white space and comments for /x patterns */
3124
+                    
3125
+                    if ((options & PCRE_EXTENDED) != 0)
3126
+                    {
3127
+                        if ((cd->ctypes[c] & ctype_space) != 0) continue;
3128
+                        if (c == '#')
3129
+                        {
3130
+                            /* The space before the ; is to avoid a warning on a silly compiler
3131
+                             on the Macintosh. */
3132
+                            while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3133
+                            if (c == 0) break;
3134
+                            continue;
3135
+                        }
3136
+                    }
3137
+                    
3138
+                    /* Backslash may introduce a data char or a metacharacter. Escaped items
3139
+                     are checked for validity in the pre-compiling pass. Stop the string
3140
+                     before a metaitem. */
3141
+                    
3142
+                    if (c == '\\')
3143
+                    {
3144
+                        tempptr = ptr;
3145
+                        c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3146
+                        if (c < 0) { ptr = tempptr; break; }
3147
+                        
3148
+                        /* If a character is > 127 in UTF-8 mode, we have to turn it into
3149
+                         two or more characters in the UTF-8 encoding. */
3150
+                        
1869 3151
 #ifdef SUPPORT_UTF8
1870
-        if (utf8)
1871
-          {                           /* Braces are required because the */
1872
-          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
1873
-          }
1874
-        else
3152
+                        if (utf8 && c > 127)
3153
+                        {
3154
+                            uschar buffer[8];
3155
+                            int len = ord2utf8(c, buffer);
3156
+                            for (c = 0; c < len; c++) *code++ = buffer[c];
3157
+                            length += len;
3158
+                            continue;
3159
+                        }
1875 3160
 #endif
1876
-        d = *ptr;
1877
-
1878
-        /* The second part of a range can be a single-character escape, but
1879
-        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1880
-        in such circumstances. */
1881
-
1882
-        if (d == '\\')
1883
-          {
1884
-          const uschar *oldptr = ptr;
1885
-          d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1886
-
1887
-          /* \b is backslash; any other special means the '-' was literal */
1888
-
1889
-          if (d < 0)
1890
-            {
1891
-            if (d == -ESC_b) d = '\b'; else
1892
-              {
1893
-              ptr = oldptr - 2;
1894
-              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
1895
-              }
1896
-            }
1897
-          }
1898
-
1899
-        /* Check that the two values are in the correct order */
1900
-
1901
-        if (d < c)
1902
-          {
1903
-          *errorptr = ERR8;
1904
-          goto FAILED;
1905
-          }
1906
-
1907
-        /* If d is greater than 255, we can't just use the bit map, so set up
1908
-        for the UTF-8 supporting class type. If we are not caseless, we can
1909
-        just set up a single range. If we are caseless, the characters < 256
1910
-        are handled with a bitmap, in order to get the case-insensitive
1911
-        handling. */
1912
-
3161
+                    }
3162
+                    
3163
+                    /* Ordinary character or single-char escape */
3164
+                    
3165
+                    *code++ = c;
3166
+                    length++;
3167
+                }
3168
+                
3169
+                /* This "while" is the end of the "do" above. */
3170
+                
3171
+                while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3172
+                
3173
+                /* Update the first and last requirements. These are always bytes, even in
3174
+                 UTF-8 mode. However, there is a special case to be considered when there
3175
+                 are only one or two characters. Because this gets messy in UTF-8 mode, the
3176
+                 code is kept separate. When we get here "length" contains the number of
3177
+                 bytes. */
3178
+                
1913 3179
 #ifdef SUPPORT_UTF8
1914
-        if (d > 255)
1915
-          {
1916
-          class_utf8 = TRUE;
1917
-          *class_utf8data++ = XCL_RANGE;
1918
-          if ((options & PCRE_CASELESS) == 0)
1919
-            {
1920
-            class_utf8data += ord2utf8(c, class_utf8data);
1921
-            class_utf8data += ord2utf8(d, class_utf8data);
1922
-            continue;  /* Go get the next char in the class */
1923
-            }
1924
-          class_utf8data += ord2utf8(256, class_utf8data);
1925
-          class_utf8data += ord2utf8(d, class_utf8data);
1926
-          d = 255;
1927
-          /* Fall through */
1928
-          }
3180
+                if (utf8 && length > 1)
3181
+                {
3182
+                    uschar *t = previous + 3;                      /* After this code, t */
3183
+                    while (t < code && (*t & 0xc0) == 0x80) t++;   /* follows the 1st char */
3184
+                    
3185
+                    /* Handle the case when there is only one multibyte character. It must
3186
+                     have at least two bytes because of the "length > 1" test above. */
3187
+                    
3188
+                    if (t == code)
3189
+                    {
3190
+                        /* If no previous first byte, set it from this character, but revert to
3191
+                         none on a zero repeat. */
3192
+                        
3193
+                        if (firstbyte == REQ_UNSET)
3194
+                        {
3195
+                            zerofirstbyte = REQ_NONE;
3196
+                            firstbyte = previous[2];
3197
+                        }
3198
+                        
3199
+                        /* Otherwise, leave the first byte value alone, and don't change it on
3200
+                         a zero repeat */
3201
+                        
3202
+                        else zerofirstbyte = firstbyte;
3203
+                        
3204
+                        /* In both cases, a zero repeat resets the previous required byte */
3205
+                        
3206
+                        zeroreqbyte = reqbyte;
3207
+                    }
3208
+                    
3209
+                    /* Handle the case when there is more than one character. These may be
3210
+                     single-byte or multibyte characters */
3211
+                    
3212
+                    else
3213
+                    {
3214
+                        t = code - 1;                       /* After this code, t is at the */
3215
+                        while ((*t & 0xc0) == 0x80) t--;    /* start of the last character */
3216
+                        
3217
+                        /* If no previous first byte, set it from the first character, and
3218
+                         retain it on a zero repeat (of the last character). The required byte
3219
+                         is reset on a zero repeat, either to the byte before the last
3220
+                         character, unless this is the first byte of the string. In that case,
3221
+                         it reverts to its previous value. */
3222
+                        
3223
+                        if (firstbyte == REQ_UNSET)
3224
+                        {
3225
+                            zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3226
+                            zeroreqbyte = (t - 1 == previous + 2)?
3227
+                            reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3228
+                        }
3229
+                        
3230
+                        /* If there was a previous first byte, leave it alone, and don't change
3231
+                         it on a zero repeat. The required byte is reset on a zero repeat to the
3232
+                         byte before the last character. */
3233
+                        
3234
+                        else
3235
+                        {
3236
+                            zerofirstbyte = firstbyte;
3237
+                            zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3238
+                        }
3239
+                    }
3240
+                    
3241
+                    /* In all cases (we know length > 1), the new required byte is the last
3242
+                     byte of the string. */
3243
+                    
3244
+                    reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3245
+                }
3246
+                
3247
+                else   /* End of UTF-8 coding */
1929 3248
 #endif
1930
-        /* We use the bit map if the range is entirely < 255, or if part of it
1931
-        is < 255 and matching is caseless. */
3249
+                    
3250
+                /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3251
+                 or when UTF-8 is not enabled. */
3252
+                    
3253
+                {
3254
+                    /* firstbyte was not previously set; take it from this string */
3255
+                    
3256
+                    if (firstbyte == REQ_UNSET)
3257
+                    {
3258
+                        if (length == 1)
3259
+                        {
3260
+                            zerofirstbyte = REQ_NONE;
3261
+                            firstbyte = previous[2] | req_caseopt;
3262
+                            zeroreqbyte = reqbyte;
3263
+                        }
3264
+                        else
3265
+                        {
3266
+                            zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3267
+                            zeroreqbyte = (length > 2)?
3268
+                            (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3269
+                            reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3270
+                        }
3271
+                    }
3272
+                    
3273
+                    /* firstbyte was previously set */
3274
+                    
3275
+                    else
3276
+                    {
3277
+                        zerofirstbyte = firstbyte;
3278
+                        zeroreqbyte = (length == 1)? reqbyte :
3279
+                        code[-2] | req_caseopt | cd->req_varyopt;
3280
+                        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3281
+                    }
3282
+                }
3283
+                
3284
+                /* Set the length in the data vector, and advance to the next state. */
3285
+                
3286
+                previous[1] = length;
3287
+                if (length < MAXLIT) ptr--;
3288
+                break;
3289
+        }
3290
+    }                   /* end of big loop */
3291
+    
3292
+    /* Control never reaches here by falling through, only by a goto for all the
3293
+     error states. Pass back the position in the pattern so that it can be displayed
3294
+     to the user for diagnosing the error. */
3295
+    
3296
+FAILED:
3297
+    *ptrptr = ptr;
3298
+    return FALSE;
3299
+}
1932 3300
 
1933
-        for (; c <= d; c++)
1934
-          {
1935
-          class[c/8] |= (1 << (c&7));
1936
-          if ((options & PCRE_CASELESS) != 0)
1937
-            {
1938
-            int uc = cd->fcc[c];           /* flip case */
1939
-            class[uc/8] |= (1 << (uc&7));
1940
-            }
1941
-          class_charcount++;                /* in case a one-char range */
1942
-          class_lastchar = c;
1943
-          }
1944 3301
 
1945
-        continue;   /* Go get the next char in the class */
1946
-        }
1947 3302
 
1948
-      /* Handle a lone single character - we can get here for a normal
1949
-      non-escape char, or after \ that introduces a single character. */
1950 3303
 
1951
-      LONE_SINGLE_CHARACTER:
3304
+/*************************************************
3305
+ *     Compile sequence of alternatives           *
3306
+ *************************************************/
1952 3307
 
1953
-      /* Handle a multibyte character */
3308
+/* On entry, ptr is pointing past the bracket character, but on return
3309
+ it points to the closing bracket, or vertical bar, or end of string.
3310
+ The code variable is pointing at the byte into which the BRA operator has been
3311
+ stored. If the ims options are changed at the start (for a (?ims: group) or
3312
+ during any branch, we need to insert an OP_OPT item at the start of every
3313
+ following branch to ensure they get set correctly at run time, and also pass
3314
+ the new options into every subsequent branch compile.
3315
+ 
3316
+ Argument:
3317
+ options        option bits, including any changes for this subpattern
3318
+ oldims         previous settings of ims option bits
3319
+ brackets       -> int containing the number of extracting brackets used
3320
+ codeptr        -> the address of the current code pointer
3321
+ ptrptr         -> the address of the current pattern pointer
3322
+ errorptr       -> pointer to error message
3323
+ lookbehind     TRUE if this is a lookbehind assertion
3324
+ skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3325
+ firstbyteptr   place to put the first required character, or a negative number
3326
+ reqbyteptr     place to put the last required character, or a negative number
3327
+ bcptr          pointer to the chain of currently open branches
3328
+ cd             points to the data block with tables pointers etc.
3329
+ 
3330
+ Returns:      TRUE on success
3331
+ */
1954 3332
 
1955
-#ifdef SUPPORT_UTF8
1956
-      if (utf8 && c > 255)
3333
+static BOOL
3334
+compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3335
+              const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3336
+              int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3337
+{
3338
+    const uschar *ptr = *ptrptr;
3339
+    uschar *code = *codeptr;
3340
+    uschar *last_branch = code;
3341
+    uschar *start_bracket = code;
3342
+    uschar *reverse_count = NULL;
3343
+    int firstbyte, reqbyte;
3344
+    int branchfirstbyte, branchreqbyte;
3345
+    branch_chain bc;
3346
+    
3347
+    bc.outer = bcptr;
3348
+    bc.current = code;
3349
+    
3350
+    firstbyte = reqbyte = REQ_UNSET;
3351
+    
3352
+    /* Offset is set zero to mark that this bracket is still open */
3353
+    
3354
+    PUT(code, 1, 0);
3355
+    code += 1 + LINK_SIZE + skipbytes;
3356
+    
3357
+    /* Loop for each alternative branch */
3358
+    
3359
+    for (;;)
3360
+    {
3361
+        /* Handle a change of ims options at the start of the branch */
3362
+        
3363
+        if ((options & PCRE_IMS) != oldims)
1957 3364
         {
1958
-        class_utf8 = TRUE;
1959
-        *class_utf8data++ = XCL_SINGLE;
1960
-        class_utf8data += ord2utf8(c, class_utf8data);
3365
+            *code++ = OP_OPT;
3366
+            *code++ = options & PCRE_IMS;
1961 3367
         }
1962
-      else
1963
-#endif
1964
-      /* Handle a single-byte character */
3368
+        
3369
+        /* Set up dummy OP_REVERSE if lookbehind assertion */
3370
+        
3371
+        if (lookbehind)
1965 3372
         {
1966
-        class [c/8] |= (1 << (c&7));
1967
-        if ((options & PCRE_CASELESS) != 0)
1968
-          {
1969
-          c = cd->fcc[c];   /* flip case */
1970
-          class[c/8] |= (1 << (c&7));
1971
-          }
1972
-        class_charcount++;
1973
-        class_lastchar = c;
3373
+            *code++ = OP_REVERSE;
3374
+            reverse_count = code;
3375
+            PUTINC(code, 0, 0);
1974 3376
         }
1975
-      }
1976
-
1977
-    /* Loop until ']' reached; the check for end of string happens inside the
1978
-    loop. This "while" is the end of the "do" above. */
1979
-
1980
-    while ((c = *(++ptr)) != ']' || inescq);
1981
-
1982
-    /* If class_charcount is 1, we saw precisely one character with a value <
1983
-    256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1984
-    the one character is < 128. In non-UTF-8 mode we can always optimize.
1985
-
1986
-    The optimization throws away the bit map. We turn the item into a
1987
-    1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1988
-    that OP_NOT does not support multibyte characters. In the positive case, it
1989
-    can cause firstbyte to be set. Otherwise, there can be no first char if
1990
-    this item is first, whatever repeat count may follow. In the case of
1991
-    reqbyte, save the previous value for reinstating. */
1992
-
1993
-#ifdef SUPPORT_UTF8
1994
-    if (class_charcount == 1 &&
1995
-          (!utf8 ||
1996
-          (!class_utf8 && class_lastchar < 128)))
1997
-#else
1998
-    if (class_charcount == 1)
1999
-#endif
2000
-      {
2001
-      zeroreqbyte = reqbyte;
2002
-      if (negate_class)
3377
+        
3378
+        /* Now compile the branch */
3379
+        
3380
+        if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3381
+                            &branchfirstbyte, &branchreqbyte, &bc, cd))
2003 3382
         {
2004
-        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2005
-        zerofirstbyte = firstbyte;
2006
-        *code++ = OP_NOT;
3383
+            *ptrptr = ptr;
3384
+            return FALSE;
2007 3385
         }
2008
-      else
3386
+        
3387
+        /* If this is the first branch, the firstbyte and reqbyte values for the
3388
+         branch become the values for the regex. */
3389
+        
3390
+        if (*last_branch != OP_ALT)
2009 3391
         {
2010
-        if (firstbyte == REQ_UNSET)
2011
-          {
2012
-          zerofirstbyte = REQ_NONE;
2013
-          firstbyte = class_lastchar | req_caseopt;
2014
-          }
3392
+            firstbyte = branchfirstbyte;
3393
+            reqbyte = branchreqbyte;
3394
+        }
3395
+        
3396
+        /* If this is not the first branch, the first char and reqbyte have to
3397
+         match the values from all the previous branches, except that if the previous
3398
+         value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3399
+         REQ_VARY for the regex. */
3400
+        
2015 3401
         else
2016
-          {
2017
-          zerofirstbyte = firstbyte;
2018
-          reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
2019
-          }
2020
-        *code++ = OP_CHARS;
2021
-        *code++ = 1;
3402
+        {
3403
+            /* If we previously had a firstbyte, but it doesn't match the new branch,
3404
+             we have to abandon the firstbyte for the regex, but if there was previously
3405
+             no reqbyte, it takes on the value of the old firstbyte. */
3406
+            
3407
+            if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3408
+            {
3409
+                if (reqbyte < 0) reqbyte = firstbyte;
3410
+                firstbyte = REQ_NONE;
3411
+            }
3412
+            
3413
+            /* If we (now or from before) have no firstbyte, a firstbyte from the
3414
+             branch becomes a reqbyte if there isn't a branch reqbyte. */
3415
+            
3416
+            if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3417
+                branchreqbyte = branchfirstbyte;
3418
+            
3419
+            /* Now ensure that the reqbytes match */
3420
+            
3421
+            if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3422
+                reqbyte = REQ_NONE;
3423
+            else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
3424
+        }
3425
+        
3426
+        /* If lookbehind, check that this branch matches a fixed-length string,
3427
+         and put the length into the OP_REVERSE item. Temporarily mark the end of
3428
+         the branch with OP_END. */
3429
+        
3430
+        if (lookbehind)
3431
+        {
3432
+            int length;
3433
+            *code = OP_END;
3434
+            length = find_fixedlength(last_branch, options);
3435
+            DPRINTF(("fixed length = %d\n", length));
3436
+            if (length < 0)
3437
+            {
3438
+                *errorptr = (length == -2)? ERR36 : ERR25;
3439
+                *ptrptr = ptr;
3440
+                return FALSE;
3441
+            }
3442
+            PUT(reverse_count, 0, length);
3443
+        }
3444
+        
3445
+        /* Reached end of expression, either ')' or end of pattern. Go back through
3446
+         the alternative branches and reverse the chain of offsets, with the field in
3447
+         the BRA item now becoming an offset to the first alternative. If there are
3448
+         no alternatives, it points to the end of the group. The length in the
3449
+         terminating ket is always the length of the whole bracketed item. If any of
3450
+         the ims options were changed inside the group, compile a resetting op-code
3451
+         following, except at the very end of the pattern. Return leaving the pointer
3452
+         at the terminating char. */
3453
+        
3454
+        if (*ptr != '|')
3455
+        {
3456
+            int length = code - last_branch;
3457
+            do
3458
+            {
3459
+                int prev_length = GET(last_branch, 1);
3460
+                PUT(last_branch, 1, length);
3461
+                length = prev_length;
3462
+                last_branch -= length;
3463
+            }
3464
+            while (length > 0);
3465
+            
3466
+            /* Fill in the ket */
3467
+            
3468
+            *code = OP_KET;
3469
+            PUT(code, 1, code - start_bracket);
3470
+            code += 1 + LINK_SIZE;
3471
+            
3472
+            /* Resetting option if needed */
3473
+            
3474
+            if ((options & PCRE_IMS) != oldims && *ptr == ')')
3475
+            {
3476
+                *code++ = OP_OPT;
3477
+                *code++ = oldims;
3478
+            }
3479
+            
3480
+            /* Set values to pass back */
3481
+            
3482
+            *codeptr = code;
3483
+            *ptrptr = ptr;
3484
+            *firstbyteptr = firstbyte;
3485
+            *reqbyteptr = reqbyte;
3486
+            return TRUE;
2022 3487
         }
2023
-      *code++ = class_lastchar;
2024
-      break;  /* End of class handling */
2025
-      }       /* End of 1-byte optimization */
3488
+        
3489
+        /* Another branch follows; insert an "or" node. Its length field points back
3490
+         to the previous branch while the bracket remains open. At the end the chain
3491
+         is reversed. It's done like this so that the start of the bracket has a
3492
+         zero offset until it is closed, making it possible to detect recursion. */
3493
+        
3494
+        *code = OP_ALT;
3495
+        PUT(code, 1, code - last_branch);
3496
+        bc.current = last_branch = code;
3497
+        code += 1 + LINK_SIZE;
3498
+        ptr++;
3499
+    }
3500
+    /* Control never reaches here */
3501
+}
2026 3502
 
2027
-    /* Otherwise, if this is the first thing in the branch, there can be no
2028
-    first char setting, whatever the repeat count. Any reqbyte setting must
2029
-    remain unchanged after any kind of repeat. */
2030 3503
 
2031
-    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2032
-    zerofirstbyte = firstbyte;
2033
-    zeroreqbyte = reqbyte;
2034 3504
 
2035
-    /* If there are characters with values > 255, we have to compile an
2036
-    extended class, with its own opcode. If there are no characters < 256,
2037
-    we can omit the bitmap. */
2038 3505
 
2039
-#ifdef SUPPORT_UTF8
2040
-    if (class_utf8)
2041
-      {
2042
-      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
2043
-      *code++ = OP_XCLASS;
2044
-      code += LINK_SIZE;
2045
-      *code = negate_class? XCL_NOT : 0;
3506
+/*************************************************
3507
+ *          Check for anchored expression         *
3508
+ *************************************************/
2046 3509
 
2047
-      /* If the map is required, install it, and move on to the end of
2048
-      the extra data */
3510
+/* Try to find out if this is an anchored regular expression. Consider each
3511
+ alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3512
+ all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3513
+ it's anchored. However, if this is a multiline pattern, then only OP_SOD
3514
+ counts, since OP_CIRC can match in the middle.
3515
+ 
3516
+ We can also consider a regex to be anchored if OP_SOM starts all its branches.
3517
+ This is the code for \G, which means "match at start of match position, taking
3518
+ into account the match offset".
3519
+ 
3520
+ A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3521
+ because that will try the rest of the pattern at all possible matching points,
3522
+ so there is no point trying again.... er ....
3523
+ 
3524
+ .... except when the .* appears inside capturing parentheses, and there is a
3525
+ subsequent back reference to those parentheses. We haven't enough information
3526
+ to catch that case precisely.
3527
+ 
3528
+ At first, the best we could do was to detect when .* was in capturing brackets
3529
+ and the highest back reference was greater than or equal to that level.
3530
+ However, by keeping a bitmap of the first 31 back references, we can catch some
3531
+ of the more common cases more precisely.
3532
+ 
3533
+ Arguments:
3534
+ code           points to start of expression (the bracket)
3535
+ options        points to the options setting
3536
+ bracket_map    a bitmap of which brackets we are inside while testing; this
3537
+ handles up to substring 31; after that we just have to take
3538
+ the less precise approach
3539
+ backref_map    the back reference bitmap
3540
+ 
3541
+ Returns:     TRUE or FALSE
3542
+ */
2049 3543
 
2050
-      if (class_charcount > 0)
3544
+static BOOL
3545
+is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3546
+            unsigned int backref_map)
3547
+{
3548
+    do {
3549
+        const uschar *scode =
3550
+        first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3551
+        register int op = *scode;
3552
+        
3553
+        /* Capturing brackets */
3554
+        
3555
+        if (op > OP_BRA)
2051 3556
         {
2052
-        *code++ |= XCL_MAP;
2053
-        memcpy(code, class, 32);
2054
-        code = class_utf8data;
3557
+            int new_map;
3558
+            op -= OP_BRA;
3559
+            if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3560
+            new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3561
+            if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
2055 3562
         }
2056
-
2057
-      /* If the map is not required, slide down the extra data. */
2058
-
2059
-      else
3563
+        
3564
+        /* Other brackets */
3565
+        
3566
+        else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3567
+        {
3568
+            if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3569
+        }
3570
+        
3571
+        /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3572
+         are or may be referenced. */
3573
+        
3574
+        else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3575
+                 (*options & PCRE_DOTALL) != 0)
2060 3576
         {
2061
-        int len = class_utf8data - (code + 33);
2062
-        memmove(code + 1, code + 33, len);
2063
-        code += len + 1;
3577
+            if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
2064 3578
         }
3579
+        
3580
+        /* Check for explicit anchoring */
3581
+        
3582
+        else if (op != OP_SOD && op != OP_SOM &&
3583
+                 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3584
+            return FALSE;
3585
+        code += GET(code, 1);
3586
+    }
3587
+    while (*code == OP_ALT);   /* Loop for each alternative */
3588
+    return TRUE;
3589
+}
2065 3590
 
2066
-      /* Now fill in the complete length of the item */
2067 3591
 
2068
-      PUT(previous, 1, code - previous);
2069
-      break;   /* End of class handling */
2070
-      }
2071
-#endif
2072 3592
 
2073
-    /* If there are no characters > 255, negate the 32-byte map if necessary,
2074
-    and copy it into the code vector. If this is the first thing in the branch,
2075
-    there can be no first char setting, whatever the repeat count. Any reqbyte
2076
-    setting must remain unchanged after any kind of repeat. */
3593
+/*************************************************
3594
+ *         Check for starting with ^ or .*        *
3595
+ *************************************************/
2077 3596
 
2078
-    if (negate_class)
2079
-      {
2080
-      *code++ = OP_NCLASS;
2081
-      for (c = 0; c < 32; c++) code[c] = ~class[c];
2082
-      }
2083
-    else
2084
-      {
2085
-      *code++ = OP_CLASS;
2086
-      memcpy(code, class, 32);
2087
-      }
2088
-    code += 32;
2089
-    break;
2090
-
2091
-    /* Various kinds of repeat */
2092
-
2093
-    case '{':
2094
-    if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
2095
-    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
2096
-    if (*errorptr != NULL) goto FAILED;
2097
-    goto REPEAT;
2098
-
2099
-    case '*':
2100
-    repeat_min = 0;
2101
-    repeat_max = -1;
2102
-    goto REPEAT;
2103
-
2104
-    case '+':
2105
-    repeat_min = 1;
2106
-    repeat_max = -1;
2107
-    goto REPEAT;
2108
-
2109
-    case '?':
2110
-    repeat_min = 0;
2111
-    repeat_max = 1;
2112
-
2113
-    REPEAT:
2114
-    if (previous == NULL)
2115
-      {
2116
-      *errorptr = ERR9;
2117
-      goto FAILED;
2118
-      }
2119
-
2120
-    if (repeat_min == 0)
2121
-      {
2122
-      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2123
-      reqbyte = zeroreqbyte;        /* Ditto */
2124
-      }
2125
-
2126
-    /* Remember whether this is a variable length repeat */
2127
-
2128
-    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2129
-
2130
-    op_type = 0;                    /* Default single-char op codes */
2131
-    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2132
-
2133
-    /* Save start of previous item, in case we have to move it up to make space
2134
-    for an inserted OP_ONCE for the additional '+' extension. */
2135
-
2136
-    tempcode = previous;
2137
-
2138
-    /* If the next character is '+', we have a possessive quantifier. This
2139
-    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2140
-    If the next character is '?' this is a minimizing repeat, by default,
2141
-    but if PCRE_UNGREEDY is set, it works the other way round. We change the
2142
-    repeat type to the non-default. */
2143
-
2144
-    if (ptr[1] == '+')
2145
-      {
2146
-      repeat_type = 0;                  /* Force greedy */
2147
-      possessive_quantifier = TRUE;
2148
-      ptr++;
2149
-      }
2150
-    else if (ptr[1] == '?')
2151
-      {
2152
-      repeat_type = greedy_non_default;
2153
-      ptr++;
2154
-      }
2155
-    else repeat_type = greedy_default;
2156
-
2157
-    /* If previous was a recursion, we need to wrap it inside brackets so that
2158
-    it can be replicated if necessary. */
2159
-
2160
-    if (*previous == OP_RECURSE)
2161
-      {
2162
-      memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2163
-      code += 1 + LINK_SIZE;
2164
-      *previous = OP_BRA;
2165
-      PUT(previous, 1, code - previous);
2166
-      *code = OP_KET;
2167
-      PUT(code, 1, code - previous);
2168
-      code += 1 + LINK_SIZE;
2169
-      }
2170
-
2171
-    /* If previous was a string of characters, chop off the last one and use it
2172
-    as the subject of the repeat. If there was only one character, we can
2173
-    abolish the previous item altogether. If a one-char item has a minumum of
2174
-    more than one, ensure that it is set in reqbyte - it might not be if a
2175
-    sequence such as x{3} is the first thing in a branch because the x will
2176
-    have gone into firstbyte instead.  */
2177
-
2178
-    if (*previous == OP_CHARS)
2179
-      {
2180
-      /* Deal with UTF-8 characters that take up more than one byte. It's
2181
-      easier to write this out separately than try to macrify it. Use c to
2182
-      hold the length of the character in bytes, plus 0x80 to flag that it's a
2183
-      length rather than a small character. */
3597
+/* This is called to find out if every branch starts with ^ or .* so that
3598
+ "first char" processing can be done to speed things up in multiline
3599
+ matching and for non-DOTALL patterns that start with .* (which must start at
3600
+ the beginning or after \n). As in the case of is_anchored() (see above), we
3601
+ have to take account of back references to capturing brackets that contain .*
3602
+ because in that case we can't make the assumption.
3603
+ 
3604
+ Arguments:
3605
+ code           points to start of expression (the bracket)
3606
+ bracket_map    a bitmap of which brackets we are inside while testing; this
3607
+ handles up to substring 31; after that we just have to take
3608
+ the less precise approach
3609
+ backref_map    the back reference bitmap
3610
+ 
3611
+ Returns:         TRUE or FALSE
3612
+ */
2184 3613
 
2185
-#ifdef SUPPORT_UTF8
2186
-      if (utf8 && (code[-1] & 0x80) != 0)
3614
+static BOOL
3615
+is_startline(const uschar *code, unsigned int bracket_map,
3616
+             unsigned int backref_map)
3617
+{
3618
+    do {
3619
+        const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3620
+        register int op = *scode;
3621
+        
3622
+        /* Capturing brackets */
3623
+        
3624
+        if (op > OP_BRA)
2187 3625
         {
2188
-        uschar *lastchar = code - 1;
2189
-        while((*lastchar & 0xc0) == 0x80) lastchar--;
2190
-        c = code - lastchar;            /* Length of UTF-8 character */
2191
-        memcpy(utf8_char, lastchar, c); /* Save the char */
2192
-        if (lastchar == previous + 2)   /* There was only one character */
2193
-          {
2194
-          code = previous;              /* Abolish the previous item */
2195
-          }
2196
-        else
2197
-          {
2198
-          previous[1] -= c;             /* Adjust length of previous */
2199
-          code = lastchar;              /* Lost char off the end */
2200
-          tempcode = code;              /* Adjust position to be moved for '+' */
2201
-          }
2202
-        c |= 0x80;                      /* Flag c as a length */
3626
+            int new_map;
3627
+            op -= OP_BRA;
3628
+            if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3629
+            new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3630
+            if (!is_startline(scode, new_map, backref_map)) return FALSE;
2203 3631
         }
2204
-      else
2205
-#endif
2206
-
2207
-      /* Handle the case of a single byte - either with no UTF8 support, or
2208
-      with UTF-8 disabled, or for a UTF-8 character < 128. */
2209
-
3632
+        
3633
+        /* Other brackets */
3634
+        
3635
+        else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3636
+        { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3637
+        
3638
+        /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3639
+         may be referenced. */
3640
+        
3641
+        else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2210 3642
         {
2211
-        c = *(--code);
2212
-        if (code == previous + 2)   /* There was only one character */
2213
-          {
2214
-          code = previous;              /* Abolish the previous item */
2215
-          if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2216
-          }
2217
-        else
2218
-          {
2219
-          previous[1]--;             /* adjust length */
2220
-          tempcode = code;           /* Adjust position to be moved for '+' */
2221
-          }
3643
+            if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
2222 3644
         }
3645
+        
3646
+        /* Check for explicit circumflex */
3647
+        
3648
+        else if (op != OP_CIRC) return FALSE;
3649
+        code += GET(code, 1);
3650
+    }
3651
+    while (*code == OP_ALT);  /* Loop for each alternative */
3652
+    return TRUE;
3653
+}
2223 3654
 
2224
-      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2225
-      }
2226
-
2227
-    /* If previous was a single negated character ([^a] or similar), we use
2228
-    one of the special opcodes, replacing it. The code is shared with single-
2229
-    character repeats by setting opt_type to add a suitable offset into
2230
-    repeat_type. OP_NOT is currently used only for single-byte chars. */
2231
-
2232
-    else if (*previous == OP_NOT)
2233
-      {
2234
-      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2235
-      c = previous[1];
2236
-      code = previous;
2237
-      goto OUTPUT_SINGLE_REPEAT;
2238
-      }
2239
-
2240
-    /* If previous was a character type match (\d or similar), abolish it and
2241
-    create a suitable repeat item. The code is shared with single-character
2242
-    repeats by setting op_type to add a suitable offset into repeat_type. */
2243
-
2244
-    else if (*previous < OP_EODN)
2245
-      {
2246
-      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2247
-      c = *previous;
2248
-      code = previous;
2249
-
2250
-      OUTPUT_SINGLE_REPEAT:
2251
-
2252
-      /* If the maximum is zero then the minimum must also be zero; Perl allows
2253
-      this case, so we do too - by simply omitting the item altogether. */
2254
-
2255
-      if (repeat_max == 0) goto END_REPEAT;
2256 3655
 
2257
-      /* Combine the op_type with the repeat_type */
2258 3656
 
2259
-      repeat_type += op_type;
3657
+/*************************************************
3658
+ *       Check for asserted fixed first char      *
3659
+ *************************************************/
2260 3660
 
2261
-      /* A minimum of zero is handled either as the special case * or ?, or as
2262
-      an UPTO, with the maximum given. */
3661
+/* During compilation, the "first char" settings from forward assertions are
3662
+ discarded, because they can cause conflicts with actual literals that follow.
3663
+ However, if we end up without a first char setting for an unanchored pattern,
3664
+ it is worth scanning the regex to see if there is an initial asserted first
3665
+ char. If all branches start with the same asserted char, or with a bracket all
3666
+ of whose alternatives start with the same asserted char (recurse ad lib), then
3667
+ we return that char, otherwise -1.
3668
+ 
3669
+ Arguments:
3670
+ code       points to start of expression (the bracket)
3671
+ options    pointer to the options (used to check casing changes)
3672
+ inassert   TRUE if in an assertion
3673
+ 
3674
+ Returns:     -1 or the fixed first char
3675
+ */
2263 3676
 
2264
-      if (repeat_min == 0)
3677
+static int
3678
+find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3679
+{
3680
+    register int c = -1;
3681
+    do {
3682
+        int d;
3683
+        const uschar *scode =
3684
+        first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3685
+        register int op = *scode;
3686
+        
3687
+        if (op >= OP_BRA) op = OP_BRA;
3688
+        
3689
+        switch(op)
2265 3690
         {
2266
-        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2267
-          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2268
-        else
2269
-          {
2270
-          *code++ = OP_UPTO + repeat_type;
2271
-          PUT2INC(code, 0, repeat_max);
2272
-          }
3691
+            default:
3692
+                return -1;
3693
+                
3694
+            case OP_BRA:
3695
+            case OP_ASSERT:
3696
+            case OP_ONCE:
3697
+            case OP_COND:
3698
+                if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3699
+                    return -1;
3700
+                if (c < 0) c = d; else if (c != d) return -1;
3701
+                break;
3702
+                
3703
+            case OP_EXACT:       /* Fall through */
3704
+                scode++;
3705
+                
3706
+            case OP_CHARS:       /* Fall through */
3707
+                scode++;
3708
+                
3709
+            case OP_PLUS:
3710
+            case OP_MINPLUS:
3711
+                if (!inassert) return -1;
3712
+                if (c < 0)
3713
+                {
3714
+                    c = scode[1];
3715
+                    if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3716
+                }
3717
+                else if (c != scode[1]) return -1;
3718
+                break;
2273 3719
         }
3720
+        
3721
+        code += GET(code, 1);
3722
+    }
3723
+    while (*code == OP_ALT);
3724
+    return c;
3725
+}
2274 3726
 
2275
-      /* The case {1,} is handled as the special case + */
2276
-
2277
-      else if (repeat_min == 1 && repeat_max == -1)
2278
-        *code++ = OP_PLUS + repeat_type;
2279
-
2280
-      /* The case {n,n} is just an EXACT, while the general case {n,m} is
2281
-      handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2282
-
2283
-      else
2284
-        {
2285
-        if (repeat_min != 1)
2286
-          {
2287
-          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
2288
-          PUT2INC(code, 0, repeat_min);
2289
-          }
2290
-
2291
-        /* If the mininum is 1 and the previous item was a character string,
2292
-        we either have to put back the item that got cancelled if the string
2293
-        length was 1, or add the character back onto the end of a longer
2294
-        string. For a character type nothing need be done; it will just get
2295
-        put back naturally. Note that the final character is always going to
2296
-        get added below, so we leave code ready for its insertion. */
2297
-
2298
-        else if (*previous == OP_CHARS)
2299
-          {
2300
-          if (code == previous) code += 2; else
2301
-
2302
-          /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2303
-          bit set as a flag. The length will always be between 2 and 6. */
2304 3727
 
2305
-#ifdef SUPPORT_UTF8
2306
-          if (utf8 && c >= 128) previous[1] += c & 7; else
2307
-#endif
2308
-          previous[1]++;
2309
-          }
2310 3728
 
2311
-        /*  For a single negated character we also have to put back the
2312
-        item that got cancelled. At present this applies only to single byte
2313
-        characters in any mode. */
2314 3729
 
2315
-        else if (*previous == OP_NOT) code++;
3730
+/*************************************************
3731
+ *        Compile a Regular Expression            *
3732
+ *************************************************/
2316 3733
 
2317
-        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2318
-        we have to insert the character for the previous code. In UTF-8 mode,
2319
-        long characters have their length in c, with the 0x80 bit as a flag. */
3734
+/* This function takes a string and returns a pointer to a block of store
3735
+ holding a compiled version of the expression.
3736
+ 
3737
+ Arguments:
3738
+ pattern      the regular expression
3739
+ options      various option bits
3740
+ errorptr     pointer to pointer to error text
3741
+ erroroffset  ptr offset in pattern where error was detected
3742
+ tables       pointer to character tables or NULL
3743
+ 
3744
+ Returns:       pointer to compiled data block, or NULL on error,
3745
+ with errorptr and erroroffset set
3746
+ */
2320 3747
 
2321
-        if (repeat_max < 0)
2322
-          {
3748
+pcre *
3749
+pcre_compile(const char *pattern, int options, const char **errorptr,
3750
+             int *erroroffset, const unsigned char *tables)
3751
+{
3752
+    real_pcre *re;
3753
+    int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
3754
+    int runlength;
3755
+    int c, firstbyte, reqbyte;
3756
+    int bracount = 0;
3757
+    int branch_extra = 0;
3758
+    int branch_newextra;
3759
+    int item_count = -1;
3760
+    int name_count = 0;
3761
+    int max_name_size = 0;
2323 3762
 #ifdef SUPPORT_UTF8
2324
-          if (utf8 && c >= 128)
2325
-            {
2326
-            memcpy(code, utf8_char, c & 7);
2327
-            code += c & 7;
2328
-            }
2329
-          else
3763
+    int lastcharlength = 0;
3764
+    BOOL utf8;
3765
+    BOOL class_utf8;
2330 3766
 #endif
2331
-          *code++ = c;
2332
-          *code++ = OP_STAR + repeat_type;
2333
-          }
2334
-
2335
-        /* Else insert an UPTO if the max is greater than the min, again
2336
-        preceded by the character, for the previously inserted code. */
2337
-
2338
-        else if (repeat_max != repeat_min)
2339
-          {
3767
+    BOOL inescq = FALSE;
3768
+    unsigned int brastackptr = 0;
3769
+    size_t size;
3770
+    uschar *code;
3771
+    const uschar *codestart;
3772
+    const uschar *ptr;
3773
+    compile_data compile_block;
3774
+    int brastack[BRASTACK_SIZE];
3775
+    uschar bralenstack[BRASTACK_SIZE];
3776
+    
3777
+    /* We can't pass back an error message if errorptr is NULL; I guess the best we
3778
+     can do is just return NULL. */
3779
+    
3780
+    if (errorptr == NULL) return NULL;
3781
+    *errorptr = NULL;
3782
+    
3783
+    /* However, we can give a message for this error */
3784
+    
3785
+    if (erroroffset == NULL)
3786
+    {
3787
+        *errorptr = ERR16;
3788
+        return NULL;
3789
+    }
3790
+    *erroroffset = 0;
3791
+    
3792
+    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3793
+    
2340 3794
 #ifdef SUPPORT_UTF8
2341
-          if (utf8 && c >= 128)
3795
+    utf8 = (options & PCRE_UTF8) != 0;
3796
+#else
3797
+    if ((options & PCRE_UTF8) != 0)
3798
+    {
3799
+        *errorptr = ERR32;
3800
+        return NULL;
3801
+    }
3802
+#endif
3803
+    
3804
+    if ((options & ~PUBLIC_OPTIONS) != 0)
3805
+    {
3806
+        *errorptr = ERR17;
3807
+        return NULL;
3808
+    }
3809
+    
3810
+    /* Set up pointers to the individual character tables */
3811
+    
3812
+    if (tables == NULL) tables = pcre_default_tables;
3813
+    compile_block.lcc = tables + lcc_offset;
3814
+    compile_block.fcc = tables + fcc_offset;
3815
+    compile_block.cbits = tables + cbits_offset;
3816
+    compile_block.ctypes = tables + ctypes_offset;
3817
+    
3818
+    /* Maximum back reference and backref bitmap. This is updated for numeric
3819
+     references during the first pass, but for named references during the actual
3820
+     compile pass. The bitmap records up to 31 back references to help in deciding
3821
+     whether (.*) can be treated as anchored or not. */
3822
+    
3823
+    compile_block.top_backref = 0;
3824
+    compile_block.backref_map = 0;
3825
+    
3826
+    /* Reflect pattern for debugging output */
3827
+    
3828
+    DPRINTF(("------------------------------------------------------------------\n"));
3829
+    DPRINTF(("%s\n", pattern));
3830
+    
3831
+    /* The first thing to do is to make a pass over the pattern to compute the
3832
+     amount of store required to hold the compiled code. This does not have to be
3833
+     perfect as long as errors are overestimates. At the same time we can detect any
3834
+     flag settings right at the start, and extract them. Make an attempt to correct
3835
+     for any counted white space if an "extended" flag setting appears late in the
3836
+     pattern. We can't be so clever for #-comments. */
3837
+    
3838
+    ptr = (const uschar *)(pattern - 1);
3839
+    while ((c = *(++ptr)) != 0)
3840
+    {
3841
+        int min, max;
3842
+        int class_optcount;
3843
+        int bracket_length;
3844
+        int duplength;
3845
+        
3846
+        /* If we are inside a \Q...\E sequence, all chars are literal */
3847
+        
3848
+        if (inescq) goto NORMAL_CHAR;
3849
+        
3850
+        /* Otherwise, first check for ignored whitespace and comments */
3851
+        
3852
+        if ((options & PCRE_EXTENDED) != 0)
3853
+        {
3854
+            if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3855
+            if (c == '#')
2342 3856
             {
2343
-            memcpy(code, utf8_char, c & 7);
2344
-            code += c & 7;
3857
+                /* The space before the ; is to avoid a warning on a silly compiler
3858
+                 on the Macintosh. */
3859
+                while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3860
+                if (c == 0) break;
3861
+                continue;
2345 3862
             }
2346
-          else
2347
-#endif
2348
-          *code++ = c;
2349
-          repeat_max -= repeat_min;
2350
-          *code++ = OP_UPTO + repeat_type;
2351
-          PUT2INC(code, 0, repeat_max);
2352
-          }
2353 3863
         }
2354
-
2355
-      /* The character or character type itself comes last in all cases. */
2356
-
2357
-#ifdef SUPPORT_UTF8
2358
-      if (utf8 && c >= 128)
2359
-        {
2360
-        memcpy(code, utf8_char, c & 7);
2361
-        code += c & 7;
2362
-        }
2363
-      else
2364
-#endif
2365
-
2366
-      *code++ = c;
2367
-      }
2368
-
2369
-    /* If previous was a character class or a back reference, we put the repeat
2370
-    stuff after it, but just skip the item if the repeat was {0,0}. */
2371
-
2372
-    else if (*previous == OP_CLASS ||
2373
-             *previous == OP_NCLASS ||
2374
-#ifdef SUPPORT_UTF8
2375
-             *previous == OP_XCLASS ||
2376
-#endif
2377
-             *previous == OP_REF)
2378
-      {
2379
-      if (repeat_max == 0)
2380
-        {
2381
-        code = previous;
2382
-        goto END_REPEAT;
2383
-        }
2384
-      if (repeat_min == 0 && repeat_max == -1)
2385
-        *code++ = OP_CRSTAR + repeat_type;
2386
-      else if (repeat_min == 1 && repeat_max == -1)
2387
-        *code++ = OP_CRPLUS + repeat_type;
2388
-      else if (repeat_min == 0 && repeat_max == 1)
2389
-        *code++ = OP_CRQUERY + repeat_type;
2390
-      else
2391
-        {
2392
-        *code++ = OP_CRRANGE + repeat_type;
2393
-        PUT2INC(code, 0, repeat_min);
2394
-        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
2395
-        PUT2INC(code, 0, repeat_max);
2396
-        }
2397
-      }
2398
-
2399
-    /* If previous was a bracket group, we may have to replicate it in certain
2400
-    cases. */
2401
-
2402
-    else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2403
-             *previous == OP_COND)
2404
-      {
2405
-      register int i;
2406
-      int ketoffset = 0;
2407
-      int len = code - previous;
2408
-      uschar *bralink = NULL;
2409
-
2410
-      /* If the maximum repeat count is unlimited, find the end of the bracket
2411
-      by scanning through from the start, and compute the offset back to it
2412
-      from the current code pointer. There may be an OP_OPT setting following
2413
-      the final KET, so we can't find the end just by going back from the code
2414
-      pointer. */
2415
-
2416
-      if (repeat_max == -1)
2417
-        {
2418
-        register uschar *ket = previous;
2419
-        do ket += GET(ket, 1); while (*ket != OP_KET);
2420
-        ketoffset = code - ket;
2421
-        }
2422
-
2423
-      /* The case of a zero minimum is special because of the need to stick
2424
-      OP_BRAZERO in front of it, and because the group appears once in the
2425
-      data, whereas in other cases it appears the minimum number of times. For
2426
-      this reason, it is simplest to treat this case separately, as otherwise
2427
-      the code gets far too messy. There are several special subcases when the
2428
-      minimum is zero. */
2429
-
2430
-      if (repeat_min == 0)
2431
-        {
2432
-        /* If the maximum is also zero, we just omit the group from the output
2433
-        altogether. */
2434
-
2435
-        if (repeat_max == 0)
2436
-          {
2437
-          code = previous;
2438
-          goto END_REPEAT;
2439
-          }
2440
-
2441
-        /* If the maximum is 1 or unlimited, we just have to stick in the
2442
-        BRAZERO and do no more at this point. */
2443
-
2444
-        if (repeat_max <= 1)
2445
-          {
2446
-          memmove(previous+1, previous, len);
2447
-          code++;
2448
-          *previous++ = OP_BRAZERO + repeat_type;
2449
-          }
2450
-
2451
-        /* If the maximum is greater than 1 and limited, we have to replicate
2452
-        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2453
-        The first one has to be handled carefully because it's the original
2454
-        copy, which has to be moved up. The remainder can be handled by code
2455
-        that is common with the non-zero minimum case below. We just have to
2456
-        adjust the value or repeat_max, since one less copy is required. */
2457
-
2458
-        else
2459
-          {
2460
-          int offset;
2461
-          memmove(previous + 2 + LINK_SIZE, previous, len);
2462
-          code += 2 + LINK_SIZE;
2463
-          *previous++ = OP_BRAZERO + repeat_type;
2464
-          *previous++ = OP_BRA;
2465
-
2466
-          /* We chain together the bracket offset fields that have to be
2467
-          filled in later when the ends of the brackets are reached. */
2468
-
2469
-          offset = (bralink == NULL)? 0 : previous - bralink;
2470
-          bralink = previous;
2471
-          PUTINC(previous, 0, offset);
2472
-          }
2473
-
2474
-        repeat_max--;
2475
-        }
2476
-
2477
-      /* If the minimum is greater than zero, replicate the group as many
2478
-      times as necessary, and adjust the maximum to the number of subsequent
2479
-      copies that we need. If we set a first char from the group, and didn't
2480
-      set a required char, copy the latter from the former. */
2481
-
2482
-      else
2483
-        {
2484
-        if (repeat_min > 1)
2485
-          {
2486
-          if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2487
-          for (i = 1; i < repeat_min; i++)
2488
-            {
2489
-            memcpy(code, previous, len);
2490
-            code += len;
2491
-            }
2492
-          }
2493
-        if (repeat_max > 0) repeat_max -= repeat_min;
2494
-        }
2495
-
2496
-      /* This code is common to both the zero and non-zero minimum cases. If
2497
-      the maximum is limited, it replicates the group in a nested fashion,
2498
-      remembering the bracket starts on a stack. In the case of a zero minimum,
2499
-      the first one was set up above. In all cases the repeat_max now specifies
2500
-      the number of additional copies needed. */
2501
-
2502
-      if (repeat_max >= 0)
2503
-        {
2504
-        for (i = repeat_max - 1; i >= 0; i--)
2505
-          {
2506
-          *code++ = OP_BRAZERO + repeat_type;
2507
-
2508
-          /* All but the final copy start a new nesting, maintaining the
2509
-          chain of brackets outstanding. */
2510
-
2511
-          if (i != 0)
2512
-            {
2513
-            int offset;
2514
-            *code++ = OP_BRA;
2515
-            offset = (bralink == NULL)? 0 : code - bralink;
2516
-            bralink = code;
2517
-            PUTINC(code, 0, offset);
2518
-            }
2519
-
2520
-          memcpy(code, previous, len);
2521
-          code += len;
2522
-          }
2523
-
2524
-        /* Now chain through the pending brackets, and fill in their length
2525
-        fields (which are holding the chain links pro tem). */
2526
-
2527
-        while (bralink != NULL)
2528
-          {
2529
-          int oldlinkoffset;
2530
-          int offset = code - bralink + 1;
2531
-          uschar *bra = code - offset;
2532
-          oldlinkoffset = GET(bra, 1);
2533
-          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2534
-          *code++ = OP_KET;
2535
-          PUTINC(code, 0, offset);
2536
-          PUT(bra, 1, offset);
2537
-          }
2538
-        }
2539
-
2540
-      /* If the maximum is unlimited, set a repeater in the final copy. We
2541
-      can't just offset backwards from the current code point, because we
2542
-      don't know if there's been an options resetting after the ket. The
2543
-      correct offset was computed above. */
2544
-
2545
-      else code[-ketoffset] = OP_KETRMAX + repeat_type;
2546
-      }
2547
-
2548
-    /* Else there's some kind of shambles */
2549
-
2550
-    else
2551
-      {
2552
-      *errorptr = ERR11;
2553
-      goto FAILED;
2554
-      }
2555
-
2556
-    /* If the character following a repeat is '+', we wrap the entire repeated
2557
-    item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2558
-    Sun's Java package. The repeated item starts at tempcode, not at previous,
2559
-    which might be the first part of a string whose (former) last char we
2560
-    repeated. However, we don't support '+' after a greediness '?'. */
2561
-
2562
-    if (possessive_quantifier)
2563
-      {
2564
-      int len = code - tempcode;
2565
-      memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2566
-      code += 1 + LINK_SIZE;
2567
-      len += 1 + LINK_SIZE;
2568
-      tempcode[0] = OP_ONCE;
2569
-      *code++ = OP_KET;
2570
-      PUTINC(code, 0, len);
2571
-      PUT(tempcode, 1, len);
2572
-      }
2573
-
2574
-    /* In all case we no longer have a previous item. We also set the
2575
-    "follows varying string" flag for subsequently encountered reqbytes if
2576
-    it isn't already set and we have just passed a varying length item. */
2577
-
2578
-    END_REPEAT:
2579
-    previous = NULL;
2580
-    cd->req_varyopt |= reqvary;
2581
-    break;
2582
-
2583
-
2584
-    /* Start of nested bracket sub-expression, or comment or lookahead or
2585
-    lookbehind or option setting or condition. First deal with special things
2586
-    that can come after a bracket; all are introduced by ?, and the appearance
2587
-    of any of them means that this is not a referencing group. They were
2588
-    checked for validity in the first pass over the string, so we don't have to
2589
-    check for syntax errors here.  */
2590
-
2591
-    case '(':
2592
-    newoptions = options;
2593
-    skipbytes = 0;
2594
-
2595
-    if (*(++ptr) == '?')
2596
-      {
2597
-      int set, unset;
2598
-      int *optset;
2599
-
2600
-      switch (*(++ptr))
2601
-        {
2602
-        case '#':                 /* Comment; skip to ket */
2603
-        ptr++;
2604
-        while (*ptr != ')') ptr++;
2605
-        continue;
2606
-
2607
-        case ':':                 /* Non-extracting bracket */
2608
-        bravalue = OP_BRA;
2609
-        ptr++;
2610
-        break;
2611
-
2612
-        case '(':
2613
-        bravalue = OP_COND;       /* Conditional group */
2614
-
2615
-        /* Condition to test for recursion */
2616
-
2617
-        if (ptr[1] == 'R')
2618
-          {
2619
-          code[1+LINK_SIZE] = OP_CREF;
2620
-          PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2621
-          skipbytes = 3;
2622
-          ptr += 3;
2623
-          }
2624
-
2625
-        /* Condition to test for a numbered subpattern match. We know that
2626
-        if a digit follows ( then there will just be digits until ) because
2627
-        the syntax was checked in the first pass. */
2628
-
2629
-        else if ((digitab[ptr[1]] & ctype_digit) != 0)
2630
-          {
2631
-          int condref;                 /* Don't amalgamate; some compilers */
2632
-          condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
2633
-          while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2634
-          if (condref == 0)
2635
-            {
2636
-            *errorptr = ERR35;
2637
-            goto FAILED;
2638
-            }
2639
-          ptr++;
2640
-          code[1+LINK_SIZE] = OP_CREF;
2641
-          PUT2(code, 2+LINK_SIZE, condref);
2642
-          skipbytes = 3;
2643
-          }
2644
-        /* For conditions that are assertions, we just fall through, having
2645
-        set bravalue above. */
2646
-        break;
2647
-
2648
-        case '=':                 /* Positive lookahead */
2649
-        bravalue = OP_ASSERT;
2650
-        ptr++;
2651
-        break;
2652
-
2653
-        case '!':                 /* Negative lookahead */
2654
-        bravalue = OP_ASSERT_NOT;
2655
-        ptr++;
2656
-        break;
2657
-
2658
-        case '<':                 /* Lookbehinds */
2659
-        switch (*(++ptr))
2660
-          {
2661
-          case '=':               /* Positive lookbehind */
2662
-          bravalue = OP_ASSERTBACK;
2663
-          ptr++;
2664
-          break;
2665
-
2666
-          case '!':               /* Negative lookbehind */
2667
-          bravalue = OP_ASSERTBACK_NOT;
2668
-          ptr++;
2669
-          break;
2670
-          }
2671
-        break;
2672
-
2673
-        case '>':                 /* One-time brackets */
2674
-        bravalue = OP_ONCE;
2675
-        ptr++;
2676
-        break;
2677
-
2678
-        case 'C':                 /* Callout - may be followed by digits */
2679
-        *code++ = OP_CALLOUT;
2680
-          {
2681
-          int n = 0;
2682
-          while ((digitab[*(++ptr)] & ctype_digit) != 0)
2683
-            n = n * 10 + *ptr - '0';
2684
-          if (n > 255)
2685
-            {
2686
-            *errorptr = ERR38;
2687
-            goto FAILED;
2688
-            }
2689
-          *code++ = n;
2690
-          }
2691
-        previous = NULL;
2692
-        continue;
2693
-
2694
-        case 'P':                 /* Named subpattern handling */
2695
-        if (*(++ptr) == '<')      /* Definition */
2696
-          {
2697
-          int i, namelen;
2698
-          uschar *slot = cd->name_table;
2699
-          const uschar *name;     /* Don't amalgamate; some compilers */
2700
-          name = ++ptr;           /* grumble at autoincrement in declaration */
2701
-
2702
-          while (*ptr++ != '>');
2703
-          namelen = ptr - name - 1;
2704
-
2705
-          for (i = 0; i < cd->names_found; i++)
2706
-            {
2707
-            int crc = memcmp(name, slot+2, namelen);
2708
-            if (crc == 0)
2709
-              {
2710
-              if (slot[2+namelen] == 0)
2711
-                {
2712
-                *errorptr = ERR43;
2713
-                goto FAILED;
2714
-                }
2715
-              crc = -1;             /* Current name is substring */
2716
-              }
2717
-            if (crc < 0)
2718
-              {
2719
-              memmove(slot + cd->name_entry_size, slot,
2720
-                (cd->names_found - i) * cd->name_entry_size);
2721
-              break;
2722
-              }
2723
-            slot += cd->name_entry_size;
2724
-            }
2725
-
2726
-          PUT2(slot, 0, *brackets + 1);
2727
-          memcpy(slot + 2, name, namelen);
2728
-          slot[2+namelen] = 0;
2729
-          cd->names_found++;
2730
-          goto NUMBERED_GROUP;
2731
-          }
2732
-
2733
-        if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
2734
-          {
2735
-          int i, namelen;
2736
-          int type = *ptr++;
2737
-          const uschar *name = ptr;
2738
-          uschar *slot = cd->name_table;
2739
-
2740
-          while (*ptr != ')') ptr++;
2741
-          namelen = ptr - name;
2742
-
2743
-          for (i = 0; i < cd->names_found; i++)
2744
-            {
2745
-            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2746
-            slot += cd->name_entry_size;
2747
-            }
2748
-          if (i >= cd->names_found)
2749
-            {
2750
-            *errorptr = ERR15;
2751
-            goto FAILED;
2752
-            }
2753
-
2754
-          recno = GET2(slot, 0);
2755
-
2756
-          if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
2757
-
2758
-          /* Back reference */
2759
-
2760
-          previous = code;
2761
-          *code++ = OP_REF;
2762
-          PUT2INC(code, 0, recno);
2763
-          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2764
-          if (recno > cd->top_backref) cd->top_backref = recno;
2765
-          continue;
2766
-          }
2767
-
2768
-        /* Should never happen */
2769
-        break;
2770
-
2771
-        case 'R':                 /* Pattern recursion */
2772
-        ptr++;                    /* Same as (?0)      */
2773
-        /* Fall through */
2774
-
2775
-        /* Recursion or "subroutine" call */
2776
-
2777
-        case '0': case '1': case '2': case '3': case '4':
2778
-        case '5': case '6': case '7': case '8': case '9':
2779
-          {
2780
-          const uschar *called;
2781
-          recno = 0;
2782
-          while((digitab[*ptr] & ctype_digit) != 0)
2783
-            recno = recno * 10 + *ptr++ - '0';
2784
-
2785
-          /* Come here from code above that handles a named recursion */
2786
-
2787
-          HANDLE_RECURSION:
2788
-
2789
-          previous = code;
2790
-
2791
-          /* Find the bracket that is being referenced. Temporarily end the
2792
-          regex in case it doesn't exist. */
2793
-
2794
-          *code = OP_END;
2795
-          called = (recno == 0)?
2796
-            cd->start_code : find_bracket(cd->start_code, utf8, recno);
2797
-
2798
-          if (called == NULL)
2799
-            {
2800
-            *errorptr = ERR15;
2801
-            goto FAILED;
2802
-            }
2803
-
2804
-          /* If the subpattern is still open, this is a recursive call. We
2805
-          check to see if this is a left recursion that could loop for ever,
2806
-          and diagnose that case. */
2807
-
2808
-          if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2809
-            {
2810
-            *errorptr = ERR40;
2811
-            goto FAILED;
2812
-            }
2813
-
2814
-          /* Insert the recursion/subroutine item */
2815
-
2816
-          *code = OP_RECURSE;
2817
-          PUT(code, 1, called - cd->start_code);
2818
-          code += 1 + LINK_SIZE;
2819
-          }
2820
-        continue;
2821
-
2822
-        /* Character after (? not specially recognized */
2823
-
2824
-        default:                  /* Option setting */
2825
-        set = unset = 0;
2826
-        optset = &set;
2827
-
2828
-        while (*ptr != ')' && *ptr != ':')
2829
-          {
2830
-          switch (*ptr++)
2831
-            {
2832
-            case '-': optset = &unset; break;
2833
-
2834
-            case 'i': *optset |= PCRE_CASELESS; break;
2835
-            case 'm': *optset |= PCRE_MULTILINE; break;
2836
-            case 's': *optset |= PCRE_DOTALL; break;
2837
-            case 'x': *optset |= PCRE_EXTENDED; break;
2838
-            case 'U': *optset |= PCRE_UNGREEDY; break;
2839
-            case 'X': *optset |= PCRE_EXTRA; break;
2840
-            }
2841
-          }
2842
-
2843
-        /* Set up the changed option bits, but don't change anything yet. */
2844
-
2845
-        newoptions = (options | set) & (~unset);
2846
-
2847
-        /* If the options ended with ')' this is not the start of a nested
2848
-        group with option changes, so the options change at this level. Compile
2849
-        code to change the ims options if this setting actually changes any of
2850
-        them. We also pass the new setting back so that it can be put at the
2851
-        start of any following branches, and when this group ends (if we are in
2852
-        a group), a resetting item can be compiled.
2853
-
2854
-        Note that if this item is right at the start of the pattern, the
2855
-        options will have been abstracted and made global, so there will be no
2856
-        change to compile. */
2857
-
2858
-        if (*ptr == ')')
2859
-          {
2860
-          if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
2861
-            {
2862
-            *code++ = OP_OPT;
2863
-            *code++ = newoptions & PCRE_IMS;
2864
-            }
2865
-
2866
-          /* Change options at this level, and pass them back for use
2867
-          in subsequent branches. Reset the greedy defaults and the case
2868
-          value for firstbyte and reqbyte. */
2869
-
2870
-          *optionsptr = options = newoptions;
2871
-          greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2872
-          greedy_non_default = greedy_default ^ 1;
2873
-          req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2874
-
2875
-          previous = NULL;       /* This item can't be repeated */
2876
-          continue;              /* It is complete */
2877
-          }
2878
-
2879
-        /* If the options ended with ':' we are heading into a nested group
2880
-        with possible change of options. Such groups are non-capturing and are
2881
-        not assertions of any kind. All we need to do is skip over the ':';
2882
-        the newoptions value is handled below. */
2883
-
2884
-        bravalue = OP_BRA;
2885
-        ptr++;
2886
-        }
2887
-      }
2888
-
2889
-    /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2890
-    non-capturing and behave like (?:...) brackets */
2891
-
2892
-    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2893
-      {
2894
-      bravalue = OP_BRA;
2895
-      }
2896
-
2897
-    /* Else we have a referencing group; adjust the opcode. If the bracket
2898
-    number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2899
-    arrange for the true number to follow later, in an OP_BRANUMBER item. */
2900
-
2901
-    else
2902
-      {
2903
-      NUMBERED_GROUP:
2904
-      if (++(*brackets) > EXTRACT_BASIC_MAX)
2905
-        {
2906
-        bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2907
-        code[1+LINK_SIZE] = OP_BRANUMBER;
2908
-        PUT2(code, 2+LINK_SIZE, *brackets);
2909
-        skipbytes = 3;
2910
-        }
2911
-      else bravalue = OP_BRA + *brackets;
2912
-      }
2913
-
2914
-    /* Process nested bracketed re. Assertions may not be repeated, but other
2915
-    kinds can be. We copy code into a non-register variable in order to be able
2916
-    to pass its address because some compilers complain otherwise. Pass in a
2917
-    new setting for the ims options if they have changed. */
2918
-
2919
-    previous = (bravalue >= OP_ONCE)? code : NULL;
2920
-    *code = bravalue;
2921
-    tempcode = code;
2922
-    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
2923
-
2924
-    if (!compile_regex(
2925
-         newoptions,                   /* The complete new option state */
2926
-         options & PCRE_IMS,           /* The previous ims option state */
2927
-         brackets,                     /* Extracting bracket count */
2928
-         &tempcode,                    /* Where to put code (updated) */
2929
-         &ptr,                         /* Input pointer (updated) */
2930
-         errorptr,                     /* Where to put an error message */
2931
-         (bravalue == OP_ASSERTBACK ||
2932
-          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2933
-         skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
2934
-         &subfirstbyte,                /* For possible first char */
2935
-         &subreqbyte,                  /* For possible last char */
2936
-         bcptr,                        /* Current branch chain */
2937
-         cd))                          /* Tables block */
2938
-      goto FAILED;
2939
-
2940
-    /* At the end of compiling, code is still pointing to the start of the
2941
-    group, while tempcode has been updated to point past the end of the group
2942
-    and any option resetting that may follow it. The pattern pointer (ptr)
2943
-    is on the bracket. */
2944
-
2945
-    /* If this is a conditional bracket, check that there are no more than
2946
-    two branches in the group. */
2947
-
2948
-    else if (bravalue == OP_COND)
2949
-      {
2950
-      uschar *tc = code;
2951
-      condcount = 0;
2952
-
2953
-      do {
2954
-         condcount++;
2955
-         tc += GET(tc,1);
2956
-         }
2957
-      while (*tc != OP_KET);
2958
-
2959
-      if (condcount > 2)
2960
-        {
2961
-        *errorptr = ERR27;
2962
-        goto FAILED;
2963
-        }
2964
-
2965
-      /* If there is just one branch, we must not make use of its firstbyte or
2966
-      reqbyte, because this is equivalent to an empty second branch. */
2967
-
2968
-      if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2969
-      }
2970
-
2971
-    /* Handle updating of the required and first characters. Update for normal
2972
-    brackets of all kinds, and conditions with two branches (see code above).
2973
-    If the bracket is followed by a quantifier with zero repeat, we have to
2974
-    back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2975
-    main loop so that they can be accessed for the back off. */
2976
-
2977
-    zeroreqbyte = reqbyte;
2978
-    zerofirstbyte = firstbyte;
2979
-    groupsetfirstbyte = FALSE;
2980
-
2981
-    if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2982
-      {
2983
-      /* If we have not yet set a firstbyte in this branch, take it from the
2984
-      subpattern, remembering that it was set here so that a repeat of more
2985
-      than one can replicate it as reqbyte if necessary. If the subpattern has
2986
-      no firstbyte, set "none" for the whole branch. In both cases, a zero
2987
-      repeat forces firstbyte to "none". */
2988
-
2989
-      if (firstbyte == REQ_UNSET)
2990
-        {
2991
-        if (subfirstbyte >= 0)
2992
-          {
2993
-          firstbyte = subfirstbyte;
2994
-          groupsetfirstbyte = TRUE;
2995
-          }
2996
-        else firstbyte = REQ_NONE;
2997
-        zerofirstbyte = REQ_NONE;
2998
-        }
2999
-
3000
-      /* If firstbyte was previously set, convert the subpattern's firstbyte
3001
-      into reqbyte if there wasn't one, using the vary flag that was in
3002
-      existence beforehand. */
3003
-
3004
-      else if (subfirstbyte >= 0 && subreqbyte < 0)
3005
-        subreqbyte = subfirstbyte | tempreqvary;
3006
-
3007
-      /* If the subpattern set a required byte (or set a first byte that isn't
3008
-      really the first byte - see above), set it. */
3009
-
3010
-      if (subreqbyte >= 0) reqbyte = subreqbyte;
3011
-      }
3012
-
3013
-    /* For a forward assertion, we take the reqbyte, if set. This can be
3014
-    helpful if the pattern that follows the assertion doesn't set a different
3015
-    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3016
-    for an assertion, however because it leads to incorrect effect for patterns
3017
-    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3018
-    of a firstbyte. This is overcome by a scan at the end if there's no
3019
-    firstbyte, looking for an asserted first char. */
3020
-
3021
-    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3022
-
3023
-    /* Now update the main code pointer to the end of the group. */
3024
-
3025
-    code = tempcode;
3026
-
3027
-    /* Error if hit end of pattern */
3028
-
3029
-    if (*ptr != ')')
3030
-      {
3031
-      *errorptr = ERR14;
3032
-      goto FAILED;
3033
-      }
3034
-    break;
3035
-
3036
-    /* Check \ for being a real metacharacter; if not, fall through and handle
3037
-    it as a data character at the start of a string. Escape items are checked
3038
-    for validity in the pre-compiling pass. */
3039
-
3040
-    case '\\':
3041
-    tempptr = ptr;
3042
-    c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3043
-
3044
-    /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3045
-    are arranged to be the negation of the corresponding OP_values. For the
3046
-    back references, the values are ESC_REF plus the reference number. Only
3047
-    back references and those types that consume a character may be repeated.
3048
-    We can test for values between ESC_b and ESC_Z for the latter; this may
3049
-    have to change if any new ones are ever created. */
3050
-
3051
-    if (c < 0)
3052
-      {
3053
-      if (-c == ESC_Q)            /* Handle start of quoted string */
3054
-        {
3055
-        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3056
-          else inescq = TRUE;
3057
-        continue;
3058
-        }
3059
-
3060
-      /* For metasequences that actually match a character, we disable the
3061
-      setting of a first character if it hasn't already been set. */
3062
-
3063
-      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3064
-        firstbyte = REQ_NONE;
3065
-
3066
-      /* Set values to reset to if this is followed by a zero repeat. */
3067
-
3068
-      zerofirstbyte = firstbyte;
3069
-      zeroreqbyte = reqbyte;
3070
-
3071
-      /* Back references are handled specially */
3072
-
3073
-      if (-c >= ESC_REF)
3074
-        {
3075
-        int number = -c - ESC_REF;
3076
-        previous = code;
3077
-        *code++ = OP_REF;
3078
-        PUT2INC(code, 0, number);
3079
-        }
3080
-      else
3081
-        {
3082
-        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3083
-        *code++ = -c;
3084
-        }
3085
-      continue;
3086
-      }
3087
-
3088
-    /* Data character: reset and fall through */
3089
-
3090
-    ptr = tempptr;
3091
-    c = '\\';
3092
-
3093
-    /* Handle a run of data characters until a metacharacter is encountered.
3094
-    The first character is guaranteed not to be whitespace or # when the
3095
-    extended flag is set. */
3096
-
3097
-    NORMAL_CHAR:
3098
-    default:
3099
-    previous = code;
3100
-    *code = OP_CHARS;
3101
-    code += 2;
3102
-    length = 0;
3103
-
3104
-    do
3105
-      {
3106
-      /* If in \Q...\E, check for the end; if not, we always have a literal */
3107
-
3108
-      if (inescq)
3109
-        {
3110
-        if (c == '\\' && ptr[1] == 'E')
3111
-          {
3112
-          inescq = FALSE;
3113
-          ptr++;
3114
-          }
3115
-        else
3116
-          {
3117
-          *code++ = c;
3118
-          length++;
3119
-          }
3120
-        continue;
3121
-        }
3122
-
3123
-      /* Skip white space and comments for /x patterns */
3124
-
3125
-      if ((options & PCRE_EXTENDED) != 0)
3126
-        {
3127
-        if ((cd->ctypes[c] & ctype_space) != 0) continue;
3128
-        if (c == '#')
3129
-          {
3130
-          /* The space before the ; is to avoid a warning on a silly compiler
3131
-          on the Macintosh. */
3132
-          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3133
-          if (c == 0) break;
3134
-          continue;
3135
-          }
3136
-        }
3137
-
3138
-      /* Backslash may introduce a data char or a metacharacter. Escaped items
3139
-      are checked for validity in the pre-compiling pass. Stop the string
3140
-      before a metaitem. */
3141
-
3142
-      if (c == '\\')
3143
-        {
3144
-        tempptr = ptr;
3145
-        c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3146
-        if (c < 0) { ptr = tempptr; break; }
3147
-
3148
-        /* If a character is > 127 in UTF-8 mode, we have to turn it into
3149
-        two or more characters in the UTF-8 encoding. */
3150
-
3151
-#ifdef SUPPORT_UTF8
3152
-        if (utf8 && c > 127)
3153
-          {
3154
-          uschar buffer[8];
3155
-          int len = ord2utf8(c, buffer);
3156
-          for (c = 0; c < len; c++) *code++ = buffer[c];
3157
-          length += len;
3158
-          continue;
3159
-          }
3160
-#endif
3161
-        }
3162
-
3163
-      /* Ordinary character or single-char escape */
3164
-
3165
-      *code++ = c;
3166
-      length++;
3167
-      }
3168
-
3169
-    /* This "while" is the end of the "do" above. */
3170
-
3171
-    while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3172
-
3173
-    /* Update the first and last requirements. These are always bytes, even in
3174
-    UTF-8 mode. However, there is a special case to be considered when there
3175
-    are only one or two characters. Because this gets messy in UTF-8 mode, the
3176
-    code is kept separate. When we get here "length" contains the number of
3177
-    bytes. */
3178
-
3179
-#ifdef SUPPORT_UTF8
3180
-    if (utf8 && length > 1)
3181
-      {
3182
-      uschar *t = previous + 3;                      /* After this code, t */
3183
-      while (t < code && (*t & 0xc0) == 0x80) t++;   /* follows the 1st char */
3184
-
3185
-      /* Handle the case when there is only one multibyte character. It must
3186
-      have at least two bytes because of the "length > 1" test above. */
3187
-
3188
-      if (t == code)
3189
-        {
3190
-        /* If no previous first byte, set it from this character, but revert to
3191
-        none on a zero repeat. */
3192
-
3193
-        if (firstbyte == REQ_UNSET)
3194
-          {
3195
-          zerofirstbyte = REQ_NONE;
3196
-          firstbyte = previous[2];
3197
-          }
3198
-
3199
-        /* Otherwise, leave the first byte value alone, and don't change it on
3200
-        a zero repeat */
3201
-
3202
-        else zerofirstbyte = firstbyte;
3203
-
3204
-        /* In both cases, a zero repeat resets the previous required byte */
3205
-
3206
-        zeroreqbyte = reqbyte;
3207
-        }
3208
-
3209
-      /* Handle the case when there is more than one character. These may be
3210
-      single-byte or multibyte characters */
3211
-
3212
-      else
3213
-        {
3214
-        t = code - 1;                       /* After this code, t is at the */
3215
-        while ((*t & 0xc0) == 0x80) t--;    /* start of the last character */
3216
-
3217
-        /* If no previous first byte, set it from the first character, and
3218
-        retain it on a zero repeat (of the last character). The required byte
3219
-        is reset on a zero repeat, either to the byte before the last
3220
-        character, unless this is the first byte of the string. In that case,
3221
-        it reverts to its previous value. */
3222
-
3223
-        if (firstbyte == REQ_UNSET)
3224
-          {
3225
-          zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3226
-          zeroreqbyte = (t - 1 == previous + 2)?
3227
-            reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3228
-          }
3229
-
3230
-        /* If there was a previous first byte, leave it alone, and don't change
3231
-        it on a zero repeat. The required byte is reset on a zero repeat to the
3232
-        byte before the last character. */
3233
-
3234
-        else
3235
-          {
3236
-          zerofirstbyte = firstbyte;
3237
-          zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3238
-          }
3239
-        }
3240
-
3241
-      /* In all cases (we know length > 1), the new required byte is the last
3242
-      byte of the string. */
3243
-
3244
-      reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3245
-      }
3246
-
3247
-    else   /* End of UTF-8 coding */
3248
-#endif
3249
-
3250
-    /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3251
-    or when UTF-8 is not enabled. */
3252
-
3253
-      {
3254
-      /* firstbyte was not previously set; take it from this string */
3255
-
3256
-      if (firstbyte == REQ_UNSET)
3257
-        {
3258
-        if (length == 1)
3259
-          {
3260
-          zerofirstbyte = REQ_NONE;
3261
-          firstbyte = previous[2] | req_caseopt;
3262
-          zeroreqbyte = reqbyte;
3263
-          }
3264
-        else
3265
-          {
3266
-          zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3267
-          zeroreqbyte = (length > 2)?
3268
-            (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3269
-          reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3270
-          }
3271
-        }
3272
-
3273
-      /* firstbyte was previously set */
3274
-
3275
-      else
3276
-        {
3277
-        zerofirstbyte = firstbyte;
3278
-        zeroreqbyte = (length == 1)? reqbyte :
3279
-          code[-2] | req_caseopt | cd->req_varyopt;
3280
-        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3281
-        }
3282
-      }
3283
-
3284
-    /* Set the length in the data vector, and advance to the next state. */
3285
-
3286
-    previous[1] = length;
3287
-    if (length < MAXLIT) ptr--;
3288
-    break;
3289
-    }
3290
-  }                   /* end of big loop */
3291
-
3292
-/* Control never reaches here by falling through, only by a goto for all the
3293
-error states. Pass back the position in the pattern so that it can be displayed
3294
-to the user for diagnosing the error. */
3295
-
3296
-FAILED:
3297
-*ptrptr = ptr;
3298
-return FALSE;
3299
-}
3300
-
3301
-
3302
-
3303
-
3304
-/*************************************************
3305
-*     Compile sequence of alternatives           *
3306
-*************************************************/
3307
-
3308
-/* On entry, ptr is pointing past the bracket character, but on return
3309
-it points to the closing bracket, or vertical bar, or end of string.
3310
-The code variable is pointing at the byte into which the BRA operator has been
3311
-stored. If the ims options are changed at the start (for a (?ims: group) or
3312
-during any branch, we need to insert an OP_OPT item at the start of every
3313
-following branch to ensure they get set correctly at run time, and also pass
3314
-the new options into every subsequent branch compile.
3315
-
3316
-Argument:
3317
-  options        option bits, including any changes for this subpattern
3318
-  oldims         previous settings of ims option bits
3319
-  brackets       -> int containing the number of extracting brackets used
3320
-  codeptr        -> the address of the current code pointer
3321
-  ptrptr         -> the address of the current pattern pointer
3322
-  errorptr       -> pointer to error message
3323
-  lookbehind     TRUE if this is a lookbehind assertion
3324
-  skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3325
-  firstbyteptr   place to put the first required character, or a negative number
3326
-  reqbyteptr     place to put the last required character, or a negative number
3327
-  bcptr          pointer to the chain of currently open branches
3328
-  cd             points to the data block with tables pointers etc.
3329
-
3330
-Returns:      TRUE on success
3331
-*/
3332
-
3333
-static BOOL
3334
-compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3335
-  const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3336
-  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3337
-{
3338
-const uschar *ptr = *ptrptr;
3339
-uschar *code = *codeptr;
3340
-uschar *last_branch = code;
3341
-uschar *start_bracket = code;
3342
-uschar *reverse_count = NULL;
3343
-int firstbyte, reqbyte;
3344
-int branchfirstbyte, branchreqbyte;
3345
-branch_chain bc;
3346
-
3347
-bc.outer = bcptr;
3348
-bc.current = code;
3349
-
3350
-firstbyte = reqbyte = REQ_UNSET;
3351
-
3352
-/* Offset is set zero to mark that this bracket is still open */
3353
-
3354
-PUT(code, 1, 0);
3355
-code += 1 + LINK_SIZE + skipbytes;
3356
-
3357
-/* Loop for each alternative branch */
3358
-
3359
-for (;;)
3360
-  {
3361
-  /* Handle a change of ims options at the start of the branch */
3362
-
3363
-  if ((options & PCRE_IMS) != oldims)
3364
-    {
3365
-    *code++ = OP_OPT;
3366
-    *code++ = options & PCRE_IMS;
3367
-    }
3368
-
3369
-  /* Set up dummy OP_REVERSE if lookbehind assertion */
3370
-
3371
-  if (lookbehind)
3372
-    {
3373
-    *code++ = OP_REVERSE;
3374
-    reverse_count = code;
3375
-    PUTINC(code, 0, 0);
3376
-    }
3377
-
3378
-  /* Now compile the branch */
3379
-
3380
-  if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3381
-        &branchfirstbyte, &branchreqbyte, &bc, cd))
3382
-    {
3383
-    *ptrptr = ptr;
3384
-    return FALSE;
3385
-    }
3386
-
3387
-  /* If this is the first branch, the firstbyte and reqbyte values for the
3388
-  branch become the values for the regex. */
3389
-
3390
-  if (*last_branch != OP_ALT)
3391
-    {
3392
-    firstbyte = branchfirstbyte;
3393
-    reqbyte = branchreqbyte;
3394
-    }
3395
-
3396
-  /* If this is not the first branch, the first char and reqbyte have to
3397
-  match the values from all the previous branches, except that if the previous
3398
-  value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3399
-  REQ_VARY for the regex. */
3400
-
3401
-  else
3402
-    {
3403
-    /* If we previously had a firstbyte, but it doesn't match the new branch,
3404
-    we have to abandon the firstbyte for the regex, but if there was previously
3405
-    no reqbyte, it takes on the value of the old firstbyte. */
3406
-
3407
-    if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3408
-      {
3409
-      if (reqbyte < 0) reqbyte = firstbyte;
3410
-      firstbyte = REQ_NONE;
3411
-      }
3412
-
3413
-    /* If we (now or from before) have no firstbyte, a firstbyte from the
3414
-    branch becomes a reqbyte if there isn't a branch reqbyte. */
3415
-
3416
-    if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3417
-        branchreqbyte = branchfirstbyte;
3418
-
3419
-    /* Now ensure that the reqbytes match */
3420
-
3421
-    if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3422
-      reqbyte = REQ_NONE;
3423
-    else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
3424
-    }
3425
-
3426
-  /* If lookbehind, check that this branch matches a fixed-length string,
3427
-  and put the length into the OP_REVERSE item. Temporarily mark the end of
3428
-  the branch with OP_END. */
3429
-
3430
-  if (lookbehind)
3431
-    {
3432
-    int length;
3433
-    *code = OP_END;
3434
-    length = find_fixedlength(last_branch, options);
3435
-    DPRINTF(("fixed length = %d\n", length));
3436
-    if (length < 0)
3437
-      {
3438
-      *errorptr = (length == -2)? ERR36 : ERR25;
3439
-      *ptrptr = ptr;
3440
-      return FALSE;
3441
-      }
3442
-    PUT(reverse_count, 0, length);
3443
-    }
3444
-
3445
-  /* Reached end of expression, either ')' or end of pattern. Go back through
3446
-  the alternative branches and reverse the chain of offsets, with the field in
3447
-  the BRA item now becoming an offset to the first alternative. If there are
3448
-  no alternatives, it points to the end of the group. The length in the
3449
-  terminating ket is always the length of the whole bracketed item. If any of
3450
-  the ims options were changed inside the group, compile a resetting op-code
3451
-  following, except at the very end of the pattern. Return leaving the pointer
3452
-  at the terminating char. */
3453
-
3454
-  if (*ptr != '|')
3455
-    {
3456
-    int length = code - last_branch;
3457
-    do
3458
-      {
3459
-      int prev_length = GET(last_branch, 1);
3460
-      PUT(last_branch, 1, length);
3461
-      length = prev_length;
3462
-      last_branch -= length;
3463
-      }
3464
-    while (length > 0);
3465
-
3466
-    /* Fill in the ket */
3467
-
3468
-    *code = OP_KET;
3469
-    PUT(code, 1, code - start_bracket);
3470
-    code += 1 + LINK_SIZE;
3471
-
3472
-    /* Resetting option if needed */
3473
-
3474
-    if ((options & PCRE_IMS) != oldims && *ptr == ')')
3475
-      {
3476
-      *code++ = OP_OPT;
3477
-      *code++ = oldims;
3478
-      }
3479
-
3480
-    /* Set values to pass back */
3481
-
3482
-    *codeptr = code;
3483
-    *ptrptr = ptr;
3484
-    *firstbyteptr = firstbyte;
3485
-    *reqbyteptr = reqbyte;
3486
-    return TRUE;
3487
-    }
3488
-
3489
-  /* Another branch follows; insert an "or" node. Its length field points back
3490
-  to the previous branch while the bracket remains open. At the end the chain
3491
-  is reversed. It's done like this so that the start of the bracket has a
3492
-  zero offset until it is closed, making it possible to detect recursion. */
3493
-
3494
-  *code = OP_ALT;
3495
-  PUT(code, 1, code - last_branch);
3496
-  bc.current = last_branch = code;
3497
-  code += 1 + LINK_SIZE;
3498
-  ptr++;
3499
-  }
3500
-/* Control never reaches here */
3501
-}
3502
-
3503
-
3504
-
3505
-
3506
-/*************************************************
3507
-*          Check for anchored expression         *
3508
-*************************************************/
3509
-
3510
-/* Try to find out if this is an anchored regular expression. Consider each
3511
-alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3512
-all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3513
-it's anchored. However, if this is a multiline pattern, then only OP_SOD
3514
-counts, since OP_CIRC can match in the middle.
3515
-
3516
-We can also consider a regex to be anchored if OP_SOM starts all its branches.
3517
-This is the code for \G, which means "match at start of match position, taking
3518
-into account the match offset".
3519
-
3520
-A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3521
-because that will try the rest of the pattern at all possible matching points,
3522
-so there is no point trying again.... er ....
3523
-
3524
-.... except when the .* appears inside capturing parentheses, and there is a
3525
-subsequent back reference to those parentheses. We haven't enough information
3526
-to catch that case precisely.
3527
-
3528
-At first, the best we could do was to detect when .* was in capturing brackets
3529
-and the highest back reference was greater than or equal to that level.
3530
-However, by keeping a bitmap of the first 31 back references, we can catch some
3531
-of the more common cases more precisely.
3532
-
3533
-Arguments:
3534
-  code           points to start of expression (the bracket)
3535
-  options        points to the options setting
3536
-  bracket_map    a bitmap of which brackets we are inside while testing; this
3537
-                  handles up to substring 31; after that we just have to take
3538
-                  the less precise approach
3539
-  backref_map    the back reference bitmap
3540
-
3541
-Returns:     TRUE or FALSE
3542
-*/
3543
-
3544
-static BOOL
3545
-is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3546
-  unsigned int backref_map)
3547
-{
3548
-do {
3549
-   const uschar *scode =
3550
-     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3551
-   register int op = *scode;
3552
-
3553
-   /* Capturing brackets */
3554
-
3555
-   if (op > OP_BRA)
3556
-     {
3557
-     int new_map;
3558
-     op -= OP_BRA;
3559
-     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3560
-     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3561
-     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3562
-     }
3563
-
3564
-   /* Other brackets */
3565
-
3566
-   else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3567
-     {
3568
-     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3569
-     }
3570
-
3571
-   /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3572
-   are or may be referenced. */
3573
-
3574
-   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3575
-            (*options & PCRE_DOTALL) != 0)
3576
-     {
3577
-     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3578
-     }
3579
-
3580
-   /* Check for explicit anchoring */
3581
-
3582
-   else if (op != OP_SOD && op != OP_SOM &&
3583
-           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3584
-     return FALSE;
3585
-   code += GET(code, 1);
3586
-   }
3587
-while (*code == OP_ALT);   /* Loop for each alternative */
3588
-return TRUE;
3589
-}
3590
-
3591
-
3592
-
3593
-/*************************************************
3594
-*         Check for starting with ^ or .*        *
3595
-*************************************************/
3596
-
3597
-/* This is called to find out if every branch starts with ^ or .* so that
3598
-"first char" processing can be done to speed things up in multiline
3599
-matching and for non-DOTALL patterns that start with .* (which must start at
3600
-the beginning or after \n). As in the case of is_anchored() (see above), we
3601
-have to take account of back references to capturing brackets that contain .*
3602
-because in that case we can't make the assumption.
3603
-
3604
-Arguments:
3605
-  code           points to start of expression (the bracket)
3606
-  bracket_map    a bitmap of which brackets we are inside while testing; this
3607
-                  handles up to substring 31; after that we just have to take
3608
-                  the less precise approach
3609
-  backref_map    the back reference bitmap
3610
-
3611
-Returns:         TRUE or FALSE
3612
-*/
3613
-
3614
-static BOOL
3615
-is_startline(const uschar *code, unsigned int bracket_map,
3616
-  unsigned int backref_map)
3617
-{
3618
-do {
3619
-   const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3620
-   register int op = *scode;
3621
-
3622
-   /* Capturing brackets */
3623
-
3624
-   if (op > OP_BRA)
3625
-     {
3626
-     int new_map;
3627
-     op -= OP_BRA;
3628
-     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3629
-     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3630
-     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3631
-     }
3632
-
3633
-   /* Other brackets */
3634
-
3635
-   else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3636
-     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3637
-
3638
-   /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3639
-   may be referenced. */
3640
-
3641
-   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3642
-     {
3643
-     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3644
-     }
3645
-
3646
-   /* Check for explicit circumflex */
3647
-
3648
-   else if (op != OP_CIRC) return FALSE;
3649
-   code += GET(code, 1);
3650
-   }
3651
-while (*code == OP_ALT);  /* Loop for each alternative */
3652
-return TRUE;
3653
-}
3654
-
3655
-
3656
-
3657
-/*************************************************
3658
-*       Check for asserted fixed first char      *
3659
-*************************************************/
3660
-
3661
-/* During compilation, the "first char" settings from forward assertions are
3662
-discarded, because they can cause conflicts with actual literals that follow.
3663
-However, if we end up without a first char setting for an unanchored pattern,
3664
-it is worth scanning the regex to see if there is an initial asserted first
3665
-char. If all branches start with the same asserted char, or with a bracket all
3666
-of whose alternatives start with the same asserted char (recurse ad lib), then
3667
-we return that char, otherwise -1.
3668
-
3669
-Arguments:
3670
-  code       points to start of expression (the bracket)
3671
-  options    pointer to the options (used to check casing changes)
3672
-  inassert   TRUE if in an assertion
3673
-
3674
-Returns:     -1 or the fixed first char
3675
-*/
3676
-
3677
-static int
3678
-find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3679
-{
3680
-register int c = -1;
3681
-do {
3682
-   int d;
3683
-   const uschar *scode =
3684
-     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3685
-   register int op = *scode;
3686
-
3687
-   if (op >= OP_BRA) op = OP_BRA;
3688
-
3689
-   switch(op)
3690
-     {
3691
-     default:
3692
-     return -1;
3693
-
3694
-     case OP_BRA:
3695
-     case OP_ASSERT:
3696
-     case OP_ONCE:
3697
-     case OP_COND:
3698
-     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3699
-       return -1;
3700
-     if (c < 0) c = d; else if (c != d) return -1;
3701
-     break;
3702
-
3703
-     case OP_EXACT:       /* Fall through */
3704
-     scode++;
3705
-
3706
-     case OP_CHARS:       /* Fall through */
3707
-     scode++;
3708
-
3709
-     case OP_PLUS:
3710
-     case OP_MINPLUS:
3711
-     if (!inassert) return -1;
3712
-     if (c < 0)
3713
-       {
3714
-       c = scode[1];
3715
-       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3716
-       }
3717
-     else if (c != scode[1]) return -1;
3718
-     break;
3719
-     }
3720
-
3721
-   code += GET(code, 1);
3722
-   }
3723
-while (*code == OP_ALT);
3724
-return c;
3725
-}
3726
-
3727
-
3728
-
3729
-
3730
-/*************************************************
3731
-*        Compile a Regular Expression            *
3732
-*************************************************/
3733
-
3734
-/* This function takes a string and returns a pointer to a block of store
3735
-holding a compiled version of the expression.
3736
-
3737
-Arguments:
3738
-  pattern      the regular expression
3739
-  options      various option bits
3740
-  errorptr     pointer to pointer to error text
3741
-  erroroffset  ptr offset in pattern where error was detected
3742
-  tables       pointer to character tables or NULL
3743
-
3744
-Returns:       pointer to compiled data block, or NULL on error,
3745
-               with errorptr and erroroffset set
3746
-*/
3747
-
3748
-pcre *
3749
-pcre_compile(const char *pattern, int options, const char **errorptr,
3750
-  int *erroroffset, const unsigned char *tables)
3751
-{
3752
-real_pcre *re;
3753
-int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
3754
-int runlength;
3755
-int c, firstbyte, reqbyte;
3756
-int bracount = 0;
3757
-int branch_extra = 0;
3758
-int branch_newextra;
3759
-int item_count = -1;
3760
-int name_count = 0;
3761
-int max_name_size = 0;
3762
-#ifdef SUPPORT_UTF8
3763
-int lastcharlength = 0;
3764
-BOOL utf8;
3765
-BOOL class_utf8;
3766
-#endif
3767
-BOOL inescq = FALSE;
3768
-unsigned int brastackptr = 0;
3769
-size_t size;
3770
-uschar *code;
3771
-const uschar *codestart;
3772
-const uschar *ptr;
3773
-compile_data compile_block;
3774
-int brastack[BRASTACK_SIZE];
3775
-uschar bralenstack[BRASTACK_SIZE];
3776
-
3777
-/* We can't pass back an error message if errorptr is NULL; I guess the best we
3778
-can do is just return NULL. */
3779
-
3780
-if (errorptr == NULL) return NULL;
3781
-*errorptr = NULL;
3782
-
3783
-/* However, we can give a message for this error */
3784
-
3785
-if (erroroffset == NULL)
3786
-  {
3787
-  *errorptr = ERR16;
3788
-  return NULL;
3789
-  }
3790
-*erroroffset = 0;
3791
-
3792
-/* Can't support UTF8 unless PCRE has been compiled to include the code. */
3793
-
3794
-#ifdef SUPPORT_UTF8
3795
-utf8 = (options & PCRE_UTF8) != 0;
3796
-#else
3797
-if ((options & PCRE_UTF8) != 0)
3798
-  {
3799
-  *errorptr = ERR32;
3800
-  return NULL;
3801
-  }
3802
-#endif
3803
-
3804
-if ((options & ~PUBLIC_OPTIONS) != 0)
3805
-  {
3806
-  *errorptr = ERR17;
3807
-  return NULL;
3808
-  }
3809
-
3810
-/* Set up pointers to the individual character tables */
3811
-
3812
-if (tables == NULL) tables = pcre_default_tables;
3813
-compile_block.lcc = tables + lcc_offset;
3814
-compile_block.fcc = tables + fcc_offset;
3815
-compile_block.cbits = tables + cbits_offset;
3816
-compile_block.ctypes = tables + ctypes_offset;
3817
-
3818
-/* Maximum back reference and backref bitmap. This is updated for numeric
3819
-references during the first pass, but for named references during the actual
3820
-compile pass. The bitmap records up to 31 back references to help in deciding
3821
-whether (.*) can be treated as anchored or not. */
3822
-
3823
-compile_block.top_backref = 0;
3824
-compile_block.backref_map = 0;
3825
-
3826
-/* Reflect pattern for debugging output */
3827
-
3828
-DPRINTF(("------------------------------------------------------------------\n"));
3829
-DPRINTF(("%s\n", pattern));
3830
-
3831
-/* The first thing to do is to make a pass over the pattern to compute the
3832
-amount of store required to hold the compiled code. This does not have to be
3833
-perfect as long as errors are overestimates. At the same time we can detect any
3834
-flag settings right at the start, and extract them. Make an attempt to correct
3835
-for any counted white space if an "extended" flag setting appears late in the
3836
-pattern. We can't be so clever for #-comments. */
3837
-
3838
-ptr = (const uschar *)(pattern - 1);
3839
-while ((c = *(++ptr)) != 0)
3840
-  {
3841
-  int min, max;
3842
-  int class_optcount;
3843
-  int bracket_length;
3844
-  int duplength;
3845
-
3846
-  /* If we are inside a \Q...\E sequence, all chars are literal */
3847
-
3848
-  if (inescq) goto NORMAL_CHAR;
3849
-
3850
-  /* Otherwise, first check for ignored whitespace and comments */
3851
-
3852
-  if ((options & PCRE_EXTENDED) != 0)
3853
-    {
3854
-    if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3855
-    if (c == '#')
3856
-      {
3857
-      /* The space before the ; is to avoid a warning on a silly compiler
3858
-      on the Macintosh. */
3859
-      while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3860
-      if (c == 0) break;
3861
-      continue;
3862
-      }
3863
-    }
3864
-
3865
-  item_count++;    /* Is zero for the first non-comment item */
3866
-
3867
-  switch(c)
3868
-    {
3869
-    /* A backslashed item may be an escaped "normal" character or a
3870
-    character type. For a "normal" character, put the pointers and
3871
-    character back so that tests for whitespace etc. in the input
3872
-    are done correctly. */
3873
-
3874
-    case '\\':
3875
-      {
3876
-      const uschar *save_ptr = ptr;
3877
-      c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
3878
-      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3879
-      if (c >= 0)
3880
-        {
3881
-        ptr = save_ptr;
3882
-        c = '\\';
3883
-        goto NORMAL_CHAR;
3884
-        }
3885
-      }
3886
-
3887
-    /* If \Q, enter "literal" mode */
3888
-
3889
-    if (-c == ESC_Q)
3890
-      {
3891
-      inescq = TRUE;
3892
-      continue;
3893
-      }
3894
-
3895
-    /* Other escapes need one byte, and are of length one for repeats */
3896
-
3897
-    length++;
3898
-#ifdef SUPPORT_UTF8
3899
-    lastcharlength = 1;
3900
-#endif
3901
-
3902
-    /* A back reference needs an additional 2 bytes, plus either one or 5
3903
-    bytes for a repeat. We also need to keep the value of the highest
3904
-    back reference. */
3905
-
3906
-    if (c <= -ESC_REF)
3907
-      {
3908
-      int refnum = -c - ESC_REF;
3909
-      compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3910
-      if (refnum > compile_block.top_backref)
3911
-        compile_block.top_backref = refnum;
3912
-      length += 2;   /* For single back reference */
3913
-      if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3914
-        {
3915
-        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
3916
-        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3917
-        if ((min == 0 && (max == 1 || max == -1)) ||
3918
-          (min == 1 && max == -1))
3919
-            length++;
3920
-        else length += 5;
3921
-        if (ptr[1] == '?') ptr++;
3922
-        }
3923
-      }
3924
-    continue;
3925
-
3926
-    case '^':     /* Single-byte metacharacters */
3927
-    case '.':
3928
-    case '$':
3929
-    length++;
3930
-#ifdef SUPPORT_UTF8
3931
-    lastcharlength = 1;
3932
-#endif
3933
-    continue;
3934
-
3935
-    case '*':            /* These repeats won't be after brackets; */
3936
-    case '+':            /* those are handled separately */
3937
-    case '?':
3938
-    length++;
3939
-    goto POSESSIVE;      /* A few lines below */
3940
-
3941
-    /* This covers the cases of braced repeats after a single char, metachar,
3942
-    class, or back reference. */
3943
-
3944
-    case '{':
3945
-    if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3946
-    ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3947
-    if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3948
-
3949
-    /* These special cases just insert one extra opcode */
3950
-
3951
-    if ((min == 0 && (max == 1 || max == -1)) ||
3952
-      (min == 1 && max == -1))
3953
-        length++;
3954
-
3955
-    /* These cases might insert additional copies of a preceding character. */
3956
-
3957
-    else
3958
-      {
3959
-#ifdef SUPPORT_UTF8
3960
-      /* In UTF-8 mode, we should find the length in lastcharlength */
3961
-      if (utf8)
3962
-        {
3963
-        if (min != 1)
3964
-          {
3965
-          length -= lastcharlength;   /* Uncount the original char or metachar */
3966
-          if (min > 0) length += 3 + lastcharlength;
3967
-          }
3968
-        length += lastcharlength + ((max > 0)? 3 : 1);
3969
-        }
3970
-      else
3971
-#endif
3972
-
3973
-      /* Not UTF-8 mode: all characters are one byte */
3974
-        {
3975
-        if (min != 1)
3976
-          {
3977
-          length--;   /* Uncount the original char or metachar */
3978
-          if (min > 0) length += 4;
3979
-          }
3980
-
3981
-        length += (max > 0)? 4 : 2;
3982
-        }
3983
-      }
3984
-
3985
-    if (ptr[1] == '?') ptr++;      /* Needs no extra length */
3986
-
3987
-    POSESSIVE:                     /* Test for possessive quantifier */
3988
-    if (ptr[1] == '+')
3989
-      {
3990
-      ptr++;
3991
-      length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
3992
-      }
3993
-    continue;
3994
-
3995
-    /* An alternation contains an offset to the next branch or ket. If any ims
3996
-    options changed in the previous branch(es), and/or if we are in a
3997
-    lookbehind assertion, extra space will be needed at the start of the
3998
-    branch. This is handled by branch_extra. */
3999
-
4000
-    case '|':
4001
-    length += 1 + LINK_SIZE + branch_extra;
4002
-    continue;
4003
-
4004
-    /* A character class uses 33 characters provided that all the character
4005
-    values are less than 256. Otherwise, it uses a bit map for low valued
4006
-    characters, and individual items for others. Don't worry about character
4007
-    types that aren't allowed in classes - they'll get picked up during the
4008
-    compile. A character class that contains only one single-byte character
4009
-    uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4010
-    where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4011
-
4012
-    case '[':
4013
-    class_optcount = 0;
4014
-
4015
-#ifdef SUPPORT_UTF8
4016
-    class_utf8 = FALSE;
4017
-#endif
4018
-
4019
-    if (*(++ptr) == '^') ptr++;
4020
-
4021
-    /* Written as a "do" so that an initial ']' is taken as data */
4022
-
4023
-    if (*ptr != 0) do
4024
-      {
4025
-      /* Inside \Q...\E everything is literal except \E */
4026
-
4027
-      if (inescq)
4028
-        {
4029
-        if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
4030
-        inescq = FALSE;
4031
-        ptr += 1;
4032
-        continue;
4033
-        }
4034
-
4035
-      /* Outside \Q...\E, check for escapes */
4036
-
4037
-      if (*ptr == '\\')
4038
-        {
4039
-#ifdef SUPPORT_UTF8
4040
-        int prevchar = ptr[-1];
4041
-#endif
4042
-        int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
4043
-          &compile_block);
4044
-        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4045
-
4046
-        /* \b is backspace inside a class */
4047
-
4048
-        if (-ch == ESC_b) ch = '\b';
4049
-
4050
-        /* \Q enters quoting mode */
4051
-
4052
-        if (-ch == ESC_Q)
4053
-          {
4054
-          inescq = TRUE;
4055
-          continue;
4056
-          }
4057
-
4058
-        /* Handle escapes that turn into characters */
4059
-
4060
-        if (ch >= 0)
4061
-          {
4062
-#ifdef SUPPORT_UTF8
4063
-          if (utf8)
4064
-            {
4065
-            if (ch > 127) class_optcount = 10;  /* Ensure > 1 */
4066
-            if (ch > 255)
4067
-              {
4068
-              uschar buffer[6];
4069
-              if (!class_utf8)
4070
-                {
4071
-                class_utf8 = TRUE;
4072
-                length += LINK_SIZE + 1 + 1;
4073
-                }
4074
-              length += 1 + ord2utf8(ch, buffer);
4075
-
4076
-              /* If this wide character is preceded by '-', add an extra 2 to
4077
-              the length in case the previous character was < 128, because in
4078
-              this case the whole range will be put into the list. */
4079
-
4080
-              if (prevchar == '-') length += 2;
4081
-              }
4082
-            }
4083
-#endif
4084
-          class_optcount++;            /* for possible optimization */
4085
-          }
4086
-        else class_optcount = 10;      /* \d, \s etc; make sure > 1 */
4087
-        }
4088
-
4089
-      /* Check the syntax for POSIX stuff. The bits we actually handle are
4090
-      checked during the real compile phase. */
4091
-
4092
-      else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4093
-        {
4094
-        ptr++;
4095
-        class_optcount = 10;    /* Make sure > 1 */
4096
-        }
4097
-
4098
-      /* Anything else just increments the possible optimization count. If
4099
-      there are wide characters, we are going to have to use an XCLASS. */
4100
-
4101
-      else
4102
-        {
4103
-        NON_SPECIAL_CHARACTER:
4104
-        class_optcount++;
4105
-
4106
-#ifdef SUPPORT_UTF8
4107
-        if (utf8)
4108
-          {
4109
-          int ch;
4110
-          int extra = 0;
4111
-          GETCHARLEN(ch, ptr, extra);
4112
-          if (ch > 127) class_optcount = 10;   /* No optimization possible */
4113
-          if (ch > 255)
4114
-            {
4115
-            if (!class_utf8)
4116
-              {
4117
-              class_utf8 = TRUE;
4118
-              length += LINK_SIZE + 1 + 1;
4119
-              }
4120
-            length += 2 + extra;
4121
-
4122
-            /* If this wide character is preceded by '-', add an extra 2 to
4123
-            the length in case the previous character was < 128, because in
4124
-            this case the whole range will be put into the list. */
4125
-
4126
-            if (ptr[-1] == '-') length += 2;
4127
-
4128
-            /* Advance to the end of this character */
4129
-
4130
-            ptr += extra;
4131
-            }
4132
-          }
4133
-#endif
4134
-        }
4135
-      }
4136
-    while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4137
-
4138
-    if (*ptr == 0)                          /* Missing terminating ']' */
4139
-      {
4140
-      *errorptr = ERR6;
4141
-      goto PCRE_ERROR_RETURN;
4142
-      }
4143
-
4144
-    /* We can optimize when there was only one optimizable character. Repeats
4145
-    for positive and negated single one-byte chars are handled by the general
4146
-    code. Here, we handle repeats for the class opcodes. */
4147
-
4148
-    if (class_optcount == 1) length += 3; else
4149
-      {
4150
-      length += 33;
4151
-
4152
-      /* A repeat needs either 1 or 5 bytes. */
4153
-
4154
-      if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
4155
-        {
4156
-        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4157
-        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4158
-        if ((min == 0 && (max == 1 || max == -1)) ||
4159
-          (min == 1 && max == -1))
4160
-            length++;
4161
-        else length += 5;
4162
-        if (ptr[1] == '?') ptr++;
4163
-        }
4164
-      }
4165
-    continue;
4166
-
4167
-    /* Brackets may be genuine groups or special things */
4168
-
4169
-    case '(':
4170
-    branch_newextra = 0;
4171
-    bracket_length = 1 + LINK_SIZE;
4172
-
4173
-    /* Handle special forms of bracket, which all start (? */
4174
-
4175
-    if (ptr[1] == '?')
4176
-      {
4177
-      int set, unset;
4178
-      int *optset;
4179
-
4180
-      switch (c = ptr[2])
4181
-        {
4182
-        /* Skip over comments entirely */
4183
-        case '#':
4184
-        ptr += 3;
4185
-        while (*ptr != 0 && *ptr != ')') ptr++;
4186
-        if (*ptr == 0)
4187
-          {
4188
-          *errorptr = ERR18;
4189
-          goto PCRE_ERROR_RETURN;
4190
-          }
4191
-        continue;
4192
-
4193
-        /* Non-referencing groups and lookaheads just move the pointer on, and
4194
-        then behave like a non-special bracket, except that they don't increment
4195
-        the count of extracting brackets. Ditto for the "once only" bracket,
4196
-        which is in Perl from version 5.005. */
4197
-
4198
-        case ':':
4199
-        case '=':
4200
-        case '!':
4201
-        case '>':
4202
-        ptr += 2;
4203
-        break;
4204
-
4205
-        /* (?R) specifies a recursive call to the regex, which is an extension
4206
-        to provide the facility which can be obtained by (?p{perl-code}) in
4207
-        Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4208
-
4209
-        From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4210
-        the appropriate numbered brackets. This includes both recursive and
4211
-        non-recursive calls. (?R) is now synonymous with (?0). */
4212
-
4213
-        case 'R':
4214
-        ptr++;
4215
-
4216
-        case '0': case '1': case '2': case '3': case '4':
4217
-        case '5': case '6': case '7': case '8': case '9':
4218
-        ptr += 2;
4219
-        if (c != 'R')
4220
-          while ((digitab[*(++ptr)] & ctype_digit) != 0);
4221
-        if (*ptr != ')')
4222
-          {
4223
-          *errorptr = ERR29;
4224
-          goto PCRE_ERROR_RETURN;
4225
-          }
4226
-        length += 1 + LINK_SIZE;
4227
-
4228
-        /* If this item is quantified, it will get wrapped inside brackets so
4229
-        as to use the code for quantified brackets. We jump down and use the
4230
-        code that handles this for real brackets. */
4231
-
4232
-        if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4233
-          {
4234
-          length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
4235
-          duplength = 5 + 3 * LINK_SIZE;
4236
-          goto HANDLE_QUANTIFIED_BRACKETS;
4237
-          }
4238
-        continue;
4239
-
4240
-        /* (?C) is an extension which provides "callout" - to provide a bit of
4241
-        the functionality of the Perl (?{...}) feature. An optional number may
4242
-        follow (default is zero). */
4243
-
4244
-        case 'C':
4245
-        ptr += 2;
4246
-        while ((digitab[*(++ptr)] & ctype_digit) != 0);
4247
-        if (*ptr != ')')
4248
-          {
4249
-          *errorptr = ERR39;
4250
-          goto PCRE_ERROR_RETURN;
4251
-          }
4252
-        length += 2;
4253
-        continue;
4254
-
4255
-        /* Named subpatterns are an extension copied from Python */
4256
-
4257
-        case 'P':
4258
-        ptr += 3;
4259
-        if (*ptr == '<')
4260
-          {
4261
-          const uschar *p;    /* Don't amalgamate; some compilers */
4262
-          p = ++ptr;          /* grumble at autoincrement in declaration */
4263
-          while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4264
-          if (*ptr != '>')
4265
-            {
4266
-            *errorptr = ERR42;
4267
-            goto PCRE_ERROR_RETURN;
4268
-            }
4269
-          name_count++;
4270
-          if (ptr - p > max_name_size) max_name_size = (ptr - p);
4271
-          break;
4272
-          }
4273
-
4274
-        if (*ptr == '=' || *ptr == '>')
4275
-          {
4276
-          while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4277
-          if (*ptr != ')')
4278
-            {
4279
-            *errorptr = ERR42;
4280
-            goto PCRE_ERROR_RETURN;
4281
-            }
4282
-          break;
4283
-          }
4284
-
4285
-        /* Unknown character after (?P */
4286
-
4287
-        *errorptr = ERR41;
4288
-        goto PCRE_ERROR_RETURN;
4289
-
4290
-        /* Lookbehinds are in Perl from version 5.005 */
4291
-
4292
-        case '<':
4293
-        ptr += 3;
4294
-        if (*ptr == '=' || *ptr == '!')
4295
-          {
4296
-          branch_newextra = 1 + LINK_SIZE;
4297
-          length += 1 + LINK_SIZE;         /* For the first branch */
4298
-          break;
4299
-          }
4300
-        *errorptr = ERR24;
4301
-        goto PCRE_ERROR_RETURN;
4302
-
4303
-        /* Conditionals are in Perl from version 5.005. The bracket must either
4304
-        be followed by a number (for bracket reference) or by an assertion
4305
-        group, or (a PCRE extension) by 'R' for a recursion test. */
4306
-
4307
-        case '(':
4308
-        if (ptr[3] == 'R' && ptr[4] == ')')
4309
-          {
4310
-          ptr += 4;
4311
-          length += 3;
4312
-          }
4313
-        else if ((digitab[ptr[3]] & ctype_digit) != 0)
4314
-          {
4315
-          ptr += 4;
4316
-          length += 3;
4317
-          while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4318
-          if (*ptr != ')')
4319
-            {
4320
-            *errorptr = ERR26;
4321
-            goto PCRE_ERROR_RETURN;
4322
-            }
4323
-          }
4324
-        else   /* An assertion must follow */
4325
-          {
4326
-          ptr++;   /* Can treat like ':' as far as spacing is concerned */
4327
-          if (ptr[2] != '?' ||
4328
-             (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4329
-            {
4330
-            ptr += 2;    /* To get right offset in message */
4331
-            *errorptr = ERR28;
4332
-            goto PCRE_ERROR_RETURN;
4333
-            }
4334
-          }
4335
-        break;
4336
-
4337
-        /* Else loop checking valid options until ) is met. Anything else is an
4338
-        error. If we are without any brackets, i.e. at top level, the settings
4339
-        act as if specified in the options, so massage the options immediately.
4340
-        This is for backward compatibility with Perl 5.004. */
4341
-
4342
-        default:
4343
-        set = unset = 0;
4344
-        optset = &set;
4345
-        ptr += 2;
4346
-
4347
-        for (;; ptr++)
4348
-          {
4349
-          c = *ptr;
4350
-          switch (c)
4351
-            {
4352
-            case 'i':
4353
-            *optset |= PCRE_CASELESS;
4354
-            continue;
4355
-
4356
-            case 'm':
4357
-            *optset |= PCRE_MULTILINE;
4358
-            continue;
4359
-
4360
-            case 's':
4361
-            *optset |= PCRE_DOTALL;
4362
-            continue;
4363
-
4364
-            case 'x':
4365
-            *optset |= PCRE_EXTENDED;
4366
-            continue;
4367
-
4368
-            case 'X':
4369
-            *optset |= PCRE_EXTRA;
4370
-            continue;
4371
-
4372
-            case 'U':
4373
-            *optset |= PCRE_UNGREEDY;
4374
-            continue;
4375
-
4376
-            case '-':
4377
-            optset = &unset;
4378
-            continue;
4379
-
4380
-            /* A termination by ')' indicates an options-setting-only item; if
4381
-            this is at the very start of the pattern (indicated by item_count
4382
-            being zero), we use it to set the global options. This is helpful
4383
-            when analyzing the pattern for first characters, etc. Otherwise
4384
-            nothing is done here and it is handled during the compiling
4385
-            process.
4386
-
4387
-            [Historical note: Up to Perl 5.8, options settings at top level
4388
-            were always global settings, wherever they appeared in the pattern.
4389
-            That is, they were equivalent to an external setting. From 5.8
4390
-            onwards, they apply only to what follows (which is what you might
4391
-            expect).] */
4392
-
4393
-            case ')':
4394
-            if (item_count == 0)
4395
-              {
4396
-              options = (options | set) & (~unset);
4397
-              set = unset = 0;     /* To save length */
4398
-              item_count--;        /* To allow for several */
4399
-              }
4400
-
4401
-            /* Fall through */
4402
-
4403
-            /* A termination by ':' indicates the start of a nested group with
4404
-            the given options set. This is again handled at compile time, but
4405
-            we must allow for compiled space if any of the ims options are
4406
-            set. We also have to allow for resetting space at the end of
4407
-            the group, which is why 4 is added to the length and not just 2.
4408
-            If there are several changes of options within the same group, this
4409
-            will lead to an over-estimate on the length, but this shouldn't
4410
-            matter very much. We also have to allow for resetting options at
4411
-            the start of any alternations, which we do by setting
4412
-            branch_newextra to 2. Finally, we record whether the case-dependent
4413
-            flag ever changes within the regex. This is used by the "required
4414
-            character" code. */
4415
-
4416
-            case ':':
4417
-            if (((set|unset) & PCRE_IMS) != 0)
4418
-              {
4419
-              length += 4;
4420
-              branch_newextra = 2;
4421
-              if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4422
-              }
4423
-            goto END_OPTIONS;
4424
-
4425
-            /* Unrecognized option character */
4426
-
4427
-            default:
4428
-            *errorptr = ERR12;
4429
-            goto PCRE_ERROR_RETURN;
4430
-            }
4431
-          }
4432
-
4433
-        /* If we hit a closing bracket, that's it - this is a freestanding
4434
-        option-setting. We need to ensure that branch_extra is updated if
4435
-        necessary. The only values branch_newextra can have here are 0 or 2.
4436
-        If the value is 2, then branch_extra must either be 2 or 5, depending
4437
-        on whether this is a lookbehind group or not. */
4438
-
4439
-        END_OPTIONS:
4440
-        if (c == ')')
4441
-          {
4442
-          if (branch_newextra == 2 &&
4443
-              (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4444
-            branch_extra += branch_newextra;
4445
-          continue;
4446
-          }
4447
-
4448
-        /* If options were terminated by ':' control comes here. Fall through
4449
-        to handle the group below. */
4450
-        }
4451
-      }
4452
-
4453
-    /* Extracting brackets must be counted so we can process escapes in a
4454
-    Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4455
-    need an additional 3 bytes of store per extracting bracket. However, if
4456
-    PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4457
-    must leave the count alone (it will aways be zero). */
4458
-
4459
-    else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4460
-      {
4461
-      bracount++;
4462
-      if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4463
-      }
4464
-
4465
-    /* Save length for computing whole length at end if there's a repeat that
4466
-    requires duplication of the group. Also save the current value of
4467
-    branch_extra, and start the new group with the new value. If non-zero, this
4468
-    will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4469
-
4470
-    if (brastackptr >= sizeof(brastack)/sizeof(int))
4471
-      {
4472
-      *errorptr = ERR19;
4473
-      goto PCRE_ERROR_RETURN;
4474
-      }
4475
-
4476
-    bralenstack[brastackptr] = branch_extra;
4477
-    branch_extra = branch_newextra;
4478
-
4479
-    brastack[brastackptr++] = length;
4480
-    length += bracket_length;
4481
-    continue;
4482
-
4483
-    /* Handle ket. Look for subsequent max/min; for certain sets of values we
4484
-    have to replicate this bracket up to that many times. If brastackptr is
4485
-    0 this is an unmatched bracket which will generate an error, but take care
4486
-    not to try to access brastack[-1] when computing the length and restoring
4487
-    the branch_extra value. */
4488
-
4489
-    case ')':
4490
-    length += 1 + LINK_SIZE;
4491
-    if (brastackptr > 0)
4492
-      {
4493
-      duplength = length - brastack[--brastackptr];
4494
-      branch_extra = bralenstack[brastackptr];
4495
-      }
4496
-    else duplength = 0;
4497
-
4498
-    /* The following code is also used when a recursion such as (?3) is
4499
-    followed by a quantifier, because in that case, it has to be wrapped inside
4500
-    brackets so that the quantifier works. The value of duplength must be
4501
-    set before arrival. */
4502
-
4503
-    HANDLE_QUANTIFIED_BRACKETS:
4504
-
4505
-    /* Leave ptr at the final char; for read_repeat_counts this happens
4506
-    automatically; for the others we need an increment. */
4507
-
4508
-    if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
4509
-      {
4510
-      ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4511
-      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4512
-      }
4513
-    else if (c == '*') { min = 0; max = -1; ptr++; }
4514
-    else if (c == '+') { min = 1; max = -1; ptr++; }
4515
-    else if (c == '?') { min = 0; max = 1;  ptr++; }
4516
-    else { min = 1; max = 1; }
4517
-
4518
-    /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4519
-    group, and if the maximum is greater than zero, we have to replicate
4520
-    maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4521
-    bracket set. */
4522
-
4523
-    if (min == 0)
4524
-      {
4525
-      length++;
4526
-      if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4527
-      }
4528
-
4529
-    /* When the minimum is greater than zero, we have to replicate up to
4530
-    minval-1 times, with no additions required in the copies. Then, if there
4531
-    is a limited maximum we have to replicate up to maxval-1 times allowing
4532
-    for a BRAZERO item before each optional copy and nesting brackets for all
4533
-    but one of the optional copies. */
4534
-
4535
-    else
4536
-      {
4537
-      length += (min - 1) * duplength;
4538
-      if (max > min)   /* Need this test as max=-1 means no limit */
4539
-        length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4540
-          - (2 + 2*LINK_SIZE);
4541
-      }
4542
-
4543
-    /* Allow space for once brackets for "possessive quantifier" */
4544
-
4545
-    if (ptr[1] == '+')
4546
-      {
4547
-      ptr++;
4548
-      length += 2 + 2*LINK_SIZE;
4549
-      }
4550
-    continue;
4551
-
4552
-    /* Non-special character. For a run of such characters the length required
4553
-    is the number of characters + 2, except that the maximum run length is
4554
-    MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4555
-    # comment as the first character, so the length can't be zero. */
4556
-
4557
-    NORMAL_CHAR:
4558
-    default:
4559
-    length += 2;
4560
-    runlength = 0;
4561
-    do
4562
-      {
4563
-#ifdef SUPPORT_UTF8
4564
-      lastcharlength = 1;     /* Need length of last char for UTF-8 repeats */
4565
-#endif
4566
-
4567
-      /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4568
-      if (inescq)
4569
-        {
4570
-        if (c == '\\' && ptr[1] == 'E')
4571
-          {
4572
-          inescq = FALSE;
4573
-          ptr++;
4574
-          }
4575
-        else runlength++;
4576
-        continue;
4577
-        }
4578
-
4579
-      /* Skip whitespace and comments for /x */
4580
-
4581
-      if ((options & PCRE_EXTENDED) != 0)
4582
-        {
4583
-        if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4584
-        if (c == '#')
4585
-          {
4586
-          /* The space before the ; is to avoid a warning on a silly compiler
4587
-          on the Macintosh. */
4588
-          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4589
-          continue;
4590
-          }
4591
-        }
4592
-
4593
-      /* Backslash may introduce a data char or a metacharacter; stop the
4594
-      string before the latter. */
4595
-
4596
-      if (c == '\\')
4597
-        {
4598
-        const uschar *saveptr = ptr;
4599
-        c = check_escape(&ptr, errorptr, bracount, options, FALSE,
4600
-          &compile_block);
4601
-        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4602
-        if (c < 0) { ptr = saveptr; break; }
4603
-
4604
-        /* In UTF-8 mode, add on the number of additional bytes needed to
4605
-        encode this character, and save the total length in case this is a
4606
-        final char that is repeated. */
4607
-
4608
-#ifdef SUPPORT_UTF8
4609
-        if (utf8 && c > 127)
4610
-          {
4611
-          int i;
4612
-          for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4613
-            if (c <= utf8_table1[i]) break;
4614
-          runlength += i;
4615
-          lastcharlength += i;
4616
-          }
4617
-#endif
4618
-        }
4619
-
4620
-      /* Ordinary character or single-char escape */
4621
-
4622
-      runlength++;
4623
-      }
4624
-
4625
-    /* This "while" is the end of the "do" above. */
4626
-
4627
-    while (runlength < MAXLIT &&
4628
-      (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4629
-
4630
-    /* If we hit a meta-character, back off to point to it */
4631
-
4632
-    if (runlength < MAXLIT) ptr--;
4633
-
4634
-    /* If the last char in the string is a UTF-8 multibyte character, we must
4635
-    set lastcharlength correctly. If it was specified as an escape, this will
4636
-    already have been done above. However, we also have to support in-line
4637
-    UTF-8 characters, so check backwards from where we are. */
4638
-
4639
-#ifdef SUPPORT_UTF8
4640
-    if (utf8)
4641
-      {
4642
-      const uschar *lastptr = ptr - 1;
4643
-      if ((*lastptr & 0x80) != 0)
4644
-        {
4645
-        while((*lastptr & 0xc0) == 0x80) lastptr--;
4646
-        lastcharlength = ptr - lastptr;
4647
-        }
4648
-      }
4649
-#endif
4650
-
4651
-    length += runlength;
4652
-    continue;
4653
-    }
4654
-  }
4655
-
4656
-length += 2 + LINK_SIZE;    /* For final KET and END */
4657
-
4658
-if (length > MAX_PATTERN_SIZE)
4659
-  {
4660
-  *errorptr = ERR20;
4661
-  return NULL;
4662
-  }
4663
-
4664
-/* Compute the size of data block needed and get it, either from malloc or
4665
-externally provided function. */
4666
-
4667
-size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4668
-re = (real_pcre *)(pcre_malloc)(size);
4669
-
4670
-if (re == NULL)
4671
-  {
4672
-  *errorptr = ERR21;
4673
-  return NULL;
4674
-  }
4675
-
4676
-/* Put in the magic number, and save the size, options, and table pointer */
4677
-
4678
-re->magic_number = MAGIC_NUMBER;
4679
-re->size = size;
4680
-re->options = options;
4681
-re->tables = tables;
4682
-re->name_entry_size = max_name_size + 3;
4683
-re->name_count = name_count;
4684
-
4685
-/* The starting points of the name/number translation table and of the code are
4686
-passed around in the compile data block. */
4687
-
4688
-compile_block.names_found = 0;
4689
-compile_block.name_entry_size = max_name_size + 3;
4690
-compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4691
-codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4692
-compile_block.start_code = codestart;
4693
-compile_block.req_varyopt = 0;
4694
-
4695
-/* Set up a starting, non-extracting bracket, then compile the expression. On
4696
-error, *errorptr will be set non-NULL, so we don't need to look at the result
4697
-of the function here. */
4698
-
4699
-ptr = (const uschar *)pattern;
4700
-code = (uschar *)codestart;
4701
-*code = OP_BRA;
4702
-bracount = 0;
4703
-(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4704
-  errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4705
-re->top_bracket = bracount;
4706
-re->top_backref = compile_block.top_backref;
4707
-
4708
-/* If not reached end of pattern on success, there's an excess bracket. */
4709
-
4710
-if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
4711
-
4712
-/* Fill in the terminating state and check for disastrous overflow, but
4713
-if debugging, leave the test till after things are printed out. */
4714
-
4715
-*code++ = OP_END;
4716
-
4717
-#ifndef DEBUG
4718
-if (code - codestart > length) *errorptr = ERR23;
4719
-#endif
4720
-
4721
-/* Give an error if there's back reference to a non-existent capturing
4722
-subpattern. */
4723
-
4724
-if (re->top_backref > re->top_bracket) *errorptr = ERR15;
4725
-
4726
-/* Failed to compile, or error while post-processing */
4727
-
4728
-if (*errorptr != NULL)
4729
-  {
4730
-  (pcre_free)(re);
4731
-  PCRE_ERROR_RETURN:
4732
-  *erroroffset = ptr - (const uschar *)pattern;
4733
-  return NULL;
4734
-  }
4735
-
4736
-/* If the anchored option was not passed, set the flag if we can determine that
4737
-the pattern is anchored by virtue of ^ characters or \A or anything else (such
4738
-as starting with .* when DOTALL is set).
4739
-
4740
-Otherwise, if we know what the first character has to be, save it, because that
4741
-speeds up unanchored matches no end. If not, see if we can set the
4742
-PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4743
-start with ^. and also when all branches start with .* for non-DOTALL matches.
4744
-*/
4745
-
4746
-if ((options & PCRE_ANCHORED) == 0)
4747
-  {
4748
-  int temp_options = options;
4749
-  if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4750
-    re->options |= PCRE_ANCHORED;
4751
-  else
4752
-    {
4753
-    if (firstbyte < 0)
4754
-      firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4755
-    if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
4756
-      {
4757
-      int ch = firstbyte & 255;
4758
-      re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4759
-         compile_block.fcc[ch] == ch)? ch : firstbyte;
4760
-      re->options |= PCRE_FIRSTSET;
4761
-      }
4762
-    else if (is_startline(codestart, 0, compile_block.backref_map))
4763
-      re->options |= PCRE_STARTLINE;
4764
-    }
4765
-  }
4766
-
4767
-/* For an anchored pattern, we use the "required byte" only if it follows a
4768
-variable length item in the regex. Remove the caseless flag for non-caseable
4769
-chars. */
4770
-
4771
-if (reqbyte >= 0 &&
4772
-     ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4773
-  {
4774
-  int ch = reqbyte & 255;
4775
-  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4776
-    compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4777
-  re->options |= PCRE_REQCHSET;
4778
-  }
4779
-
4780
-/* Print out the compiled data for debugging */
4781
-
4782
-#ifdef DEBUG
4783
-
4784
-printf("Length = %d top_bracket = %d top_backref = %d\n",
4785
-  length, re->top_bracket, re->top_backref);
4786
-
4787
-if (re->options != 0)
4788
-  {
4789
-  printf("%s%s%s%s%s%s%s%s%s\n",
4790
-    ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4791
-    ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4792
-    ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4793
-    ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4794
-    ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4795
-    ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4796
-    ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4797
-    ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4798
-    ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4799
-  }
4800
-
4801
-if ((re->options & PCRE_FIRSTSET) != 0)
4802
-  {
4803
-  int ch = re->first_byte & 255;
4804
-  char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4805
-  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4806
-    else printf("First char = \\x%02x%s\n", ch, caseless);
4807
-  }
4808
-
4809
-if ((re->options & PCRE_REQCHSET) != 0)
4810
-  {
4811
-  int ch = re->req_byte & 255;
4812
-  char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4813
-  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
4814
-    else printf("Req char = \\x%02x%s\n", ch, caseless);
4815
-  }
4816
-
4817
-print_internals(re, stdout);
4818
-
4819
-/* This check is done here in the debugging case so that the code that
4820
-was compiled can be seen. */
4821
-
4822
-if (code - codestart > length)
4823
-  {
4824
-  *errorptr = ERR23;
4825
-  (pcre_free)(re);
4826
-  *erroroffset = ptr - (uschar *)pattern;
4827
-  return NULL;
4828
-  }
4829
-#endif
4830
-
4831
-return (pcre *)re;
4832
-}
4833
-
4834
-
4835
-
4836
-/*************************************************
4837
-*          Match a back-reference                *
4838
-*************************************************/
4839
-
4840
-/* If a back reference hasn't been set, the length that is passed is greater
4841
-than the number of characters left in the string, so the match fails.
4842
-
4843
-Arguments:
4844
-  offset      index into the offset vector
4845
-  eptr        points into the subject
4846
-  length      length to be matched
4847
-  md          points to match data block
4848
-  ims         the ims flags
4849
-
4850
-Returns:      TRUE if matched
4851
-*/
4852
-
4853
-static BOOL
4854
-match_ref(int offset, register const uschar *eptr, int length, match_data *md,
4855
-  unsigned long int ims)
4856
-{
4857
-const uschar *p = md->start_subject + md->offset_vector[offset];
4858
-
4859
-#ifdef DEBUG
4860
-if (eptr >= md->end_subject)
4861
-  printf("matching subject <null>");
4862
-else
4863
-  {
4864
-  printf("matching subject ");
4865
-  pchars(eptr, length, TRUE, md);
4866
-  }
4867
-printf(" against backref ");
4868
-pchars(p, length, FALSE, md);
4869
-printf("\n");
4870
-#endif
4871
-
4872
-/* Always fail if not enough characters left */
4873
-
4874
-if (length > md->end_subject - eptr) return FALSE;
4875
-
4876
-/* Separate the caselesss case for speed */
4877
-
4878
-if ((ims & PCRE_CASELESS) != 0)
4879
-  {
4880
-  while (length-- > 0)
4881
-    if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
4882
-  }
4883
-else
4884
-  { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
4885
-
4886
-return TRUE;
4887
-}
4888
-
4889
-
4890
-#ifdef SUPPORT_UTF8
4891
-/*************************************************
4892
-*       Match character against an XCLASS        *
4893
-*************************************************/
4894
-
4895
-/* This function is called from within the XCLASS code below, to match a
4896
-character against an extended class which might match values > 255.
4897
-
4898
-Arguments:
4899
-  c           the character
4900
-  data        points to the flag byte of the XCLASS data
4901
-
4902
-Returns:      TRUE if character matches, else FALSE
4903
-*/
4904
-
4905
-static BOOL
4906
-match_xclass(int c, const uschar *data)
4907
-{
4908
-int t;
4909
-BOOL negated = (*data & XCL_NOT) != 0;
4910
-
4911
-/* Character values < 256 are matched against a bitmap, if one is present. If
4912
-not, we still carry on, because there may be ranges that start below 256 in the
4913
-additional data. */
4914
-
4915
-if (c < 256)
4916
-  {
4917
-  if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
4918
-    return !negated;   /* char found */
4919
-  }
4920
-
4921
-/* Now match against the list of large chars or ranges that end with a large
4922
-char. First skip the bit map if present. */
4923
-
4924
-if ((*data++ & XCL_MAP) != 0) data += 32;
4925
-
4926
-while ((t = *data++) != XCL_END)
4927
-  {
4928
-  int x, y;
4929
-  GETCHARINC(x, data);
4930
-  if (t == XCL_SINGLE)
4931
-    {
4932
-    if (c == x) return !negated;
4933
-    }
4934
-  else
4935
-    {
4936
-    GETCHARINC(y, data);
4937
-    if (c >= x && c <= y) return !negated;
4938
-    }
4939
-  }
4940
-
4941
-return negated;   /* char was not found */
4942
-}
4943
-#endif
4944
-
4945
-
4946
-
4947
-
4948
-/*************************************************
4949
-*         Match from current position            *
4950
-*************************************************/
4951
-
4952
-/* On entry ecode points to the first opcode, and eptr to the first character
4953
-in the subject string, while eptrb holds the value of eptr at the start of the
4954
-last bracketed group - used for breaking infinite loops matching zero-length
4955
-strings. This function is called recursively in many circumstances. Whenever it
4956
-returns a negative (error) response, the outer incarnation must also return the
4957
-same response.
4958
-
4959
-Performance note: It might be tempting to extract commonly used fields from the
4960
-md structure (e.g. utf8, end_subject) into individual variables to improve
4961
-performance. Tests using gcc on a SPARC disproved this; in the first case, it
4962
-made performance worse.
4963
-
4964
-Arguments:
4965
-   eptr        pointer in subject
4966
-   ecode       position in code
4967
-   offset_top  current top pointer
4968
-   md          pointer to "static" info for the match
4969
-   ims         current /i, /m, and /s options
4970
-   eptrb       pointer to chain of blocks containing eptr at start of
4971
-                 brackets - for testing for empty matches
4972
-   flags       can contain
4973
-                 match_condassert - this is an assertion condition
4974
-                 match_isgroup - this is the start of a bracketed group
4975
-
4976
-Returns:       MATCH_MATCH if matched            )  these values are >= 0
4977
-               MATCH_NOMATCH if failed to match  )
4978
-               a negative PCRE_ERROR_xxx value if aborted by an error condition
4979
-                 (e.g. stopped by recursion limit)
4980
-*/
4981
-
4982
-static int
4983
-match(register const uschar *eptr, register const uschar *ecode,
4984
-  int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
4985
-  int flags)
4986
-{
4987
-unsigned long int original_ims = ims;   /* Save for resetting on ')' */
4988
-register int rrc;
4989
-eptrblock newptrb;
4990
-
4991
-if (md->match_call_count++ >= md->match_limit) return PCRE_ERROR_MATCHLIMIT;
4992
-
4993
-/* At the start of a bracketed group, add the current subject pointer to the
4994
-stack of such pointers, to be re-instated at the end of the group when we hit
4995
-the closing ket. When match() is called in other circumstances, we don't add to
4996
-the stack. */
4997
-
4998
-if ((flags & match_isgroup) != 0)
4999
-  {
5000
-  newptrb.prev = eptrb;
5001
-  newptrb.saved_eptr = eptr;
5002
-  eptrb = &newptrb;
5003
-  }
5004
-
5005
-/* Now start processing the operations. */
5006
-
5007
-for (;;)
5008
-  {
5009
-  int op = (int)*ecode;
5010
-  int min, max, ctype;
5011
-  register int i;
5012
-  register int c;
5013
-  BOOL minimize = FALSE;
5014
-
5015
-  /* Opening capturing bracket. If there is space in the offset vector, save
5016
-  the current subject position in the working slot at the top of the vector. We
5017
-  mustn't change the current values of the data slot, because they may be set
5018
-  from a previous iteration of this group, and be referred to by a reference
5019
-  inside the group.
5020
-
5021
-  If the bracket fails to match, we need to restore this value and also the
5022
-  values of the final offsets, in case they were set by a previous iteration of
5023
-  the same bracket.
5024
-
5025
-  If there isn't enough space in the offset vector, treat this as if it were a
5026
-  non-capturing bracket. Don't worry about setting the flag for the error case
5027
-  here; that is handled in the code for KET. */
5028
-
5029
-  if (op > OP_BRA)
5030
-    {
5031
-    int offset;
5032
-    int number = op - OP_BRA;
5033
-
5034
-    /* For extended extraction brackets (large number), we have to fish out the
5035
-    number from a dummy opcode at the start. */
5036
-
5037
-    if (number > EXTRACT_BASIC_MAX)
5038
-      number = GET2(ecode, 2+LINK_SIZE);
5039
-    offset = number << 1;
5040
-
5041
-#ifdef DEBUG
5042
-    printf("start bracket %d subject=", number);
5043
-    pchars(eptr, 16, TRUE, md);
5044
-    printf("\n");
5045
-#endif
5046
-
5047
-    if (offset < md->offset_max)
5048
-      {
5049
-      int save_offset1 = md->offset_vector[offset];
5050
-      int save_offset2 = md->offset_vector[offset+1];
5051
-      int save_offset3 = md->offset_vector[md->offset_end - number];
5052
-      int save_capture_last = md->capture_last;
5053
-
5054
-      DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
5055
-      md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
5056
-
5057
-      do
5058
-        {
5059
-        if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5060
-              eptrb, match_isgroup)) != MATCH_NOMATCH) return rrc;
5061
-        md->capture_last = save_capture_last;
5062
-        ecode += GET(ecode, 1);
5063
-        }
5064
-      while (*ecode == OP_ALT);
5065
-
5066
-      DPRINTF(("bracket %d failed\n", number));
5067
-
5068
-      md->offset_vector[offset] = save_offset1;
5069
-      md->offset_vector[offset+1] = save_offset2;
5070
-      md->offset_vector[md->offset_end - number] = save_offset3;
5071
-
5072
-      return MATCH_NOMATCH;
5073
-      }
5074
-
5075
-    /* Insufficient room for saving captured contents */
5076
-
5077
-    else op = OP_BRA;
5078
-    }
5079
-
5080
-  /* Other types of node can be handled by a switch */
5081
-
5082
-  switch(op)
5083
-    {
5084
-    case OP_BRA:     /* Non-capturing bracket: optimized */
5085
-    DPRINTF(("start bracket 0\n"));
5086
-    do
5087
-      {
5088
-      if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5089
-        match_isgroup)) != MATCH_NOMATCH) return rrc;
5090
-      ecode += GET(ecode, 1);
5091
-      }
5092
-    while (*ecode == OP_ALT);
5093
-    DPRINTF(("bracket 0 failed\n"));
5094
-    return MATCH_NOMATCH;
5095
-
5096
-    /* Conditional group: compilation checked that there are no more than
5097
-    two branches. If the condition is false, skipping the first branch takes us
5098
-    past the end if there is only one branch, but that's OK because that is
5099
-    exactly what going to the ket would do. */
5100
-
5101
-    case OP_COND:
5102
-    if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
5103
-      {
5104
-      int offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
5105
-      BOOL condition = (offset == CREF_RECURSE * 2)?
5106
-        (md->recursive != NULL) :
5107
-        (offset < offset_top && md->offset_vector[offset] >= 0);
5108
-      return match(eptr, ecode + (condition?
5109
-        (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
5110
-        offset_top, md, ims, eptrb, match_isgroup);
5111
-      }
5112
-
5113
-    /* The condition is an assertion. Call match() to evaluate it - setting
5114
-    the final argument TRUE causes it to stop at the end of an assertion. */
5115
-
5116
-    else
5117
-      {
5118
-      if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5119
-          match_condassert | match_isgroup)) == MATCH_MATCH)
5120
-        {
5121
-        ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
5122
-        while (*ecode == OP_ALT) ecode += GET(ecode, 1);
5123
-        }
5124
-      else if (rrc != MATCH_NOMATCH) return rrc;
5125
-      else ecode += GET(ecode, 1);
5126
-      return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5127
-        match_isgroup);
5128
-      }
5129
-    /* Control never reaches here */
5130
-
5131
-    /* Skip over conditional reference or large extraction number data if
5132
-    encountered. */
5133
-
5134
-    case OP_CREF:
5135
-    case OP_BRANUMBER:
5136
-    ecode += 3;
5137
-    break;
5138
-
5139
-    /* End of the pattern. If we are in a recursion, we should restore the
5140
-    offsets appropriately and continue from after the call. */
5141
-
5142
-    case OP_END:
5143
-    if (md->recursive != NULL && md->recursive->group_num == 0)
5144
-      {
5145
-      recursion_info *rec = md->recursive;
5146
-      DPRINTF(("Hit the end in a (?0) recursion\n"));
5147
-      md->recursive = rec->prev;
5148
-      memmove(md->offset_vector, rec->offset_save,
5149
-        rec->saved_max * sizeof(int));
5150
-      md->start_match = rec->save_start;
5151
-      ims = original_ims;
5152
-      ecode = rec->after_call;
5153
-      break;
5154
-      }
5155
-
5156
-    /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
5157
-    string - backtracking will then try other alternatives, if any. */
5158
-
5159
-    if (md->notempty && eptr == md->start_match) return MATCH_NOMATCH;
5160
-    md->end_match_ptr = eptr;          /* Record where we ended */
5161
-    md->end_offset_top = offset_top;   /* and how many extracts were taken */
5162
-    return MATCH_MATCH;
5163
-
5164
-    /* Change option settings */
5165
-
5166
-    case OP_OPT:
5167
-    ims = ecode[1];
5168
-    ecode += 2;
5169
-    DPRINTF(("ims set to %02lx\n", ims));
5170
-    break;
5171
-
5172
-    /* Assertion brackets. Check the alternative branches in turn - the
5173
-    matching won't pass the KET for an assertion. If any one branch matches,
5174
-    the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
5175
-    start of each branch to move the current point backwards, so the code at
5176
-    this level is identical to the lookahead case. */
5177
-
5178
-    case OP_ASSERT:
5179
-    case OP_ASSERTBACK:
5180
-    do
5181
-      {
5182
-      if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5183
-        match_isgroup)) == MATCH_MATCH) break;
5184
-      if (rrc != MATCH_NOMATCH) return rrc;
5185
-      ecode += GET(ecode, 1);
5186
-      }
5187
-    while (*ecode == OP_ALT);
5188
-    if (*ecode == OP_KET) return MATCH_NOMATCH;
5189
-
5190
-    /* If checking an assertion for a condition, return MATCH_MATCH. */
5191
-
5192
-    if ((flags & match_condassert) != 0) return MATCH_MATCH;
5193
-
5194
-    /* Continue from after the assertion, updating the offsets high water
5195
-    mark, since extracts may have been taken during the assertion. */
5196
-
5197
-    do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5198
-    ecode += 1 + LINK_SIZE;
5199
-    offset_top = md->end_offset_top;
5200
-    continue;
5201
-
5202
-    /* Negative assertion: all branches must fail to match */
5203
-
5204
-    case OP_ASSERT_NOT:
5205
-    case OP_ASSERTBACK_NOT:
5206
-    do
5207
-      {
5208
-      if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5209
-        match_isgroup)) == MATCH_MATCH) return MATCH_NOMATCH;
5210
-      if (rrc != MATCH_NOMATCH) return rrc;
5211
-      ecode += GET(ecode,1);
5212
-      }
5213
-    while (*ecode == OP_ALT);
5214
-
5215
-    if ((flags & match_condassert) != 0) return MATCH_MATCH;
5216
-
5217
-    ecode += 1 + LINK_SIZE;
5218
-    continue;
5219
-
5220
-    /* Move the subject pointer back. This occurs only at the start of
5221
-    each branch of a lookbehind assertion. If we are too close to the start to
5222
-    move back, this match function fails. When working with UTF-8 we move
5223
-    back a number of characters, not bytes. */
5224
-
5225
-    case OP_REVERSE:
5226
-#ifdef SUPPORT_UTF8
5227
-    if (md->utf8)
5228
-      {
5229
-      c = GET(ecode,1);
5230
-      for (i = 0; i < c; i++)
5231
-        {
5232
-        eptr--;
5233
-        if (eptr < md->start_subject) return MATCH_NOMATCH;
5234
-        BACKCHAR(eptr)
5235
-        }
5236
-      }
5237
-    else
5238
-#endif
5239
-
5240
-    /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
5241
-
5242
-      {
5243
-      eptr -= GET(ecode,1);
5244
-      if (eptr < md->start_subject) return MATCH_NOMATCH;
5245
-      }
5246
-
5247
-    /* Skip to next op code */
5248
-
5249
-    ecode += 1 + LINK_SIZE;
5250
-    break;
5251
-
5252
-    /* The callout item calls an external function, if one is provided, passing
5253
-    details of the match so far. This is mainly for debugging, though the
5254
-    function is able to force a failure. */
5255
-
5256
-    case OP_CALLOUT:
5257
-    if (pcre_callout != NULL)
5258
-      {
5259
-      pcre_callout_block cb;
5260
-      cb.version          = 0;   /* Version 0 of the callout block */
5261
-      cb.callout_number   = ecode[1];
5262
-      cb.offset_vector    = md->offset_vector;
5263
-      cb.subject          = (const char *)md->start_subject;
5264
-      cb.subject_length   = md->end_subject - md->start_subject;
5265
-      cb.start_match      = md->start_match - md->start_subject;
5266
-      cb.current_position = eptr - md->start_subject;
5267
-      cb.capture_top      = offset_top/2;
5268
-      cb.capture_last     = md->capture_last;
5269
-      cb.callout_data     = md->callout_data;
5270
-      if ((rrc = (*pcre_callout)(&cb)) > 0) return MATCH_NOMATCH;
5271
-      if (rrc < 0) return rrc;
5272
-      }
5273
-    ecode += 2;
5274
-    break;
5275
-
5276
-    /* Recursion either matches the current regex, or some subexpression. The
5277
-    offset data is the offset to the starting bracket from the start of the
5278
-    whole pattern. However, it is possible that a BRAZERO was inserted before
5279
-    this bracket after we took the offset - we just skip it if encountered.
5280
-
5281
-    If there are any capturing brackets started but not finished, we have to
5282
-    save their starting points and reinstate them after the recursion. However,
5283
-    we don't know how many such there are (offset_top records the completed
5284
-    total) so we just have to save all the potential data. There may be up to
5285
-    65535 such values, which is too large to put on the stack, but using malloc
5286
-    for small numbers seems expensive. As a compromise, the stack is used when
5287
-    there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
5288
-    is used. A problem is what to do if the malloc fails ... there is no way of
5289
-    returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
5290
-    values on the stack, and accept that the rest may be wrong.
5291
-
5292
-    There are also other values that have to be saved. We use a chained
5293
-    sequence of blocks that actually live on the stack. Thanks to Robin Houston
5294
-    for the original version of this logic. */
5295
-
5296
-    case OP_RECURSE:
5297
-      {
5298
-      int stacksave[REC_STACK_SAVE_MAX];
5299
-      recursion_info new_recursive;
5300
-      const uschar *callpat = md->start_code + GET(ecode, 1);
5301
-
5302
-      if (*callpat == OP_BRAZERO) callpat++;
5303
-
5304
-      new_recursive.group_num = *callpat - OP_BRA;
5305
-
5306
-      /* For extended extraction brackets (large number), we have to fish out
5307
-      the number from a dummy opcode at the start. */
5308
-
5309
-      if (new_recursive.group_num > EXTRACT_BASIC_MAX)
5310
-        new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
5311
-
5312
-      /* Add to "recursing stack" */
5313
-
5314
-      new_recursive.prev = md->recursive;
5315
-      md->recursive = &new_recursive;
5316
-
5317
-      /* Find where to continue from afterwards */
5318
-
5319
-      ecode += 1 + LINK_SIZE;
5320
-      new_recursive.after_call = ecode;
5321
-
5322
-      /* Now save the offset data. */
5323
-
5324
-      new_recursive.saved_max = md->offset_end;
5325
-      if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
5326
-        new_recursive.offset_save = stacksave;
5327
-      else
5328
-        {
5329
-        new_recursive.offset_save =
5330
-          (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
5331
-        if (new_recursive.offset_save == NULL) return PCRE_ERROR_NOMEMORY;
5332
-        }
5333
-
5334
-      memcpy(new_recursive.offset_save, md->offset_vector,
5335
-            new_recursive.saved_max * sizeof(int));
5336
-      new_recursive.save_start = md->start_match;
5337
-      md->start_match = eptr;
5338
-
5339
-      /* OK, now we can do the recursion. For each top-level alternative we
5340
-      restore the offset and recursion data. */
5341
-
5342
-      DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
5343
-      do
5344
-        {
5345
-        if ((rrc = match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
5346
-            eptrb, match_isgroup)) == MATCH_MATCH)
5347
-          {
5348
-          md->recursive = new_recursive.prev;
5349
-          if (new_recursive.offset_save != stacksave)
5350
-            (pcre_free)(new_recursive.offset_save);
5351
-          return MATCH_MATCH;
5352
-          }
5353
-        else if (rrc != MATCH_NOMATCH) return rrc;
5354
-
5355
-        md->recursive = &new_recursive;
5356
-        memcpy(md->offset_vector, new_recursive.offset_save,
5357
-            new_recursive.saved_max * sizeof(int));
5358
-        callpat += GET(callpat, 1);
5359
-        }
5360
-      while (*callpat == OP_ALT);
5361
-
5362
-      DPRINTF(("Recursion didn't match\n"));
5363
-      md->recursive = new_recursive.prev;
5364
-      if (new_recursive.offset_save != stacksave)
5365
-        (pcre_free)(new_recursive.offset_save);
5366
-      return MATCH_NOMATCH;
5367
-      }
5368
-    /* Control never reaches here */
5369
-
5370
-    /* "Once" brackets are like assertion brackets except that after a match,
5371
-    the point in the subject string is not moved back. Thus there can never be
5372
-    a move back into the brackets. Friedl calls these "atomic" subpatterns.
5373
-    Check the alternative branches in turn - the matching won't pass the KET
5374
-    for this kind of subpattern. If any one branch matches, we carry on as at
5375
-    the end of a normal bracket, leaving the subject pointer. */
5376
-
5377
-    case OP_ONCE:
5378
-      {
5379
-      const uschar *prev = ecode;
5380
-      const uschar *saved_eptr = eptr;
5381
-
5382
-      do
5383
-        {
5384
-        if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5385
-          eptrb, match_isgroup)) == MATCH_MATCH) break;
5386
-        if (rrc != MATCH_NOMATCH) return rrc;
5387
-        ecode += GET(ecode,1);
5388
-        }
5389
-      while (*ecode == OP_ALT);
5390
-
5391
-      /* If hit the end of the group (which could be repeated), fail */
5392
-
5393
-      if (*ecode != OP_ONCE && *ecode != OP_ALT) return MATCH_NOMATCH;
5394
-
5395
-      /* Continue as from after the assertion, updating the offsets high water
5396
-      mark, since extracts may have been taken. */
5397
-
5398
-      do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5399
-
5400
-      offset_top = md->end_offset_top;
5401
-      eptr = md->end_match_ptr;
5402
-
5403
-      /* For a non-repeating ket, just continue at this level. This also
5404
-      happens for a repeating ket if no characters were matched in the group.
5405
-      This is the forcible breaking of infinite loops as implemented in Perl
5406
-      5.005. If there is an options reset, it will get obeyed in the normal
5407
-      course of events. */
5408
-
5409
-      if (*ecode == OP_KET || eptr == saved_eptr)
5410
-        {
5411
-        ecode += 1+LINK_SIZE;
5412
-        break;
5413
-        }
5414
-
5415
-      /* The repeating kets try the rest of the pattern or restart from the
5416
-      preceding bracket, in the appropriate order. We need to reset any options
5417
-      that changed within the bracket before re-running it, so check the next
5418
-      opcode. */
5419
-
5420
-      if (ecode[1+LINK_SIZE] == OP_OPT)
5421
-        {
5422
-        ims = (ims & ~PCRE_IMS) | ecode[4];
5423
-        DPRINTF(("ims set to %02lx at group repeat\n", ims));
5424
-        }
5425
-
5426
-      if (*ecode == OP_KETRMIN)
5427
-        {
5428
-        if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5429
-             eptrb, 0)) != MATCH_NOMATCH) return rrc;
5430
-        if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5431
-             match_isgroup)) != MATCH_NOMATCH) return rrc;
5432
-        }
5433
-      else  /* OP_KETRMAX */
5434
-        {
5435
-        if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5436
-             match_isgroup)) != MATCH_NOMATCH) return rrc;
5437
-        if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5438
-             0)) != MATCH_NOMATCH) return rrc;
5439
-        }
5440
-      }
5441
-    return MATCH_NOMATCH;
5442
-
5443
-    /* An alternation is the end of a branch; scan along to find the end of the
5444
-    bracketed group and go to there. */
5445
-
5446
-    case OP_ALT:
5447
-    do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5448
-    break;
5449
-
5450
-    /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
5451
-    that it may occur zero times. It may repeat infinitely, or not at all -
5452
-    i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
5453
-    repeat limits are compiled as a number of copies, with the optional ones
5454
-    preceded by BRAZERO or BRAMINZERO. */
5455
-
5456
-    case OP_BRAZERO:
5457
-      {
5458
-      const uschar *next = ecode+1;
5459
-      if ((rrc = match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
5460
-           != MATCH_NOMATCH) return rrc;
5461
-      do next += GET(next,1); while (*next == OP_ALT);
5462
-      ecode = next + 1+LINK_SIZE;
5463
-      }
5464
-    break;
5465
-
5466
-    case OP_BRAMINZERO:
5467
-      {
5468
-      const uschar *next = ecode+1;
5469
-      do next += GET(next,1); while (*next == OP_ALT);
5470
-      if ((rrc = match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5471
-        match_isgroup)) != MATCH_NOMATCH) return rrc;
5472
-      ecode++;
5473
-      }
5474
-    break;
5475
-
5476
-    /* End of a group, repeated or non-repeating. If we are at the end of
5477
-    an assertion "group", stop matching and return MATCH_MATCH, but record the
5478
-    current high water mark for use by positive assertions. Do this also
5479
-    for the "once" (not-backup up) groups. */
5480
-
5481
-    case OP_KET:
5482
-    case OP_KETRMIN:
5483
-    case OP_KETRMAX:
5484
-      {
5485
-      const uschar *prev = ecode - GET(ecode, 1);
5486
-      const uschar *saved_eptr = eptrb->saved_eptr;
5487
-
5488
-      eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
5489
-
5490
-      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
5491
-          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
5492
-          *prev == OP_ONCE)
5493
-        {
5494
-        md->end_match_ptr = eptr;      /* For ONCE */
5495
-        md->end_offset_top = offset_top;
5496
-        return MATCH_MATCH;
5497
-        }
5498
-
5499
-      /* In all other cases except a conditional group we have to check the
5500
-      group number back at the start and if necessary complete handling an
5501
-      extraction by setting the offsets and bumping the high water mark. */
5502
-
5503
-      if (*prev != OP_COND)
3864
+        
3865
+        item_count++;    /* Is zero for the first non-comment item */
3866
+        
3867
+        switch(c)
5504 3868
         {
5505
-        int offset;
5506
-        int number = *prev - OP_BRA;
5507
-
5508
-        /* For extended extraction brackets (large number), we have to fish out
5509
-        the number from a dummy opcode at the start. */
5510
-
5511
-        if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
5512
-        offset = number << 1;
5513
-
5514
-#ifdef DEBUG
5515
-        printf("end bracket %d", number);
5516
-        printf("\n");
5517
-#endif
5518
-
5519
-        /* Test for a numbered group. This includes groups called as a result
5520
-        of recursion. Note that whole-pattern recursion is coded as a recurse
5521
-        into group 0, so it won't be picked up here. Instead, we catch it when
5522
-        the OP_END is reached. */
5523
-
5524
-        if (number > 0)
5525
-          {
5526
-          md->capture_last = number;
5527
-          if (offset >= md->offset_max) md->offset_overflow = TRUE; else
5528
-            {
5529
-            md->offset_vector[offset] =
5530
-              md->offset_vector[md->offset_end - number];
5531
-            md->offset_vector[offset+1] = eptr - md->start_subject;
5532
-            if (offset_top <= offset) offset_top = offset + 2;
5533
-            }
5534
-
5535
-          /* Handle a recursively called group. Restore the offsets
5536
-          appropriately and continue from after the call. */
5537
-
5538
-          if (md->recursive != NULL && md->recursive->group_num == number)
3869
+                /* A backslashed item may be an escaped "normal" character or a
3870
+                 character type. For a "normal" character, put the pointers and
3871
+                 character back so that tests for whitespace etc. in the input
3872
+                 are done correctly. */
3873
+                
3874
+            case '\\':
5539 3875
             {
5540
-            recursion_info *rec = md->recursive;
5541
-            DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
5542
-            md->recursive = rec->prev;
5543
-            md->start_match = rec->save_start;
5544
-            memcpy(md->offset_vector, rec->offset_save,
5545
-              rec->saved_max * sizeof(int));
5546
-            ecode = rec->after_call;
5547
-            ims = original_ims;
5548
-            break;
3876
+                const uschar *save_ptr = ptr;
3877
+                c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
3878
+                if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3879
+                if (c >= 0)
3880
+                {
3881
+                    ptr = save_ptr;
3882
+                    c = '\\';
3883
+                    goto NORMAL_CHAR;
3884
+                }
5549 3885
             }
5550
-          }
5551
-        }
5552
-
5553
-      /* Reset the value of the ims flags, in case they got changed during
5554
-      the group. */
5555
-
5556
-      ims = original_ims;
5557
-      DPRINTF(("ims reset to %02lx\n", ims));
5558
-
5559
-      /* For a non-repeating ket, just continue at this level. This also
5560
-      happens for a repeating ket if no characters were matched in the group.
5561
-      This is the forcible breaking of infinite loops as implemented in Perl
5562
-      5.005. If there is an options reset, it will get obeyed in the normal
5563
-      course of events. */
5564
-
5565
-      if (*ecode == OP_KET || eptr == saved_eptr)
5566
-        {
5567
-        ecode += 1 + LINK_SIZE;
5568
-        break;
5569
-        }
5570
-
5571
-      /* The repeating kets try the rest of the pattern or restart from the
5572
-      preceding bracket, in the appropriate order. */
5573
-
5574
-      if (*ecode == OP_KETRMIN)
5575
-        {
5576
-        if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5577
-             0)) != MATCH_NOMATCH) return rrc;
5578
-        if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5579
-             match_isgroup)) != MATCH_NOMATCH) return rrc;
5580
-        }
5581
-      else  /* OP_KETRMAX */
5582
-        {
5583
-        if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5584
-             match_isgroup)) != MATCH_NOMATCH) return rrc;
5585
-        if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5586
-             0)) != MATCH_NOMATCH) return rrc;
5587
-        }
5588
-      }
5589
-    return MATCH_NOMATCH;
5590
-
5591
-    /* Start of subject unless notbol, or after internal newline if multiline */
5592
-
5593
-    case OP_CIRC:
5594
-    if (md->notbol && eptr == md->start_subject) return MATCH_NOMATCH;
5595
-    if ((ims & PCRE_MULTILINE) != 0)
5596
-      {
5597
-      if (eptr != md->start_subject && eptr[-1] != NEWLINE)
5598
-        return MATCH_NOMATCH;
5599
-      ecode++;
5600
-      break;
5601
-      }
5602
-    /* ... else fall through */
5603
-
5604
-    /* Start of subject assertion */
5605
-
5606
-    case OP_SOD:
5607
-    if (eptr != md->start_subject) return MATCH_NOMATCH;
5608
-    ecode++;
5609
-    break;
5610
-
5611
-    /* Start of match assertion */
5612
-
5613
-    case OP_SOM:
5614
-    if (eptr != md->start_subject + md->start_offset) return MATCH_NOMATCH;
5615
-    ecode++;
5616
-    break;
5617
-
5618
-    /* Assert before internal newline if multiline, or before a terminating
5619
-    newline unless endonly is set, else end of subject unless noteol is set. */
5620
-
5621
-    case OP_DOLL:
5622
-    if ((ims & PCRE_MULTILINE) != 0)
5623
-      {
5624
-      if (eptr < md->end_subject)
5625
-        { if (*eptr != NEWLINE) return MATCH_NOMATCH; }
5626
-      else
5627
-        { if (md->noteol) return MATCH_NOMATCH; }
5628
-      ecode++;
5629
-      break;
5630
-      }
5631
-    else
5632
-      {
5633
-      if (md->noteol) return MATCH_NOMATCH;
5634
-      if (!md->endonly)
5635
-        {
5636
-        if (eptr < md->end_subject - 1 ||
5637
-           (eptr == md->end_subject - 1 && *eptr != NEWLINE))
5638
-          return MATCH_NOMATCH;
5639
-        ecode++;
5640
-        break;
5641
-        }
5642
-      }
5643
-    /* ... else fall through */
5644
-
5645
-    /* End of subject assertion (\z) */
5646
-
5647
-    case OP_EOD:
5648
-    if (eptr < md->end_subject) return MATCH_NOMATCH;
5649
-    ecode++;
5650
-    break;
5651
-
5652
-    /* End of subject or ending \n assertion (\Z) */
5653
-
5654
-    case OP_EODN:
5655
-    if (eptr < md->end_subject - 1 ||
5656
-       (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return MATCH_NOMATCH;
5657
-    ecode++;
5658
-    break;
5659
-
5660
-    /* Word boundary assertions */
5661
-
5662
-    case OP_NOT_WORD_BOUNDARY:
5663
-    case OP_WORD_BOUNDARY:
5664
-      {
5665
-      BOOL prev_is_word, cur_is_word;
5666
-
5667
-      /* Find out if the previous and current characters are "word" characters.
5668
-      It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
5669
-      be "non-word" characters. */
5670
-
5671
-#ifdef SUPPORT_UTF8
5672
-      if (md->utf8)
5673
-        {
5674
-        if (eptr == md->start_subject) prev_is_word = FALSE; else
5675
-          {
5676
-          const uschar *lastptr = eptr - 1;
5677
-          while((*lastptr & 0xc0) == 0x80) lastptr--;
5678
-          GETCHAR(c, lastptr);
5679
-          prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5680
-          }
5681
-        if (eptr >= md->end_subject) cur_is_word = FALSE; else
5682
-          {
5683
-          GETCHAR(c, eptr);
5684
-          cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5685
-          }
5686
-        }
5687
-      else
5688
-#endif
5689
-
5690
-      /* More streamlined when not in UTF-8 mode */
5691
-
5692
-        {
5693
-        prev_is_word = (eptr != md->start_subject) &&
5694
-          ((md->ctypes[eptr[-1]] & ctype_word) != 0);
5695
-        cur_is_word = (eptr < md->end_subject) &&
5696
-          ((md->ctypes[*eptr] & ctype_word) != 0);
5697
-        }
5698
-
5699
-      /* Now see if the situation is what we want */
5700
-
5701
-      if ((*ecode++ == OP_WORD_BOUNDARY)?
5702
-           cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5703
-        return MATCH_NOMATCH;
5704
-      }
5705
-    break;
5706
-
5707
-    /* Match a single character type; inline for speed */
5708
-
5709
-    case OP_ANY:
5710
-    if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
5711
-      return MATCH_NOMATCH;
5712
-    if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5713
-#ifdef SUPPORT_UTF8
5714
-    if (md->utf8)
5715
-      while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5716
-#endif
5717
-    ecode++;
5718
-    break;
5719
-
5720
-    /* Match a single byte, even in UTF-8 mode. This opcode really does match
5721
-    any byte, even newline, independent of the setting of PCRE_DOTALL. */
5722
-
5723
-    case OP_ANYBYTE:
5724
-    if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5725
-    ecode++;
5726
-    break;
5727
-
5728
-    case OP_NOT_DIGIT:
5729
-    if (eptr >= md->end_subject) return MATCH_NOMATCH;
5730
-    GETCHARINCTEST(c, eptr);
5731
-    if (
3886
+                
3887
+                /* If \Q, enter "literal" mode */
3888
+                
3889
+                if (-c == ESC_Q)
3890
+                {
3891
+                    inescq = TRUE;
3892
+                    continue;
3893
+                }
3894
+                
3895
+                /* Other escapes need one byte, and are of length one for repeats */
3896
+                
3897
+                length++;
5732 3898
 #ifdef SUPPORT_UTF8
5733
-       c < 256 &&
3899
+                lastcharlength = 1;
5734 3900
 #endif
5735
-       (md->ctypes[c] & ctype_digit) != 0
5736
-       )
5737
-      return MATCH_NOMATCH;
5738
-    ecode++;
5739
-    break;
5740
-
5741
-    case OP_DIGIT:
5742
-    if (eptr >= md->end_subject) return MATCH_NOMATCH;
5743
-    GETCHARINCTEST(c, eptr);
5744
-    if (
3901
+                
3902
+                /* A back reference needs an additional 2 bytes, plus either one or 5
3903
+                 bytes for a repeat. We also need to keep the value of the highest
3904
+                 back reference. */
3905
+                
3906
+                if (c <= -ESC_REF)
3907
+                {
3908
+                    int refnum = -c - ESC_REF;
3909
+                    compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3910
+                    if (refnum > compile_block.top_backref)
3911
+                        compile_block.top_backref = refnum;
3912
+                    length += 2;   /* For single back reference */
3913
+                    if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3914
+                    {
3915
+                        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
3916
+                        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3917
+                        if ((min == 0 && (max == 1 || max == -1)) ||
3918
+                            (min == 1 && max == -1))
3919
+                            length++;
3920
+                        else length += 5;
3921
+                        if (ptr[1] == '?') ptr++;
3922
+                    }
3923
+                }
3924
+                continue;
3925
+                
3926
+            case '^':     /* Single-byte metacharacters */
3927
+            case '.':
3928
+            case '$':
3929
+                length++;
5745 3930
 #ifdef SUPPORT_UTF8
5746
-       c >= 256 ||
3931
+                lastcharlength = 1;
5747 3932
 #endif
5748
-       (md->ctypes[c] & ctype_digit) == 0
5749
-       )
5750
-      return MATCH_NOMATCH;
5751
-    ecode++;
5752
-    break;
5753
-
5754
-    case OP_NOT_WHITESPACE:
5755
-    if (eptr >= md->end_subject) return MATCH_NOMATCH;
5756
-    GETCHARINCTEST(c, eptr);
5757
-    if (
3933
+                continue;
3934
+                
3935
+            case '*':            /* These repeats won't be after brackets; */
3936
+            case '+':            /* those are handled separately */
3937
+            case '?':
3938
+                length++;
3939
+                goto POSESSIVE;      /* A few lines below */
3940
+                
3941
+                /* This covers the cases of braced repeats after a single char, metachar,
3942
+                 class, or back reference. */
3943
+                
3944
+            case '{':
3945
+                if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3946
+                ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3947
+                if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3948
+                
3949
+                /* These special cases just insert one extra opcode */
3950
+                
3951
+                if ((min == 0 && (max == 1 || max == -1)) ||
3952
+                    (min == 1 && max == -1))
3953
+                    length++;
3954
+                
3955
+                /* These cases might insert additional copies of a preceding character. */
3956
+                
3957
+                else
3958
+                {
5758 3959
 #ifdef SUPPORT_UTF8
5759
-       c < 256 &&
3960
+                    /* In UTF-8 mode, we should find the length in lastcharlength */
3961
+                    if (utf8)
3962
+                    {
3963
+                        if (min != 1)
3964
+                        {
3965
+                            length -= lastcharlength;   /* Uncount the original char or metachar */
3966
+                            if (min > 0) length += 3 + lastcharlength;
3967
+                        }
3968
+                        length += lastcharlength + ((max > 0)? 3 : 1);
3969
+                    }
3970
+                    else
5760 3971
 #endif
5761
-       (md->ctypes[c] & ctype_space) != 0
5762
-       )
5763
-      return MATCH_NOMATCH;
5764
-    ecode++;
5765
-    break;
5766
-
5767
-    case OP_WHITESPACE:
5768
-    if (eptr >= md->end_subject) return MATCH_NOMATCH;
5769
-    GETCHARINCTEST(c, eptr);
5770
-    if (
3972
+                        
3973
+                    /* Not UTF-8 mode: all characters are one byte */
3974
+                    {
3975
+                        if (min != 1)
3976
+                        {
3977
+                            length--;   /* Uncount the original char or metachar */
3978
+                            if (min > 0) length += 4;
3979
+                        }
3980
+                        
3981
+                        length += (max > 0)? 4 : 2;
3982
+                    }
3983
+                }
3984
+                
3985
+                if (ptr[1] == '?') ptr++;      /* Needs no extra length */
3986
+                
3987
+            POSESSIVE:                     /* Test for possessive quantifier */
3988
+                if (ptr[1] == '+')
3989
+                {
3990
+                    ptr++;
3991
+                    length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
3992
+                }
3993
+                continue;
3994
+                
3995
+                /* An alternation contains an offset to the next branch or ket. If any ims
3996
+                 options changed in the previous branch(es), and/or if we are in a
3997
+                 lookbehind assertion, extra space will be needed at the start of the
3998
+                 branch. This is handled by branch_extra. */
3999
+                
4000
+            case '|':
4001
+                length += 1 + LINK_SIZE + branch_extra;
4002
+                continue;
4003
+                
4004
+                /* A character class uses 33 characters provided that all the character
4005
+                 values are less than 256. Otherwise, it uses a bit map for low valued
4006
+                 characters, and individual items for others. Don't worry about character
4007
+                 types that aren't allowed in classes - they'll get picked up during the
4008
+                 compile. A character class that contains only one single-byte character
4009
+                 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4010
+                 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4011
+                
4012
+            case '[':
4013
+                class_optcount = 0;
4014
+                
5771 4015
 #ifdef SUPPORT_UTF8
5772
-       c >= 256 ||
4016
+                class_utf8 = FALSE;
5773 4017
 #endif
5774
-       (md->ctypes[c] & ctype_space) == 0
5775
-       )
5776
-      return MATCH_NOMATCH;
5777
-    ecode++;
5778
-    break;
5779
-
5780
-    case OP_NOT_WORDCHAR:
5781
-    if (eptr >= md->end_subject) return MATCH_NOMATCH;
5782
-    GETCHARINCTEST(c, eptr);
5783
-    if (
4018
+                
4019
+                if (*(++ptr) == '^') ptr++;
4020
+                
4021
+                /* Written as a "do" so that an initial ']' is taken as data */
4022
+                
4023
+                if (*ptr != 0) do
4024
+                {
4025
+                    /* Inside \Q...\E everything is literal except \E */
4026
+                    
4027
+                    if (inescq)
4028
+                    {
4029
+                        if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
4030
+                        inescq = FALSE;
4031
+                        ptr += 1;
4032
+                        continue;
4033
+                    }
4034
+                    
4035
+                    /* Outside \Q...\E, check for escapes */
4036
+                    
4037
+                    if (*ptr == '\\')
4038
+                    {
5784 4039
 #ifdef SUPPORT_UTF8
5785
-       c < 256 &&
4040
+                        int prevchar = ptr[-1];
5786 4041
 #endif
5787
-       (md->ctypes[c] & ctype_word) != 0
5788
-       )
5789
-      return MATCH_NOMATCH;
5790
-    ecode++;
5791
-    break;
5792
-
5793
-    case OP_WORDCHAR:
5794
-    if (eptr >= md->end_subject) return MATCH_NOMATCH;
5795
-    GETCHARINCTEST(c, eptr);
5796
-    if (
4042
+                        int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
4043
+                                              &compile_block);
4044
+                        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4045
+                        
4046
+                        /* \b is backspace inside a class */
4047
+                        
4048
+                        if (-ch == ESC_b) ch = '\b';
4049
+                        
4050
+                        /* \Q enters quoting mode */
4051
+                        
4052
+                        if (-ch == ESC_Q)
4053
+                        {
4054
+                            inescq = TRUE;
4055
+                            continue;
4056
+                        }
4057
+                        
4058
+                        /* Handle escapes that turn into characters */
4059
+                        
4060
+                        if (ch >= 0)
4061
+                        {
5797 4062
 #ifdef SUPPORT_UTF8
5798
-       c >= 256 ||
4063
+                            if (utf8)
4064
+                            {
4065
+                                if (ch > 127) class_optcount = 10;  /* Ensure > 1 */
4066
+                                if (ch > 255)
4067
+                                {
4068
+                                    uschar buffer[6];
4069
+                                    if (!class_utf8)
4070
+                                    {
4071
+                                        class_utf8 = TRUE;
4072
+                                        length += LINK_SIZE + 1 + 1;
4073
+                                    }
4074
+                                    length += 1 + ord2utf8(ch, buffer);
4075
+                                    
4076
+                                    /* If this wide character is preceded by '-', add an extra 2 to
4077
+                                     the length in case the previous character was < 128, because in
4078
+                                     this case the whole range will be put into the list. */
4079
+                                    
4080
+                                    if (prevchar == '-') length += 2;
4081
+                                }
4082
+                            }
5799 4083
 #endif
5800
-       (md->ctypes[c] & ctype_word) == 0
5801
-       )
5802
-      return MATCH_NOMATCH;
5803
-    ecode++;
5804
-    break;
5805
-
5806
-    /* Match a back reference, possibly repeatedly. Look past the end of the
5807
-    item to see if there is repeat information following. The code is similar
5808
-    to that for character classes, but repeated for efficiency. Then obey
5809
-    similar code to character type repeats - written out again for speed.
5810
-    However, if the referenced string is the empty string, always treat
5811
-    it as matched, any number of times (otherwise there could be infinite
5812
-    loops). */
5813
-
5814
-    case OP_REF:
5815
-      {
5816
-      int length;
5817
-      int offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
5818
-      ecode += 3;                                     /* Advance past item */
5819
-
5820
-      /* If the reference is unset, set the length to be longer than the amount
5821
-      of subject left; this ensures that every attempt at a match fails. We
5822
-      can't just fail here, because of the possibility of quantifiers with zero
5823
-      minima. */
5824
-
5825
-      length = (offset >= offset_top || md->offset_vector[offset] < 0)?
5826
-        md->end_subject - eptr + 1 :
5827
-        md->offset_vector[offset+1] - md->offset_vector[offset];
5828
-
5829
-      /* Set up for repetition, or handle the non-repeated case */
5830
-
5831
-      switch (*ecode)
5832
-        {
5833
-        case OP_CRSTAR:
5834
-        case OP_CRMINSTAR:
5835
-        case OP_CRPLUS:
5836
-        case OP_CRMINPLUS:
5837
-        case OP_CRQUERY:
5838
-        case OP_CRMINQUERY:
5839
-        c = *ecode++ - OP_CRSTAR;
5840
-        minimize = (c & 1) != 0;
5841
-        min = rep_min[c];                 /* Pick up values from tables; */
5842
-        max = rep_max[c];                 /* zero for max => infinity */
5843
-        if (max == 0) max = INT_MAX;
5844
-        break;
5845
-
5846
-        case OP_CRRANGE:
5847
-        case OP_CRMINRANGE:
5848
-        minimize = (*ecode == OP_CRMINRANGE);
5849
-        min = GET2(ecode, 1);
5850
-        max = GET2(ecode, 3);
5851
-        if (max == 0) max = INT_MAX;
5852
-        ecode += 5;
5853
-        break;
5854
-
5855
-        default:               /* No repeat follows */
5856
-        if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5857
-        eptr += length;
5858
-        continue;              /* With the main loop */
5859
-        }
5860
-
5861
-      /* If the length of the reference is zero, just continue with the
5862
-      main loop. */
5863
-
5864
-      if (length == 0) continue;
5865
-
5866
-      /* First, ensure the minimum number of matches are present. We get back
5867
-      the length of the reference string explicitly rather than passing the
5868
-      address of eptr, so that eptr can be a register variable. */
5869
-
5870
-      for (i = 1; i <= min; i++)
5871
-        {
5872
-        if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5873
-        eptr += length;
5874
-        }
5875
-
5876
-      /* If min = max, continue at the same level without recursion.
5877
-      They are not both allowed to be zero. */
5878
-
5879
-      if (min == max) continue;
5880
-
5881
-      /* If minimizing, keep trying and advancing the pointer */
5882
-
5883
-      if (minimize)
5884
-        {
5885
-        for (i = min;; i++)
5886
-          {
5887
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5888
-               MATCH_NOMATCH) return rrc;
5889
-          if (i >= max || !match_ref(offset, eptr, length, md, ims))
5890
-            return MATCH_NOMATCH;
5891
-          eptr += length;
5892
-          }
5893
-        /* Control never gets here */
5894
-        }
5895
-
5896
-      /* If maximizing, find the longest string and work backwards */
5897
-
5898
-      else
5899
-        {
5900
-        const uschar *pp = eptr;
5901
-        for (i = min; i < max; i++)
5902
-          {
5903
-          if (!match_ref(offset, eptr, length, md, ims)) break;
5904
-          eptr += length;
5905
-          }
5906
-        while (eptr >= pp)
5907
-          {
5908
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5909
-               MATCH_NOMATCH) return rrc;
5910
-          eptr -= length;
5911
-          }
5912
-        return MATCH_NOMATCH;
5913
-        }
5914
-      }
5915
-    /* Control never gets here */
5916
-
5917
-
5918
-
5919
-    /* Match a bit-mapped character class, possibly repeatedly. This op code is
5920
-    used when all the characters in the class have values in the range 0-255.
5921
-    The only difference between OP_CLASS and OP_NCLASS occurs when a data
5922
-    character outside the range is encountered.
5923
-
5924
-    First, look past the end of the item to see if there is repeat information
5925
-    following. Then obey similar code to character type repeats - written out
5926
-    again for speed. */
5927
-
5928
-    case OP_NCLASS:
5929
-    case OP_CLASS:
5930
-      {
5931
-      const uschar *data = ecode + 1;  /* Save for matching */
5932
-      ecode += 33;                     /* Advance past the item */
5933
-
5934
-      switch (*ecode)
5935
-        {
5936
-        case OP_CRSTAR:
5937
-        case OP_CRMINSTAR:
5938
-        case OP_CRPLUS:
5939
-        case OP_CRMINPLUS:
5940
-        case OP_CRQUERY:
5941
-        case OP_CRMINQUERY:
5942
-        c = *ecode++ - OP_CRSTAR;
5943
-        minimize = (c & 1) != 0;
5944
-        min = rep_min[c];                 /* Pick up values from tables; */
5945
-        max = rep_max[c];                 /* zero for max => infinity */
5946
-        if (max == 0) max = INT_MAX;
5947
-        break;
5948
-
5949
-        case OP_CRRANGE:
5950
-        case OP_CRMINRANGE:
5951
-        minimize = (*ecode == OP_CRMINRANGE);
5952
-        min = GET2(ecode, 1);
5953
-        max = GET2(ecode, 3);
5954
-        if (max == 0) max = INT_MAX;
5955
-        ecode += 5;
5956
-        break;
5957
-
5958
-        default:               /* No repeat follows */
5959
-        min = max = 1;
5960
-        break;
5961
-        }
5962
-
5963
-      /* First, ensure the minimum number of matches are present. */
5964
-
4084
+                            class_optcount++;            /* for possible optimization */
4085
+                        }
4086
+                        else class_optcount = 10;      /* \d, \s etc; make sure > 1 */
4087
+                    }
4088
+                    
4089
+                    /* Check the syntax for POSIX stuff. The bits we actually handle are
4090
+                     checked during the real compile phase. */
4091
+                    
4092
+                    else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4093
+                    {
4094
+                        ptr++;
4095
+                        class_optcount = 10;    /* Make sure > 1 */
4096
+                    }
4097
+                    
4098
+                    /* Anything else just increments the possible optimization count. If
4099
+                     there are wide characters, we are going to have to use an XCLASS. */
4100
+                    
4101
+                    else
4102
+                    {
4103
+                    NON_SPECIAL_CHARACTER:
4104
+                        class_optcount++;
4105
+                        
5965 4106
 #ifdef SUPPORT_UTF8
5966
-      /* UTF-8 mode */
5967
-      if (md->utf8)
5968
-        {
5969
-        for (i = 1; i <= min; i++)
5970
-          {
5971
-          if (eptr >= md->end_subject) return MATCH_NOMATCH;
5972
-          GETCHARINC(c, eptr);
5973
-          if (c > 255)
5974
-            {
5975
-            if (op == OP_CLASS) return MATCH_NOMATCH;
5976
-            }
5977
-          else
5978
-            {
5979
-            if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5980
-            }
5981
-          }
5982
-        }
5983
-      else
4107
+                        if (utf8)
4108
+                        {
4109
+                            int ch;
4110
+                            int extra = 0;
4111
+                            GETCHARLEN(ch, ptr, extra);
4112
+                            if (ch > 127) class_optcount = 10;   /* No optimization possible */
4113
+                            if (ch > 255)
4114
+                            {
4115
+                                if (!class_utf8)
4116
+                                {
4117
+                                    class_utf8 = TRUE;
4118
+                                    length += LINK_SIZE + 1 + 1;
4119
+                                }
4120
+                                length += 2 + extra;
4121
+                                
4122
+                                /* If this wide character is preceded by '-', add an extra 2 to
4123
+                                 the length in case the previous character was < 128, because in
4124
+                                 this case the whole range will be put into the list. */
4125
+                                
4126
+                                if (ptr[-1] == '-') length += 2;
4127
+                                
4128
+                                /* Advance to the end of this character */
4129
+                                
4130
+                                ptr += extra;
4131
+                            }
4132
+                        }
5984 4133
 #endif
5985
-      /* Not UTF-8 mode */
5986
-        {
5987
-        for (i = 1; i <= min; i++)
5988
-          {
5989
-          if (eptr >= md->end_subject) return MATCH_NOMATCH;
5990
-          c = *eptr++;
5991
-          if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5992
-          }
5993
-        }
5994
-
5995
-      /* If max == min we can continue with the main loop without the
5996
-      need to recurse. */
5997
-
5998
-      if (min == max) continue;
5999
-
6000
-      /* If minimizing, keep testing the rest of the expression and advancing
6001
-      the pointer while it matches the class. */
6002
-
6003
-      if (minimize)
6004
-        {
4134
+                    }
4135
+                }
4136
+                while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4137
+                
4138
+                if (*ptr == 0)                          /* Missing terminating ']' */
4139
+                {
4140
+                    *errorptr = ERR6;
4141
+                    goto PCRE_ERROR_RETURN;
4142
+                }
4143
+                
4144
+                /* We can optimize when there was only one optimizable character. Repeats
4145
+                 for positive and negated single one-byte chars are handled by the general
4146
+                 code. Here, we handle repeats for the class opcodes. */
4147
+                
4148
+                if (class_optcount == 1) length += 3; else
4149
+                {
4150
+                    length += 33;
4151
+                    
4152
+                    /* A repeat needs either 1 or 5 bytes. */
4153
+                    
4154
+                    if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
4155
+                    {
4156
+                        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4157
+                        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4158
+                        if ((min == 0 && (max == 1 || max == -1)) ||
4159
+                            (min == 1 && max == -1))
4160
+                            length++;
4161
+                        else length += 5;
4162
+                        if (ptr[1] == '?') ptr++;
4163
+                    }
4164
+                }
4165
+                continue;
4166
+                
4167
+                /* Brackets may be genuine groups or special things */
4168
+                
4169
+            case '(':
4170
+                branch_newextra = 0;
4171
+                bracket_length = 1 + LINK_SIZE;
4172
+                
4173
+                /* Handle special forms of bracket, which all start (? */
4174
+                
4175
+                if (ptr[1] == '?')
4176
+                {
4177
+                    int set, unset;
4178
+                    int *optset;
4179
+                    
4180
+                    switch (c = ptr[2])
4181
+                    {
4182
+                            /* Skip over comments entirely */
4183
+                        case '#':
4184
+                            ptr += 3;
4185
+                            while (*ptr != 0 && *ptr != ')') ptr++;
4186
+                            if (*ptr == 0)
4187
+                            {
4188
+                                *errorptr = ERR18;
4189
+                                goto PCRE_ERROR_RETURN;
4190
+                            }
4191
+                            continue;
4192
+                            
4193
+                            /* Non-referencing groups and lookaheads just move the pointer on, and
4194
+                             then behave like a non-special bracket, except that they don't increment
4195
+                             the count of extracting brackets. Ditto for the "once only" bracket,
4196
+                             which is in Perl from version 5.005. */
4197
+                            
4198
+                        case ':':
4199
+                        case '=':
4200
+                        case '!':
4201
+                        case '>':
4202
+                            ptr += 2;
4203
+                            break;
4204
+                            
4205
+                            /* (?R) specifies a recursive call to the regex, which is an extension
4206
+                             to provide the facility which can be obtained by (?p{perl-code}) in
4207
+                             Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4208
+                             
4209
+                             From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4210
+                             the appropriate numbered brackets. This includes both recursive and
4211
+                             non-recursive calls. (?R) is now synonymous with (?0). */
4212
+                            
4213
+                        case 'R':
4214
+                            ptr++;
4215
+                            
4216
+                        case '0': case '1': case '2': case '3': case '4':
4217
+                        case '5': case '6': case '7': case '8': case '9':
4218
+                            ptr += 2;
4219
+                            if (c != 'R')
4220
+                                while ((digitab[*(++ptr)] & ctype_digit) != 0);
4221
+                            if (*ptr != ')')
4222
+                            {
4223
+                                *errorptr = ERR29;
4224
+                                goto PCRE_ERROR_RETURN;
4225
+                            }
4226
+                            length += 1 + LINK_SIZE;
4227
+                            
4228
+                            /* If this item is quantified, it will get wrapped inside brackets so
4229
+                             as to use the code for quantified brackets. We jump down and use the
4230
+                             code that handles this for real brackets. */
4231
+                            
4232
+                            if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4233
+                            {
4234
+                                length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
4235
+                                duplength = 5 + 3 * LINK_SIZE;
4236
+                                goto HANDLE_QUANTIFIED_BRACKETS;
4237
+                            }
4238
+                            continue;
4239
+                            
4240
+                            /* (?C) is an extension which provides "callout" - to provide a bit of
4241
+                             the functionality of the Perl (?{...}) feature. An optional number may
4242
+                             follow (default is zero). */
4243
+                            
4244
+                        case 'C':
4245
+                            ptr += 2;
4246
+                            while ((digitab[*(++ptr)] & ctype_digit) != 0);
4247
+                            if (*ptr != ')')
4248
+                            {
4249
+                                *errorptr = ERR39;
4250
+                                goto PCRE_ERROR_RETURN;
4251
+                            }
4252
+                            length += 2;
4253
+                            continue;
4254
+                            
4255
+                            /* Named subpatterns are an extension copied from Python */
4256
+                            
4257
+                        case 'P':
4258
+                            ptr += 3;
4259
+                            if (*ptr == '<')
4260
+                            {
4261
+                                const uschar *p;    /* Don't amalgamate; some compilers */
4262
+                                p = ++ptr;          /* grumble at autoincrement in declaration */
4263
+                                while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4264
+                                if (*ptr != '>')
4265
+                                {
4266
+                                    *errorptr = ERR42;
4267
+                                    goto PCRE_ERROR_RETURN;
4268
+                                }
4269
+                                name_count++;
4270
+                                if (ptr - p > max_name_size) max_name_size = (ptr - p);
4271
+                                break;
4272
+                            }
4273
+                            
4274
+                            if (*ptr == '=' || *ptr == '>')
4275
+                            {
4276
+                                while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4277
+                                if (*ptr != ')')
4278
+                                {
4279
+                                    *errorptr = ERR42;
4280
+                                    goto PCRE_ERROR_RETURN;
4281
+                                }
4282
+                                break;
4283
+                            }
4284
+                            
4285
+                            /* Unknown character after (?P */
4286
+                            
4287
+                            *errorptr = ERR41;
4288
+                            goto PCRE_ERROR_RETURN;
4289
+                            
4290
+                            /* Lookbehinds are in Perl from version 5.005 */
4291
+                            
4292
+                        case '<':
4293
+                            ptr += 3;
4294
+                            if (*ptr == '=' || *ptr == '!')
4295
+                            {
4296
+                                branch_newextra = 1 + LINK_SIZE;
4297
+                                length += 1 + LINK_SIZE;         /* For the first branch */
4298
+                                break;
4299
+                            }
4300
+                            *errorptr = ERR24;
4301
+                            goto PCRE_ERROR_RETURN;
4302
+                            
4303
+                            /* Conditionals are in Perl from version 5.005. The bracket must either
4304
+                             be followed by a number (for bracket reference) or by an assertion
4305
+                             group, or (a PCRE extension) by 'R' for a recursion test. */
4306
+                            
4307
+                        case '(':
4308
+                            if (ptr[3] == 'R' && ptr[4] == ')')
4309
+                            {
4310
+                                ptr += 4;
4311
+                                length += 3;
4312
+                            }
4313
+                            else if ((digitab[ptr[3]] & ctype_digit) != 0)
4314
+                            {
4315
+                                ptr += 4;
4316
+                                length += 3;
4317
+                                while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4318
+                                if (*ptr != ')')
4319
+                                {
4320
+                                    *errorptr = ERR26;
4321
+                                    goto PCRE_ERROR_RETURN;
4322
+                                }
4323
+                            }
4324
+                            else   /* An assertion must follow */
4325
+                            {
4326
+                                ptr++;   /* Can treat like ':' as far as spacing is concerned */
4327
+                                if (ptr[2] != '?' ||
4328
+                                    (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4329
+                                {
4330
+                                    ptr += 2;    /* To get right offset in message */
4331
+                                    *errorptr = ERR28;
4332
+                                    goto PCRE_ERROR_RETURN;
4333
+                                }
4334
+                            }
4335
+                            break;
4336
+                            
4337
+                            /* Else loop checking valid options until ) is met. Anything else is an
4338
+                             error. If we are without any brackets, i.e. at top level, the settings
4339
+                             act as if specified in the options, so massage the options immediately.
4340
+                             This is for backward compatibility with Perl 5.004. */
4341
+                            
4342
+                        default:
4343
+                            set = unset = 0;
4344
+                            optset = &set;
4345
+                            ptr += 2;
4346
+                            
4347
+                            for (;; ptr++)
4348
+                            {
4349
+                                c = *ptr;
4350
+                                switch (c)
4351
+                                {
4352
+                                    case 'i':
4353
+                                        *optset |= PCRE_CASELESS;
4354
+                                        continue;
4355
+                                        
4356
+                                    case 'm':
4357
+                                        *optset |= PCRE_MULTILINE;
4358
+                                        continue;
4359
+                                        
4360
+                                    case 's':
4361
+                                        *optset |= PCRE_DOTALL;
4362
+                                        continue;
4363
+                                        
4364
+                                    case 'x':
4365
+                                        *optset |= PCRE_EXTENDED;
4366
+                                        continue;
4367
+                                        
4368
+                                    case 'X':
4369
+                                        *optset |= PCRE_EXTRA;
4370
+                                        continue;
4371
+                                        
4372
+                                    case 'U':
4373
+                                        *optset |= PCRE_UNGREEDY;
4374
+                                        continue;
4375
+                                        
4376
+                                    case '-':
4377
+                                        optset = &unset;
4378
+                                        continue;
4379
+                                        
4380
+                                        /* A termination by ')' indicates an options-setting-only item; if
4381
+                                         this is at the very start of the pattern (indicated by item_count
4382
+                                         being zero), we use it to set the global options. This is helpful
4383
+                                         when analyzing the pattern for first characters, etc. Otherwise
4384
+                                         nothing is done here and it is handled during the compiling
4385
+                                         process.
4386
+                                         
4387
+                                         [Historical note: Up to Perl 5.8, options settings at top level
4388
+                                         were always global settings, wherever they appeared in the pattern.
4389
+                                         That is, they were equivalent to an external setting. From 5.8
4390
+                                         onwards, they apply only to what follows (which is what you might
4391
+                                         expect).] */
4392
+                                        
4393
+                                    case ')':
4394
+                                        if (item_count == 0)
4395
+                                        {
4396
+                                            options = (options | set) & (~unset);
4397
+                                            set = unset = 0;     /* To save length */
4398
+                                            item_count--;        /* To allow for several */
4399
+                                        }
4400
+                                        
4401
+                                        /* Fall through */
4402
+                                        
4403
+                                        /* A termination by ':' indicates the start of a nested group with
4404
+                                         the given options set. This is again handled at compile time, but
4405
+                                         we must allow for compiled space if any of the ims options are
4406
+                                         set. We also have to allow for resetting space at the end of
4407
+                                         the group, which is why 4 is added to the length and not just 2.
4408
+                                         If there are several changes of options within the same group, this
4409
+                                         will lead to an over-estimate on the length, but this shouldn't
4410
+                                         matter very much. We also have to allow for resetting options at
4411
+                                         the start of any alternations, which we do by setting
4412
+                                         branch_newextra to 2. Finally, we record whether the case-dependent
4413
+                                         flag ever changes within the regex. This is used by the "required
4414
+                                         character" code. */
4415
+                                        
4416
+                                    case ':':
4417
+                                        if (((set|unset) & PCRE_IMS) != 0)
4418
+                                        {
4419
+                                            length += 4;
4420
+                                            branch_newextra = 2;
4421
+                                            if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4422
+                                        }
4423
+                                        goto END_OPTIONS;
4424
+                                        
4425
+                                        /* Unrecognized option character */
4426
+                                        
4427
+                                    default:
4428
+                                        *errorptr = ERR12;
4429
+                                        goto PCRE_ERROR_RETURN;
4430
+                                }
4431
+                            }
4432
+                            
4433
+                            /* If we hit a closing bracket, that's it - this is a freestanding
4434
+                             option-setting. We need to ensure that branch_extra is updated if
4435
+                             necessary. The only values branch_newextra can have here are 0 or 2.
4436
+                             If the value is 2, then branch_extra must either be 2 or 5, depending
4437
+                             on whether this is a lookbehind group or not. */
4438
+                            
4439
+                        END_OPTIONS:
4440
+                            if (c == ')')
4441
+                            {
4442
+                                if (branch_newextra == 2 &&
4443
+                                    (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4444
+                                    branch_extra += branch_newextra;
4445
+                                continue;
4446
+                            }
4447
+                            
4448
+                            /* If options were terminated by ':' control comes here. Fall through
4449
+                             to handle the group below. */
4450
+                    }
4451
+                }
4452
+                
4453
+                /* Extracting brackets must be counted so we can process escapes in a
4454
+                 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
4455
+                 need an additional 3 bytes of store per extracting bracket. However, if
4456
+                 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
4457
+                 must leave the count alone (it will aways be zero). */
4458
+                
4459
+                else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
4460
+                {
4461
+                    bracount++;
4462
+                    if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4463
+                }
4464
+                
4465
+                /* Save length for computing whole length at end if there's a repeat that
4466
+                 requires duplication of the group. Also save the current value of
4467
+                 branch_extra, and start the new group with the new value. If non-zero, this
4468
+                 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4469
+                
4470
+                if (brastackptr >= sizeof(brastack)/sizeof(int))
4471
+                {
4472
+                    *errorptr = ERR19;
4473
+                    goto PCRE_ERROR_RETURN;
4474
+                }
4475
+                
4476
+                bralenstack[brastackptr] = branch_extra;
4477
+                branch_extra = branch_newextra;
4478
+                
4479
+                brastack[brastackptr++] = length;
4480
+                length += bracket_length;
4481
+                continue;
4482
+                
4483
+                /* Handle ket. Look for subsequent max/min; for certain sets of values we
4484
+                 have to replicate this bracket up to that many times. If brastackptr is
4485
+                 0 this is an unmatched bracket which will generate an error, but take care
4486
+                 not to try to access brastack[-1] when computing the length and restoring
4487
+                 the branch_extra value. */
4488
+                
4489
+            case ')':
4490
+                length += 1 + LINK_SIZE;
4491
+                if (brastackptr > 0)
4492
+                {
4493
+                    duplength = length - brastack[--brastackptr];
4494
+                    branch_extra = bralenstack[brastackptr];
4495
+                }
4496
+                else duplength = 0;
4497
+                
4498
+                /* The following code is also used when a recursion such as (?3) is
4499
+                 followed by a quantifier, because in that case, it has to be wrapped inside
4500
+                 brackets so that the quantifier works. The value of duplength must be
4501
+                 set before arrival. */
4502
+                
4503
+            HANDLE_QUANTIFIED_BRACKETS:
4504
+                
4505
+                /* Leave ptr at the final char; for read_repeat_counts this happens
4506
+                 automatically; for the others we need an increment. */
4507
+                
4508
+                if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
4509
+                {
4510
+                    ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
4511
+                    if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4512
+                }
4513
+                else if (c == '*') { min = 0; max = -1; ptr++; }
4514
+                else if (c == '+') { min = 1; max = -1; ptr++; }
4515
+                else if (c == '?') { min = 0; max = 1;  ptr++; }
4516
+                else { min = 1; max = 1; }
4517
+                
4518
+                /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4519
+                 group, and if the maximum is greater than zero, we have to replicate
4520
+                 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4521
+                 bracket set. */
4522
+                
4523
+                if (min == 0)
4524
+                {
4525
+                    length++;
4526
+                    if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4527
+                }
4528
+                
4529
+                /* When the minimum is greater than zero, we have to replicate up to
4530
+                 minval-1 times, with no additions required in the copies. Then, if there
4531
+                 is a limited maximum we have to replicate up to maxval-1 times allowing
4532
+                 for a BRAZERO item before each optional copy and nesting brackets for all
4533
+                 but one of the optional copies. */
4534
+                
4535
+                else
4536
+                {
4537
+                    length += (min - 1) * duplength;
4538
+                    if (max > min)   /* Need this test as max=-1 means no limit */
4539
+                        length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4540
+                        - (2 + 2*LINK_SIZE);
4541
+                }
4542
+                
4543
+                /* Allow space for once brackets for "possessive quantifier" */
4544
+                
4545
+                if (ptr[1] == '+')
4546
+                {
4547
+                    ptr++;
4548
+                    length += 2 + 2*LINK_SIZE;
4549
+                }
4550
+                continue;
4551
+                
4552
+                /* Non-special character. For a run of such characters the length required
4553
+                 is the number of characters + 2, except that the maximum run length is
4554
+                 MAXLIT. We won't get a skipped space or a non-data escape or the start of a
4555
+                 # comment as the first character, so the length can't be zero. */
4556
+                
4557
+            NORMAL_CHAR:
4558
+            default:
4559
+                length += 2;
4560
+                runlength = 0;
4561
+                do
4562
+                {
6005 4563
 #ifdef SUPPORT_UTF8
6006
-        /* UTF-8 mode */
6007
-        if (md->utf8)
6008
-          {
6009
-          for (i = min;; i++)
6010
-            {
6011
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6012
-                 MATCH_NOMATCH) return rrc;
6013
-            if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6014
-            GETCHARINC(c, eptr);
6015
-            if (c > 255)
6016
-              {
6017
-              if (op == OP_CLASS) return MATCH_NOMATCH;
6018
-              }
6019
-            else
6020
-              {
6021
-              if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6022
-              }
6023
-            }
6024
-          }
6025
-        else
4564
+                    lastcharlength = 1;     /* Need length of last char for UTF-8 repeats */
6026 4565
 #endif
6027
-        /* Not UTF-8 mode */
6028
-          {
6029
-          for (i = min;; i++)
6030
-            {
6031
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6032
-                 MATCH_NOMATCH) return rrc;
6033
-            if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6034
-            c = *eptr++;
6035
-            if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6036
-            }
6037
-          }
6038
-        /* Control never gets here */
6039
-        }
6040
-
6041
-      /* If maximizing, find the longest possible run, then work backwards. */
6042
-
6043
-      else
6044
-        {
6045
-        const uschar *pp = eptr;
6046
-
4566
+                    
4567
+                    /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
4568
+                    if (inescq)
4569
+                    {
4570
+                        if (c == '\\' && ptr[1] == 'E')
4571
+                        {
4572
+                            inescq = FALSE;
4573
+                            ptr++;
4574
+                        }
4575
+                        else runlength++;
4576
+                        continue;
4577
+                    }
4578
+                    
4579
+                    /* Skip whitespace and comments for /x */
4580
+                    
4581
+                    if ((options & PCRE_EXTENDED) != 0)
4582
+                    {
4583
+                        if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4584
+                        if (c == '#')
4585
+                        {
4586
+                            /* The space before the ; is to avoid a warning on a silly compiler
4587
+                             on the Macintosh. */
4588
+                            while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4589
+                            continue;
4590
+                        }
4591
+                    }
4592
+                    
4593
+                    /* Backslash may introduce a data char or a metacharacter; stop the
4594
+                     string before the latter. */
4595
+                    
4596
+                    if (c == '\\')
4597
+                    {
4598
+                        const uschar *saveptr = ptr;
4599
+                        c = check_escape(&ptr, errorptr, bracount, options, FALSE,
4600
+                                         &compile_block);
4601
+                        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4602
+                        if (c < 0) { ptr = saveptr; break; }
4603
+                        
4604
+                        /* In UTF-8 mode, add on the number of additional bytes needed to
4605
+                         encode this character, and save the total length in case this is a
4606
+                         final char that is repeated. */
4607
+                        
6047 4608
 #ifdef SUPPORT_UTF8
6048
-        /* UTF-8 mode */
6049
-        if (md->utf8)
6050
-          {
6051
-          for (i = min; i < max; i++)
6052
-            {
6053
-            int len = 1;
6054
-            if (eptr >= md->end_subject) break;
6055
-            GETCHARLEN(c, eptr, len);
6056
-            if (c > 255)
6057
-              {
6058
-              if (op == OP_CLASS) break;
6059
-              }
6060
-            else
6061
-              {
6062
-              if ((data[c/8] & (1 << (c&7))) == 0) break;
6063
-              }
6064
-            eptr += len;
6065
-            }
6066
-          for (;;)
6067
-            {
6068
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6069
-                 MATCH_NOMATCH) return rrc;
6070
-            if (eptr-- == pp) break;        /* Stop if tried at original pos */
6071
-            BACKCHAR(eptr);
6072
-            }
6073
-          }
6074
-        else
4609
+                        if (utf8 && c > 127)
4610
+                        {
4611
+                            int i;
4612
+                            for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4613
+                                if (c <= utf8_table1[i]) break;
4614
+                            runlength += i;
4615
+                            lastcharlength += i;
4616
+                        }
6075 4617
 #endif
6076
-          /* Not UTF-8 mode */
6077
-          {
6078
-          for (i = min; i < max; i++)
6079
-            {
6080
-            if (eptr >= md->end_subject) break;
6081
-            c = *eptr;
6082
-            if ((data[c/8] & (1 << (c&7))) == 0) break;
6083
-            eptr++;
6084
-            }
6085
-          while (eptr >= pp)
6086
-            {
6087
-            if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6088
-                 MATCH_NOMATCH) return rrc;
6089
-            }
6090
-          }
6091
-
6092
-        return MATCH_NOMATCH;
6093
-        }
6094
-      }
6095
-    /* Control never gets here */
6096
-
6097
-
6098
-    /* Match an extended character class. This opcode is encountered only
6099
-    in UTF-8 mode, because that's the only time it is compiled. */
6100
-
4618
+                    }
4619
+                    
4620
+                    /* Ordinary character or single-char escape */
4621
+                    
4622
+                    runlength++;
4623
+                }
4624
+                
4625
+                /* This "while" is the end of the "do" above. */
4626
+                
4627
+                while (runlength < MAXLIT &&
4628
+                       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
4629
+                
4630
+                /* If we hit a meta-character, back off to point to it */
4631
+                
4632
+                if (runlength < MAXLIT) ptr--;
4633
+                
4634
+                /* If the last char in the string is a UTF-8 multibyte character, we must
4635
+                 set lastcharlength correctly. If it was specified as an escape, this will
4636
+                 already have been done above. However, we also have to support in-line
4637
+                 UTF-8 characters, so check backwards from where we are. */
4638
+                
6101 4639
 #ifdef SUPPORT_UTF8
6102
-    case OP_XCLASS:
6103
-      {
6104
-      const uschar *data = ecode + 1 + LINK_SIZE;  /* Save for matching */
6105
-      ecode += GET(ecode, 1);                      /* Advance past the item */
6106
-
6107
-      switch (*ecode)
6108
-        {
6109
-        case OP_CRSTAR:
6110
-        case OP_CRMINSTAR:
6111
-        case OP_CRPLUS:
6112
-        case OP_CRMINPLUS:
6113
-        case OP_CRQUERY:
6114
-        case OP_CRMINQUERY:
6115
-        c = *ecode++ - OP_CRSTAR;
6116
-        minimize = (c & 1) != 0;
6117
-        min = rep_min[c];                 /* Pick up values from tables; */
6118
-        max = rep_max[c];                 /* zero for max => infinity */
6119
-        if (max == 0) max = INT_MAX;
6120
-        break;
6121
-
6122
-        case OP_CRRANGE:
6123
-        case OP_CRMINRANGE:
6124
-        minimize = (*ecode == OP_CRMINRANGE);
6125
-        min = GET2(ecode, 1);
6126
-        max = GET2(ecode, 3);
6127
-        if (max == 0) max = INT_MAX;
6128
-        ecode += 5;
6129
-        break;
6130
-
6131
-        default:               /* No repeat follows */
6132
-        min = max = 1;
6133
-        break;
6134
-        }
6135
-
6136
-      /* First, ensure the minimum number of matches are present. */
6137
-
6138
-      for (i = 1; i <= min; i++)
6139
-        {
6140
-        if (eptr >= md->end_subject) return MATCH_NOMATCH;
6141
-        GETCHARINC(c, eptr);
6142
-        if (!match_xclass(c, data)) return MATCH_NOMATCH;
4640
+                if (utf8)
4641
+                {
4642
+                    const uschar *lastptr = ptr - 1;
4643
+                    if ((*lastptr & 0x80) != 0)
4644
+                    {
4645
+                        while((*lastptr & 0xc0) == 0x80) lastptr--;
4646
+                        lastcharlength = ptr - lastptr;
4647
+                    }
4648
+                }
4649
+#endif
4650
+                
4651
+                length += runlength;
4652
+                continue;
6143 4653
         }
6144
-
6145
-      /* If max == min we can continue with the main loop without the
6146
-      need to recurse. */
6147
-
6148
-      if (min == max) continue;
6149
-
6150
-      /* If minimizing, keep testing the rest of the expression and advancing
6151
-      the pointer while it matches the class. */
6152
-
6153
-      if (minimize)
4654
+    }
4655
+    
4656
+    length += 2 + LINK_SIZE;    /* For final KET and END */
4657
+    
4658
+    if (length > MAX_PATTERN_SIZE)
4659
+    {
4660
+        *errorptr = ERR20;
4661
+        return NULL;
4662
+    }
4663
+    
4664
+    /* Compute the size of data block needed and get it, either from malloc or
4665
+     externally provided function. */
4666
+    
4667
+    size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4668
+    re = (real_pcre *)(pcre_malloc)(size);
4669
+    
4670
+    if (re == NULL)
4671
+    {
4672
+        *errorptr = ERR21;
4673
+        return NULL;
4674
+    }
4675
+    
4676
+    /* Put in the magic number, and save the size, options, and table pointer */
4677
+    
4678
+    re->magic_number = MAGIC_NUMBER;
4679
+    re->size = size;
4680
+    re->options = options;
4681
+    re->tables = tables;
4682
+    re->name_entry_size = max_name_size + 3;
4683
+    re->name_count = name_count;
4684
+    
4685
+    /* The starting points of the name/number translation table and of the code are
4686
+     passed around in the compile data block. */
4687
+    
4688
+    compile_block.names_found = 0;
4689
+    compile_block.name_entry_size = max_name_size + 3;
4690
+    compile_block.name_table = (uschar *)re + sizeof(real_pcre);
4691
+    codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4692
+    compile_block.start_code = codestart;
4693
+    compile_block.req_varyopt = 0;
4694
+    
4695
+    /* Set up a starting, non-extracting bracket, then compile the expression. On
4696
+     error, *errorptr will be set non-NULL, so we don't need to look at the result
4697
+     of the function here. */
4698
+    
4699
+    ptr = (const uschar *)pattern;
4700
+    code = (uschar *)codestart;
4701
+    *code = OP_BRA;
4702
+    bracount = 0;
4703
+    (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4704
+                        errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4705
+    re->top_bracket = bracount;
4706
+    re->top_backref = compile_block.top_backref;
4707
+    
4708
+    /* If not reached end of pattern on success, there's an excess bracket. */
4709
+    
4710
+    if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
4711
+    
4712
+    /* Fill in the terminating state and check for disastrous overflow, but
4713
+     if debugging, leave the test till after things are printed out. */
4714
+    
4715
+    *code++ = OP_END;
4716
+    
4717
+#ifndef DEBUG
4718
+    if (code - codestart > length) *errorptr = ERR23;
4719
+#endif
4720
+    
4721
+    /* Give an error if there's back reference to a non-existent capturing
4722
+     subpattern. */
4723
+    
4724
+    if (re->top_backref > re->top_bracket) *errorptr = ERR15;
4725
+    
4726
+    /* Failed to compile, or error while post-processing */
4727
+    
4728
+    if (*errorptr != NULL)
4729
+    {
4730
+        (pcre_free)(re);
4731
+    PCRE_ERROR_RETURN:
4732
+        *erroroffset = ptr - (const uschar *)pattern;
4733
+        return NULL;
4734
+    }
4735
+    
4736
+    /* If the anchored option was not passed, set the flag if we can determine that
4737
+     the pattern is anchored by virtue of ^ characters or \A or anything else (such
4738
+     as starting with .* when DOTALL is set).
4739
+     
4740
+     Otherwise, if we know what the first character has to be, save it, because that
4741
+     speeds up unanchored matches no end. If not, see if we can set the
4742
+     PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4743
+     start with ^. and also when all branches start with .* for non-DOTALL matches.
4744
+     */
4745
+    
4746
+    if ((options & PCRE_ANCHORED) == 0)
4747
+    {
4748
+        int temp_options = options;
4749
+        if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4750
+            re->options |= PCRE_ANCHORED;
4751
+        else
6154 4752
         {
6155
-        for (i = min;; i++)
6156
-          {
6157
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6158
-               MATCH_NOMATCH) return rrc;
6159
-          if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6160
-          GETCHARINC(c, eptr);
6161
-          if (!match_xclass(c, data)) return MATCH_NOMATCH;
6162
-          }
6163
-        /* Control never gets here */
4753
+            if (firstbyte < 0)
4754
+                firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4755
+            if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
4756
+            {
4757
+                int ch = firstbyte & 255;
4758
+                re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4759
+                                  compile_block.fcc[ch] == ch)? ch : firstbyte;
4760
+                re->options |= PCRE_FIRSTSET;
4761
+            }
4762
+            else if (is_startline(codestart, 0, compile_block.backref_map))
4763
+                re->options |= PCRE_STARTLINE;
6164 4764
         }
4765
+    }
4766
+    
4767
+    /* For an anchored pattern, we use the "required byte" only if it follows a
4768
+     variable length item in the regex. Remove the caseless flag for non-caseable
4769
+     chars. */
4770
+    
4771
+    if (reqbyte >= 0 &&
4772
+        ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4773
+    {
4774
+        int ch = reqbyte & 255;
4775
+        re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4776
+                        compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
4777
+        re->options |= PCRE_REQCHSET;
4778
+    }
4779
+    
4780
+    /* Print out the compiled data for debugging */
4781
+    
4782
+#ifdef DEBUG
4783
+    
4784
+    printf("Length = %d top_bracket = %d top_backref = %d\n",
4785
+           length, re->top_bracket, re->top_backref);
4786
+    
4787
+    if (re->options != 0)
4788
+    {
4789
+        printf("%s%s%s%s%s%s%s%s%s\n",
4790
+               ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
4791
+               ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
4792
+               ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
4793
+               ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
4794
+               ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
4795
+               ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
4796
+               ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
4797
+               ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
4798
+               ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
4799
+    }
4800
+    
4801
+    if ((re->options & PCRE_FIRSTSET) != 0)
4802
+    {
4803
+        int ch = re->first_byte & 255;
4804
+        char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4805
+        if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
4806
+        else printf("First char = \\x%02x%s\n", ch, caseless);
4807
+    }
4808
+    
4809
+    if ((re->options & PCRE_REQCHSET) != 0)
4810
+    {
4811
+        int ch = re->req_byte & 255;
4812
+        char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
4813
+        if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
4814
+        else printf("Req char = \\x%02x%s\n", ch, caseless);
4815
+    }
4816
+    
4817
+    print_internals(re, stdout);
4818
+    
4819
+    /* This check is done here in the debugging case so that the code that
4820
+     was compiled can be seen. */
4821
+    
4822
+    if (code - codestart > length)
4823
+    {
4824
+        *errorptr = ERR23;
4825
+        (pcre_free)(re);
4826
+        *erroroffset = ptr - (uschar *)pattern;
4827
+        return NULL;
4828
+    }
4829
+#endif
4830
+    
4831
+    return (pcre *)re;
4832
+}
6165 4833
 
6166
-      /* If maximizing, find the longest possible run, then work backwards. */
6167
-
6168
-      else
6169
-        {
6170
-        const uschar *pp = eptr;
6171
-        for (i = min; i < max; i++)
6172
-          {
6173
-          int len = 1;
6174
-          if (eptr >= md->end_subject) break;
6175
-          GETCHARLEN(c, eptr, len);
6176
-          if (!match_xclass(c, data)) break;
6177
-          eptr += len;
6178
-          }
6179
-        for(;;)
6180
-          {
6181
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6182
-               MATCH_NOMATCH) return rrc;
6183
-          if (eptr-- == pp) break;        /* Stop if tried at original pos */
6184
-          BACKCHAR(eptr)
6185
-          }
6186
-        return MATCH_NOMATCH;
6187
-        }
6188 4834
 
6189
-      /* Control never gets here */
6190
-      }
6191
-#endif    /* End of XCLASS */
6192 4835
 
6193
-    /* Match a run of characters */
4836
+/*************************************************
4837
+ *          Match a back-reference                *
4838
+ *************************************************/
6194 4839
 
6195
-    case OP_CHARS:
6196
-      {
6197
-      register int length = ecode[1];
6198
-      ecode += 2;
4840
+/* If a back reference hasn't been set, the length that is passed is greater
4841
+ than the number of characters left in the string, so the match fails.
4842
+ 
4843
+ Arguments:
4844
+ offset      index into the offset vector
4845
+ eptr        points into the subject
4846
+ length      length to be matched
4847
+ md          points to match data block
4848
+ ims         the ims flags
4849
+ 
4850
+ Returns:      TRUE if matched
4851
+ */
6199 4852
 
6200
-#ifdef DEBUG    /* Sigh. Some compilers never learn. */
6201
-      if (eptr >= md->end_subject)
6202
-        printf("matching subject <null> against pattern ");
6203
-      else
6204
-        {
4853
+static BOOL
4854
+match_ref(int offset, register const uschar *eptr, int length, match_data *md,
4855
+          unsigned long int ims)
4856
+{
4857
+    const uschar *p = md->start_subject + md->offset_vector[offset];
4858
+    
4859
+#ifdef DEBUG
4860
+    if (eptr >= md->end_subject)
4861
+        printf("matching subject <null>");
4862
+    else
4863
+    {
6205 4864
         printf("matching subject ");
6206 4865
         pchars(eptr, length, TRUE, md);
6207
-        printf(" against pattern ");
6208
-        }
6209
-      pchars(ecode, length, FALSE, md);
6210
-      printf("\n");
4866
+    }
4867
+    printf(" against backref ");
4868
+    pchars(p, length, FALSE, md);
4869
+    printf("\n");
6211 4870
 #endif
6212
-
6213
-      if (length > md->end_subject - eptr) return MATCH_NOMATCH;
6214
-      if ((ims & PCRE_CASELESS) != 0)
6215
-        {
4871
+    
4872
+    /* Always fail if not enough characters left */
4873
+    
4874
+    if (length > md->end_subject - eptr) return FALSE;
4875
+    
4876
+    /* Separate the caselesss case for speed */
4877
+    
4878
+    if ((ims & PCRE_CASELESS) != 0)
4879
+    {
6216 4880
         while (length-- > 0)
6217
-          if (md->lcc[*ecode++] != md->lcc[*eptr++])
6218
-            return MATCH_NOMATCH;
6219
-        }
6220
-      else
6221
-        {
6222
-        while (length-- > 0) if (*ecode++ != *eptr++) return MATCH_NOMATCH;
6223
-        }
6224
-      }
6225
-    break;
6226
-
6227
-    /* Match a single character repeatedly; different opcodes share code. */
6228
-
6229
-    case OP_EXACT:
6230
-    min = max = GET2(ecode, 1);
6231
-    ecode += 3;
6232
-    goto REPEATCHAR;
6233
-
6234
-    case OP_UPTO:
6235
-    case OP_MINUPTO:
6236
-    min = 0;
6237
-    max = GET2(ecode, 1);
6238
-    minimize = *ecode == OP_MINUPTO;
6239
-    ecode += 3;
6240
-    goto REPEATCHAR;
6241
-
6242
-    case OP_STAR:
6243
-    case OP_MINSTAR:
6244
-    case OP_PLUS:
6245
-    case OP_MINPLUS:
6246
-    case OP_QUERY:
6247
-    case OP_MINQUERY:
6248
-    c = *ecode++ - OP_STAR;
6249
-    minimize = (c & 1) != 0;
6250
-    min = rep_min[c];                 /* Pick up values from tables; */
6251
-    max = rep_max[c];                 /* zero for max => infinity */
6252
-    if (max == 0) max = INT_MAX;
6253
-
6254
-    /* Common code for all repeated single-character matches. We can give
6255
-    up quickly if there are fewer than the minimum number of characters left in
6256
-    the subject. */
6257
-
6258
-    REPEATCHAR:
6259
-#ifdef SUPPORT_UTF8
6260
-    if (md->utf8)
6261
-      {
6262
-      int len = 1;
6263
-      const uschar *charptr = ecode;
6264
-      GETCHARLEN(c, ecode, len);
6265
-      if (min * len > md->end_subject - eptr) return MATCH_NOMATCH;
6266
-      ecode += len;
6267
-
6268
-      /* Handle multibyte character matching specially here. There is no
6269
-      support for any kind of casing for multibyte characters. */
6270
-
6271
-      if (len > 1)
6272
-        {
6273
-        for (i = 1; i <= min; i++)
6274
-          {
6275
-          if (memcmp(eptr, charptr, len) != 0) return MATCH_NOMATCH;
6276
-          eptr += len;
6277
-          }
6278
-
6279
-        if (min == max) continue;
6280
-
6281
-        if (minimize)
6282
-          {
6283
-          for (i = min;; i++)
6284
-            {
6285
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6286
-                 MATCH_NOMATCH) return rrc;
6287
-            if (i >= max ||
6288
-                eptr >= md->end_subject ||
6289
-                memcmp(eptr, charptr, len) != 0)
6290
-              return MATCH_NOMATCH;
6291
-            eptr += len;
6292
-            }
6293
-          /* Control never gets here */
6294
-          }
6295
-        else
6296
-          {
6297
-          const uschar *pp = eptr;
6298
-          for (i = min; i < max; i++)
6299
-            {
6300
-            if (eptr > md->end_subject - len ||
6301
-                memcmp(eptr, charptr, len) != 0)
6302
-              break;
6303
-            eptr += len;
6304
-            }
6305
-          while (eptr >= pp)
6306
-           {
6307
-           if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6308
-                MATCH_NOMATCH) return rrc;
6309
-           eptr -= len;
6310
-           }
6311
-          return MATCH_NOMATCH;
6312
-          }
6313
-        /* Control never gets here */
6314
-        }
6315
-
6316
-      /* If the length of a UTF-8 character is 1, we fall through here, and
6317
-      obey the code as for non-UTF-8 characters below, though in this case the
6318
-      value of c will always be < 128. */
6319
-      }
4881
+            if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
4882
+    }
6320 4883
     else
6321
-#endif
6322
-
6323
-    /* When not in UTF-8 mode, load a single-byte character. */
6324
-      {
6325
-      if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6326
-      c = *ecode++;
6327
-      }
6328
-
6329
-    /* The value of c at this point is always less than 256, though we may or
6330
-    may not be in UTF-8 mode. The code is duplicated for the caseless and
6331
-    caseful cases, for speed, since matching characters is likely to be quite
6332
-    common. First, ensure the minimum number of matches are present. If min =
6333
-    max, continue at the same level without recursing. Otherwise, if
6334
-    minimizing, keep trying the rest of the expression and advancing one
6335
-    matching character if failing, up to the maximum. Alternatively, if
6336
-    maximizing, find the maximum number of characters and work backwards. */
4884
+    { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
4885
+    
4886
+    return TRUE;
4887
+}
6337 4888
 
6338
-    DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
6339
-      max, eptr));
6340 4889
 
6341
-    if ((ims & PCRE_CASELESS) != 0)
6342
-      {
6343
-      c = md->lcc[c];
6344
-      for (i = 1; i <= min; i++)
6345
-        if (c != md->lcc[*eptr++]) return MATCH_NOMATCH;
6346
-      if (min == max) continue;
6347
-      if (minimize)
6348
-        {
6349
-        for (i = min;; i++)
6350
-          {
6351
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6352
-               MATCH_NOMATCH) return rrc;
6353
-          if (i >= max || eptr >= md->end_subject ||
6354
-              c != md->lcc[*eptr++])
6355
-            return MATCH_NOMATCH;
6356
-          }
6357
-        /* Control never gets here */
6358
-        }
6359
-      else
6360
-        {
6361
-        const uschar *pp = eptr;
6362
-        for (i = min; i < max; i++)
6363
-          {
6364
-          if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
6365
-          eptr++;
6366
-          }
6367
-        while (eptr >= pp)
6368
-          if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6369
-               MATCH_NOMATCH) return rrc;
6370
-        return MATCH_NOMATCH;
6371
-        }
6372
-      /* Control never gets here */
6373
-      }
4890
+#ifdef SUPPORT_UTF8
4891
+/*************************************************
4892
+ *       Match character against an XCLASS        *
4893
+ *************************************************/
6374 4894
 
6375
-    /* Caseful comparisons (includes all multi-byte characters) */
4895
+/* This function is called from within the XCLASS code below, to match a
4896
+ character against an extended class which might match values > 255.
4897
+ 
4898
+ Arguments:
4899
+ c           the character
4900
+ data        points to the flag byte of the XCLASS data
4901
+ 
4902
+ Returns:      TRUE if character matches, else FALSE
4903
+ */
6376 4904
 
6377
-    else
6378
-      {
6379
-      for (i = 1; i <= min; i++) if (c != *eptr++) return MATCH_NOMATCH;
6380
-      if (min == max) continue;
6381
-      if (minimize)
4905
+static BOOL
4906
+match_xclass(int c, const uschar *data)
4907
+{
4908
+    int t;
4909
+    BOOL negated = (*data & XCL_NOT) != 0;
4910
+    
4911
+    /* Character values < 256 are matched against a bitmap, if one is present. If
4912
+     not, we still carry on, because there may be ranges that start below 256 in the
4913
+     additional data. */
4914
+    
4915
+    if (c < 256)
4916
+    {
4917
+        if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
4918
+            return !negated;   /* char found */
4919
+    }
4920
+    
4921
+    /* Now match against the list of large chars or ranges that end with a large
4922
+     char. First skip the bit map if present. */
4923
+    
4924
+    if ((*data++ & XCL_MAP) != 0) data += 32;
4925
+    
4926
+    while ((t = *data++) != XCL_END)
4927
+    {
4928
+        int x, y;
4929
+        GETCHARINC(x, data);
4930
+        if (t == XCL_SINGLE)
6382 4931
         {
6383
-        for (i = min;; i++)
6384
-          {
6385
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6386
-               MATCH_NOMATCH) return rrc;
6387
-          if (i >= max || eptr >= md->end_subject || c != *eptr++)
6388
-            return MATCH_NOMATCH;
6389
-          }
6390
-        /* Control never gets here */
4932
+            if (c == x) return !negated;
6391 4933
         }
6392
-      else
4934
+        else
6393 4935
         {
6394
-        const uschar *pp = eptr;
6395
-        for (i = min; i < max; i++)
6396
-          {
6397
-          if (eptr >= md->end_subject || c != *eptr) break;
6398
-          eptr++;
6399
-          }
6400
-        while (eptr >= pp)
6401
-         if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6402
-              MATCH_NOMATCH) return rrc;
6403
-        return MATCH_NOMATCH;
4936
+            GETCHARINC(y, data);
4937
+            if (c >= x && c <= y) return !negated;
6404 4938
         }
6405
-      }
6406
-    /* Control never gets here */
6407
-
6408
-    /* Match a negated single one-byte character. The character we are
6409
-    checking can be multibyte. */
6410
-
6411
-    case OP_NOT:
6412
-    if (eptr >= md->end_subject) return MATCH_NOMATCH;
6413
-    ecode++;
6414
-    GETCHARINCTEST(c, eptr);
6415
-    if ((ims & PCRE_CASELESS) != 0)
6416
-      {
6417
-#ifdef SUPPORT_UTF8
6418
-      if (c < 256)
4939
+    }
4940
+    
4941
+    return negated;   /* char was not found */
4942
+}
6419 4943
 #endif
6420
-      c = md->lcc[c];
6421
-      if (md->lcc[*ecode++] == c) return MATCH_NOMATCH;
6422
-      }
6423
-    else
6424
-      {
6425
-      if (*ecode++ == c) return MATCH_NOMATCH;
6426
-      }
6427
-    break;
6428
-
6429
-    /* Match a negated single one-byte character repeatedly. This is almost a
6430
-    repeat of the code for a repeated single character, but I haven't found a
6431
-    nice way of commoning these up that doesn't require a test of the
6432
-    positive/negative option for each character match. Maybe that wouldn't add
6433
-    very much to the time taken, but character matching *is* what this is all
6434
-    about... */
6435
-
6436
-    case OP_NOTEXACT:
6437
-    min = max = GET2(ecode, 1);
6438
-    ecode += 3;
6439
-    goto REPEATNOTCHAR;
6440
-
6441
-    case OP_NOTUPTO:
6442
-    case OP_NOTMINUPTO:
6443
-    min = 0;
6444
-    max = GET2(ecode, 1);
6445
-    minimize = *ecode == OP_NOTMINUPTO;
6446
-    ecode += 3;
6447
-    goto REPEATNOTCHAR;
6448
-
6449
-    case OP_NOTSTAR:
6450
-    case OP_NOTMINSTAR:
6451
-    case OP_NOTPLUS:
6452
-    case OP_NOTMINPLUS:
6453
-    case OP_NOTQUERY:
6454
-    case OP_NOTMINQUERY:
6455
-    c = *ecode++ - OP_NOTSTAR;
6456
-    minimize = (c & 1) != 0;
6457
-    min = rep_min[c];                 /* Pick up values from tables; */
6458
-    max = rep_max[c];                 /* zero for max => infinity */
6459
-    if (max == 0) max = INT_MAX;
6460
-
6461
-    /* Common code for all repeated single-character (less than 255) matches.
6462
-    We can give up quickly if there are fewer than the minimum number of
6463
-    characters left in the subject. */
6464
-
6465
-    REPEATNOTCHAR:
6466
-    if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6467
-    c = *ecode++;
6468
-
6469
-    /* The code is duplicated for the caseless and caseful cases, for speed,
6470
-    since matching characters is likely to be quite common. First, ensure the
6471
-    minimum number of matches are present. If min = max, continue at the same
6472
-    level without recursing. Otherwise, if minimizing, keep trying the rest of
6473
-    the expression and advancing one matching character if failing, up to the
6474
-    maximum. Alternatively, if maximizing, find the maximum number of
6475
-    characters and work backwards. */
6476
-
6477
-    DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
6478
-      max, eptr));
6479 4944
 
6480
-    if ((ims & PCRE_CASELESS) != 0)
6481
-      {
6482
-      c = md->lcc[c];
6483 4945
 
6484
-#ifdef SUPPORT_UTF8
6485
-      /* UTF-8 mode */
6486
-      if (md->utf8)
6487
-        {
6488
-        register int d;
6489
-        for (i = 1; i <= min; i++)
6490
-          {
6491
-          GETCHARINC(d, eptr);
6492
-          if (d < 256) d = md->lcc[d];
6493
-          if (c == d) return MATCH_NOMATCH;
6494
-          }
6495
-        }
6496
-      else
6497
-#endif
6498 4946
 
6499
-      /* Not UTF-8 mode */
6500
-        {
6501
-        for (i = 1; i <= min; i++)
6502
-          if (c == md->lcc[*eptr++]) return MATCH_NOMATCH;
6503
-        }
6504 4947
 
6505
-      if (min == max) continue;
4948
+/*************************************************
4949
+ *         Match from current position            *
4950
+ *************************************************/
4951
+
4952
+/* On entry ecode points to the first opcode, and eptr to the first character
4953
+ in the subject string, while eptrb holds the value of eptr at the start of the
4954
+ last bracketed group - used for breaking infinite loops matching zero-length
4955
+ strings. This function is called recursively in many circumstances. Whenever it
4956
+ returns a negative (error) response, the outer incarnation must also return the
4957
+ same response.
4958
+ 
4959
+ Performance note: It might be tempting to extract commonly used fields from the
4960
+ md structure (e.g. utf8, end_subject) into individual variables to improve
4961
+ performance. Tests using gcc on a SPARC disproved this; in the first case, it
4962
+ made performance worse.
4963
+ 
4964
+ Arguments:
4965
+ eptr        pointer in subject
4966
+ ecode       position in code
4967
+ offset_top  current top pointer
4968
+ md          pointer to "static" info for the match
4969
+ ims         current /i, /m, and /s options
4970
+ eptrb       pointer to chain of blocks containing eptr at start of
4971
+ brackets - for testing for empty matches
4972
+ flags       can contain
4973
+ match_condassert - this is an assertion condition
4974
+ match_isgroup - this is the start of a bracketed group
4975
+ 
4976
+ Returns:       MATCH_MATCH if matched            )  these values are >= 0
4977
+ MATCH_NOMATCH if failed to match  )
4978
+ a negative PCRE_ERROR_xxx value if aborted by an error condition
4979
+ (e.g. stopped by recursion limit)
4980
+ */
6506 4981
 
6507
-      if (minimize)
4982
+static int
4983
+match(register const uschar *eptr, register const uschar *ecode,
4984
+      int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
4985
+      int flags)
4986
+{
4987
+    unsigned long int original_ims = ims;   /* Save for resetting on ')' */
4988
+    register int rrc;
4989
+    eptrblock newptrb;
4990
+    
4991
+    if (md->match_call_count++ >= md->match_limit) return PCRE_ERROR_MATCHLIMIT;
4992
+    
4993
+    /* At the start of a bracketed group, add the current subject pointer to the
4994
+     stack of such pointers, to be re-instated at the end of the group when we hit
4995
+     the closing ket. When match() is called in other circumstances, we don't add to
4996
+     the stack. */
4997
+    
4998
+    if ((flags & match_isgroup) != 0)
4999
+    {
5000
+        newptrb.prev = eptrb;
5001
+        newptrb.saved_eptr = eptr;
5002
+        eptrb = &newptrb;
5003
+    }
5004
+    
5005
+    /* Now start processing the operations. */
5006
+    
5007
+    for (;;)
5008
+    {
5009
+        int op = (int)*ecode;
5010
+        int min, max, ctype;
5011
+        register int i;
5012
+        register int c;
5013
+        BOOL minimize = FALSE;
5014
+        
5015
+        /* Opening capturing bracket. If there is space in the offset vector, save
5016
+         the current subject position in the working slot at the top of the vector. We
5017
+         mustn't change the current values of the data slot, because they may be set
5018
+         from a previous iteration of this group, and be referred to by a reference
5019
+         inside the group.
5020
+         
5021
+         If the bracket fails to match, we need to restore this value and also the
5022
+         values of the final offsets, in case they were set by a previous iteration of
5023
+         the same bracket.
5024
+         
5025
+         If there isn't enough space in the offset vector, treat this as if it were a
5026
+         non-capturing bracket. Don't worry about setting the flag for the error case
5027
+         here; that is handled in the code for KET. */
5028
+        
5029
+        if (op > OP_BRA)
6508 5030
         {
6509
-#ifdef SUPPORT_UTF8
6510
-        /* UTF-8 mode */
6511
-        if (md->utf8)
6512
-          {
6513
-          register int d;
6514
-          for (i = min;; i++)
6515
-            {
6516
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6517
-                 MATCH_NOMATCH) return rrc;
6518
-            GETCHARINC(d, eptr);
6519
-            if (d < 256) d = md->lcc[d];
6520
-            if (i >= max || eptr >= md->end_subject || c == d)
6521
-              return MATCH_NOMATCH;
6522
-            }
6523
-          }
6524
-        else
5031
+            int offset;
5032
+            int number = op - OP_BRA;
5033
+            
5034
+            /* For extended extraction brackets (large number), we have to fish out the
5035
+             number from a dummy opcode at the start. */
5036
+            
5037
+            if (number > EXTRACT_BASIC_MAX)
5038
+                number = GET2(ecode, 2+LINK_SIZE);
5039
+            offset = number << 1;
5040
+            
5041
+#ifdef DEBUG
5042
+            printf("start bracket %d subject=", number);
5043
+            pchars(eptr, 16, TRUE, md);
5044
+            printf("\n");
6525 5045
 #endif
6526
-        /* Not UTF-8 mode */
6527
-          {
6528
-          for (i = min;; i++)
5046
+            
5047
+            if (offset < md->offset_max)
6529 5048
             {
6530
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6531
-                 MATCH_NOMATCH) return rrc;
6532
-            if (i >= max || eptr >= md->end_subject || c == md->lcc[*eptr++])
6533
-              return MATCH_NOMATCH;
5049
+                int save_offset1 = md->offset_vector[offset];
5050
+                int save_offset2 = md->offset_vector[offset+1];
5051
+                int save_offset3 = md->offset_vector[md->offset_end - number];
5052
+                int save_capture_last = md->capture_last;
5053
+                
5054
+                DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
5055
+                md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
5056
+                
5057
+                do
5058
+                {
5059
+                    if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5060
+                                     eptrb, match_isgroup)) != MATCH_NOMATCH) return rrc;
5061
+                    md->capture_last = save_capture_last;
5062
+                    ecode += GET(ecode, 1);
5063
+                }
5064
+                while (*ecode == OP_ALT);
5065
+                
5066
+                DPRINTF(("bracket %d failed\n", number));
5067
+                
5068
+                md->offset_vector[offset] = save_offset1;
5069
+                md->offset_vector[offset+1] = save_offset2;
5070
+                md->offset_vector[md->offset_end - number] = save_offset3;
5071
+                
5072
+                return MATCH_NOMATCH;
6534 5073
             }
6535
-          }
6536
-        /* Control never gets here */
5074
+            
5075
+            /* Insufficient room for saving captured contents */
5076
+            
5077
+            else op = OP_BRA;
6537 5078
         }
6538
-
6539
-      /* Maximize case */
6540
-
6541
-      else
5079
+        
5080
+        /* Other types of node can be handled by a switch */
5081
+        
5082
+        switch(op)
6542 5083
         {
6543
-        const uschar *pp = eptr;
6544
-
5084
+            case OP_BRA:     /* Non-capturing bracket: optimized */
5085
+                DPRINTF(("start bracket 0\n"));
5086
+                do
5087
+                {
5088
+                    if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5089
+                                     match_isgroup)) != MATCH_NOMATCH) return rrc;
5090
+                    ecode += GET(ecode, 1);
5091
+                }
5092
+                while (*ecode == OP_ALT);
5093
+                DPRINTF(("bracket 0 failed\n"));
5094
+                return MATCH_NOMATCH;
5095
+                
5096
+                /* Conditional group: compilation checked that there are no more than
5097
+                 two branches. If the condition is false, skipping the first branch takes us
5098
+                 past the end if there is only one branch, but that's OK because that is
5099
+                 exactly what going to the ket would do. */
5100
+                
5101
+            case OP_COND:
5102
+                if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
5103
+                {
5104
+                    int offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
5105
+                    BOOL condition = (offset == CREF_RECURSE * 2)?
5106
+                    (md->recursive != NULL) :
5107
+                    (offset < offset_top && md->offset_vector[offset] >= 0);
5108
+                    return match(eptr, ecode + (condition?
5109
+                                                (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
5110
+                                 offset_top, md, ims, eptrb, match_isgroup);
5111
+                }
5112
+                
5113
+                /* The condition is an assertion. Call match() to evaluate it - setting
5114
+                 the final argument TRUE causes it to stop at the end of an assertion. */
5115
+                
5116
+                else
5117
+                {
5118
+                    if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5119
+                                     match_condassert | match_isgroup)) == MATCH_MATCH)
5120
+                    {
5121
+                        ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
5122
+                        while (*ecode == OP_ALT) ecode += GET(ecode, 1);
5123
+                    }
5124
+                    else if (rrc != MATCH_NOMATCH) return rrc;
5125
+                    else ecode += GET(ecode, 1);
5126
+                    return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
5127
+                                 match_isgroup);
5128
+                }
5129
+                /* Control never reaches here */
5130
+                
5131
+                /* Skip over conditional reference or large extraction number data if
5132
+                 encountered. */
5133
+                
5134
+            case OP_CREF:
5135
+            case OP_BRANUMBER:
5136
+                ecode += 3;
5137
+                break;
5138
+                
5139
+                /* End of the pattern. If we are in a recursion, we should restore the
5140
+                 offsets appropriately and continue from after the call. */
5141
+                
5142
+            case OP_END:
5143
+                if (md->recursive != NULL && md->recursive->group_num == 0)
5144
+                {
5145
+                    recursion_info *rec = md->recursive;
5146
+                    DPRINTF(("Hit the end in a (?0) recursion\n"));
5147
+                    md->recursive = rec->prev;
5148
+                    memmove(md->offset_vector, rec->offset_save,
5149
+                            rec->saved_max * sizeof(int));
5150
+                    md->start_match = rec->save_start;
5151
+                    ims = original_ims;
5152
+                    ecode = rec->after_call;
5153
+                    break;
5154
+                }
5155
+                
5156
+                /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
5157
+                 string - backtracking will then try other alternatives, if any. */
5158
+                
5159
+                if (md->notempty && eptr == md->start_match) return MATCH_NOMATCH;
5160
+                md->end_match_ptr = eptr;          /* Record where we ended */
5161
+                md->end_offset_top = offset_top;   /* and how many extracts were taken */
5162
+                return MATCH_MATCH;
5163
+                
5164
+                /* Change option settings */
5165
+                
5166
+            case OP_OPT:
5167
+                ims = ecode[1];
5168
+                ecode += 2;
5169
+                DPRINTF(("ims set to %02lx\n", ims));
5170
+                break;
5171
+                
5172
+                /* Assertion brackets. Check the alternative branches in turn - the
5173
+                 matching won't pass the KET for an assertion. If any one branch matches,
5174
+                 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
5175
+                 start of each branch to move the current point backwards, so the code at
5176
+                 this level is identical to the lookahead case. */
5177
+                
5178
+            case OP_ASSERT:
5179
+            case OP_ASSERTBACK:
5180
+                do
5181
+                {
5182
+                    if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5183
+                                     match_isgroup)) == MATCH_MATCH) break;
5184
+                    if (rrc != MATCH_NOMATCH) return rrc;
5185
+                    ecode += GET(ecode, 1);
5186
+                }
5187
+                while (*ecode == OP_ALT);
5188
+                if (*ecode == OP_KET) return MATCH_NOMATCH;
5189
+                
5190
+                /* If checking an assertion for a condition, return MATCH_MATCH. */
5191
+                
5192
+                if ((flags & match_condassert) != 0) return MATCH_MATCH;
5193
+                
5194
+                /* Continue from after the assertion, updating the offsets high water
5195
+                 mark, since extracts may have been taken during the assertion. */
5196
+                
5197
+                do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5198
+                ecode += 1 + LINK_SIZE;
5199
+                offset_top = md->end_offset_top;
5200
+                continue;
5201
+                
5202
+                /* Negative assertion: all branches must fail to match */
5203
+                
5204
+            case OP_ASSERT_NOT:
5205
+            case OP_ASSERTBACK_NOT:
5206
+                do
5207
+                {
5208
+                    if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
5209
+                                     match_isgroup)) == MATCH_MATCH) return MATCH_NOMATCH;
5210
+                    if (rrc != MATCH_NOMATCH) return rrc;
5211
+                    ecode += GET(ecode,1);
5212
+                }
5213
+                while (*ecode == OP_ALT);
5214
+                
5215
+                if ((flags & match_condassert) != 0) return MATCH_MATCH;
5216
+                
5217
+                ecode += 1 + LINK_SIZE;
5218
+                continue;
5219
+                
5220
+                /* Move the subject pointer back. This occurs only at the start of
5221
+                 each branch of a lookbehind assertion. If we are too close to the start to
5222
+                 move back, this match function fails. When working with UTF-8 we move
5223
+                 back a number of characters, not bytes. */
5224
+                
5225
+            case OP_REVERSE:
6545 5226
 #ifdef SUPPORT_UTF8
6546
-        /* UTF-8 mode */
6547
-        if (md->utf8)
6548
-          {
6549
-          register int d;
6550
-          for (i = min; i < max; i++)
6551
-            {
6552
-            int len = 1;
6553
-            if (eptr >= md->end_subject) break;
6554
-            GETCHARLEN(d, eptr, len);
6555
-            if (d < 256) d = md->lcc[d];
6556
-            if (c == d) break;
6557
-            eptr += len;
6558
-            }
6559
-          for(;;)
6560
-            {
6561
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6562
-                 MATCH_NOMATCH) return rrc;
6563
-            if (eptr-- == pp) break;        /* Stop if tried at original pos */
6564
-            BACKCHAR(eptr);
6565
-            }
6566
-          }
6567
-        else
5227
+                if (md->utf8)
5228
+                {
5229
+                    c = GET(ecode,1);
5230
+                    for (i = 0; i < c; i++)
5231
+                    {
5232
+                        eptr--;
5233
+                        if (eptr < md->start_subject) return MATCH_NOMATCH;
5234
+                        BACKCHAR(eptr)
5235
+                    }
5236
+                }
5237
+                else
6568 5238
 #endif
6569
-        /* Not UTF-8 mode */
6570
-          {
6571
-          for (i = min; i < max; i++)
6572
-            {
6573
-            if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
6574
-            eptr++;
6575
-            }
6576
-          while (eptr >= pp)
5239
+                    
5240
+                /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
5241
+                    
5242
+                {
5243
+                    eptr -= GET(ecode,1);
5244
+                    if (eptr < md->start_subject) return MATCH_NOMATCH;
5245
+                }
5246
+                
5247
+                /* Skip to next op code */
5248
+                
5249
+                ecode += 1 + LINK_SIZE;
5250
+                break;
5251
+                
5252
+                /* The callout item calls an external function, if one is provided, passing
5253
+                 details of the match so far. This is mainly for debugging, though the
5254
+                 function is able to force a failure. */
5255
+                
5256
+            case OP_CALLOUT:
5257
+                if (pcre_callout != NULL)
5258
+                {
5259
+                    pcre_callout_block cb;
5260
+                    cb.version          = 0;   /* Version 0 of the callout block */
5261
+                    cb.callout_number   = ecode[1];
5262
+                    cb.offset_vector    = md->offset_vector;
5263
+                    cb.subject          = (const char *)md->start_subject;
5264
+                    cb.subject_length   = md->end_subject - md->start_subject;
5265
+                    cb.start_match      = md->start_match - md->start_subject;
5266
+                    cb.current_position = eptr - md->start_subject;
5267
+                    cb.capture_top      = offset_top/2;
5268
+                    cb.capture_last     = md->capture_last;
5269
+                    cb.callout_data     = md->callout_data;
5270
+                    if ((rrc = (*pcre_callout)(&cb)) > 0) return MATCH_NOMATCH;
5271
+                    if (rrc < 0) return rrc;
5272
+                }
5273
+                ecode += 2;
5274
+                break;
5275
+                
5276
+                /* Recursion either matches the current regex, or some subexpression. The
5277
+                 offset data is the offset to the starting bracket from the start of the
5278
+                 whole pattern. However, it is possible that a BRAZERO was inserted before
5279
+                 this bracket after we took the offset - we just skip it if encountered.
5280
+                 
5281
+                 If there are any capturing brackets started but not finished, we have to
5282
+                 save their starting points and reinstate them after the recursion. However,
5283
+                 we don't know how many such there are (offset_top records the completed
5284
+                 total) so we just have to save all the potential data. There may be up to
5285
+                 65535 such values, which is too large to put on the stack, but using malloc
5286
+                 for small numbers seems expensive. As a compromise, the stack is used when
5287
+                 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
5288
+                 is used. A problem is what to do if the malloc fails ... there is no way of
5289
+                 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
5290
+                 values on the stack, and accept that the rest may be wrong.
5291
+                 
5292
+                 There are also other values that have to be saved. We use a chained
5293
+                 sequence of blocks that actually live on the stack. Thanks to Robin Houston
5294
+                 for the original version of this logic. */
5295
+                
5296
+            case OP_RECURSE:
6577 5297
             {
6578
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6579
-                 MATCH_NOMATCH) return rrc;
6580
-            eptr--;
5298
+                int stacksave[REC_STACK_SAVE_MAX];
5299
+                recursion_info new_recursive;
5300
+                const uschar *callpat = md->start_code + GET(ecode, 1);
5301
+                
5302
+                if (*callpat == OP_BRAZERO) callpat++;
5303
+                
5304
+                new_recursive.group_num = *callpat - OP_BRA;
5305
+                
5306
+                /* For extended extraction brackets (large number), we have to fish out
5307
+                 the number from a dummy opcode at the start. */
5308
+                
5309
+                if (new_recursive.group_num > EXTRACT_BASIC_MAX)
5310
+                    new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
5311
+                
5312
+                /* Add to "recursing stack" */
5313
+                
5314
+                new_recursive.prev = md->recursive;
5315
+                md->recursive = &new_recursive;
5316
+                
5317
+                /* Find where to continue from afterwards */
5318
+                
5319
+                ecode += 1 + LINK_SIZE;
5320
+                new_recursive.after_call = ecode;
5321
+                
5322
+                /* Now save the offset data. */
5323
+                
5324
+                new_recursive.saved_max = md->offset_end;
5325
+                if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
5326
+                    new_recursive.offset_save = stacksave;
5327
+                else
5328
+                {
5329
+                    new_recursive.offset_save =
5330
+                    (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
5331
+                    if (new_recursive.offset_save == NULL) return PCRE_ERROR_NOMEMORY;
5332
+                }
5333
+                
5334
+                memcpy(new_recursive.offset_save, md->offset_vector,
5335
+                       new_recursive.saved_max * sizeof(int));
5336
+                new_recursive.save_start = md->start_match;
5337
+                md->start_match = eptr;
5338
+                
5339
+                /* OK, now we can do the recursion. For each top-level alternative we
5340
+                 restore the offset and recursion data. */
5341
+                
5342
+                DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
5343
+                do
5344
+                {
5345
+                    if ((rrc = match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
5346
+                                     eptrb, match_isgroup)) == MATCH_MATCH)
5347
+                    {
5348
+                        md->recursive = new_recursive.prev;
5349
+                        if (new_recursive.offset_save != stacksave)
5350
+                            (pcre_free)(new_recursive.offset_save);
5351
+                        return MATCH_MATCH;
5352
+                    }
5353
+                    else if (rrc != MATCH_NOMATCH) return rrc;
5354
+                    
5355
+                    md->recursive = &new_recursive;
5356
+                    memcpy(md->offset_vector, new_recursive.offset_save,
5357
+                           new_recursive.saved_max * sizeof(int));
5358
+                    callpat += GET(callpat, 1);
5359
+                }
5360
+                while (*callpat == OP_ALT);
5361
+                
5362
+                DPRINTF(("Recursion didn't match\n"));
5363
+                md->recursive = new_recursive.prev;
5364
+                if (new_recursive.offset_save != stacksave)
5365
+                    (pcre_free)(new_recursive.offset_save);
5366
+                return MATCH_NOMATCH;
6581 5367
             }
6582
-          }
6583
-
6584
-        return MATCH_NOMATCH;
6585
-        }
6586
-      /* Control never gets here */
6587
-      }
6588
-
6589
-    /* Caseful comparisons */
6590
-
6591
-    else
6592
-      {
6593
-#ifdef SUPPORT_UTF8
6594
-      /* UTF-8 mode */
6595
-      if (md->utf8)
6596
-        {
6597
-        register int d;
6598
-        for (i = 1; i <= min; i++)
6599
-          {
6600
-          GETCHARINC(d, eptr);
6601
-          if (c == d) return MATCH_NOMATCH;
6602
-          }
6603
-        }
6604
-      else
6605
-#endif
6606
-      /* Not UTF-8 mode */
6607
-        {
6608
-        for (i = 1; i <= min; i++)
6609
-          if (c == *eptr++) return MATCH_NOMATCH;
6610
-        }
6611
-
6612
-      if (min == max) continue;
6613
-
6614
-      if (minimize)
6615
-        {
6616
-#ifdef SUPPORT_UTF8
6617
-        /* UTF-8 mode */
6618
-        if (md->utf8)
6619
-          {
6620
-          register int d;
6621
-          for (i = min;; i++)
5368
+                /* Control never reaches here */
5369
+                
5370
+                /* "Once" brackets are like assertion brackets except that after a match,
5371
+                 the point in the subject string is not moved back. Thus there can never be
5372
+                 a move back into the brackets. Friedl calls these "atomic" subpatterns.
5373
+                 Check the alternative branches in turn - the matching won't pass the KET
5374
+                 for this kind of subpattern. If any one branch matches, we carry on as at
5375
+                 the end of a normal bracket, leaving the subject pointer. */
5376
+                
5377
+            case OP_ONCE:
6622 5378
             {
6623
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6624
-                 MATCH_NOMATCH) return rrc;
6625
-            GETCHARINC(d, eptr);
6626
-            if (i >= max || eptr >= md->end_subject || c == d)
6627
-              return MATCH_NOMATCH;
5379
+                const uschar *prev = ecode;
5380
+                const uschar *saved_eptr = eptr;
5381
+                
5382
+                do
5383
+                {
5384
+                    if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5385
+                                     eptrb, match_isgroup)) == MATCH_MATCH) break;
5386
+                    if (rrc != MATCH_NOMATCH) return rrc;
5387
+                    ecode += GET(ecode,1);
5388
+                }
5389
+                while (*ecode == OP_ALT);
5390
+                
5391
+                /* If hit the end of the group (which could be repeated), fail */
5392
+                
5393
+                if (*ecode != OP_ONCE && *ecode != OP_ALT) return MATCH_NOMATCH;
5394
+                
5395
+                /* Continue as from after the assertion, updating the offsets high water
5396
+                 mark, since extracts may have been taken. */
5397
+                
5398
+                do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5399
+                
5400
+                offset_top = md->end_offset_top;
5401
+                eptr = md->end_match_ptr;
5402
+                
5403
+                /* For a non-repeating ket, just continue at this level. This also
5404
+                 happens for a repeating ket if no characters were matched in the group.
5405
+                 This is the forcible breaking of infinite loops as implemented in Perl
5406
+                 5.005. If there is an options reset, it will get obeyed in the normal
5407
+                 course of events. */
5408
+                
5409
+                if (*ecode == OP_KET || eptr == saved_eptr)
5410
+                {
5411
+                    ecode += 1+LINK_SIZE;
5412
+                    break;
5413
+                }
5414
+                
5415
+                /* The repeating kets try the rest of the pattern or restart from the
5416
+                 preceding bracket, in the appropriate order. We need to reset any options
5417
+                 that changed within the bracket before re-running it, so check the next
5418
+                 opcode. */
5419
+                
5420
+                if (ecode[1+LINK_SIZE] == OP_OPT)
5421
+                {
5422
+                    ims = (ims & ~PCRE_IMS) | ecode[4];
5423
+                    DPRINTF(("ims set to %02lx at group repeat\n", ims));
5424
+                }
5425
+                
5426
+                if (*ecode == OP_KETRMIN)
5427
+                {
5428
+                    if ((rrc = match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
5429
+                                     eptrb, 0)) != MATCH_NOMATCH) return rrc;
5430
+                    if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5431
+                                     match_isgroup)) != MATCH_NOMATCH) return rrc;
5432
+                }
5433
+                else  /* OP_KETRMAX */
5434
+                {
5435
+                    if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5436
+                                     match_isgroup)) != MATCH_NOMATCH) return rrc;
5437
+                    if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5438
+                                     0)) != MATCH_NOMATCH) return rrc;
5439
+                }
6628 5440
             }
6629
-          }
6630
-        else
6631
-#endif
6632
-        /* Not UTF-8 mode */
6633
-          {
6634
-          for (i = min;; i++)
5441
+                return MATCH_NOMATCH;
5442
+                
5443
+                /* An alternation is the end of a branch; scan along to find the end of the
5444
+                 bracketed group and go to there. */
5445
+                
5446
+            case OP_ALT:
5447
+                do ecode += GET(ecode,1); while (*ecode == OP_ALT);
5448
+                break;
5449
+                
5450
+                /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
5451
+                 that it may occur zero times. It may repeat infinitely, or not at all -
5452
+                 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
5453
+                 repeat limits are compiled as a number of copies, with the optional ones
5454
+                 preceded by BRAZERO or BRAMINZERO. */
5455
+                
5456
+            case OP_BRAZERO:
6635 5457
             {
6636
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6637
-                 MATCH_NOMATCH) return rrc;
6638
-            if (i >= max || eptr >= md->end_subject || c == *eptr++)
6639
-              return MATCH_NOMATCH;
5458
+                const uschar *next = ecode+1;
5459
+                if ((rrc = match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
5460
+                    != MATCH_NOMATCH) return rrc;
5461
+                do next += GET(next,1); while (*next == OP_ALT);
5462
+                ecode = next + 1+LINK_SIZE;
6640 5463
             }
6641
-          }
6642
-        /* Control never gets here */
6643
-        }
6644
-
6645
-      /* Maximize case */
6646
-
6647
-      else
6648
-        {
6649
-        const uschar *pp = eptr;
6650
-
6651
-#ifdef SUPPORT_UTF8
6652
-        /* UTF-8 mode */
6653
-        if (md->utf8)
6654
-          {
6655
-          register int d;
6656
-          for (i = min; i < max; i++)
5464
+                break;
5465
+                
5466
+            case OP_BRAMINZERO:
6657 5467
             {
6658
-            int len = 1;
6659
-            if (eptr >= md->end_subject) break;
6660
-            GETCHARLEN(d, eptr, len);
6661
-            if (c == d) break;
6662
-            eptr += len;
5468
+                const uschar *next = ecode+1;
5469
+                do next += GET(next,1); while (*next == OP_ALT);
5470
+                if ((rrc = match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5471
+                                 match_isgroup)) != MATCH_NOMATCH) return rrc;
5472
+                ecode++;
6663 5473
             }
6664
-          for(;;)
5474
+                break;
5475
+                
5476
+                /* End of a group, repeated or non-repeating. If we are at the end of
5477
+                 an assertion "group", stop matching and return MATCH_MATCH, but record the
5478
+                 current high water mark for use by positive assertions. Do this also
5479
+                 for the "once" (not-backup up) groups. */
5480
+                
5481
+            case OP_KET:
5482
+            case OP_KETRMIN:
5483
+            case OP_KETRMAX:
6665 5484
             {
6666
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6667
-                MATCH_NOMATCH) return rrc;
6668
-            if (eptr-- == pp) break;        /* Stop if tried at original pos */
6669
-            BACKCHAR(eptr);
6670
-            }
6671
-          }
6672
-        else
5485
+                const uschar *prev = ecode - GET(ecode, 1);
5486
+                const uschar *saved_eptr = eptrb->saved_eptr;
5487
+                
5488
+                eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
5489
+                
5490
+                if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
5491
+                    *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
5492
+                    *prev == OP_ONCE)
5493
+                {
5494
+                    md->end_match_ptr = eptr;      /* For ONCE */
5495
+                    md->end_offset_top = offset_top;
5496
+                    return MATCH_MATCH;
5497
+                }
5498
+                
5499
+                /* In all other cases except a conditional group we have to check the
5500
+                 group number back at the start and if necessary complete handling an
5501
+                 extraction by setting the offsets and bumping the high water mark. */
5502
+                
5503
+                if (*prev != OP_COND)
5504
+                {
5505
+                    int offset;
5506
+                    int number = *prev - OP_BRA;
5507
+                    
5508
+                    /* For extended extraction brackets (large number), we have to fish out
5509
+                     the number from a dummy opcode at the start. */
5510
+                    
5511
+                    if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
5512
+                    offset = number << 1;
5513
+                    
5514
+#ifdef DEBUG
5515
+                    printf("end bracket %d", number);
5516
+                    printf("\n");
6673 5517
 #endif
6674
-        /* Not UTF-8 mode */
6675
-          {
6676
-          for (i = min; i < max; i++)
6677
-            {
6678
-            if (eptr >= md->end_subject || c == *eptr) break;
6679
-            eptr++;
5518
+                    
5519
+                    /* Test for a numbered group. This includes groups called as a result
5520
+                     of recursion. Note that whole-pattern recursion is coded as a recurse
5521
+                     into group 0, so it won't be picked up here. Instead, we catch it when
5522
+                     the OP_END is reached. */
5523
+                    
5524
+                    if (number > 0)
5525
+                    {
5526
+                        md->capture_last = number;
5527
+                        if (offset >= md->offset_max) md->offset_overflow = TRUE; else
5528
+                        {
5529
+                            md->offset_vector[offset] =
5530
+                            md->offset_vector[md->offset_end - number];
5531
+                            md->offset_vector[offset+1] = eptr - md->start_subject;
5532
+                            if (offset_top <= offset) offset_top = offset + 2;
5533
+                        }
5534
+                        
5535
+                        /* Handle a recursively called group. Restore the offsets
5536
+                         appropriately and continue from after the call. */
5537
+                        
5538
+                        if (md->recursive != NULL && md->recursive->group_num == number)
5539
+                        {
5540
+                            recursion_info *rec = md->recursive;
5541
+                            DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
5542
+                            md->recursive = rec->prev;
5543
+                            md->start_match = rec->save_start;
5544
+                            memcpy(md->offset_vector, rec->offset_save,
5545
+                                   rec->saved_max * sizeof(int));
5546
+                            ecode = rec->after_call;
5547
+                            ims = original_ims;
5548
+                            break;
5549
+                        }
5550
+                    }
5551
+                }
5552
+                
5553
+                /* Reset the value of the ims flags, in case they got changed during
5554
+                 the group. */
5555
+                
5556
+                ims = original_ims;
5557
+                DPRINTF(("ims reset to %02lx\n", ims));
5558
+                
5559
+                /* For a non-repeating ket, just continue at this level. This also
5560
+                 happens for a repeating ket if no characters were matched in the group.
5561
+                 This is the forcible breaking of infinite loops as implemented in Perl
5562
+                 5.005. If there is an options reset, it will get obeyed in the normal
5563
+                 course of events. */
5564
+                
5565
+                if (*ecode == OP_KET || eptr == saved_eptr)
5566
+                {
5567
+                    ecode += 1 + LINK_SIZE;
5568
+                    break;
5569
+                }
5570
+                
5571
+                /* The repeating kets try the rest of the pattern or restart from the
5572
+                 preceding bracket, in the appropriate order. */
5573
+                
5574
+                if (*ecode == OP_KETRMIN)
5575
+                {
5576
+                    if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5577
+                                     0)) != MATCH_NOMATCH) return rrc;
5578
+                    if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5579
+                                     match_isgroup)) != MATCH_NOMATCH) return rrc;
5580
+                }
5581
+                else  /* OP_KETRMAX */
5582
+                {
5583
+                    if ((rrc = match(eptr, prev, offset_top, md, ims, eptrb,
5584
+                                     match_isgroup)) != MATCH_NOMATCH) return rrc;
5585
+                    if ((rrc = match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb,
5586
+                                     0)) != MATCH_NOMATCH) return rrc;
5587
+                }
6680 5588
             }
6681
-          while (eptr >= pp)
5589
+                return MATCH_NOMATCH;
5590
+                
5591
+                /* Start of subject unless notbol, or after internal newline if multiline */
5592
+                
5593
+            case OP_CIRC:
5594
+                if (md->notbol && eptr == md->start_subject) return MATCH_NOMATCH;
5595
+                if ((ims & PCRE_MULTILINE) != 0)
5596
+                {
5597
+                    if (eptr != md->start_subject && eptr[-1] != NEWLINE)
5598
+                        return MATCH_NOMATCH;
5599
+                    ecode++;
5600
+                    break;
5601
+                }
5602
+                /* ... else fall through */
5603
+                
5604
+                /* Start of subject assertion */
5605
+                
5606
+            case OP_SOD:
5607
+                if (eptr != md->start_subject) return MATCH_NOMATCH;
5608
+                ecode++;
5609
+                break;
5610
+                
5611
+                /* Start of match assertion */
5612
+                
5613
+            case OP_SOM:
5614
+                if (eptr != md->start_subject + md->start_offset) return MATCH_NOMATCH;
5615
+                ecode++;
5616
+                break;
5617
+                
5618
+                /* Assert before internal newline if multiline, or before a terminating
5619
+                 newline unless endonly is set, else end of subject unless noteol is set. */
5620
+                
5621
+            case OP_DOLL:
5622
+                if ((ims & PCRE_MULTILINE) != 0)
5623
+                {
5624
+                    if (eptr < md->end_subject)
5625
+                    { if (*eptr != NEWLINE) return MATCH_NOMATCH; }
5626
+                    else
5627
+                    { if (md->noteol) return MATCH_NOMATCH; }
5628
+                    ecode++;
5629
+                    break;
5630
+                }
5631
+                else
5632
+                {
5633
+                    if (md->noteol) return MATCH_NOMATCH;
5634
+                    if (!md->endonly)
5635
+                    {
5636
+                        if (eptr < md->end_subject - 1 ||
5637
+                            (eptr == md->end_subject - 1 && *eptr != NEWLINE))
5638
+                            return MATCH_NOMATCH;
5639
+                        ecode++;
5640
+                        break;
5641
+                    }
5642
+                }
5643
+                /* ... else fall through */
5644
+                
5645
+                /* End of subject assertion (\z) */
5646
+                
5647
+            case OP_EOD:
5648
+                if (eptr < md->end_subject) return MATCH_NOMATCH;
5649
+                ecode++;
5650
+                break;
5651
+                
5652
+                /* End of subject or ending \n assertion (\Z) */
5653
+                
5654
+            case OP_EODN:
5655
+                if (eptr < md->end_subject - 1 ||
5656
+                    (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return MATCH_NOMATCH;
5657
+                ecode++;
5658
+                break;
5659
+                
5660
+                /* Word boundary assertions */
5661
+                
5662
+            case OP_NOT_WORD_BOUNDARY:
5663
+            case OP_WORD_BOUNDARY:
6682 5664
             {
6683
-            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6684
-                MATCH_NOMATCH) return rrc;
6685
-            eptr--;
6686
-            }
6687
-          }
6688
-
6689
-        return MATCH_NOMATCH;
6690
-        }
6691
-      }
6692
-    /* Control never gets here */
6693
-
6694
-    /* Match a single character type repeatedly; several different opcodes
6695
-    share code. This is very similar to the code for single characters, but we
6696
-    repeat it in the interests of efficiency. */
6697
-
6698
-    case OP_TYPEEXACT:
6699
-    min = max = GET2(ecode, 1);
6700
-    minimize = TRUE;
6701
-    ecode += 3;
6702
-    goto REPEATTYPE;
6703
-
6704
-    case OP_TYPEUPTO:
6705
-    case OP_TYPEMINUPTO:
6706
-    min = 0;
6707
-    max = GET2(ecode, 1);
6708
-    minimize = *ecode == OP_TYPEMINUPTO;
6709
-    ecode += 3;
6710
-    goto REPEATTYPE;
6711
-
6712
-    case OP_TYPESTAR:
6713
-    case OP_TYPEMINSTAR:
6714
-    case OP_TYPEPLUS:
6715
-    case OP_TYPEMINPLUS:
6716
-    case OP_TYPEQUERY:
6717
-    case OP_TYPEMINQUERY:
6718
-    c = *ecode++ - OP_TYPESTAR;
6719
-    minimize = (c & 1) != 0;
6720
-    min = rep_min[c];                 /* Pick up values from tables; */
6721
-    max = rep_max[c];                 /* zero for max => infinity */
6722
-    if (max == 0) max = INT_MAX;
6723
-
6724
-    /* Common code for all repeated single character type matches. Note that
6725
-    in UTF-8 mode, '.' matches a character of any length, but for the other
6726
-    character types, the valid characters are all one-byte long. */
6727
-
6728
-    REPEATTYPE:
6729
-    ctype = *ecode++;      /* Code for the character type */
6730
-
6731
-    /* First, ensure the minimum number of matches are present. Use inline
6732
-    code for maximizing the speed, and do the type test once at the start
6733
-    (i.e. keep it out of the loop). Also we can test that there are at least
6734
-    the minimum number of bytes before we start. This isn't as effective in
6735
-    UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
6736
-    is tidier. */
6737
-
6738
-    if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6739
-    if (min > 0)
6740
-      {
6741
-#ifdef SUPPORT_UTF8
6742
-      if (md->utf8) switch(ctype)
6743
-        {
6744
-        case OP_ANY:
6745
-        for (i = 1; i <= min; i++)
6746
-          {
6747
-          if (eptr >= md->end_subject ||
6748
-             (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
6749
-            return MATCH_NOMATCH;
6750
-          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6751
-          }
6752
-        break;
6753
-
6754
-        case OP_ANYBYTE:
6755
-        eptr += min;
6756
-        break;
6757
-
6758
-        case OP_NOT_DIGIT:
6759
-        for (i = 1; i <= min; i++)
6760
-          {
6761
-          if (eptr >= md->end_subject) return MATCH_NOMATCH;
6762
-          GETCHARINC(c, eptr);
6763
-          if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
6764
-            return MATCH_NOMATCH;
6765
-          }
6766
-        break;
6767
-
6768
-        case OP_DIGIT:
6769
-        for (i = 1; i <= min; i++)
6770
-          {
6771
-          if (eptr >= md->end_subject ||
6772
-             *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
6773
-            return MATCH_NOMATCH;
6774
-          /* No need to skip more bytes - we know it's a 1-byte character */
6775
-          }
6776
-        break;
6777
-
6778
-        case OP_NOT_WHITESPACE:
6779
-        for (i = 1; i <= min; i++)
6780
-          {
6781
-          if (eptr >= md->end_subject ||
6782
-             (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
6783
-            return MATCH_NOMATCH;
6784
-          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6785
-          }
6786
-        break;
6787
-
6788
-        case OP_WHITESPACE:
6789
-        for (i = 1; i <= min; i++)
6790
-          {
6791
-          if (eptr >= md->end_subject ||
6792
-             *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
6793
-            return MATCH_NOMATCH;
6794
-          /* No need to skip more bytes - we know it's a 1-byte character */
6795
-          }
6796
-        break;
6797
-
6798
-        case OP_NOT_WORDCHAR:
6799
-        for (i = 1; i <= min; i++)
6800
-          {
6801
-          if (eptr >= md->end_subject ||
6802
-             (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
6803
-            return MATCH_NOMATCH;
6804
-          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6805
-          }
6806
-        break;
6807
-
6808
-        case OP_WORDCHAR:
6809
-        for (i = 1; i <= min; i++)
6810
-          {
6811
-          if (eptr >= md->end_subject ||
6812
-             *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
6813
-            return MATCH_NOMATCH;
6814
-          /* No need to skip more bytes - we know it's a 1-byte character */
6815
-          }
6816
-        break;
6817
-        }
6818
-      else
6819
-#endif
6820
-
6821
-      /* Code for the non-UTF-8 case for minimum matching */
6822
-
6823
-      switch(ctype)
6824
-        {
6825
-        case OP_ANY:
6826
-        if ((ims & PCRE_DOTALL) == 0)
6827
-          {
6828
-          for (i = 1; i <= min; i++)
6829
-            if (*eptr++ == NEWLINE) return MATCH_NOMATCH;
6830
-          }
6831
-        else eptr += min;
6832
-        break;
6833
-
6834
-        case OP_ANYBYTE:
6835
-        eptr += min;
6836
-        break;
6837
-
6838
-        case OP_NOT_DIGIT:
6839
-        for (i = 1; i <= min; i++)
6840
-          if ((md->ctypes[*eptr++] & ctype_digit) != 0) return MATCH_NOMATCH;
6841
-        break;
6842
-
6843
-        case OP_DIGIT:
6844
-        for (i = 1; i <= min; i++)
6845
-          if ((md->ctypes[*eptr++] & ctype_digit) == 0) return MATCH_NOMATCH;
6846
-        break;
6847
-
6848
-        case OP_NOT_WHITESPACE:
6849
-        for (i = 1; i <= min; i++)
6850
-          if ((md->ctypes[*eptr++] & ctype_space) != 0) return MATCH_NOMATCH;
6851
-        break;
6852
-
6853
-        case OP_WHITESPACE:
6854
-        for (i = 1; i <= min; i++)
6855
-          if ((md->ctypes[*eptr++] & ctype_space) == 0) return MATCH_NOMATCH;
6856
-        break;
6857
-
6858
-        case OP_NOT_WORDCHAR:
6859
-        for (i = 1; i <= min; i++)
6860
-          if ((md->ctypes[*eptr++] & ctype_word) != 0)
6861
-            return MATCH_NOMATCH;
6862
-        break;
6863
-
6864
-        case OP_WORDCHAR:
6865
-        for (i = 1; i <= min; i++)
6866
-          if ((md->ctypes[*eptr++] & ctype_word) == 0)
6867
-            return MATCH_NOMATCH;
6868
-        break;
6869
-        }
6870
-      }
6871
-
6872
-    /* If min = max, continue at the same level without recursing */
6873
-
6874
-    if (min == max) continue;
6875
-
6876
-    /* If minimizing, we have to test the rest of the pattern before each
6877
-    subsequent match. Again, separate the UTF-8 case for speed. */
6878
-
6879
-    if (minimize)
6880
-      {
5665
+                BOOL prev_is_word, cur_is_word;
5666
+                
5667
+                /* Find out if the previous and current characters are "word" characters.
5668
+                 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
5669
+                 be "non-word" characters. */
5670
+                
6881 5671
 #ifdef SUPPORT_UTF8
6882
-      /* UTF-8 mode */
6883
-      if (md->utf8)
6884
-        {
6885
-        for (i = min;; i++)
6886
-          {
6887
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6888
-               MATCH_NOMATCH) return rrc;
6889
-          if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6890
-
6891
-          GETCHARINC(c, eptr);
6892
-          switch(ctype)
6893
-            {
6894
-            case OP_ANY:
6895
-            if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return MATCH_NOMATCH;
6896
-            break;
6897
-
6898
-            case OP_ANYBYTE:
6899
-            break;
6900
-
6901
-            case OP_NOT_DIGIT:
6902
-            if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
6903
-              return MATCH_NOMATCH;
6904
-            break;
6905
-
6906
-            case OP_DIGIT:
6907
-            if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
6908
-              return MATCH_NOMATCH;
6909
-            break;
6910
-
6911
-            case OP_NOT_WHITESPACE:
6912
-            if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
6913
-              return MATCH_NOMATCH;
6914
-            break;
6915
-
6916
-            case OP_WHITESPACE:
6917
-            if  (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
6918
-              return MATCH_NOMATCH;
6919
-            break;
6920
-
6921
-            case OP_NOT_WORDCHAR:
6922
-            if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
6923
-              return MATCH_NOMATCH;
6924
-            break;
6925
-
6926
-            case OP_WORDCHAR:
6927
-            if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
6928
-              return MATCH_NOMATCH;
6929
-            break;
6930
-            }
6931
-          }
6932
-        }
6933
-      else
5672
+                if (md->utf8)
5673
+                {
5674
+                    if (eptr == md->start_subject) prev_is_word = FALSE; else
5675
+                    {
5676
+                        const uschar *lastptr = eptr - 1;
5677
+                        while((*lastptr & 0xc0) == 0x80) lastptr--;
5678
+                        GETCHAR(c, lastptr);
5679
+                        prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5680
+                    }
5681
+                    if (eptr >= md->end_subject) cur_is_word = FALSE; else
5682
+                    {
5683
+                        GETCHAR(c, eptr);
5684
+                        cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
5685
+                    }
5686
+                }
5687
+                else
6934 5688
 #endif
6935
-      /* Not UTF-8 mode */
6936
-        {
6937
-        for (i = min;; i++)
6938
-          {
6939
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6940
-               MATCH_NOMATCH) return rrc;
6941
-          if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6942
-          c = *eptr++;
6943
-          switch(ctype)
6944
-            {
5689
+                    
5690
+                /* More streamlined when not in UTF-8 mode */
5691
+                    
5692
+                {
5693
+                    prev_is_word = (eptr != md->start_subject) &&
5694
+                    ((md->ctypes[eptr[-1]] & ctype_word) != 0);
5695
+                    cur_is_word = (eptr < md->end_subject) &&
5696
+                    ((md->ctypes[*eptr] & ctype_word) != 0);
5697
+                }
5698
+                
5699
+                /* Now see if the situation is what we want */
5700
+                
5701
+                if ((*ecode++ == OP_WORD_BOUNDARY)?
5702
+                    cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5703
+                    return MATCH_NOMATCH;
5704
+            }
5705
+                break;
5706
+                
5707
+                /* Match a single character type; inline for speed */
5708
+                
6945 5709
             case OP_ANY:
6946
-            if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return MATCH_NOMATCH;
6947
-            break;
6948
-
5710
+                if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
5711
+                    return MATCH_NOMATCH;
5712
+                if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5713
+#ifdef SUPPORT_UTF8
5714
+                if (md->utf8)
5715
+                    while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5716
+#endif
5717
+                ecode++;
5718
+                break;
5719
+                
5720
+                /* Match a single byte, even in UTF-8 mode. This opcode really does match
5721
+                 any byte, even newline, independent of the setting of PCRE_DOTALL. */
5722
+                
6949 5723
             case OP_ANYBYTE:
6950
-            break;
6951
-
5724
+                if (eptr++ >= md->end_subject) return MATCH_NOMATCH;
5725
+                ecode++;
5726
+                break;
5727
+                
6952 5728
             case OP_NOT_DIGIT:
6953
-            if ((md->ctypes[c] & ctype_digit) != 0) return MATCH_NOMATCH;
6954
-            break;
6955
-
5729
+                if (eptr >= md->end_subject) return MATCH_NOMATCH;
5730
+                GETCHARINCTEST(c, eptr);
5731
+                if (
5732
+#ifdef SUPPORT_UTF8
5733
+                    c < 256 &&
5734
+#endif
5735
+                    (md->ctypes[c] & ctype_digit) != 0
5736
+                    )
5737
+                    return MATCH_NOMATCH;
5738
+                ecode++;
5739
+                break;
5740
+                
6956 5741
             case OP_DIGIT:
6957
-            if ((md->ctypes[c] & ctype_digit) == 0) return MATCH_NOMATCH;
6958
-            break;
6959
-
5742
+                if (eptr >= md->end_subject) return MATCH_NOMATCH;
5743
+                GETCHARINCTEST(c, eptr);
5744
+                if (
5745
+#ifdef SUPPORT_UTF8
5746
+                    c >= 256 ||
5747
+#endif
5748
+                    (md->ctypes[c] & ctype_digit) == 0
5749
+                    )
5750
+                    return MATCH_NOMATCH;
5751
+                ecode++;
5752
+                break;
5753
+                
6960 5754
             case OP_NOT_WHITESPACE:
6961
-            if ((md->ctypes[c] & ctype_space) != 0) return MATCH_NOMATCH;
6962
-            break;
6963
-
5755
+                if (eptr >= md->end_subject) return MATCH_NOMATCH;
5756
+                GETCHARINCTEST(c, eptr);
5757
+                if (
5758
+#ifdef SUPPORT_UTF8
5759
+                    c < 256 &&
5760
+#endif
5761
+                    (md->ctypes[c] & ctype_space) != 0
5762
+                    )
5763
+                    return MATCH_NOMATCH;
5764
+                ecode++;
5765
+                break;
5766
+                
6964 5767
             case OP_WHITESPACE:
6965
-            if  ((md->ctypes[c] & ctype_space) == 0) return MATCH_NOMATCH;
6966
-            break;
6967
-
5768
+                if (eptr >= md->end_subject) return MATCH_NOMATCH;
5769
+                GETCHARINCTEST(c, eptr);
5770
+                if (
5771
+#ifdef SUPPORT_UTF8
5772
+                    c >= 256 ||
5773
+#endif
5774
+                    (md->ctypes[c] & ctype_space) == 0
5775
+                    )
5776
+                    return MATCH_NOMATCH;
5777
+                ecode++;
5778
+                break;
5779
+                
6968 5780
             case OP_NOT_WORDCHAR:
6969
-            if ((md->ctypes[c] & ctype_word) != 0) return MATCH_NOMATCH;
6970
-            break;
6971
-
5781
+                if (eptr >= md->end_subject) return MATCH_NOMATCH;
5782
+                GETCHARINCTEST(c, eptr);
5783
+                if (
5784
+#ifdef SUPPORT_UTF8
5785
+                    c < 256 &&
5786
+#endif
5787
+                    (md->ctypes[c] & ctype_word) != 0
5788
+                    )
5789
+                    return MATCH_NOMATCH;
5790
+                ecode++;
5791
+                break;
5792
+                
6972 5793
             case OP_WORDCHAR:
6973
-            if ((md->ctypes[c] & ctype_word) == 0) return MATCH_NOMATCH;
6974
-            break;
6975
-            }
6976
-          }
6977
-        }
6978
-      /* Control never gets here */
6979
-      }
6980
-
6981
-    /* If maximizing it is worth using inline code for speed, doing the type
6982
-    test once at the start (i.e. keep it out of the loop). Again, keep the
6983
-    UTF-8 stuff separate. */
6984
-
6985
-    else
6986
-      {
6987
-      const uschar *pp = eptr;
6988
-
5794
+                if (eptr >= md->end_subject) return MATCH_NOMATCH;
5795
+                GETCHARINCTEST(c, eptr);
5796
+                if (
6989 5797
 #ifdef SUPPORT_UTF8
6990
-      /* UTF-8 mode */
6991
-
6992
-      if (md->utf8)
6993
-        {
6994
-        switch(ctype)
6995
-          {
6996
-          case OP_ANY:
6997
-
6998
-          /* Special code is required for UTF8, but when the maximum is unlimited
6999
-          we don't need it, so we repeat the non-UTF8 code. This is probably
7000
-          worth it, because .* is quite a common idiom. */
7001
-
7002
-          if (max < INT_MAX)
5798
+                    c >= 256 ||
5799
+#endif
5800
+                    (md->ctypes[c] & ctype_word) == 0
5801
+                    )
5802
+                    return MATCH_NOMATCH;
5803
+                ecode++;
5804
+                break;
5805
+                
5806
+                /* Match a back reference, possibly repeatedly. Look past the end of the
5807
+                 item to see if there is repeat information following. The code is similar
5808
+                 to that for character classes, but repeated for efficiency. Then obey
5809
+                 similar code to character type repeats - written out again for speed.
5810
+                 However, if the referenced string is the empty string, always treat
5811
+                 it as matched, any number of times (otherwise there could be infinite
5812
+                 loops). */
5813
+                
5814
+            case OP_REF:
7003 5815
             {
7004
-            if ((ims & PCRE_DOTALL) == 0)
7005
-              {
7006
-              for (i = min; i < max; i++)
5816
+                int length;
5817
+                int offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
5818
+                ecode += 3;                                     /* Advance past item */
5819
+                
5820
+                /* If the reference is unset, set the length to be longer than the amount
5821
+                 of subject left; this ensures that every attempt at a match fails. We
5822
+                 can't just fail here, because of the possibility of quantifiers with zero
5823
+                 minima. */
5824
+                
5825
+                length = (offset >= offset_top || md->offset_vector[offset] < 0)?
5826
+                md->end_subject - eptr + 1 :
5827
+                md->offset_vector[offset+1] - md->offset_vector[offset];
5828
+                
5829
+                /* Set up for repetition, or handle the non-repeated case */
5830
+                
5831
+                switch (*ecode)
7007 5832
                 {
7008
-                if (eptr >= md->end_subject || *eptr == NEWLINE) break;
7009
-                eptr++;
7010
-                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5833
+                    case OP_CRSTAR:
5834
+                    case OP_CRMINSTAR:
5835
+                    case OP_CRPLUS:
5836
+                    case OP_CRMINPLUS:
5837
+                    case OP_CRQUERY:
5838
+                    case OP_CRMINQUERY:
5839
+                        c = *ecode++ - OP_CRSTAR;
5840
+                        minimize = (c & 1) != 0;
5841
+                        min = rep_min[c];                 /* Pick up values from tables; */
5842
+                        max = rep_max[c];                 /* zero for max => infinity */
5843
+                        if (max == 0) max = INT_MAX;
5844
+                        break;
5845
+                        
5846
+                    case OP_CRRANGE:
5847
+                    case OP_CRMINRANGE:
5848
+                        minimize = (*ecode == OP_CRMINRANGE);
5849
+                        min = GET2(ecode, 1);
5850
+                        max = GET2(ecode, 3);
5851
+                        if (max == 0) max = INT_MAX;
5852
+                        ecode += 5;
5853
+                        break;
5854
+                        
5855
+                    default:               /* No repeat follows */
5856
+                        if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5857
+                        eptr += length;
5858
+                        continue;              /* With the main loop */
7011 5859
                 }
7012
-              }
7013
-            else
7014
-              {
7015
-              for (i = min; i < max; i++)
5860
+                
5861
+                /* If the length of the reference is zero, just continue with the
5862
+                 main loop. */
5863
+                
5864
+                if (length == 0) continue;
5865
+                
5866
+                /* First, ensure the minimum number of matches are present. We get back
5867
+                 the length of the reference string explicitly rather than passing the
5868
+                 address of eptr, so that eptr can be a register variable. */
5869
+                
5870
+                for (i = 1; i <= min; i++)
7016 5871
                 {
7017
-                eptr++;
7018
-                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5872
+                    if (!match_ref(offset, eptr, length, md, ims)) return MATCH_NOMATCH;
5873
+                    eptr += length;
7019 5874
                 }
7020
-              }
7021
-            }
7022
-
7023
-          /* Handle unlimited UTF-8 repeat */
7024
-
7025
-          else
7026
-            {
7027
-            if ((ims & PCRE_DOTALL) == 0)
7028
-              {
7029
-              for (i = min; i < max; i++)
5875
+                
5876
+                /* If min = max, continue at the same level without recursion.
5877
+                 They are not both allowed to be zero. */
5878
+                
5879
+                if (min == max) continue;
5880
+                
5881
+                /* If minimizing, keep trying and advancing the pointer */
5882
+                
5883
+                if (minimize)
7030 5884
                 {
7031
-                if (eptr >= md->end_subject || *eptr == NEWLINE) break;
7032
-                eptr++;
5885
+                    for (i = min;; i++)
5886
+                    {
5887
+                        if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5888
+                            MATCH_NOMATCH) return rrc;
5889
+                        if (i >= max || !match_ref(offset, eptr, length, md, ims))
5890
+                            return MATCH_NOMATCH;
5891
+                        eptr += length;
5892
+                    }
5893
+                    /* Control never gets here */
5894
+                }
5895
+                
5896
+                /* If maximizing, find the longest string and work backwards */
5897
+                
5898
+                else
5899
+                {
5900
+                    const uschar *pp = eptr;
5901
+                    for (i = min; i < max; i++)
5902
+                    {
5903
+                        if (!match_ref(offset, eptr, length, md, ims)) break;
5904
+                        eptr += length;
5905
+                    }
5906
+                    while (eptr >= pp)
5907
+                    {
5908
+                        if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
5909
+                            MATCH_NOMATCH) return rrc;
5910
+                        eptr -= length;
5911
+                    }
5912
+                    return MATCH_NOMATCH;
7033 5913
                 }
7034
-              break;
7035
-              }
7036
-            else
7037
-              {
7038
-              c = max - min;
7039
-              if (c > md->end_subject - eptr) c = md->end_subject - eptr;
7040
-              eptr += c;
7041
-              }
7042
-            }
7043
-          break;
7044
-
7045
-          /* The byte case is the same as non-UTF8 */
7046
-
7047
-          case OP_ANYBYTE:
7048
-          c = max - min;
7049
-          if (c > md->end_subject - eptr) c = md->end_subject - eptr;
7050
-          eptr += c;
7051
-          break;
7052
-
7053
-          case OP_NOT_DIGIT:
7054
-          for (i = min; i < max; i++)
7055
-            {
7056
-            int len = 1;
7057
-            if (eptr >= md->end_subject) break;
7058
-            GETCHARLEN(c, eptr, len);
7059
-            if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
7060
-            eptr+= len;
7061
-            }
7062
-          break;
7063
-
7064
-          case OP_DIGIT:
7065
-          for (i = min; i < max; i++)
7066
-            {
7067
-            int len = 1;
7068
-            if (eptr >= md->end_subject) break;
7069
-            GETCHARLEN(c, eptr, len);
7070
-            if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
7071
-            eptr+= len;
7072
-            }
7073
-          break;
7074
-
7075
-          case OP_NOT_WHITESPACE:
7076
-          for (i = min; i < max; i++)
7077
-            {
7078
-            int len = 1;
7079
-            if (eptr >= md->end_subject) break;
7080
-            GETCHARLEN(c, eptr, len);
7081
-            if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
7082
-            eptr+= len;
7083
-            }
7084
-          break;
7085
-
7086
-          case OP_WHITESPACE:
7087
-          for (i = min; i < max; i++)
7088
-            {
7089
-            int len = 1;
7090
-            if (eptr >= md->end_subject) break;
7091
-            GETCHARLEN(c, eptr, len);
7092
-            if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
7093
-            eptr+= len;
7094
-            }
7095
-          break;
7096
-
7097
-          case OP_NOT_WORDCHAR:
7098
-          for (i = min; i < max; i++)
7099
-            {
7100
-            int len = 1;
7101
-            if (eptr >= md->end_subject) break;
7102
-            GETCHARLEN(c, eptr, len);
7103
-            if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
7104
-            eptr+= len;
7105 5914
             }
7106
-          break;
7107
-
7108
-          case OP_WORDCHAR:
7109
-          for (i = min; i < max; i++)
5915
+                /* Control never gets here */
5916
+                
5917
+                
5918
+                
5919
+                /* Match a bit-mapped character class, possibly repeatedly. This op code is
5920
+                 used when all the characters in the class have values in the range 0-255.
5921
+                 The only difference between OP_CLASS and OP_NCLASS occurs when a data
5922
+                 character outside the range is encountered.
5923
+                 
5924
+                 First, look past the end of the item to see if there is repeat information
5925
+                 following. Then obey similar code to character type repeats - written out
5926
+                 again for speed. */
5927
+                
5928
+            case OP_NCLASS:
5929
+            case OP_CLASS:
7110 5930
             {
7111
-            int len = 1;
7112
-            if (eptr >= md->end_subject) break;
7113
-            GETCHARLEN(c, eptr, len);
7114
-            if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
7115
-            eptr+= len;
7116
-            }
7117
-          break;
7118
-          }
7119
-
7120
-        /* eptr is now past the end of the maximum run */
7121
-
7122
-        for(;;)
7123
-          {
7124
-          if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
7125
-               MATCH_NOMATCH) return rrc;
7126
-          if (eptr-- == pp) break;        /* Stop if tried at original pos */
7127
-          BACKCHAR(eptr);
7128
-          }
7129
-        }
7130
-      else
5931
+                const uschar *data = ecode + 1;  /* Save for matching */
5932
+                ecode += 33;                     /* Advance past the item */
5933
+                
5934
+                switch (*ecode)
5935
+                {
5936
+                    case OP_CRSTAR:
5937
+                    case OP_CRMINSTAR:
5938
+                    case OP_CRPLUS:
5939
+                    case OP_CRMINPLUS:
5940
+                    case OP_CRQUERY:
5941
+                    case OP_CRMINQUERY:
5942
+                        c = *ecode++ - OP_CRSTAR;
5943
+                        minimize = (c & 1) != 0;
5944
+                        min = rep_min[c];                 /* Pick up values from tables; */
5945
+                        max = rep_max[c];                 /* zero for max => infinity */
5946
+                        if (max == 0) max = INT_MAX;
5947
+                        break;
5948
+                        
5949
+                    case OP_CRRANGE:
5950
+                    case OP_CRMINRANGE:
5951
+                        minimize = (*ecode == OP_CRMINRANGE);
5952
+                        min = GET2(ecode, 1);
5953
+                        max = GET2(ecode, 3);
5954
+                        if (max == 0) max = INT_MAX;
5955
+                        ecode += 5;
5956
+                        break;
5957
+                        
5958
+                    default:               /* No repeat follows */
5959
+                        min = max = 1;
5960
+                        break;
5961
+                }
5962
+                
5963
+                /* First, ensure the minimum number of matches are present. */
5964
+                
5965
+#ifdef SUPPORT_UTF8
5966
+                /* UTF-8 mode */
5967
+                if (md->utf8)
5968
+                {
5969
+                    for (i = 1; i <= min; i++)
5970
+                    {
5971
+                        if (eptr >= md->end_subject) return MATCH_NOMATCH;
5972
+                        GETCHARINC(c, eptr);
5973
+                        if (c > 255)
5974
+                        {
5975
+                            if (op == OP_CLASS) return MATCH_NOMATCH;
5976
+                        }
5977
+                        else
5978
+                        {
5979
+                            if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5980
+                        }
5981
+                    }
5982
+                }
5983
+                else
7131 5984
 #endif
7132
-
7133
-      /* Not UTF-8 mode */
7134
-        {
7135
-        switch(ctype)
7136
-          {
7137
-          case OP_ANY:
7138
-          if ((ims & PCRE_DOTALL) == 0)
7139
-            {
7140
-            for (i = min; i < max; i++)
7141
-              {
7142
-              if (eptr >= md->end_subject || *eptr == NEWLINE) break;
7143
-              eptr++;
7144
-              }
7145
-            break;
7146
-            }
7147
-          /* For DOTALL case, fall through and treat as \C */
7148
-
7149
-          case OP_ANYBYTE:
7150
-          c = max - min;
7151
-          if (c > md->end_subject - eptr) c = md->end_subject - eptr;
7152
-          eptr += c;
7153
-          break;
7154
-
7155
-          case OP_NOT_DIGIT:
7156
-          for (i = min; i < max; i++)
7157
-            {
7158
-            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
7159
-              break;
7160
-            eptr++;
7161
-            }
7162
-          break;
7163
-
7164
-          case OP_DIGIT:
7165
-          for (i = min; i < max; i++)
7166
-            {
7167
-            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
7168
-              break;
7169
-            eptr++;
7170
-            }
7171
-          break;
7172
-
7173
-          case OP_NOT_WHITESPACE:
7174
-          for (i = min; i < max; i++)
7175
-            {
7176
-            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
7177
-              break;
7178
-            eptr++;
7179
-            }
7180
-          break;
7181
-
7182
-          case OP_WHITESPACE:
7183
-          for (i = min; i < max; i++)
7184
-            {
7185
-            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
7186
-              break;
7187
-            eptr++;
5985
+                /* Not UTF-8 mode */
5986
+                {
5987
+                    for (i = 1; i <= min; i++)
5988
+                    {
5989
+                        if (eptr >= md->end_subject) return MATCH_NOMATCH;
5990
+                        c = *eptr++;
5991
+                        if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
5992
+                    }
5993
+                }
5994
+                
5995
+                /* If max == min we can continue with the main loop without the
5996
+                 need to recurse. */
5997
+                
5998
+                if (min == max) continue;
5999
+                
6000
+                /* If minimizing, keep testing the rest of the expression and advancing
6001
+                 the pointer while it matches the class. */
6002
+                
6003
+                if (minimize)
6004
+                {
6005
+#ifdef SUPPORT_UTF8
6006
+                    /* UTF-8 mode */
6007
+                    if (md->utf8)
6008
+                    {
6009
+                        for (i = min;; i++)
6010
+                        {
6011
+                            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6012
+                                MATCH_NOMATCH) return rrc;
6013
+                            if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6014
+                            GETCHARINC(c, eptr);
6015
+                            if (c > 255)
6016
+                            {
6017
+                                if (op == OP_CLASS) return MATCH_NOMATCH;
6018
+                            }
6019
+                            else
6020
+                            {
6021
+                                if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6022
+                            }
6023
+                        }
6024
+                    }
6025
+                    else
6026
+#endif
6027
+                    /* Not UTF-8 mode */
6028
+                    {
6029
+                        for (i = min;; i++)
6030
+                        {
6031
+                            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6032
+                                MATCH_NOMATCH) return rrc;
6033
+                            if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6034
+                            c = *eptr++;
6035
+                            if ((data[c/8] & (1 << (c&7))) == 0) return MATCH_NOMATCH;
6036
+                        }
6037
+                    }
6038
+                    /* Control never gets here */
6039
+                }
6040
+                
6041
+                /* If maximizing, find the longest possible run, then work backwards. */
6042
+                
6043
+                else
6044
+                {
6045
+                    const uschar *pp = eptr;
6046
+                    
6047
+#ifdef SUPPORT_UTF8
6048
+                    /* UTF-8 mode */
6049
+                    if (md->utf8)
6050
+                    {
6051
+                        for (i = min; i < max; i++)
6052
+                        {
6053
+                            int len = 1;
6054
+                            if (eptr >= md->end_subject) break;
6055
+                            GETCHARLEN(c, eptr, len);
6056
+                            if (c > 255)
6057
+                            {
6058
+                                if (op == OP_CLASS) break;
6059
+                            }
6060
+                            else
6061
+                            {
6062
+                                if ((data[c/8] & (1 << (c&7))) == 0) break;
6063
+                            }
6064
+                            eptr += len;
6065
+                        }
6066
+                        for (;;)
6067
+                        {
6068
+                            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6069
+                                MATCH_NOMATCH) return rrc;
6070
+                            if (eptr-- == pp) break;        /* Stop if tried at original pos */
6071
+                            BACKCHAR(eptr);
6072
+                        }
6073
+                    }
6074
+                    else
6075
+#endif
6076
+                    /* Not UTF-8 mode */
6077
+                    {
6078
+                        for (i = min; i < max; i++)
6079
+                        {
6080
+                            if (eptr >= md->end_subject) break;
6081
+                            c = *eptr;
6082
+                            if ((data[c/8] & (1 << (c&7))) == 0) break;
6083
+                            eptr++;
6084
+                        }
6085
+                        while (eptr >= pp)
6086
+                        {
6087
+                            if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6088
+                                MATCH_NOMATCH) return rrc;
6089
+                        }
6090
+                    }
6091
+                    
6092
+                    return MATCH_NOMATCH;
6093
+                }
7188 6094
             }
7189
-          break;
7190
-
7191
-          case OP_NOT_WORDCHAR:
7192
-          for (i = min; i < max; i++)
6095
+                /* Control never gets here */
6096
+                
6097
+                
6098
+                /* Match an extended character class. This opcode is encountered only
6099
+                 in UTF-8 mode, because that's the only time it is compiled. */
6100
+                
6101
+#ifdef SUPPORT_UTF8
6102
+            case OP_XCLASS:
7193 6103
             {
7194
-            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
7195
-              break;
7196
-            eptr++;
6104
+                const uschar *data = ecode + 1 + LINK_SIZE;  /* Save for matching */
6105
+                ecode += GET(ecode, 1);                      /* Advance past the item */
6106
+                
6107
+                switch (*ecode)
6108
+                {
6109
+                    case OP_CRSTAR:
6110
+                    case OP_CRMINSTAR:
6111
+                    case OP_CRPLUS:
6112
+                    case OP_CRMINPLUS:
6113
+                    case OP_CRQUERY:
6114
+                    case OP_CRMINQUERY:
6115
+                        c = *ecode++ - OP_CRSTAR;
6116
+                        minimize = (c & 1) != 0;
6117
+                        min = rep_min[c];                 /* Pick up values from tables; */
6118
+                        max = rep_max[c];                 /* zero for max => infinity */
6119
+                        if (max == 0) max = INT_MAX;
6120
+                        break;
6121
+                        
6122
+                    case OP_CRRANGE:
6123
+                    case OP_CRMINRANGE:
6124
+                        minimize = (*ecode == OP_CRMINRANGE);
6125
+                        min = GET2(ecode, 1);
6126
+                        max = GET2(ecode, 3);
6127
+                        if (max == 0) max = INT_MAX;
6128
+                        ecode += 5;
6129
+                        break;
6130
+                        
6131
+                    default:               /* No repeat follows */
6132
+                        min = max = 1;
6133
+                        break;
6134
+                }
6135
+                
6136
+                /* First, ensure the minimum number of matches are present. */
6137
+                
6138
+                for (i = 1; i <= min; i++)
6139
+                {
6140
+                    if (eptr >= md->end_subject) return MATCH_NOMATCH;
6141
+                    GETCHARINC(c, eptr);
6142
+                    if (!match_xclass(c, data)) return MATCH_NOMATCH;
6143
+                }
6144
+                
6145
+                /* If max == min we can continue with the main loop without the
6146
+                 need to recurse. */
6147
+                
6148
+                if (min == max) continue;
6149
+                
6150
+                /* If minimizing, keep testing the rest of the expression and advancing
6151
+                 the pointer while it matches the class. */
6152
+                
6153
+                if (minimize)
6154
+                {
6155
+                    for (i = min;; i++)
6156
+                    {
6157
+                        if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6158
+                            MATCH_NOMATCH) return rrc;
6159
+                        if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6160
+                        GETCHARINC(c, eptr);
6161
+                        if (!match_xclass(c, data)) return MATCH_NOMATCH;
6162
+                    }
6163
+                    /* Control never gets here */
6164
+                }
6165
+                
6166
+                /* If maximizing, find the longest possible run, then work backwards. */
6167
+                
6168
+                else
6169
+                {
6170
+                    const uschar *pp = eptr;
6171
+                    for (i = min; i < max; i++)
6172
+                    {
6173
+                        int len = 1;
6174
+                        if (eptr >= md->end_subject) break;
6175
+                        GETCHARLEN(c, eptr, len);
6176
+                        if (!match_xclass(c, data)) break;
6177
+                        eptr += len;
6178
+                    }
6179
+                    for(;;)
6180
+                    {
6181
+                        if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6182
+                            MATCH_NOMATCH) return rrc;
6183
+                        if (eptr-- == pp) break;        /* Stop if tried at original pos */
6184
+                        BACKCHAR(eptr)
6185
+                    }
6186
+                    return MATCH_NOMATCH;
6187
+                }
6188
+                
6189
+                /* Control never gets here */
7197 6190
             }
7198
-          break;
7199
-
7200
-          case OP_WORDCHAR:
7201
-          for (i = min; i < max; i++)
6191
+#endif    /* End of XCLASS */
6192
+                
6193
+                /* Match a run of characters */
6194
+                
6195
+            case OP_CHARS:
7202 6196
             {
7203
-            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
7204
-              break;
7205
-            eptr++;
6197
+                register int length = ecode[1];
6198
+                ecode += 2;
6199
+                
6200
+#ifdef DEBUG    /* Sigh. Some compilers never learn. */
6201
+                if (eptr >= md->end_subject)
6202
+                    printf("matching subject <null> against pattern ");
6203
+                else
6204
+                {
6205
+                    printf("matching subject ");
6206
+                    pchars(eptr, length, TRUE, md);
6207
+                    printf(" against pattern ");
6208
+                }
6209
+                pchars(ecode, length, FALSE, md);
6210
+                printf("\n");
6211
+#endif
6212
+                
6213
+                if (length > md->end_subject - eptr) return MATCH_NOMATCH;
6214
+                if ((ims & PCRE_CASELESS) != 0)
6215
+                {
6216
+                    while (length-- > 0)
6217
+                        if (md->lcc[*ecode++] != md->lcc[*eptr++])
6218
+                            return MATCH_NOMATCH;
6219
+                }
6220
+                else
6221
+                {
6222
+                    while (length-- > 0) if (*ecode++ != *eptr++) return MATCH_NOMATCH;
6223
+                }
7206 6224
             }
7207
-          break;
7208
-          }
7209
-
7210
-        /* eptr is now past the end of the maximum run */
7211
-
7212
-        while (eptr >= pp)
7213
-          {
7214
-          if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
7215
-               MATCH_NOMATCH) return rrc;
7216
-          }
6225
+                break;
6226
+                
6227
+                /* Match a single character repeatedly; different opcodes share code. */
6228
+                
6229
+            case OP_EXACT:
6230
+                min = max = GET2(ecode, 1);
6231
+                ecode += 3;
6232
+                goto REPEATCHAR;
6233
+                
6234
+            case OP_UPTO:
6235
+            case OP_MINUPTO:
6236
+                min = 0;
6237
+                max = GET2(ecode, 1);
6238
+                minimize = *ecode == OP_MINUPTO;
6239
+                ecode += 3;
6240
+                goto REPEATCHAR;
6241
+                
6242
+            case OP_STAR:
6243
+            case OP_MINSTAR:
6244
+            case OP_PLUS:
6245
+            case OP_MINPLUS:
6246
+            case OP_QUERY:
6247
+            case OP_MINQUERY:
6248
+                c = *ecode++ - OP_STAR;
6249
+                minimize = (c & 1) != 0;
6250
+                min = rep_min[c];                 /* Pick up values from tables; */
6251
+                max = rep_max[c];                 /* zero for max => infinity */
6252
+                if (max == 0) max = INT_MAX;
6253
+                
6254
+                /* Common code for all repeated single-character matches. We can give
6255
+                 up quickly if there are fewer than the minimum number of characters left in
6256
+                 the subject. */
6257
+                
6258
+            REPEATCHAR:
6259
+#ifdef SUPPORT_UTF8
6260
+                if (md->utf8)
6261
+                {
6262
+                    int len = 1;
6263
+                    const uschar *charptr = ecode;
6264
+                    GETCHARLEN(c, ecode, len);
6265
+                    if (min * len > md->end_subject - eptr) return MATCH_NOMATCH;
6266
+                    ecode += len;
6267
+                    
6268
+                    /* Handle multibyte character matching specially here. There is no
6269
+                     support for any kind of casing for multibyte characters. */
6270
+                    
6271
+                    if (len > 1)
6272
+                    {
6273
+                        for (i = 1; i <= min; i++)
6274
+                        {
6275
+                            if (memcmp(eptr, charptr, len) != 0) return MATCH_NOMATCH;
6276
+                            eptr += len;
6277
+                        }
6278
+                        
6279
+                        if (min == max) continue;
6280
+                        
6281
+                        if (minimize)
6282
+                        {
6283
+                            for (i = min;; i++)
6284
+                            {
6285
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6286
+                                    MATCH_NOMATCH) return rrc;
6287
+                                if (i >= max ||
6288
+                                    eptr >= md->end_subject ||
6289
+                                    memcmp(eptr, charptr, len) != 0)
6290
+                                    return MATCH_NOMATCH;
6291
+                                eptr += len;
6292
+                            }
6293
+                            /* Control never gets here */
6294
+                        }
6295
+                        else
6296
+                        {
6297
+                            const uschar *pp = eptr;
6298
+                            for (i = min; i < max; i++)
6299
+                            {
6300
+                                if (eptr > md->end_subject - len ||
6301
+                                    memcmp(eptr, charptr, len) != 0)
6302
+                                    break;
6303
+                                eptr += len;
6304
+                            }
6305
+                            while (eptr >= pp)
6306
+                            {
6307
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6308
+                                    MATCH_NOMATCH) return rrc;
6309
+                                eptr -= len;
6310
+                            }
6311
+                            return MATCH_NOMATCH;
6312
+                        }
6313
+                        /* Control never gets here */
6314
+                    }
6315
+                    
6316
+                    /* If the length of a UTF-8 character is 1, we fall through here, and
6317
+                     obey the code as for non-UTF-8 characters below, though in this case the
6318
+                     value of c will always be < 128. */
6319
+                }
6320
+                else
6321
+#endif
6322
+                    
6323
+                /* When not in UTF-8 mode, load a single-byte character. */
6324
+                {
6325
+                    if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6326
+                    c = *ecode++;
6327
+                }
6328
+                
6329
+                /* The value of c at this point is always less than 256, though we may or
6330
+                 may not be in UTF-8 mode. The code is duplicated for the caseless and
6331
+                 caseful cases, for speed, since matching characters is likely to be quite
6332
+                 common. First, ensure the minimum number of matches are present. If min =
6333
+                 max, continue at the same level without recursing. Otherwise, if
6334
+                 minimizing, keep trying the rest of the expression and advancing one
6335
+                 matching character if failing, up to the maximum. Alternatively, if
6336
+                 maximizing, find the maximum number of characters and work backwards. */
6337
+                
6338
+                DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
6339
+                         max, eptr));
6340
+                
6341
+                if ((ims & PCRE_CASELESS) != 0)
6342
+                {
6343
+                    c = md->lcc[c];
6344
+                    for (i = 1; i <= min; i++)
6345
+                        if (c != md->lcc[*eptr++]) return MATCH_NOMATCH;
6346
+                    if (min == max) continue;
6347
+                    if (minimize)
6348
+                    {
6349
+                        for (i = min;; i++)
6350
+                        {
6351
+                            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6352
+                                MATCH_NOMATCH) return rrc;
6353
+                            if (i >= max || eptr >= md->end_subject ||
6354
+                                c != md->lcc[*eptr++])
6355
+                                return MATCH_NOMATCH;
6356
+                        }
6357
+                        /* Control never gets here */
6358
+                    }
6359
+                    else
6360
+                    {
6361
+                        const uschar *pp = eptr;
6362
+                        for (i = min; i < max; i++)
6363
+                        {
6364
+                            if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
6365
+                            eptr++;
6366
+                        }
6367
+                        while (eptr >= pp)
6368
+                            if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6369
+                                MATCH_NOMATCH) return rrc;
6370
+                        return MATCH_NOMATCH;
6371
+                    }
6372
+                    /* Control never gets here */
6373
+                }
6374
+                
6375
+                /* Caseful comparisons (includes all multi-byte characters) */
6376
+                
6377
+                else
6378
+                {
6379
+                    for (i = 1; i <= min; i++) if (c != *eptr++) return MATCH_NOMATCH;
6380
+                    if (min == max) continue;
6381
+                    if (minimize)
6382
+                    {
6383
+                        for (i = min;; i++)
6384
+                        {
6385
+                            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6386
+                                MATCH_NOMATCH) return rrc;
6387
+                            if (i >= max || eptr >= md->end_subject || c != *eptr++)
6388
+                                return MATCH_NOMATCH;
6389
+                        }
6390
+                        /* Control never gets here */
6391
+                    }
6392
+                    else
6393
+                    {
6394
+                        const uschar *pp = eptr;
6395
+                        for (i = min; i < max; i++)
6396
+                        {
6397
+                            if (eptr >= md->end_subject || c != *eptr) break;
6398
+                            eptr++;
6399
+                        }
6400
+                        while (eptr >= pp)
6401
+                            if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
6402
+                                MATCH_NOMATCH) return rrc;
6403
+                        return MATCH_NOMATCH;
6404
+                    }
6405
+                }
6406
+                /* Control never gets here */
6407
+                
6408
+                /* Match a negated single one-byte character. The character we are
6409
+                 checking can be multibyte. */
6410
+                
6411
+            case OP_NOT:
6412
+                if (eptr >= md->end_subject) return MATCH_NOMATCH;
6413
+                ecode++;
6414
+                GETCHARINCTEST(c, eptr);
6415
+                if ((ims & PCRE_CASELESS) != 0)
6416
+                {
6417
+#ifdef SUPPORT_UTF8
6418
+                    if (c < 256)
6419
+#endif
6420
+                        c = md->lcc[c];
6421
+                    if (md->lcc[*ecode++] == c) return MATCH_NOMATCH;
6422
+                }
6423
+                else
6424
+                {
6425
+                    if (*ecode++ == c) return MATCH_NOMATCH;
6426
+                }
6427
+                break;
6428
+                
6429
+                /* Match a negated single one-byte character repeatedly. This is almost a
6430
+                 repeat of the code for a repeated single character, but I haven't found a
6431
+                 nice way of commoning these up that doesn't require a test of the
6432
+                 positive/negative option for each character match. Maybe that wouldn't add
6433
+                 very much to the time taken, but character matching *is* what this is all
6434
+                 about... */
6435
+                
6436
+            case OP_NOTEXACT:
6437
+                min = max = GET2(ecode, 1);
6438
+                ecode += 3;
6439
+                goto REPEATNOTCHAR;
6440
+                
6441
+            case OP_NOTUPTO:
6442
+            case OP_NOTMINUPTO:
6443
+                min = 0;
6444
+                max = GET2(ecode, 1);
6445
+                minimize = *ecode == OP_NOTMINUPTO;
6446
+                ecode += 3;
6447
+                goto REPEATNOTCHAR;
6448
+                
6449
+            case OP_NOTSTAR:
6450
+            case OP_NOTMINSTAR:
6451
+            case OP_NOTPLUS:
6452
+            case OP_NOTMINPLUS:
6453
+            case OP_NOTQUERY:
6454
+            case OP_NOTMINQUERY:
6455
+                c = *ecode++ - OP_NOTSTAR;
6456
+                minimize = (c & 1) != 0;
6457
+                min = rep_min[c];                 /* Pick up values from tables; */
6458
+                max = rep_max[c];                 /* zero for max => infinity */
6459
+                if (max == 0) max = INT_MAX;
6460
+                
6461
+                /* Common code for all repeated single-character (less than 255) matches.
6462
+                 We can give up quickly if there are fewer than the minimum number of
6463
+                 characters left in the subject. */
6464
+                
6465
+            REPEATNOTCHAR:
6466
+                if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6467
+                c = *ecode++;
6468
+                
6469
+                /* The code is duplicated for the caseless and caseful cases, for speed,
6470
+                 since matching characters is likely to be quite common. First, ensure the
6471
+                 minimum number of matches are present. If min = max, continue at the same
6472
+                 level without recursing. Otherwise, if minimizing, keep trying the rest of
6473
+                 the expression and advancing one matching character if failing, up to the
6474
+                 maximum. Alternatively, if maximizing, find the maximum number of
6475
+                 characters and work backwards. */
6476
+                
6477
+                DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
6478
+                         max, eptr));
6479
+                
6480
+                if ((ims & PCRE_CASELESS) != 0)
6481
+                {
6482
+                    c = md->lcc[c];
6483
+                    
6484
+#ifdef SUPPORT_UTF8
6485
+                    /* UTF-8 mode */
6486
+                    if (md->utf8)
6487
+                    {
6488
+                        register int d;
6489
+                        for (i = 1; i <= min; i++)
6490
+                        {
6491
+                            GETCHARINC(d, eptr);
6492
+                            if (d < 256) d = md->lcc[d];
6493
+                            if (c == d) return MATCH_NOMATCH;
6494
+                        }
6495
+                    }
6496
+                    else
6497
+#endif
6498
+                        
6499
+                    /* Not UTF-8 mode */
6500
+                    {
6501
+                        for (i = 1; i <= min; i++)
6502
+                            if (c == md->lcc[*eptr++]) return MATCH_NOMATCH;
6503
+                    }
6504
+                    
6505
+                    if (min == max) continue;
6506
+                    
6507
+                    if (minimize)
6508
+                    {
6509
+#ifdef SUPPORT_UTF8
6510
+                        /* UTF-8 mode */
6511
+                        if (md->utf8)
6512
+                        {
6513
+                            register int d;
6514
+                            for (i = min;; i++)
6515
+                            {
6516
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6517
+                                    MATCH_NOMATCH) return rrc;
6518
+                                GETCHARINC(d, eptr);
6519
+                                if (d < 256) d = md->lcc[d];
6520
+                                if (i >= max || eptr >= md->end_subject || c == d)
6521
+                                    return MATCH_NOMATCH;
6522
+                            }
6523
+                        }
6524
+                        else
6525
+#endif
6526
+                        /* Not UTF-8 mode */
6527
+                        {
6528
+                            for (i = min;; i++)
6529
+                            {
6530
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6531
+                                    MATCH_NOMATCH) return rrc;
6532
+                                if (i >= max || eptr >= md->end_subject || c == md->lcc[*eptr++])
6533
+                                    return MATCH_NOMATCH;
6534
+                            }
6535
+                        }
6536
+                        /* Control never gets here */
6537
+                    }
6538
+                    
6539
+                    /* Maximize case */
6540
+                    
6541
+                    else
6542
+                    {
6543
+                        const uschar *pp = eptr;
6544
+                        
6545
+#ifdef SUPPORT_UTF8
6546
+                        /* UTF-8 mode */
6547
+                        if (md->utf8)
6548
+                        {
6549
+                            register int d;
6550
+                            for (i = min; i < max; i++)
6551
+                            {
6552
+                                int len = 1;
6553
+                                if (eptr >= md->end_subject) break;
6554
+                                GETCHARLEN(d, eptr, len);
6555
+                                if (d < 256) d = md->lcc[d];
6556
+                                if (c == d) break;
6557
+                                eptr += len;
6558
+                            }
6559
+                            for(;;)
6560
+                            {
6561
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6562
+                                    MATCH_NOMATCH) return rrc;
6563
+                                if (eptr-- == pp) break;        /* Stop if tried at original pos */
6564
+                                BACKCHAR(eptr);
6565
+                            }
6566
+                        }
6567
+                        else
6568
+#endif
6569
+                        /* Not UTF-8 mode */
6570
+                        {
6571
+                            for (i = min; i < max; i++)
6572
+                            {
6573
+                                if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
6574
+                                eptr++;
6575
+                            }
6576
+                            while (eptr >= pp)
6577
+                            {
6578
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6579
+                                    MATCH_NOMATCH) return rrc;
6580
+                                eptr--;
6581
+                            }
6582
+                        }
6583
+                        
6584
+                        return MATCH_NOMATCH;
6585
+                    }
6586
+                    /* Control never gets here */
6587
+                }
6588
+                
6589
+                /* Caseful comparisons */
6590
+                
6591
+                else
6592
+                {
6593
+#ifdef SUPPORT_UTF8
6594
+                    /* UTF-8 mode */
6595
+                    if (md->utf8)
6596
+                    {
6597
+                        register int d;
6598
+                        for (i = 1; i <= min; i++)
6599
+                        {
6600
+                            GETCHARINC(d, eptr);
6601
+                            if (c == d) return MATCH_NOMATCH;
6602
+                        }
6603
+                    }
6604
+                    else
6605
+#endif
6606
+                    /* Not UTF-8 mode */
6607
+                    {
6608
+                        for (i = 1; i <= min; i++)
6609
+                            if (c == *eptr++) return MATCH_NOMATCH;
6610
+                    }
6611
+                    
6612
+                    if (min == max) continue;
6613
+                    
6614
+                    if (minimize)
6615
+                    {
6616
+#ifdef SUPPORT_UTF8
6617
+                        /* UTF-8 mode */
6618
+                        if (md->utf8)
6619
+                        {
6620
+                            register int d;
6621
+                            for (i = min;; i++)
6622
+                            {
6623
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6624
+                                    MATCH_NOMATCH) return rrc;
6625
+                                GETCHARINC(d, eptr);
6626
+                                if (i >= max || eptr >= md->end_subject || c == d)
6627
+                                    return MATCH_NOMATCH;
6628
+                            }
6629
+                        }
6630
+                        else
6631
+#endif
6632
+                        /* Not UTF-8 mode */
6633
+                        {
6634
+                            for (i = min;; i++)
6635
+                            {
6636
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6637
+                                    MATCH_NOMATCH) return rrc;
6638
+                                if (i >= max || eptr >= md->end_subject || c == *eptr++)
6639
+                                    return MATCH_NOMATCH;
6640
+                            }
6641
+                        }
6642
+                        /* Control never gets here */
6643
+                    }
6644
+                    
6645
+                    /* Maximize case */
6646
+                    
6647
+                    else
6648
+                    {
6649
+                        const uschar *pp = eptr;
6650
+                        
6651
+#ifdef SUPPORT_UTF8
6652
+                        /* UTF-8 mode */
6653
+                        if (md->utf8)
6654
+                        {
6655
+                            register int d;
6656
+                            for (i = min; i < max; i++)
6657
+                            {
6658
+                                int len = 1;
6659
+                                if (eptr >= md->end_subject) break;
6660
+                                GETCHARLEN(d, eptr, len);
6661
+                                if (c == d) break;
6662
+                                eptr += len;
6663
+                            }
6664
+                            for(;;)
6665
+                            {
6666
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6667
+                                    MATCH_NOMATCH) return rrc;
6668
+                                if (eptr-- == pp) break;        /* Stop if tried at original pos */
6669
+                                BACKCHAR(eptr);
6670
+                            }
6671
+                        }
6672
+                        else
6673
+#endif
6674
+                        /* Not UTF-8 mode */
6675
+                        {
6676
+                            for (i = min; i < max; i++)
6677
+                            {
6678
+                                if (eptr >= md->end_subject || c == *eptr) break;
6679
+                                eptr++;
6680
+                            }
6681
+                            while (eptr >= pp)
6682
+                            {
6683
+                                if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6684
+                                    MATCH_NOMATCH) return rrc;
6685
+                                eptr--;
6686
+                            }
6687
+                        }
6688
+                        
6689
+                        return MATCH_NOMATCH;
6690
+                    }
6691
+                }
6692
+                /* Control never gets here */
6693
+                
6694
+                /* Match a single character type repeatedly; several different opcodes
6695
+                 share code. This is very similar to the code for single characters, but we
6696
+                 repeat it in the interests of efficiency. */
6697
+                
6698
+            case OP_TYPEEXACT:
6699
+                min = max = GET2(ecode, 1);
6700
+                minimize = TRUE;
6701
+                ecode += 3;
6702
+                goto REPEATTYPE;
6703
+                
6704
+            case OP_TYPEUPTO:
6705
+            case OP_TYPEMINUPTO:
6706
+                min = 0;
6707
+                max = GET2(ecode, 1);
6708
+                minimize = *ecode == OP_TYPEMINUPTO;
6709
+                ecode += 3;
6710
+                goto REPEATTYPE;
6711
+                
6712
+            case OP_TYPESTAR:
6713
+            case OP_TYPEMINSTAR:
6714
+            case OP_TYPEPLUS:
6715
+            case OP_TYPEMINPLUS:
6716
+            case OP_TYPEQUERY:
6717
+            case OP_TYPEMINQUERY:
6718
+                c = *ecode++ - OP_TYPESTAR;
6719
+                minimize = (c & 1) != 0;
6720
+                min = rep_min[c];                 /* Pick up values from tables; */
6721
+                max = rep_max[c];                 /* zero for max => infinity */
6722
+                if (max == 0) max = INT_MAX;
6723
+                
6724
+                /* Common code for all repeated single character type matches. Note that
6725
+                 in UTF-8 mode, '.' matches a character of any length, but for the other
6726
+                 character types, the valid characters are all one-byte long. */
6727
+                
6728
+            REPEATTYPE:
6729
+                ctype = *ecode++;      /* Code for the character type */
6730
+                
6731
+                /* First, ensure the minimum number of matches are present. Use inline
6732
+                 code for maximizing the speed, and do the type test once at the start
6733
+                 (i.e. keep it out of the loop). Also we can test that there are at least
6734
+                 the minimum number of bytes before we start. This isn't as effective in
6735
+                 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
6736
+                 is tidier. */
6737
+                
6738
+                if (min > md->end_subject - eptr) return MATCH_NOMATCH;
6739
+                if (min > 0)
6740
+                {
6741
+#ifdef SUPPORT_UTF8
6742
+                    if (md->utf8) switch(ctype)
6743
+                    {
6744
+                        case OP_ANY:
6745
+                            for (i = 1; i <= min; i++)
6746
+                            {
6747
+                                if (eptr >= md->end_subject ||
6748
+                                    (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
6749
+                                    return MATCH_NOMATCH;
6750
+                                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6751
+                            }
6752
+                            break;
6753
+                            
6754
+                        case OP_ANYBYTE:
6755
+                            eptr += min;
6756
+                            break;
6757
+                            
6758
+                        case OP_NOT_DIGIT:
6759
+                            for (i = 1; i <= min; i++)
6760
+                            {
6761
+                                if (eptr >= md->end_subject) return MATCH_NOMATCH;
6762
+                                GETCHARINC(c, eptr);
6763
+                                if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
6764
+                                    return MATCH_NOMATCH;
6765
+                            }
6766
+                            break;
6767
+                            
6768
+                        case OP_DIGIT:
6769
+                            for (i = 1; i <= min; i++)
6770
+                            {
6771
+                                if (eptr >= md->end_subject ||
6772
+                                    *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
6773
+                                    return MATCH_NOMATCH;
6774
+                                /* No need to skip more bytes - we know it's a 1-byte character */
6775
+                            }
6776
+                            break;
6777
+                            
6778
+                        case OP_NOT_WHITESPACE:
6779
+                            for (i = 1; i <= min; i++)
6780
+                            {
6781
+                                if (eptr >= md->end_subject ||
6782
+                                    (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
6783
+                                    return MATCH_NOMATCH;
6784
+                                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6785
+                            }
6786
+                            break;
6787
+                            
6788
+                        case OP_WHITESPACE:
6789
+                            for (i = 1; i <= min; i++)
6790
+                            {
6791
+                                if (eptr >= md->end_subject ||
6792
+                                    *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
6793
+                                    return MATCH_NOMATCH;
6794
+                                /* No need to skip more bytes - we know it's a 1-byte character */
6795
+                            }
6796
+                            break;
6797
+                            
6798
+                        case OP_NOT_WORDCHAR:
6799
+                            for (i = 1; i <= min; i++)
6800
+                            {
6801
+                                if (eptr >= md->end_subject ||
6802
+                                    (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
6803
+                                    return MATCH_NOMATCH;
6804
+                                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6805
+                            }
6806
+                            break;
6807
+                            
6808
+                        case OP_WORDCHAR:
6809
+                            for (i = 1; i <= min; i++)
6810
+                            {
6811
+                                if (eptr >= md->end_subject ||
6812
+                                    *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
6813
+                                    return MATCH_NOMATCH;
6814
+                                /* No need to skip more bytes - we know it's a 1-byte character */
6815
+                            }
6816
+                            break;
6817
+                    }
6818
+                    else
6819
+#endif
6820
+                        
6821
+                    /* Code for the non-UTF-8 case for minimum matching */
6822
+                        
6823
+                        switch(ctype)
6824
+                    {
6825
+                        case OP_ANY:
6826
+                            if ((ims & PCRE_DOTALL) == 0)
6827
+                            {
6828
+                                for (i = 1; i <= min; i++)
6829
+                                    if (*eptr++ == NEWLINE) return MATCH_NOMATCH;
6830
+                            }
6831
+                            else eptr += min;
6832
+                            break;
6833
+                            
6834
+                        case OP_ANYBYTE:
6835
+                            eptr += min;
6836
+                            break;
6837
+                            
6838
+                        case OP_NOT_DIGIT:
6839
+                            for (i = 1; i <= min; i++)
6840
+                                if ((md->ctypes[*eptr++] & ctype_digit) != 0) return MATCH_NOMATCH;
6841
+                            break;
6842
+                            
6843
+                        case OP_DIGIT:
6844
+                            for (i = 1; i <= min; i++)
6845
+                                if ((md->ctypes[*eptr++] & ctype_digit) == 0) return MATCH_NOMATCH;
6846
+                            break;
6847
+                            
6848
+                        case OP_NOT_WHITESPACE:
6849
+                            for (i = 1; i <= min; i++)
6850
+                                if ((md->ctypes[*eptr++] & ctype_space) != 0) return MATCH_NOMATCH;
6851
+                            break;
6852
+                            
6853
+                        case OP_WHITESPACE:
6854
+                            for (i = 1; i <= min; i++)
6855
+                                if ((md->ctypes[*eptr++] & ctype_space) == 0) return MATCH_NOMATCH;
6856
+                            break;
6857
+                            
6858
+                        case OP_NOT_WORDCHAR:
6859
+                            for (i = 1; i <= min; i++)
6860
+                                if ((md->ctypes[*eptr++] & ctype_word) != 0)
6861
+                                    return MATCH_NOMATCH;
6862
+                            break;
6863
+                            
6864
+                        case OP_WORDCHAR:
6865
+                            for (i = 1; i <= min; i++)
6866
+                                if ((md->ctypes[*eptr++] & ctype_word) == 0)
6867
+                                    return MATCH_NOMATCH;
6868
+                            break;
6869
+                    }
6870
+                }
6871
+                
6872
+                /* If min = max, continue at the same level without recursing */
6873
+                
6874
+                if (min == max) continue;
6875
+                
6876
+                /* If minimizing, we have to test the rest of the pattern before each
6877
+                 subsequent match. Again, separate the UTF-8 case for speed. */
6878
+                
6879
+                if (minimize)
6880
+                {
6881
+#ifdef SUPPORT_UTF8
6882
+                    /* UTF-8 mode */
6883
+                    if (md->utf8)
6884
+                    {
6885
+                        for (i = min;; i++)
6886
+                        {
6887
+                            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6888
+                                MATCH_NOMATCH) return rrc;
6889
+                            if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6890
+                            
6891
+                            GETCHARINC(c, eptr);
6892
+                            switch(ctype)
6893
+                            {
6894
+                                case OP_ANY:
6895
+                                    if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return MATCH_NOMATCH;
6896
+                                    break;
6897
+                                    
6898
+                                case OP_ANYBYTE:
6899
+                                    break;
6900
+                                    
6901
+                                case OP_NOT_DIGIT:
6902
+                                    if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
6903
+                                        return MATCH_NOMATCH;
6904
+                                    break;
6905
+                                    
6906
+                                case OP_DIGIT:
6907
+                                    if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
6908
+                                        return MATCH_NOMATCH;
6909
+                                    break;
6910
+                                    
6911
+                                case OP_NOT_WHITESPACE:
6912
+                                    if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
6913
+                                        return MATCH_NOMATCH;
6914
+                                    break;
6915
+                                    
6916
+                                case OP_WHITESPACE:
6917
+                                    if  (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
6918
+                                        return MATCH_NOMATCH;
6919
+                                    break;
6920
+                                    
6921
+                                case OP_NOT_WORDCHAR:
6922
+                                    if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
6923
+                                        return MATCH_NOMATCH;
6924
+                                    break;
6925
+                                    
6926
+                                case OP_WORDCHAR:
6927
+                                    if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
6928
+                                        return MATCH_NOMATCH;
6929
+                                    break;
6930
+                            }
6931
+                        }
6932
+                    }
6933
+                    else
6934
+#endif
6935
+                    /* Not UTF-8 mode */
6936
+                    {
6937
+                        for (i = min;; i++)
6938
+                        {
6939
+                            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
6940
+                                MATCH_NOMATCH) return rrc;
6941
+                            if (i >= max || eptr >= md->end_subject) return MATCH_NOMATCH;
6942
+                            c = *eptr++;
6943
+                            switch(ctype)
6944
+                            {
6945
+                                case OP_ANY:
6946
+                                    if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return MATCH_NOMATCH;
6947
+                                    break;
6948
+                                    
6949
+                                case OP_ANYBYTE:
6950
+                                    break;
6951
+                                    
6952
+                                case OP_NOT_DIGIT:
6953
+                                    if ((md->ctypes[c] & ctype_digit) != 0) return MATCH_NOMATCH;
6954
+                                    break;
6955
+                                    
6956
+                                case OP_DIGIT:
6957
+                                    if ((md->ctypes[c] & ctype_digit) == 0) return MATCH_NOMATCH;
6958
+                                    break;
6959
+                                    
6960
+                                case OP_NOT_WHITESPACE:
6961
+                                    if ((md->ctypes[c] & ctype_space) != 0) return MATCH_NOMATCH;
6962
+                                    break;
6963
+                                    
6964
+                                case OP_WHITESPACE:
6965
+                                    if  ((md->ctypes[c] & ctype_space) == 0) return MATCH_NOMATCH;
6966
+                                    break;
6967
+                                    
6968
+                                case OP_NOT_WORDCHAR:
6969
+                                    if ((md->ctypes[c] & ctype_word) != 0) return MATCH_NOMATCH;
6970
+                                    break;
6971
+                                    
6972
+                                case OP_WORDCHAR:
6973
+                                    if ((md->ctypes[c] & ctype_word) == 0) return MATCH_NOMATCH;
6974
+                                    break;
6975
+                            }
6976
+                        }
6977
+                    }
6978
+                    /* Control never gets here */
6979
+                }
6980
+                
6981
+                /* If maximizing it is worth using inline code for speed, doing the type
6982
+                 test once at the start (i.e. keep it out of the loop). Again, keep the
6983
+                 UTF-8 stuff separate. */
6984
+                
6985
+                else
6986
+                {
6987
+                    const uschar *pp = eptr;
6988
+                    
6989
+#ifdef SUPPORT_UTF8
6990
+                    /* UTF-8 mode */
6991
+                    
6992
+                    if (md->utf8)
6993
+                    {
6994
+                        switch(ctype)
6995
+                        {
6996
+                            case OP_ANY:
6997
+                                
6998
+                                /* Special code is required for UTF8, but when the maximum is unlimited
6999
+                                 we don't need it, so we repeat the non-UTF8 code. This is probably
7000
+                                 worth it, because .* is quite a common idiom. */
7001
+                                
7002
+                                if (max < INT_MAX)
7003
+                                {
7004
+                                    if ((ims & PCRE_DOTALL) == 0)
7005
+                                    {
7006
+                                        for (i = min; i < max; i++)
7007
+                                        {
7008
+                                            if (eptr >= md->end_subject || *eptr == NEWLINE) break;
7009
+                                            eptr++;
7010
+                                            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
7011
+                                        }
7012
+                                    }
7013
+                                    else
7014
+                                    {
7015
+                                        for (i = min; i < max; i++)
7016
+                                        {
7017
+                                            eptr++;
7018
+                                            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
7019
+                                        }
7020
+                                    }
7021
+                                }
7022
+                                
7023
+                                /* Handle unlimited UTF-8 repeat */
7024
+                                
7025
+                                else
7026
+                                {
7027
+                                    if ((ims & PCRE_DOTALL) == 0)
7028
+                                    {
7029
+                                        for (i = min; i < max; i++)
7030
+                                        {
7031
+                                            if (eptr >= md->end_subject || *eptr == NEWLINE) break;
7032
+                                            eptr++;
7033
+                                        }
7034
+                                        break;
7035
+                                    }
7036
+                                    else
7037
+                                    {
7038
+                                        c = max - min;
7039
+                                        if (c > md->end_subject - eptr) c = md->end_subject - eptr;
7040
+                                        eptr += c;
7041
+                                    }
7042
+                                }
7043
+                                break;
7044
+                                
7045
+                                /* The byte case is the same as non-UTF8 */
7046
+                                
7047
+                            case OP_ANYBYTE:
7048
+                                c = max - min;
7049
+                                if (c > md->end_subject - eptr) c = md->end_subject - eptr;
7050
+                                eptr += c;
7051
+                                break;
7052
+                                
7053
+                            case OP_NOT_DIGIT:
7054
+                                for (i = min; i < max; i++)
7055
+                                {
7056
+                                    int len = 1;
7057
+                                    if (eptr >= md->end_subject) break;
7058
+                                    GETCHARLEN(c, eptr, len);
7059
+                                    if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
7060
+                                    eptr+= len;
7061
+                                }
7062
+                                break;
7063
+                                
7064
+                            case OP_DIGIT:
7065
+                                for (i = min; i < max; i++)
7066
+                                {
7067
+                                    int len = 1;
7068
+                                    if (eptr >= md->end_subject) break;
7069
+                                    GETCHARLEN(c, eptr, len);
7070
+                                    if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
7071
+                                    eptr+= len;
7072
+                                }
7073
+                                break;
7074
+                                
7075
+                            case OP_NOT_WHITESPACE:
7076
+                                for (i = min; i < max; i++)
7077
+                                {
7078
+                                    int len = 1;
7079
+                                    if (eptr >= md->end_subject) break;
7080
+                                    GETCHARLEN(c, eptr, len);
7081
+                                    if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
7082
+                                    eptr+= len;
7083
+                                }
7084
+                                break;
7085
+                                
7086
+                            case OP_WHITESPACE:
7087
+                                for (i = min; i < max; i++)
7088
+                                {
7089
+                                    int len = 1;
7090
+                                    if (eptr >= md->end_subject) break;
7091
+                                    GETCHARLEN(c, eptr, len);
7092
+                                    if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
7093
+                                    eptr+= len;
7094
+                                }
7095
+                                break;
7096
+                                
7097
+                            case OP_NOT_WORDCHAR:
7098
+                                for (i = min; i < max; i++)
7099
+                                {
7100
+                                    int len = 1;
7101
+                                    if (eptr >= md->end_subject) break;
7102
+                                    GETCHARLEN(c, eptr, len);
7103
+                                    if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
7104
+                                    eptr+= len;
7105
+                                }
7106
+                                break;
7107
+                                
7108
+                            case OP_WORDCHAR:
7109
+                                for (i = min; i < max; i++)
7110
+                                {
7111
+                                    int len = 1;
7112
+                                    if (eptr >= md->end_subject) break;
7113
+                                    GETCHARLEN(c, eptr, len);
7114
+                                    if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
7115
+                                    eptr+= len;
7116
+                                }
7117
+                                break;
7118
+                        }
7119
+                        
7120
+                        /* eptr is now past the end of the maximum run */
7121
+                        
7122
+                        for(;;)
7123
+                        {
7124
+                            if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
7125
+                                MATCH_NOMATCH) return rrc;
7126
+                            if (eptr-- == pp) break;        /* Stop if tried at original pos */
7127
+                            BACKCHAR(eptr);
7128
+                        }
7129
+                    }
7130
+                    else
7131
+#endif
7132
+                        
7133
+                    /* Not UTF-8 mode */
7134
+                    {
7135
+                        switch(ctype)
7136
+                        {
7137
+                            case OP_ANY:
7138
+                                if ((ims & PCRE_DOTALL) == 0)
7139
+                                {
7140
+                                    for (i = min; i < max; i++)
7141
+                                    {
7142
+                                        if (eptr >= md->end_subject || *eptr == NEWLINE) break;
7143
+                                        eptr++;
7144
+                                    }
7145
+                                    break;
7146
+                                }
7147
+                                /* For DOTALL case, fall through and treat as \C */
7148
+                                
7149
+                            case OP_ANYBYTE:
7150
+                                c = max - min;
7151
+                                if (c > md->end_subject - eptr) c = md->end_subject - eptr;
7152
+                                eptr += c;
7153
+                                break;
7154
+                                
7155
+                            case OP_NOT_DIGIT:
7156
+                                for (i = min; i < max; i++)
7157
+                                {
7158
+                                    if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
7159
+                                        break;
7160
+                                    eptr++;
7161
+                                }
7162
+                                break;
7163
+                                
7164
+                            case OP_DIGIT:
7165
+                                for (i = min; i < max; i++)
7166
+                                {
7167
+                                    if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
7168
+                                        break;
7169
+                                    eptr++;
7170
+                                }
7171
+                                break;
7172
+                                
7173
+                            case OP_NOT_WHITESPACE:
7174
+                                for (i = min; i < max; i++)
7175
+                                {
7176
+                                    if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
7177
+                                        break;
7178
+                                    eptr++;
7179
+                                }
7180
+                                break;
7181
+                                
7182
+                            case OP_WHITESPACE:
7183
+                                for (i = min; i < max; i++)
7184
+                                {
7185
+                                    if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
7186
+                                        break;
7187
+                                    eptr++;
7188
+                                }
7189
+                                break;
7190
+                                
7191
+                            case OP_NOT_WORDCHAR:
7192
+                                for (i = min; i < max; i++)
7193
+                                {
7194
+                                    if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
7195
+                                        break;
7196
+                                    eptr++;
7197
+                                }
7198
+                                break;
7199
+                                
7200
+                            case OP_WORDCHAR:
7201
+                                for (i = min; i < max; i++)
7202
+                                {
7203
+                                    if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
7204
+                                        break;
7205
+                                    eptr++;
7206
+                                }
7207
+                                break;
7208
+                        }
7209
+                        
7210
+                        /* eptr is now past the end of the maximum run */
7211
+                        
7212
+                        while (eptr >= pp)
7213
+                        {
7214
+                            if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
7215
+                                MATCH_NOMATCH) return rrc;
7216
+                        }
7217
+                    }
7218
+                    
7219
+                    /* Get here if we can't make it match with any permitted repetitions */
7220
+                    
7221
+                    return MATCH_NOMATCH;
7222
+                }
7223
+                /* Control never gets here */
7224
+                
7225
+                /* There's been some horrible disaster. Since all codes > OP_BRA are
7226
+                 for capturing brackets, and there shouldn't be any gaps between 0 and
7227
+                 OP_BRA, arrival here can only mean there is something seriously wrong
7228
+                 in the code above or the OP_xxx definitions. */
7229
+                
7230
+            default:
7231
+                DPRINTF(("Unknown opcode %d\n", *ecode));
7232
+                return PCRE_ERROR_UNKNOWN_NODE;
7217 7233
         }
7218
-
7219
-      /* Get here if we can't make it match with any permitted repetitions */
7220
-
7221
-      return MATCH_NOMATCH;
7222
-      }
7223
-    /* Control never gets here */
7224
-
7225
-    /* There's been some horrible disaster. Since all codes > OP_BRA are
7226
-    for capturing brackets, and there shouldn't be any gaps between 0 and
7227
-    OP_BRA, arrival here can only mean there is something seriously wrong
7228
-    in the code above or the OP_xxx definitions. */
7229
-
7230
-    default:
7231
-    DPRINTF(("Unknown opcode %d\n", *ecode));
7232
-    return PCRE_ERROR_UNKNOWN_NODE;
7233
-    }
7234
-
7235
-  /* Do not stick any code in here without much thought; it is assumed
7236
-  that "continue" in the code above comes out to here to repeat the main
7237
-  loop. */
7238
-
7239
-  }             /* End of main loop */
7240
-/* Control never reaches here */
7234
+        
7235
+        /* Do not stick any code in here without much thought; it is assumed
7236
+         that "continue" in the code above comes out to here to repeat the main
7237
+         loop. */
7238
+        
7239
+    }             /* End of main loop */
7240
+    /* Control never reaches here */
7241 7241
 }
7242 7242
 
7243 7243
 
7244 7244
 
7245 7245
 
7246 7246
 /*************************************************
7247
-*         Execute a Regular Expression           *
7248
-*************************************************/
7247
+ *         Execute a Regular Expression           *
7248
+ *************************************************/
7249 7249
 
7250 7250
 /* This function applies a compiled re to a subject string and picks out
7251
-portions of the string if it matches. Two elements in the vector are set for
7252
-each substring: the offsets to the start and end of the substring.
7253
-
7254
-Arguments:
7255
-  external_re     points to the compiled expression
7256
-  extra_data      points to extra data or is NULL
7257
-  subject         points to the subject string
7258
-  length          length of subject string (may contain binary zeros)
7259
-  start_offset    where to start in the subject string
7260
-  options         option bits
7261
-  offsets         points to a vector of ints to be filled in with offsets
7262
-  offsetcount     the number of elements in the vector
7263
-
7264
-Returns:          > 0 => success; value is the number of elements filled in
7265
-                  = 0 => success, but offsets is not big enough
7266
-                   -1 => failed to match
7267
-                 < -1 => some kind of unexpected problem
7268
-*/
7251
+ portions of the string if it matches. Two elements in the vector are set for
7252
+ each substring: the offsets to the start and end of the substring.
7253
+ 
7254
+ Arguments:
7255
+ external_re     points to the compiled expression
7256
+ extra_data      points to extra data or is NULL
7257
+ subject         points to the subject string
7258
+ length          length of subject string (may contain binary zeros)
7259
+ start_offset    where to start in the subject string
7260
+ options         option bits
7261
+ offsets         points to a vector of ints to be filled in with offsets
7262
+ offsetcount     the number of elements in the vector
7263
+ 
7264
+ Returns:          > 0 => success; value is the number of elements filled in
7265
+ = 0 => success, but offsets is not big enough
7266
+ -1 => failed to match
7267
+ < -1 => some kind of unexpected problem
7268
+ */
7269 7269
 
7270 7270
 int
7271 7271
 pcre_exec(const pcre *external_re, const pcre_extra *extra_data,
7272
-  const char *subject, int length, int start_offset, int options, int *offsets,
7273
-  int offsetcount)
7272
+          const char *subject, int length, int start_offset, int options, int *offsets,
7273
+          int offsetcount)
7274 7274
 {
7275
-int rc, resetcount, ocount;
7276
-int first_byte = -1;
7277
-int req_byte = -1;
7278
-int req_byte2 = -1;
7279
-unsigned long int ims = 0;
7280
-BOOL using_temporary_offsets = FALSE;
7281
-BOOL anchored;
7282
-BOOL startline;
7283
-BOOL first_byte_caseless = FALSE;
7284
-BOOL req_byte_caseless = FALSE;
7285
-match_data match_block;
7286
-const uschar *start_bits = NULL;
7287
-const uschar *start_match = (const uschar *)subject + start_offset;
7288
-const uschar *end_subject;
7289
-const uschar *req_byte_ptr = start_match - 1;
7290
-const pcre_study_data *study;
7291
-const real_pcre *re = (const real_pcre *)external_re;
7292
-
7293
-/* Plausibility checks */
7294
-
7295
-if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
7296
-if (re == NULL || subject == NULL ||
7297
-   (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
7298
-
7299
-/* Fish out the optional data from the extra_data structure, first setting
7300
-the default values. */
7301
-
7302
-study = NULL;
7303
-match_block.match_limit = MATCH_LIMIT;
7304
-match_block.callout_data = NULL;
7305
-
7306
-if (extra_data != NULL)
7307
-  {
7308
-  register unsigned int flags = extra_data->flags;
7309
-  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
7310
-    study = extra_data->study_data;
7311
-  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
7312
-    match_block.match_limit = extra_data->match_limit;
7313
-  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
7314
-    match_block.callout_data = extra_data->callout_data;
7315
-  }
7316
-
7317
-/* Now we have re supposedly pointing to the regex */
7318
-
7319
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
7320
-
7321
-anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
7322
-startline = (re->options & PCRE_STARTLINE) != 0;
7323
-
7324
-match_block.start_code =
7325
-  (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;
7326
-match_block.start_subject = (const uschar *)subject;
7327
-match_block.start_offset = start_offset;
7328
-match_block.end_subject = match_block.start_subject + length;
7329
-end_subject = match_block.end_subject;
7330
-
7331
-match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
7332
-match_block.utf8 = (re->options & PCRE_UTF8) != 0;
7333
-
7334
-match_block.notbol = (options & PCRE_NOTBOL) != 0;
7335
-match_block.noteol = (options & PCRE_NOTEOL) != 0;
7336
-match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
7337
-
7338
-match_block.recursive = NULL;                   /* No recursion at top level */
7339
-
7340
-match_block.lcc = re->tables + lcc_offset;
7341
-match_block.ctypes = re->tables + ctypes_offset;
7342
-
7343
-/* The ims options can vary during the matching as a result of the presence
7344
-of (?ims) items in the pattern. They are kept in a local variable so that
7345
-restoring at the exit of a group is easy. */
7346
-
7347
-ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
7348
-
7349
-/* If the expression has got more back references than the offsets supplied can
7350
-hold, we get a temporary bit of working store to use during the matching.
7351
-Otherwise, we can use the vector supplied, rounding down its size to a multiple
7352
-of 3. */
7353
-
7354
-ocount = offsetcount - (offsetcount % 3);
7355
-
7356
-if (re->top_backref > 0 && re->top_backref >= ocount/3)
7357
-  {
7358
-  ocount = re->top_backref * 3 + 3;
7359
-  match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
7360
-  if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
7361
-  using_temporary_offsets = TRUE;
7362
-  DPRINTF(("Got memory to hold back references\n"));
7363
-  }
7364
-else match_block.offset_vector = offsets;
7365
-
7366
-match_block.offset_end = ocount;
7367
-match_block.offset_max = (2*ocount)/3;
7368
-match_block.offset_overflow = FALSE;
7369
-match_block.capture_last = -1;
7370
-
7371
-/* Compute the minimum number of offsets that we need to reset each time. Doing
7372
-this makes a huge difference to execution time when there aren't many brackets
7373
-in the pattern. */
7374
-
7375
-resetcount = 2 + re->top_bracket * 2;
7376
-if (resetcount > offsetcount) resetcount = ocount;
7377
-
7378
-/* Reset the working variable associated with each extraction. These should
7379
-never be used unless previously set, but they get saved and restored, and so we
7380
-initialize them to avoid reading uninitialized locations. */
7381
-
7382
-if (match_block.offset_vector != NULL)
7383
-  {
7384
-  register int *iptr = match_block.offset_vector + ocount;
7385
-  register int *iend = iptr - resetcount/2 + 1;
7386
-  while (--iptr >= iend) *iptr = -1;
7387
-  }
7388
-
7389
-/* Set up the first character to match, if available. The first_byte value is
7390
-never set for an anchored regular expression, but the anchoring may be forced
7391
-at run time, so we have to test for anchoring. The first char may be unset for
7392
-an unanchored pattern, of course. If there's no first char and the pattern was
7393
-studied, there may be a bitmap of possible first characters. */
7394
-
7395
-if (!anchored)
7396
-  {
7397
-  if ((re->options & PCRE_FIRSTSET) != 0)
7275
+    int rc, resetcount, ocount;
7276
+    int first_byte = -1;
7277
+    int req_byte = -1;
7278
+    int req_byte2 = -1;
7279
+    unsigned long int ims = 0;
7280
+    BOOL using_temporary_offsets = FALSE;
7281
+    BOOL anchored;
7282
+    BOOL startline;
7283
+    BOOL first_byte_caseless = FALSE;
7284
+    BOOL req_byte_caseless = FALSE;
7285
+    match_data match_block;
7286
+    const uschar *start_bits = NULL;
7287
+    const uschar *start_match = (const uschar *)subject + start_offset;
7288
+    const uschar *end_subject;
7289
+    const uschar *req_byte_ptr = start_match - 1;
7290
+    const pcre_study_data *study;
7291
+    const real_pcre *re = (const real_pcre *)external_re;
7292
+    
7293
+    /* Plausibility checks */
7294
+    
7295
+    if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
7296
+    if (re == NULL || subject == NULL ||
7297
+        (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
7298
+    
7299
+    /* Fish out the optional data from the extra_data structure, first setting
7300
+     the default values. */
7301
+    
7302
+    study = NULL;
7303
+    match_block.match_limit = MATCH_LIMIT;
7304
+    match_block.callout_data = NULL;
7305
+    
7306
+    if (extra_data != NULL)
7398 7307
     {
7399
-    first_byte = re->first_byte & 255;
7400
-    if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
7401
-      first_byte = match_block.lcc[first_byte];
7308
+        register unsigned int flags = extra_data->flags;
7309
+        if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
7310
+            study = extra_data->study_data;
7311
+        if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
7312
+            match_block.match_limit = extra_data->match_limit;
7313
+        if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
7314
+            match_block.callout_data = extra_data->callout_data;
7402 7315
     }
7403
-  else
7404
-    if (!startline && study != NULL &&
7405
-      (study->options & PCRE_STUDY_MAPPED) != 0)
7406
-        start_bits = study->start_bits;
7407
-  }
7408
-
7409
-/* For anchored or unanchored matches, there may be a "last known required
7410
-character" set. */
7411
-
7412
-if ((re->options & PCRE_REQCHSET) != 0)
7413
-  {
7414
-  req_byte = re->req_byte & 255;
7415
-  req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
7416
-  req_byte2 = (re->tables + fcc_offset)[req_byte];  /* case flipped */
7417
-  }
7418
-
7419
-/* Loop for handling unanchored repeated matching attempts; for anchored regexs
7420
-the loop runs just once. */
7421
-
7422
-do
7423
-  {
7424
-  register int *iptr = match_block.offset_vector;
7425
-  register int *iend = iptr + resetcount;
7426
-
7427
-  /* Reset the maximum number of extractions we might see. */
7428
-
7429
-  while (iptr < iend) *iptr++ = -1;
7430
-
7431
-  /* Advance to a unique first char if possible */
7432
-
7433
-  if (first_byte >= 0)
7316
+    
7317
+    /* Now we have re supposedly pointing to the regex */
7318
+    
7319
+    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
7320
+    
7321
+    anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
7322
+    startline = (re->options & PCRE_STARTLINE) != 0;
7323
+    
7324
+    match_block.start_code =
7325
+    (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;
7326
+    match_block.start_subject = (const uschar *)subject;
7327
+    match_block.start_offset = start_offset;
7328
+    match_block.end_subject = match_block.start_subject + length;
7329
+    end_subject = match_block.end_subject;
7330
+    
7331
+    match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
7332
+    match_block.utf8 = (re->options & PCRE_UTF8) != 0;
7333
+    
7334
+    match_block.notbol = (options & PCRE_NOTBOL) != 0;
7335
+    match_block.noteol = (options & PCRE_NOTEOL) != 0;
7336
+    match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
7337
+    
7338
+    match_block.recursive = NULL;                   /* No recursion at top level */
7339
+    
7340
+    match_block.lcc = re->tables + lcc_offset;
7341
+    match_block.ctypes = re->tables + ctypes_offset;
7342
+    
7343
+    /* The ims options can vary during the matching as a result of the presence
7344
+     of (?ims) items in the pattern. They are kept in a local variable so that
7345
+     restoring at the exit of a group is easy. */
7346
+    
7347
+    ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
7348
+    
7349
+    /* If the expression has got more back references than the offsets supplied can
7350
+     hold, we get a temporary bit of working store to use during the matching.
7351
+     Otherwise, we can use the vector supplied, rounding down its size to a multiple
7352
+     of 3. */
7353
+    
7354
+    ocount = offsetcount - (offsetcount % 3);
7355
+    
7356
+    if (re->top_backref > 0 && re->top_backref >= ocount/3)
7434 7357
     {
7435
-    if (first_byte_caseless)
7436
-      while (start_match < end_subject &&
7437
-             match_block.lcc[*start_match] != first_byte)
7438
-        start_match++;
7439
-    else
7440
-      while (start_match < end_subject && *start_match != first_byte)
7441
-        start_match++;
7358
+        ocount = re->top_backref * 3 + 3;
7359
+        match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
7360
+        if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
7361
+        using_temporary_offsets = TRUE;
7362
+        DPRINTF(("Got memory to hold back references\n"));
7442 7363
     }
7443
-
7444
-  /* Or to just after \n for a multiline match if possible */
7445
-
7446
-  else if (startline)
7364
+    else match_block.offset_vector = offsets;
7365
+    
7366
+    match_block.offset_end = ocount;
7367
+    match_block.offset_max = (2*ocount)/3;
7368
+    match_block.offset_overflow = FALSE;
7369
+    match_block.capture_last = -1;
7370
+    
7371
+    /* Compute the minimum number of offsets that we need to reset each time. Doing
7372
+     this makes a huge difference to execution time when there aren't many brackets
7373
+     in the pattern. */
7374
+    
7375
+    resetcount = 2 + re->top_bracket * 2;
7376
+    if (resetcount > offsetcount) resetcount = ocount;
7377
+    
7378
+    /* Reset the working variable associated with each extraction. These should
7379
+     never be used unless previously set, but they get saved and restored, and so we
7380
+     initialize them to avoid reading uninitialized locations. */
7381
+    
7382
+    if (match_block.offset_vector != NULL)
7447 7383
     {
7448
-    if (start_match > match_block.start_subject + start_offset)
7449
-      {
7450
-      while (start_match < end_subject && start_match[-1] != NEWLINE)
7451
-        start_match++;
7452
-      }
7384
+        register int *iptr = match_block.offset_vector + ocount;
7385
+        register int *iend = iptr - resetcount/2 + 1;
7386
+        while (--iptr >= iend) *iptr = -1;
7453 7387
     }
7454
-
7455
-  /* Or to a non-unique first char after study */
7456
-
7457
-  else if (start_bits != NULL)
7388
+    
7389
+    /* Set up the first character to match, if available. The first_byte value is
7390
+     never set for an anchored regular expression, but the anchoring may be forced
7391
+     at run time, so we have to test for anchoring. The first char may be unset for
7392
+     an unanchored pattern, of course. If there's no first char and the pattern was
7393
+     studied, there may be a bitmap of possible first characters. */
7394
+    
7395
+    if (!anchored)
7458 7396
     {
7459
-    while (start_match < end_subject)
7460
-      {
7461
-      register int c = *start_match;
7462
-      if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
7463
-      }
7397
+        if ((re->options & PCRE_FIRSTSET) != 0)
7398
+        {
7399
+            first_byte = re->first_byte & 255;
7400
+            if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
7401
+                first_byte = match_block.lcc[first_byte];
7402
+        }
7403
+        else
7404
+            if (!startline && study != NULL &&
7405
+                (study->options & PCRE_STUDY_MAPPED) != 0)
7406
+                start_bits = study->start_bits;
7464 7407
     }
7465
-
7466
-#ifdef DEBUG  /* Sigh. Some compilers never learn. */
7467
-  printf(">>>> Match against: ");
7468
-  pchars(start_match, end_subject - start_match, TRUE, &match_block);
7469
-  printf("\n");
7470
-#endif
7471
-
7472
-  /* If req_byte is set, we know that that character must appear in the subject
7473
-  for the match to succeed. If the first character is set, req_byte must be
7474
-  later in the subject; otherwise the test starts at the match point. This
7475
-  optimization can save a huge amount of backtracking in patterns with nested
7476
-  unlimited repeats that aren't going to match. Writing separate code for
7477
-  cased/caseless versions makes it go faster, as does using an autoincrement
7478
-  and backing off on a match.
7479
-
7480
-  HOWEVER: when the subject string is very, very long, searching to its end can
7481
-  take a long time, and give bad performance on quite ordinary patterns. This
7482
-  showed up when somebody was matching /^C/ on a 32-megabyte string... so we
7483
-  don't do this when the string is sufficiently long. */
7484
-
7485
-  if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
7408
+    
7409
+    /* For anchored or unanchored matches, there may be a "last known required
7410
+     character" set. */
7411
+    
7412
+    if ((re->options & PCRE_REQCHSET) != 0)
7486 7413
     {
7487
-    register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
7488
-
7489
-    /* We don't need to repeat the search if we haven't yet reached the
7490
-    place we found it at last time. */
7491
-
7492
-    if (p > req_byte_ptr)
7493
-      {
7494
-      if (req_byte_caseless)
7414
+        req_byte = re->req_byte & 255;
7415
+        req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
7416
+        req_byte2 = (re->tables + fcc_offset)[req_byte];  /* case flipped */
7417
+    }
7418
+    
7419
+    /* Loop for handling unanchored repeated matching attempts; for anchored regexs
7420
+     the loop runs just once. */
7421
+    
7422
+    do
7423
+    {
7424
+        register int *iptr = match_block.offset_vector;
7425
+        register int *iend = iptr + resetcount;
7426
+        
7427
+        /* Reset the maximum number of extractions we might see. */
7428
+        
7429
+        while (iptr < iend) *iptr++ = -1;
7430
+        
7431
+        /* Advance to a unique first char if possible */
7432
+        
7433
+        if (first_byte >= 0)
7495 7434
         {
7496
-        while (p < end_subject)
7497
-          {
7498
-          register int pp = *p++;
7499
-          if (pp == req_byte || pp == req_byte2) { p--; break; }
7500
-          }
7435
+            if (first_byte_caseless)
7436
+                while (start_match < end_subject &&
7437
+                       match_block.lcc[*start_match] != first_byte)
7438
+                    start_match++;
7439
+            else
7440
+                while (start_match < end_subject && *start_match != first_byte)
7441
+                    start_match++;
7501 7442
         }
7502
-      else
7443
+        
7444
+        /* Or to just after \n for a multiline match if possible */
7445
+        
7446
+        else if (startline)
7503 7447
         {
7504
-        while (p < end_subject)
7505
-          {
7506
-          if (*p++ == req_byte) { p--; break; }
7507
-          }
7448
+            if (start_match > match_block.start_subject + start_offset)
7449
+            {
7450
+                while (start_match < end_subject && start_match[-1] != NEWLINE)
7451
+                    start_match++;
7452
+            }
7508 7453
         }
7509
-
7510
-      /* If we can't find the required character, break the matching loop */
7511
-
7512
-      if (p >= end_subject) break;
7513
-
7514
-      /* If we have found the required character, save the point where we
7515
-      found it, so that we don't search again next time round the loop if
7516
-      the start hasn't passed this character yet. */
7517
-
7518
-      req_byte_ptr = p;
7519
-      }
7520
-    }
7521
-
7522
-  /* When a match occurs, substrings will be set for all internal extractions;
7523
-  we just need to set up the whole thing as substring 0 before returning. If
7524
-  there were too many extractions, set the return code to zero. In the case
7525
-  where we had to get some local store to hold offsets for backreferences, copy
7526
-  those back references that we can. In this case there need not be overflow
7527
-  if certain parts of the pattern were not used. */
7528
-
7529
-  match_block.start_match = start_match;
7530
-  match_block.match_call_count = 0;
7531
-
7532
-  rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
7533
-    match_isgroup);
7534
-
7535
-  if (rc == MATCH_NOMATCH)
7536
-    {
7537
-    start_match++;
7454
+        
7455
+        /* Or to a non-unique first char after study */
7456
+        
7457
+        else if (start_bits != NULL)
7458
+        {
7459
+            while (start_match < end_subject)
7460
+            {
7461
+                register int c = *start_match;
7462
+                if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
7463
+            }
7464
+        }
7465
+        
7466
+#ifdef DEBUG  /* Sigh. Some compilers never learn. */
7467
+        printf(">>>> Match against: ");
7468
+        pchars(start_match, end_subject - start_match, TRUE, &match_block);
7469
+        printf("\n");
7470
+#endif
7471
+        
7472
+        /* If req_byte is set, we know that that character must appear in the subject
7473
+         for the match to succeed. If the first character is set, req_byte must be
7474
+         later in the subject; otherwise the test starts at the match point. This
7475
+         optimization can save a huge amount of backtracking in patterns with nested
7476
+         unlimited repeats that aren't going to match. Writing separate code for
7477
+         cased/caseless versions makes it go faster, as does using an autoincrement
7478
+         and backing off on a match.
7479
+         
7480
+         HOWEVER: when the subject string is very, very long, searching to its end can
7481
+         take a long time, and give bad performance on quite ordinary patterns. This
7482
+         showed up when somebody was matching /^C/ on a 32-megabyte string... so we
7483
+         don't do this when the string is sufficiently long. */
7484
+        
7485
+        if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
7486
+        {
7487
+            register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
7488
+            
7489
+            /* We don't need to repeat the search if we haven't yet reached the
7490
+             place we found it at last time. */
7491
+            
7492
+            if (p > req_byte_ptr)
7493
+            {
7494
+                if (req_byte_caseless)
7495
+                {
7496
+                    while (p < end_subject)
7497
+                    {
7498
+                        register int pp = *p++;
7499
+                        if (pp == req_byte || pp == req_byte2) { p--; break; }
7500
+                    }
7501
+                }
7502
+                else
7503
+                {
7504
+                    while (p < end_subject)
7505
+                    {
7506
+                        if (*p++ == req_byte) { p--; break; }
7507
+                    }
7508
+                }
7509
+                
7510
+                /* If we can't find the required character, break the matching loop */
7511
+                
7512
+                if (p >= end_subject) break;
7513
+                
7514
+                /* If we have found the required character, save the point where we
7515
+                 found it, so that we don't search again next time round the loop if
7516
+                 the start hasn't passed this character yet. */
7517
+                
7518
+                req_byte_ptr = p;
7519
+            }
7520
+        }
7521
+        
7522
+        /* When a match occurs, substrings will be set for all internal extractions;
7523
+         we just need to set up the whole thing as substring 0 before returning. If
7524
+         there were too many extractions, set the return code to zero. In the case
7525
+         where we had to get some local store to hold offsets for backreferences, copy
7526
+         those back references that we can. In this case there need not be overflow
7527
+         if certain parts of the pattern were not used. */
7528
+        
7529
+        match_block.start_match = start_match;
7530
+        match_block.match_call_count = 0;
7531
+        
7532
+        rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
7533
+                   match_isgroup);
7534
+        
7535
+        if (rc == MATCH_NOMATCH)
7536
+        {
7537
+            start_match++;
7538 7538
 #ifdef SUPPORT_UTF8
7539
-    if (match_block.utf8)
7540
-      while((*start_match & 0xc0) == 0x80) start_match++;
7539
+            if (match_block.utf8)
7540
+                while((*start_match & 0xc0) == 0x80) start_match++;
7541 7541
 #endif
7542
-    continue;
7543
-    }
7544
-
7545
-  if (rc != MATCH_MATCH)
7546
-    {
7547
-    DPRINTF((">>>> error: returning %d\n", rc));
7548
-    return rc;
7549
-    }
7550
-
7551
-  /* We have a match! Copy the offset information from temporary store if
7552
-  necessary */
7553
-
7554
-  if (using_temporary_offsets)
7555
-    {
7556
-    if (offsetcount >= 4)
7557
-      {
7558
-      memcpy(offsets + 2, match_block.offset_vector + 2,
7559
-        (offsetcount - 2) * sizeof(int));
7560
-      DPRINTF(("Copied offsets from temporary memory\n"));
7561
-      }
7562
-    if (match_block.end_offset_top > offsetcount)
7563
-      match_block.offset_overflow = TRUE;
7564
-
7565
-    DPRINTF(("Freeing temporary memory\n"));
7566
-    (pcre_free)(match_block.offset_vector);
7542
+            continue;
7543
+        }
7544
+        
7545
+        if (rc != MATCH_MATCH)
7546
+        {
7547
+            DPRINTF((">>>> error: returning %d\n", rc));
7548
+            return rc;
7549
+        }
7550
+        
7551
+        /* We have a match! Copy the offset information from temporary store if
7552
+         necessary */
7553
+        
7554
+        if (using_temporary_offsets)
7555
+        {
7556
+            if (offsetcount >= 4)
7557
+            {
7558
+                memcpy(offsets + 2, match_block.offset_vector + 2,
7559
+                       (offsetcount - 2) * sizeof(int));
7560
+                DPRINTF(("Copied offsets from temporary memory\n"));
7561
+            }
7562
+            if (match_block.end_offset_top > offsetcount)
7563
+                match_block.offset_overflow = TRUE;
7564
+            
7565
+            DPRINTF(("Freeing temporary memory\n"));
7566
+            (pcre_free)(match_block.offset_vector);
7567
+        }
7568
+        
7569
+        rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
7570
+        
7571
+        if (offsetcount < 2) rc = 0; else
7572
+        {
7573
+            offsets[0] = start_match - match_block.start_subject;
7574
+            offsets[1] = match_block.end_match_ptr - match_block.start_subject;
7575
+        }
7576
+        
7577
+        DPRINTF((">>>> returning %d\n", rc));
7578
+        return rc;
7567 7579
     }
7568
-
7569
-  rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
7570
-
7571
-  if (offsetcount < 2) rc = 0; else
7580
+    
7581
+    /* This "while" is the end of the "do" above */
7582
+    
7583
+    while (!anchored && start_match <= end_subject);
7584
+    
7585
+    if (using_temporary_offsets)
7572 7586
     {
7573
-    offsets[0] = start_match - match_block.start_subject;
7574
-    offsets[1] = match_block.end_match_ptr - match_block.start_subject;
7587
+        DPRINTF(("Freeing temporary memory\n"));
7588
+        (pcre_free)(match_block.offset_vector);
7575 7589
     }
7576
-
7577
-  DPRINTF((">>>> returning %d\n", rc));
7578
-  return rc;
7579
-  }
7580
-
7581
-/* This "while" is the end of the "do" above */
7582
-
7583
-while (!anchored && start_match <= end_subject);
7584
-
7585
-if (using_temporary_offsets)
7586
-  {
7587
-  DPRINTF(("Freeing temporary memory\n"));
7588
-  (pcre_free)(match_block.offset_vector);
7589
-  }
7590
-
7591
-DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7592
-
7593
-return PCRE_ERROR_NOMATCH;
7590
+    
7591
+    DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7592
+    
7593
+    return PCRE_ERROR_NOMATCH;
7594 7594
 }
7595 7595
 
7596 7596
 /* End of pcre.c */
... ...
@@ -1,6 +1,6 @@
1 1
 /*************************************************
2
-*       Perl-Compatible Regular Expressions      *
3
-*************************************************/
2
+ *       Perl-Compatible Regular Expressions      *
3
+ *************************************************/
4 4
 
5 5
 /* Copyright (c) 1997-2003 University of Cambridge */
6 6
 
... ...
@@ -8,7 +8,7 @@
8 8
 #define _PCRE_H
9 9
 
10 10
 /* The file pcre.h is build by "configure". Do not edit it; instead
11
-make changes to pcre.in. */
11
+ make changes to pcre.in. */
12 12
 
13 13
 #define PCRE_MAJOR          4
14 14
 #define PCRE_MINOR          3
... ...
@@ -32,7 +32,7 @@ make changes to pcre.in. */
32 32
 #endif
33 33
 
34 34
 /* Have to include stdlib.h in order to ensure that size_t is defined;
35
-it is needed here for malloc. */
35
+ it is needed here for malloc. */
36 36
 
37 37
 #include <stdlib.h>
38 38
 
... ...
@@ -41,9 +41,9 @@ it is needed here for malloc. */
41 41
 #ifdef __cplusplus
42 42
 extern "C" {
43 43
 #endif
44
-
45
-/* Options */
46
-
44
+    
45
+    /* Options */
46
+    
47 47
 #define PCRE_CASELESS           0x0001
48 48
 #define PCRE_MULTILINE          0x0002
49 49
 #define PCRE_DOTALL             0x0004
... ...
@@ -57,9 +57,9 @@ extern "C" {
57 57
 #define PCRE_NOTEMPTY           0x0400
58 58
 #define PCRE_UTF8               0x0800
59 59
 #define PCRE_NO_AUTO_CAPTURE    0x1000
60
-
61
-/* Exec-time and get/set-time error codes */
62
-
60
+    
61
+    /* Exec-time and get/set-time error codes */
62
+    
63 63
 #define PCRE_ERROR_NOMATCH        (-1)
64 64
 #define PCRE_ERROR_NULL           (-2)
65 65
 #define PCRE_ERROR_BADOPTION      (-3)
... ...
@@ -69,9 +69,9 @@ extern "C" {
69 69
 #define PCRE_ERROR_NOSUBSTRING    (-7)
70 70
 #define PCRE_ERROR_MATCHLIMIT     (-8)
71 71
 #define PCRE_ERROR_CALLOUT        (-9)  /* Never used by PCRE itself */
72
-
73
-/* Request types for pcre_fullinfo() */
74
-
72
+    
73
+    /* Request types for pcre_fullinfo() */
74
+    
75 75
 #define PCRE_INFO_OPTIONS            0
76 76
 #define PCRE_INFO_SIZE               1
77 77
 #define PCRE_INFO_CAPTURECOUNT       2
... ...
@@ -84,99 +84,99 @@ extern "C" {
84 84
 #define PCRE_INFO_NAMECOUNT          8
85 85
 #define PCRE_INFO_NAMETABLE          9
86 86
 #define PCRE_INFO_STUDYSIZE         10
87
-
88
-/* Request types for pcre_config() */
89
-
87
+    
88
+    /* Request types for pcre_config() */
89
+    
90 90
 #define PCRE_CONFIG_UTF8                    0
91 91
 #define PCRE_CONFIG_NEWLINE                 1
92 92
 #define PCRE_CONFIG_LINK_SIZE               2
93 93
 #define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD  3
94 94
 #define PCRE_CONFIG_MATCH_LIMIT             4
95
-
96
-/* Bit flags for the pcre_extra structure */
97
-
95
+    
96
+    /* Bit flags for the pcre_extra structure */
97
+    
98 98
 #define PCRE_EXTRA_STUDY_DATA          0x0001
99 99
 #define PCRE_EXTRA_MATCH_LIMIT         0x0002
100 100
 #define PCRE_EXTRA_CALLOUT_DATA        0x0004
101
-
102
-/* Types */
103
-
104
-struct real_pcre;                 /* declaration; the definition is private  */
105
-typedef struct real_pcre pcre;
106
-
107
-/* The structure for passing additional data to pcre_exec(). This is defined in
108
-such as way as to be extensible. */
109
-
110
-typedef struct pcre_extra {
111
-  unsigned long int flags;        /* Bits for which fields are set */
112
-  void *study_data;               /* Opaque data from pcre_study() */
113
-  unsigned long int match_limit;  /* Maximum number of calls to match() */
114
-  void *callout_data;             /* Data passed back in callouts */
115
-} pcre_extra;
116
-
117
-/* The structure for passing out data via the pcre_callout_function. We use a
118
-structure so that new fields can be added on the end in future versions,
119
-without changing the API of the function, thereby allowing old clients to work
120
-without modification. */
121
-
122
-typedef struct pcre_callout_block {
123
-  int          version;           /* Identifies version of block */
124
-  /* ------------------------ Version 0 ------------------------------- */
125
-  int          callout_number;    /* Number compiled into pattern */
126
-  int         *offset_vector;     /* The offset vector */
127
-  const char  *subject;           /* The subject being matched */
128
-  int          subject_length;    /* The length of the subject */
129
-  int          start_match;       /* Offset to start of this match attempt */
130
-  int          current_position;  /* Where we currently are */
131
-  int          capture_top;       /* Max current capture */
132
-  int          capture_last;      /* Most recently closed capture */
133
-  void        *callout_data;      /* Data passed in with the call */
134
-  /* ------------------------------------------------------------------ */
135
-} pcre_callout_block;
136
-
137
-/* Indirection for store get and free functions. These can be set to
138
-alternative malloc/free functions if required. There is also an optional
139
-callout function that is triggered by the (?) regex item. Some magic is
140
-required for Win32 DLL; it is null on other OS. For Virtual Pascal, these
141
-have to be different again. */
142
-
101
+    
102
+    /* Types */
103
+    
104
+    struct real_pcre;                 /* declaration; the definition is private  */
105
+    typedef struct real_pcre pcre;
106
+    
107
+    /* The structure for passing additional data to pcre_exec(). This is defined in
108
+     such as way as to be extensible. */
109
+    
110
+    typedef struct pcre_extra {
111
+        unsigned long int flags;        /* Bits for which fields are set */
112
+        void *study_data;               /* Opaque data from pcre_study() */
113
+        unsigned long int match_limit;  /* Maximum number of calls to match() */
114
+        void *callout_data;             /* Data passed back in callouts */
115
+    } pcre_extra;
116
+    
117
+    /* The structure for passing out data via the pcre_callout_function. We use a
118
+     structure so that new fields can be added on the end in future versions,
119
+     without changing the API of the function, thereby allowing old clients to work
120
+     without modification. */
121
+    
122
+    typedef struct pcre_callout_block {
123
+        int          version;           /* Identifies version of block */
124
+        /* ------------------------ Version 0 ------------------------------- */
125
+        int          callout_number;    /* Number compiled into pattern */
126
+        int         *offset_vector;     /* The offset vector */
127
+        const char  *subject;           /* The subject being matched */
128
+        int          subject_length;    /* The length of the subject */
129
+        int          start_match;       /* Offset to start of this match attempt */
130
+        int          current_position;  /* Where we currently are */
131
+        int          capture_top;       /* Max current capture */
132
+        int          capture_last;      /* Most recently closed capture */
133
+        void        *callout_data;      /* Data passed in with the call */
134
+        /* ------------------------------------------------------------------ */
135
+    } pcre_callout_block;
136
+    
137
+    /* Indirection for store get and free functions. These can be set to
138
+     alternative malloc/free functions if required. There is also an optional
139
+     callout function that is triggered by the (?) regex item. Some magic is
140
+     required for Win32 DLL; it is null on other OS. For Virtual Pascal, these
141
+     have to be different again. */
142
+    
143 143
 #ifndef VPCOMPAT
144
-PCRE_DATA_SCOPE void *(*pcre_malloc)(size_t);
145
-PCRE_DATA_SCOPE void  (*pcre_free)(void *);
146
-PCRE_DATA_SCOPE int   (*pcre_callout)(pcre_callout_block *);
144
+    PCRE_DATA_SCOPE void *(*pcre_malloc)(size_t);
145
+    PCRE_DATA_SCOPE void  (*pcre_free)(void *);
146
+    PCRE_DATA_SCOPE int   (*pcre_callout)(pcre_callout_block *);
147 147
 #else   /* VPCOMPAT */
148
-extern void *pcre_malloc(size_t);
149
-extern void  pcre_free(void *);
150
-extern int   pcre_callout(pcre_callout_block *);
148
+    extern void *pcre_malloc(size_t);
149
+    extern void  pcre_free(void *);
150
+    extern int   pcre_callout(pcre_callout_block *);
151 151
 #endif  /* VPCOMPAT */
152
-
153
-/* Exported PCRE functions */
154
-
155
-extern pcre *pcre_compile(const char *, int, const char **,
156
-              int *, const unsigned char *);
157
-extern int  pcre_config(int, void *);
158
-extern int  pcre_copy_named_substring(const pcre *, const char *,
159
-              int *, int, const char *, char *, int);
160
-extern int  pcre_copy_substring(const char *, int *, int, int,
161
-              char *, int);
162
-extern int  pcre_exec(const pcre *, const pcre_extra *,
163
-              const char *, int, int, int, int *, int);
164
-extern void pcre_free_substring(const char *);
165
-extern void pcre_free_substring_list(const char **);
166
-extern int  pcre_fullinfo(const pcre *, const pcre_extra *, int,
167
-              void *);
168
-extern int  pcre_get_named_substring(const pcre *, const char *,
169
-              int *, int,  const char *, const char **);
170
-extern int  pcre_get_stringnumber(const pcre *, const char *);
171
-extern int  pcre_get_substring(const char *, int *, int, int,
172
-              const char **);
173
-extern int  pcre_get_substring_list(const char *, int *, int,
174
-              const char ***);
175
-extern int  pcre_info(const pcre *, int *, int *);
176
-extern const unsigned char *pcre_maketables(void);
177
-extern pcre_extra *pcre_study(const pcre *, int, const char **);
178
-extern const char *pcre_version(void);
179
-
152
+    
153
+    /* Exported PCRE functions */
154
+    
155
+    extern pcre *pcre_compile(const char *, int, const char **,
156
+                              int *, const unsigned char *);
157
+    extern int  pcre_config(int, void *);
158
+    extern int  pcre_copy_named_substring(const pcre *, const char *,
159
+                                          int *, int, const char *, char *, int);
160
+    extern int  pcre_copy_substring(const char *, int *, int, int,
161
+                                    char *, int);
162
+    extern int  pcre_exec(const pcre *, const pcre_extra *,
163
+                          const char *, int, int, int, int *, int);
164
+    extern void pcre_free_substring(const char *);
165
+    extern void pcre_free_substring_list(const char **);
166
+    extern int  pcre_fullinfo(const pcre *, const pcre_extra *, int,
167
+                              void *);
168
+    extern int  pcre_get_named_substring(const pcre *, const char *,
169
+                                         int *, int,  const char *, const char **);
170
+    extern int  pcre_get_stringnumber(const pcre *, const char *);
171
+    extern int  pcre_get_substring(const char *, int *, int, int,
172
+                                   const char **);
173
+    extern int  pcre_get_substring_list(const char *, int *, int,
174
+                                        const char ***);
175
+    extern int  pcre_info(const pcre *, int *, int *);
176
+    extern const unsigned char *pcre_maketables(void);
177
+    extern pcre_extra *pcre_study(const pcre *, int, const char **);
178
+    extern const char *pcre_version(void);
179
+    
180 180
 #ifdef __cplusplus
181 181
 }  /* extern "C" */
182 182
 #endif
... ...
@@ -1,438 +1,438 @@
1 1
 /*************************************************
2
-*      Perl-Compatible Regular Expressions       *
3
-*************************************************/
2
+ *      Perl-Compatible Regular Expressions       *
3
+ *************************************************/
4 4
 
5 5
 /*
6
-This is a library of functions to support regular expressions whose syntax
7
-and semantics are as close as possible to those of the Perl 5 language. See
8
-the file Tech.Notes for some information on the internals.
9
-
10
-Written by: Philip Hazel <ph10@cam.ac.uk>
11
-
12
-           Copyright (c) 1997-2002 University of Cambridge
13
-
14
-Permission is granted to anyone to use this software for any purpose on any
15
-computer system, and to redistribute it freely, subject to the following
16
-restrictions:
17
-
18
-1. This software is distributed in the hope that it will be useful,
19
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
20
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
21
-
22
-2. The origin of this software must not be misrepresented, either by
23
-   explicit claim or by omission.
24
-
25
-3. Altered versions must be plainly marked as such, and must not be
26
-   misrepresented as being the original software.
27
-
28
-4. If PCRE is embedded in any software that is released under the GNU
29
-   General Purpose Licence (GPL), then the terms of that licence shall
30
-   supersede any condition above with which it is incompatible.
31
-*/
6
+ This is a library of functions to support regular expressions whose syntax
7
+ and semantics are as close as possible to those of the Perl 5 language. See
8
+ the file Tech.Notes for some information on the internals.
9
+ 
10
+ Written by: Philip Hazel <ph10@cam.ac.uk>
11
+ 
12
+ Copyright (c) 1997-2002 University of Cambridge
13
+ 
14
+ -----------------------------------------------------------------------------
15
+ Permission is granted to anyone to use this software for any purpose on any
16
+ computer system, and to redistribute it freely, subject to the following
17
+ restrictions:
18
+ 
19
+ 1. This software is distributed in the hope that it will be useful,
20
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
+ 
23
+ 2. The origin of this software must not be misrepresented, either by
24
+ explicit claim or by omission.
25
+ 
26
+ 3. Altered versions must be plainly marked as such, and must not be
27
+ misrepresented as being the original software.
28
+ 
29
+ 4. If PCRE is embedded in any software that is released under the GNU
30
+ General Purpose Licence (GPL), then the terms of that licence shall
31
+ supersede any condition above with which it is incompatible.
32
+ -----------------------------------------------------------------------------
33
+ */
32 34
 
33 35
 
34 36
 /* Include the internals header, which itself includes Standard C headers plus
35
-the external pcre header. */
37
+ the external pcre header. */
36 38
 
37 39
 #include "internal.h"
38 40
 
39 41
 
40 42
 
41 43
 /*************************************************
42
-*      Set a bit and maybe its alternate case    *
43
-*************************************************/
44
+ *      Set a bit and maybe its alternate case    *
45
+ *************************************************/
44 46
 
45 47
 /* Given a character, set its bit in the table, and also the bit for the other
46
-version of a letter if we are caseless.
47
-
48
-Arguments:
49
-  start_bits    points to the bit map
50
-  c             is the character
51
-  caseless      the caseless flag
52
-  cd            the block with char table pointers
53
-
54
-Returns:        nothing
55
-*/
48
+ version of a letter if we are caseless.
49
+ 
50
+ Arguments:
51
+ start_bits    points to the bit map
52
+ c             is the character
53
+ caseless      the caseless flag
54
+ cd            the block with char table pointers
55
+ 
56
+ Returns:        nothing
57
+ */
56 58
 
57 59
 static void
58 60
 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
59 61
 {
60
-start_bits[c/8] |= (1 << (c&7));
61
-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
62
-  start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
62
+    start_bits[c/8] |= (1 << (c&7));
63
+    if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64
+        start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
63 65
 }
64 66
 
65 67
 
66 68
 
67 69
 /*************************************************
68
-*          Create bitmap of starting chars       *
69
-*************************************************/
70
+ *          Create bitmap of starting chars       *
71
+ *************************************************/
70 72
 
71 73
 /* This function scans a compiled unanchored expression and attempts to build a
72
-bitmap of the set of initial characters. If it can't, it returns FALSE. As time
73
-goes by, we may be able to get more clever at doing this.
74
-
75
-Arguments:
76
-  code         points to an expression
77
-  start_bits   points to a 32-byte table, initialized to 0
78
-  caseless     the current state of the caseless flag
79
-  utf8         TRUE if in UTF-8 mode
80
-  cd           the block with char table pointers
81
-
82
-Returns:       TRUE if table built, FALSE otherwise
83
-*/
74
+ bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75
+ goes by, we may be able to get more clever at doing this.
76
+ 
77
+ Arguments:
78
+ code         points to an expression
79
+ start_bits   points to a 32-byte table, initialized to 0
80
+ caseless     the current state of the caseless flag
81
+ utf8         TRUE if in UTF-8 mode
82
+ cd           the block with char table pointers
83
+ 
84
+ Returns:       TRUE if table built, FALSE otherwise
85
+ */
84 86
 
85 87
 static BOOL
86 88
 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
87
-  BOOL utf8, compile_data *cd)
89
+               BOOL utf8, compile_data *cd)
88 90
 {
89
-register int c;
90
-
91
-/* This next statement and the later reference to dummy are here in order to
92
-trick the optimizer of the IBM C compiler for OS/2 into generating correct
93
-code. Apparently IBM isn't going to fix the problem, and we would rather not
94
-disable optimization (in this module it actually makes a big difference, and
95
-the pcre module can use all the optimization it can get). */
96
-
97
-volatile int dummy;
98
-
99
-do
100
-  {
101
-  const uschar *tcode = code + 1 + LINK_SIZE;
102
-  BOOL try_next = TRUE;
103
-
104
-  while (try_next)
91
+    register int c;
92
+    
93
+    /* This next statement and the later reference to dummy are here in order to
94
+     trick the optimizer of the IBM C compiler for OS/2 into generating correct
95
+     code. Apparently IBM isn't going to fix the problem, and we would rather not
96
+     disable optimization (in this module it actually makes a big difference, and
97
+     the pcre module can use all the optimization it can get). */
98
+    
99
+    volatile int dummy;
100
+    
101
+    do
105 102
     {
106
-    /* If a branch starts with a bracket or a positive lookahead assertion,
107
-    recurse to set bits from within them. That's all for this branch. */
108
-
109
-    if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
110
-      {
111
-      if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
112
-        return FALSE;
113
-      try_next = FALSE;
114
-      }
115
-
116
-    else switch(*tcode)
117
-      {
118
-      default:
119
-      return FALSE;
120
-
121
-      /* Skip over callout */
122
-
123
-      case OP_CALLOUT:
124
-      tcode += 2;
125
-      break;
126
-
127
-      /* Skip over extended extraction bracket number */
128
-
129
-      case OP_BRANUMBER:
130
-      tcode += 3;
131
-      break;
132
-
133
-      /* Skip over lookbehind and negative lookahead assertions */
134
-
135
-      case OP_ASSERT_NOT:
136
-      case OP_ASSERTBACK:
137
-      case OP_ASSERTBACK_NOT:
138
-      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
139
-      tcode += 1+LINK_SIZE;
140
-      break;
141
-
142
-      /* Skip over an option setting, changing the caseless flag */
143
-
144
-      case OP_OPT:
145
-      caseless = (tcode[1] & PCRE_CASELESS) != 0;
146
-      tcode += 2;
147
-      break;
148
-
149
-      /* BRAZERO does the bracket, but carries on. */
150
-
151
-      case OP_BRAZERO:
152
-      case OP_BRAMINZERO:
153
-      if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
154
-        return FALSE;
155
-      dummy = 1;
156
-      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
157
-      tcode += 1+LINK_SIZE;
158
-      break;
159
-
160
-      /* Single-char * or ? sets the bit and tries the next item */
161
-
162
-      case OP_STAR:
163
-      case OP_MINSTAR:
164
-      case OP_QUERY:
165
-      case OP_MINQUERY:
166
-      set_bit(start_bits, tcode[1], caseless, cd);
167
-      tcode += 2;
103
+        const uschar *tcode = code + 1 + LINK_SIZE;
104
+        BOOL try_next = TRUE;
105
+        
106
+        while (try_next)
107
+        {
108
+            /* If a branch starts with a bracket or a positive lookahead assertion,
109
+             recurse to set bits from within them. That's all for this branch. */
110
+            
111
+            if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
112
+            {
113
+                if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
114
+                    return FALSE;
115
+                try_next = FALSE;
116
+            }
117
+            
118
+            else switch(*tcode)
119
+            {
120
+                default:
121
+                    return FALSE;
122
+                    
123
+                    /* Skip over callout */
124
+                    
125
+                case OP_CALLOUT:
126
+                    tcode += 2;
127
+                    break;
128
+                    
129
+                    /* Skip over extended extraction bracket number */
130
+                    
131
+                case OP_BRANUMBER:
132
+                    tcode += 3;
133
+                    break;
134
+                    
135
+                    /* Skip over lookbehind and negative lookahead assertions */
136
+                    
137
+                case OP_ASSERT_NOT:
138
+                case OP_ASSERTBACK:
139
+                case OP_ASSERTBACK_NOT:
140
+                    do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
141
+                    tcode += 1+LINK_SIZE;
142
+                    break;
143
+                    
144
+                    /* Skip over an option setting, changing the caseless flag */
145
+                    
146
+                case OP_OPT:
147
+                    caseless = (tcode[1] & PCRE_CASELESS) != 0;
148
+                    tcode += 2;
149
+                    break;
150
+                    
151
+                    /* BRAZERO does the bracket, but carries on. */
152
+                    
153
+                case OP_BRAZERO:
154
+                case OP_BRAMINZERO:
155
+                    if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
156
+                        return FALSE;
157
+                    dummy = 1;
158
+                    do tcode += GET(tcode,1); while (*tcode == OP_ALT);
159
+                    tcode += 1+LINK_SIZE;
160
+                    break;
161
+                    
162
+                    /* Single-char * or ? sets the bit and tries the next item */
163
+                    
164
+                case OP_STAR:
165
+                case OP_MINSTAR:
166
+                case OP_QUERY:
167
+                case OP_MINQUERY:
168
+                    set_bit(start_bits, tcode[1], caseless, cd);
169
+                    tcode += 2;
168 170
 #ifdef SUPPORT_UTF8
169
-      if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
171
+                    if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
170 172
 #endif
171
-      break;
172
-
173
-      /* Single-char upto sets the bit and tries the next */
174
-
175
-      case OP_UPTO:
176
-      case OP_MINUPTO:
177
-      set_bit(start_bits, tcode[3], caseless, cd);
178
-      tcode += 4;
173
+                    break;
174
+                    
175
+                    /* Single-char upto sets the bit and tries the next */
176
+                    
177
+                case OP_UPTO:
178
+                case OP_MINUPTO:
179
+                    set_bit(start_bits, tcode[3], caseless, cd);
180
+                    tcode += 4;
179 181
 #ifdef SUPPORT_UTF8
180
-      if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
182
+                    if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
181 183
 #endif
182
-      break;
183
-
184
-      /* At least one single char sets the bit and stops */
185
-
186
-      case OP_EXACT:       /* Fall through */
187
-      tcode++;
188
-
189
-      case OP_CHARS:       /* Fall through */
190
-      tcode++;
191
-
192
-      case OP_PLUS:
193
-      case OP_MINPLUS:
194
-      set_bit(start_bits, tcode[1], caseless, cd);
195
-      try_next = FALSE;
196
-      break;
197
-
198
-      /* Single character type sets the bits and stops */
199
-
200
-      case OP_NOT_DIGIT:
201
-      for (c = 0; c < 32; c++)
202
-        start_bits[c] |= ~cd->cbits[c+cbit_digit];
203
-      try_next = FALSE;
204
-      break;
205
-
206
-      case OP_DIGIT:
207
-      for (c = 0; c < 32; c++)
208
-        start_bits[c] |= cd->cbits[c+cbit_digit];
209
-      try_next = FALSE;
210
-      break;
211
-
212
-      case OP_NOT_WHITESPACE:
213
-      for (c = 0; c < 32; c++)
214
-        start_bits[c] |= ~cd->cbits[c+cbit_space];
215
-      try_next = FALSE;
216
-      break;
217
-
218
-      case OP_WHITESPACE:
219
-      for (c = 0; c < 32; c++)
220
-        start_bits[c] |= cd->cbits[c+cbit_space];
221
-      try_next = FALSE;
222
-      break;
223
-
224
-      case OP_NOT_WORDCHAR:
225
-      for (c = 0; c < 32; c++)
226
-        start_bits[c] |= ~cd->cbits[c+cbit_word];
227
-      try_next = FALSE;
228
-      break;
229
-
230
-      case OP_WORDCHAR:
231
-      for (c = 0; c < 32; c++)
232
-        start_bits[c] |= cd->cbits[c+cbit_word];
233
-      try_next = FALSE;
234
-      break;
235
-
236
-      /* One or more character type fudges the pointer and restarts, knowing
237
-      it will hit a single character type and stop there. */
238
-
239
-      case OP_TYPEPLUS:
240
-      case OP_TYPEMINPLUS:
241
-      tcode++;
242
-      break;
243
-
244
-      case OP_TYPEEXACT:
245
-      tcode += 3;
246
-      break;
247
-
248
-      /* Zero or more repeats of character types set the bits and then
249
-      try again. */
250
-
251
-      case OP_TYPEUPTO:
252
-      case OP_TYPEMINUPTO:
253
-      tcode += 2;               /* Fall through */
254
-
255
-      case OP_TYPESTAR:
256
-      case OP_TYPEMINSTAR:
257
-      case OP_TYPEQUERY:
258
-      case OP_TYPEMINQUERY:
259
-      switch(tcode[1])
260
-        {
261
-        case OP_NOT_DIGIT:
262
-        for (c = 0; c < 32; c++)
263
-          start_bits[c] |= ~cd->cbits[c+cbit_digit];
264
-        break;
265
-
266
-        case OP_DIGIT:
267
-        for (c = 0; c < 32; c++)
268
-          start_bits[c] |= cd->cbits[c+cbit_digit];
269
-        break;
270
-
271
-        case OP_NOT_WHITESPACE:
272
-        for (c = 0; c < 32; c++)
273
-          start_bits[c] |= ~cd->cbits[c+cbit_space];
274
-        break;
275
-
276
-        case OP_WHITESPACE:
277
-        for (c = 0; c < 32; c++)
278
-          start_bits[c] |= cd->cbits[c+cbit_space];
279
-        break;
280
-
281
-        case OP_NOT_WORDCHAR:
282
-        for (c = 0; c < 32; c++)
283
-          start_bits[c] |= ~cd->cbits[c+cbit_word];
284
-        break;
285
-
286
-        case OP_WORDCHAR:
287
-        for (c = 0; c < 32; c++)
288
-          start_bits[c] |= cd->cbits[c+cbit_word];
289
-        break;
290
-        }
291
-
292
-      tcode += 2;
293
-      break;
294
-
295
-      /* Character class where all the information is in a bit map: set the
296
-      bits and either carry on or not, according to the repeat count. If it was
297
-      a negative class, and we are operating with UTF-8 characters, any byte
298
-      with the top-bit set is a potentially valid starter because it may start
299
-      a character with a value > 255. (This is sub-optimal in that the
300
-      character may be in the range 128-255, and those characters might be
301
-      unwanted, but that's as far as we go for the moment.) */
302
-
303
-      case OP_NCLASS:
304
-      if (utf8) memset(start_bits+16, 0xff, 16);
305
-      /* Fall through */
306
-
307
-      case OP_CLASS:
308
-        {
309
-        tcode++;
310
-        for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
311
-        tcode += 32;
312
-        switch (*tcode)
313
-          {
314
-          case OP_CRSTAR:
315
-          case OP_CRMINSTAR:
316
-          case OP_CRQUERY:
317
-          case OP_CRMINQUERY:
318
-          tcode++;
319
-          break;
320
-
321
-          case OP_CRRANGE:
322
-          case OP_CRMINRANGE:
323
-          if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
324
-            else try_next = FALSE;
325
-          break;
326
-
327
-          default:
328
-          try_next = FALSE;
329
-          break;
330
-          }
331
-        }
332
-      break; /* End of bitmap class handling */
333
-
334
-      }      /* End of switch */
335
-    }        /* End of try_next loop */
336
-
337
-  code += GET(code, 1);   /* Advance to next branch */
338
-  }
339
-while (*code == OP_ALT);
340
-return TRUE;
184
+                    break;
185
+                    
186
+                    /* At least one single char sets the bit and stops */
187
+                    
188
+                case OP_EXACT:       /* Fall through */
189
+                    tcode++;
190
+                    
191
+                case OP_CHARS:       /* Fall through */
192
+                    tcode++;
193
+                    
194
+                case OP_PLUS:
195
+                case OP_MINPLUS:
196
+                    set_bit(start_bits, tcode[1], caseless, cd);
197
+                    try_next = FALSE;
198
+                    break;
199
+                    
200
+                    /* Single character type sets the bits and stops */
201
+                    
202
+                case OP_NOT_DIGIT:
203
+                    for (c = 0; c < 32; c++)
204
+                        start_bits[c] |= ~cd->cbits[c+cbit_digit];
205
+                    try_next = FALSE;
206
+                    break;
207
+                    
208
+                case OP_DIGIT:
209
+                    for (c = 0; c < 32; c++)
210
+                        start_bits[c] |= cd->cbits[c+cbit_digit];
211
+                    try_next = FALSE;
212
+                    break;
213
+                    
214
+                case OP_NOT_WHITESPACE:
215
+                    for (c = 0; c < 32; c++)
216
+                        start_bits[c] |= ~cd->cbits[c+cbit_space];
217
+                    try_next = FALSE;
218
+                    break;
219
+                    
220
+                case OP_WHITESPACE:
221
+                    for (c = 0; c < 32; c++)
222
+                        start_bits[c] |= cd->cbits[c+cbit_space];
223
+                    try_next = FALSE;
224
+                    break;
225
+                    
226
+                case OP_NOT_WORDCHAR:
227
+                    for (c = 0; c < 32; c++)
228
+                        start_bits[c] |= ~cd->cbits[c+cbit_word];
229
+                    try_next = FALSE;
230
+                    break;
231
+                    
232
+                case OP_WORDCHAR:
233
+                    for (c = 0; c < 32; c++)
234
+                        start_bits[c] |= cd->cbits[c+cbit_word];
235
+                    try_next = FALSE;
236
+                    break;
237
+                    
238
+                    /* One or more character type fudges the pointer and restarts, knowing
239
+                     it will hit a single character type and stop there. */
240
+                    
241
+                case OP_TYPEPLUS:
242
+                case OP_TYPEMINPLUS:
243
+                    tcode++;
244
+                    break;
245
+                    
246
+                case OP_TYPEEXACT:
247
+                    tcode += 3;
248
+                    break;
249
+                    
250
+                    /* Zero or more repeats of character types set the bits and then
251
+                     try again. */
252
+                    
253
+                case OP_TYPEUPTO:
254
+                case OP_TYPEMINUPTO:
255
+                    tcode += 2;               /* Fall through */
256
+                    
257
+                case OP_TYPESTAR:
258
+                case OP_TYPEMINSTAR:
259
+                case OP_TYPEQUERY:
260
+                case OP_TYPEMINQUERY:
261
+                    switch(tcode[1])
262
+                {
263
+                    case OP_NOT_DIGIT:
264
+                        for (c = 0; c < 32; c++)
265
+                            start_bits[c] |= ~cd->cbits[c+cbit_digit];
266
+                        break;
267
+                        
268
+                    case OP_DIGIT:
269
+                        for (c = 0; c < 32; c++)
270
+                            start_bits[c] |= cd->cbits[c+cbit_digit];
271
+                        break;
272
+                        
273
+                    case OP_NOT_WHITESPACE:
274
+                        for (c = 0; c < 32; c++)
275
+                            start_bits[c] |= ~cd->cbits[c+cbit_space];
276
+                        break;
277
+                        
278
+                    case OP_WHITESPACE:
279
+                        for (c = 0; c < 32; c++)
280
+                            start_bits[c] |= cd->cbits[c+cbit_space];
281
+                        break;
282
+                        
283
+                    case OP_NOT_WORDCHAR:
284
+                        for (c = 0; c < 32; c++)
285
+                            start_bits[c] |= ~cd->cbits[c+cbit_word];
286
+                        break;
287
+                        
288
+                    case OP_WORDCHAR:
289
+                        for (c = 0; c < 32; c++)
290
+                            start_bits[c] |= cd->cbits[c+cbit_word];
291
+                        break;
292
+                }
293
+                    
294
+                    tcode += 2;
295
+                    break;
296
+                    
297
+                    /* Character class where all the information is in a bit map: set the
298
+                     bits and either carry on or not, according to the repeat count. If it was
299
+                     a negative class, and we are operating with UTF-8 characters, any byte
300
+                     with the top-bit set is a potentially valid starter because it may start
301
+                     a character with a value > 255. (This is sub-optimal in that the
302
+                     character may be in the range 128-255, and those characters might be
303
+                     unwanted, but that's as far as we go for the moment.) */
304
+                    
305
+                case OP_NCLASS:
306
+                    if (utf8) memset(start_bits+16, 0xff, 16);
307
+                    /* Fall through */
308
+                    
309
+                case OP_CLASS:
310
+                {
311
+                    tcode++;
312
+                    for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
313
+                    tcode += 32;
314
+                    switch (*tcode)
315
+                    {
316
+                        case OP_CRSTAR:
317
+                        case OP_CRMINSTAR:
318
+                        case OP_CRQUERY:
319
+                        case OP_CRMINQUERY:
320
+                            tcode++;
321
+                            break;
322
+                            
323
+                        case OP_CRRANGE:
324
+                        case OP_CRMINRANGE:
325
+                            if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
326
+                            else try_next = FALSE;
327
+                            break;
328
+                            
329
+                        default:
330
+                            try_next = FALSE;
331
+                            break;
332
+                    }
333
+                }
334
+                    break; /* End of bitmap class handling */
335
+                    
336
+            }      /* End of switch */
337
+        }        /* End of try_next loop */
338
+        
339
+        code += GET(code, 1);   /* Advance to next branch */
340
+    }
341
+    while (*code == OP_ALT);
342
+    return TRUE;
341 343
 }
342 344
 
343 345
 
344 346
 
345 347
 /*************************************************
346
-*          Study a compiled expression           *
347
-*************************************************/
348
+ *          Study a compiled expression           *
349
+ *************************************************/
348 350
 
349 351
 /* This function is handed a compiled expression that it must study to produce
350
-information that will speed up the matching. It returns a pcre_extra block
351
-which then gets handed back to pcre_exec().
352
-
353
-Arguments:
354
-  re        points to the compiled expression
355
-  options   contains option bits
356
-  errorptr  points to where to place error messages;
357
-            set NULL unless error
358
-
359
-Returns:    pointer to a pcre_extra block, with study_data filled in and the
360
-              appropriate flag set;
361
-            NULL on error or if no optimization possible
362
-*/
352
+ information that will speed up the matching. It returns a pcre_extra block
353
+ which then gets handed back to pcre_exec().
354
+ 
355
+ Arguments:
356
+ re        points to the compiled expression
357
+ options   contains option bits
358
+ errorptr  points to where to place error messages;
359
+ set NULL unless error
360
+ 
361
+ Returns:    pointer to a pcre_extra block, with study_data filled in and the
362
+ appropriate flag set;
363
+ NULL on error or if no optimization possible
364
+ */
363 365
 
364 366
 pcre_extra *
365 367
 pcre_study(const pcre *external_re, int options, const char **errorptr)
366 368
 {
367
-uschar start_bits[32];
368
-pcre_extra *extra;
369
-pcre_study_data *study;
370
-const real_pcre *re = (const real_pcre *)external_re;
371
-uschar *code = (uschar *)re + sizeof(real_pcre) +
372
-  (re->name_count * re->name_entry_size);
373
-compile_data compile_block;
374
-
375
-*errorptr = NULL;
376
-
377
-if (re == NULL || re->magic_number != MAGIC_NUMBER)
378
-  {
379
-  *errorptr = "argument is not a compiled regular expression";
380
-  return NULL;
381
-  }
382
-
383
-if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
384
-  {
385
-  *errorptr = "unknown or incorrect option bit(s) set";
386
-  return NULL;
387
-  }
388
-
389
-/* For an anchored pattern, or an unanchored pattern that has a first char, or
390
-a multiline pattern that matches only at "line starts", no further processing
391
-at present. */
392
-
393
-if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
394
-  return NULL;
395
-
396
-/* Set the character tables in the block which is passed around */
397
-
398
-compile_block.lcc = re->tables + lcc_offset;
399
-compile_block.fcc = re->tables + fcc_offset;
400
-compile_block.cbits = re->tables + cbits_offset;
401
-compile_block.ctypes = re->tables + ctypes_offset;
402
-
403
-/* See if we can find a fixed set of initial characters for the pattern. */
404
-
405
-memset(start_bits, 0, 32 * sizeof(uschar));
406
-if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
407
-  (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
408
-
409
-/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
410
-the latter, which is pointed to by the former, which may also get additional
411
-data set later by the calling program. At the moment, the size of
412
-pcre_study_data is fixed. We nevertheless save it in a field for returning via
413
-the pcre_fullinfo() function so that if it becomes variable in the future, we
414
-don't have to change that code. */
415
-
416
-extra = (pcre_extra *)(pcre_malloc)
417
-  (sizeof(pcre_extra) + sizeof(pcre_study_data));
418
-
419
-if (extra == NULL)
420
-  {
421
-  *errorptr = "failed to get memory";
422
-  return NULL;
423
-  }
424
-
425
-study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
426
-extra->flags = PCRE_EXTRA_STUDY_DATA;
427
-extra->study_data = study;
428
-
429
-study->size = sizeof(pcre_study_data);
430
-study->options = PCRE_STUDY_MAPPED;
431
-memcpy(study->start_bits, start_bits, sizeof(start_bits));
432
-
433
-return extra;
369
+    uschar start_bits[32];
370
+    pcre_extra *extra;
371
+    pcre_study_data *study;
372
+    const real_pcre *re = (const real_pcre *)external_re;
373
+    uschar *code = (uschar *)re + sizeof(real_pcre) +
374
+    (re->name_count * re->name_entry_size);
375
+    compile_data compile_block;
376
+    
377
+    *errorptr = NULL;
378
+    
379
+    if (re == NULL || re->magic_number != MAGIC_NUMBER)
380
+    {
381
+        *errorptr = "argument is not a compiled regular expression";
382
+        return NULL;
383
+    }
384
+    
385
+    if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
386
+    {
387
+        *errorptr = "unknown or incorrect option bit(s) set";
388
+        return NULL;
389
+    }
390
+    
391
+    /* For an anchored pattern, or an unanchored pattern that has a first char, or
392
+     a multiline pattern that matches only at "line starts", no further processing
393
+     at present. */
394
+    
395
+    if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
396
+        return NULL;
397
+    
398
+    /* Set the character tables in the block which is passed around */
399
+    
400
+    compile_block.lcc = re->tables + lcc_offset;
401
+    compile_block.fcc = re->tables + fcc_offset;
402
+    compile_block.cbits = re->tables + cbits_offset;
403
+    compile_block.ctypes = re->tables + ctypes_offset;
404
+    
405
+    /* See if we can find a fixed set of initial characters for the pattern. */
406
+    
407
+    memset(start_bits, 0, 32 * sizeof(uschar));
408
+    if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
409
+                        (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
410
+    
411
+    /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
412
+     the latter, which is pointed to by the former, which may also get additional
413
+     data set later by the calling program. At the moment, the size of
414
+     pcre_study_data is fixed. We nevertheless save it in a field for returning via
415
+     the pcre_fullinfo() function so that if it becomes variable in the future, we
416
+     don't have to change that code. */
417
+    
418
+    extra = (pcre_extra *)(pcre_malloc)
419
+    (sizeof(pcre_extra) + sizeof(pcre_study_data));
420
+    
421
+    if (extra == NULL)
422
+    {
423
+        *errorptr = "failed to get memory";
424
+        return NULL;
425
+    }
426
+    
427
+    study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
428
+    extra->flags = PCRE_EXTRA_STUDY_DATA;
429
+    extra->study_data = study;
430
+    
431
+    study->size = sizeof(pcre_study_data);
432
+    study->options = PCRE_STUDY_MAPPED;
433
+    memcpy(study->start_bits, start_bits, sizeof(start_bits));
434
+    
435
+    return extra;
434 436
 }
435 437
 
436 438
 /* End of study.c */
... ...
@@ -13,7 +13,7 @@
13 13
 #import "AGRegex.h"
14 14
 
15 15
 @interface CMetadataImporter : NSObject {
16
-
16
+    
17 17
 }
18 18
 
19 19
 // "Public" methods.
... ...
@@ -68,58 +68,58 @@ static AGRegex *LispDefvar_RE = nil;
68 68
 - (void)initStaticData
69 69
 {
70 70
     NSLog(@"Import Lisp");
71
-	if (StaticDataIsInitialized)
72
-	{
73
-		return;
74
-	}
75
-	
76
-	StaticDataIsInitialized = YES;
77
-	
78
-	// Find the bundle, and Info.plist.  Set the debug level specified
79
-	// there, as well as the maximum file length to index.
80
-	NSBundle *theBundle = [NSBundle bundleForClass:[self class]];
81
-	
82
-	NSObject *debugLevelObj = [theBundle objectForInfoDictionaryKey:@"DebugLevel"];
83
-	if (debugLevelObj != nil)
84
-	{
85
-		SetDebugLogLevel(DebugLevelNameToValue((NSString*)debugLevelObj));
86
-	}
87
-
88
-	NSObject *maxSourceSizeObj = [theBundle objectForInfoDictionaryKey:@"MaxSourceSizeToIndex"];
89
-	int max = [(NSNumber*)maxSourceSizeObj intValue];
90
-	if (max != 0)
91
-	{
92
-		DebugLog(DEBUG_LEVEL_DEBUG, @"Using MaxSourceSize=%d", max);
93
-		MaxSourceSize = max;
94
-	}
95
-	else
96
-	{
97
-		NSLog(@"Error parsing MaxSourceSizeToIndex, using %d", MaxSourceSize);
98
-	}
99
-	
100
-	// Precompile our regexes.
101
-	LispDef1_RE = [[AGRegex alloc] initWithPattern:LispDef1_pat];
102
-	LispDef2_RE = [[AGRegex alloc] initWithPattern:LispDef2_pat];
103
-	LispDefun_RE = [[AGRegex alloc] initWithPattern:LispDefun_pat];
104
-	LispDefunsetf_RE = [[AGRegex alloc] initWithPattern:LispDefunsetf_pat];
105
-	LispDefmethod_RE = [[AGRegex alloc] initWithPattern:LispDefmethod_pat];
106
-	LispDefmethodsetf_RE = [[AGRegex alloc] initWithPattern:LispDefmethodsetf_pat];
107
-	LispDefgeneric_RE = [[AGRegex alloc] initWithPattern:LispDefgeneric_pat];
108
-	LispDefgenericsetf_RE = [[AGRegex alloc] initWithPattern:LispDefgenericsetf_pat];
109
-	LispDefclass_RE = [[AGRegex alloc] initWithPattern:LispDefclass_pat];
110
-	LispDefstruct_RE = [[AGRegex alloc] initWithPattern:LispDefstruct_pat];
111
-	LispDefvar_RE = [[AGRegex alloc] initWithPattern:LispDefvar_pat];
112
-	LispDefmacro_RE = [[AGRegex alloc] initWithPattern:LispDefmacro_pat];
113
-	
114
-	DebugLog(DEBUG_LEVEL_DEBUG, @"Static data has been initialized.");
71
+    if (StaticDataIsInitialized)
72
+    {
73
+        return;
74
+    }
75
+    
76
+    StaticDataIsInitialized = YES;
77
+    
78
+    // Find the bundle, and Info.plist.  Set the debug level specified
79
+    // there, as well as the maximum file length to index.
80
+    NSBundle *theBundle = [NSBundle bundleForClass:[self class]];
81
+    
82
+    NSObject *debugLevelObj = [theBundle objectForInfoDictionaryKey:@"DebugLevel"];
83
+    if (debugLevelObj != nil)
84
+    {
85
+        SetDebugLogLevel(DebugLevelNameToValue((NSString*)debugLevelObj));
86
+    }
87
+    
88
+    NSObject *maxSourceSizeObj = [theBundle objectForInfoDictionaryKey:@"MaxSourceSizeToIndex"];
89
+    int max = [(NSNumber*)maxSourceSizeObj intValue];
90
+    if (max != 0)
91
+    {
92
+        DebugLog(DEBUG_LEVEL_DEBUG, @"Using MaxSourceSize=%d", max);
93
+        MaxSourceSize = max;
94
+    }
95
+    else
96
+    {
97
+        NSLog(@"Error parsing MaxSourceSizeToIndex, using %d", MaxSourceSize);
98
+    }
99
+    
100
+    // Precompile our regexes.
101
+    LispDef1_RE = [[AGRegex alloc] initWithPattern:LispDef1_pat];
102
+    LispDef2_RE = [[AGRegex alloc] initWithPattern:LispDef2_pat];
103
+    LispDefun_RE = [[AGRegex alloc] initWithPattern:LispDefun_pat];
104
+    LispDefunsetf_RE = [[AGRegex alloc] initWithPattern:LispDefunsetf_pat];
105
+    LispDefmethod_RE = [[AGRegex alloc] initWithPattern:LispDefmethod_pat];
106
+    LispDefmethodsetf_RE = [[AGRegex alloc] initWithPattern:LispDefmethodsetf_pat];
107
+    LispDefgeneric_RE = [[AGRegex alloc] initWithPattern:LispDefgeneric_pat];
108
+    LispDefgenericsetf_RE = [[AGRegex alloc] initWithPattern:LispDefgenericsetf_pat];
109
+    LispDefclass_RE = [[AGRegex alloc] initWithPattern:LispDefclass_pat];
110
+    LispDefstruct_RE = [[AGRegex alloc] initWithPattern:LispDefstruct_pat];
111
+    LispDefvar_RE = [[AGRegex alloc] initWithPattern:LispDefvar_pat];
112
+    LispDefmacro_RE = [[AGRegex alloc] initWithPattern:LispDefmacro_pat];
113
+    
114
+    DebugLog(DEBUG_LEVEL_DEBUG, @"Static data has been initialized.");
115 115
 }
116 116
 
117 117
 
118 118
 
119 119
 static NSStringEncoding PossibleSourceTextEncodings[] = {	NSUTF8StringEncoding,
120
-															NSMacOSRomanStringEncoding,
121
-															NSISOLatin1StringEncoding,
122
-															NSWindowsCP1252StringEncoding };
120
+    NSMacOSRomanStringEncoding,
121
+    NSISOLatin1StringEncoding,
122
+    NSWindowsCP1252StringEncoding };
123 123
 
124 124
 // Tries to read the file using the encodings specified in
125 125
 // PossibleSourceTextEncodings, in order, until one succeeds.
... ...
@@ -131,51 +131,51 @@ static NSStringEncoding PossibleSourceTextEncodings[] = {	NSUTF8StringEncoding,
131 131
 
132 132
 - (NSString*)readContentsOfFile:(NSString*)pathToFile error:(NSError**)theError
133 133
 {
134
-	int i;
135
-	NSStringEncoding theEncoding;
136
-	NSString *theSource = nil;
137
-	NSData *data;
138
-	
139
-	DebugLog(DEBUG_LEVEL_DEBUG, @"Indexing %@", pathToFile);
140
-	
141
-	// Read the file.
142
-	if (MaxSourceSize == NO_MAXIMUM)
143
-	{
144
-		data = [NSData dataWithContentsOfFile:pathToFile options:0 error:theError];
145
-	}
146
-	else
147
-	{
148
-		data = [NSData dataWithContentsOfFile:pathToFile maxSize:MaxSourceSize error:theError];
149
-		if ([data length] == MaxSourceSize)
150
-		{
151
-			// This is not absolutely certain to be correct, since the file might just have been
152
-			// MaxSourceSize bytes long.
153
-			DebugLog(DEBUG_LEVEL_DEBUG, @"Truncated indexing of '%@' to %d bytes", pathToFile, MaxSourceSize);
154
-		}
155
-	}
156
-	
157
-	if (data == nil)
158
-	{
159
-		return nil;
160
-	}
161
-	
162
-	// Try to convert the file contents to a string by trying the candidate
163
-	// encodings, in order.
164
-	for (i = 0; i < sizeof(PossibleSourceTextEncodings); i++)
165
-	{
166
-		theEncoding = PossibleSourceTextEncodings[i];
167
-		DebugLog(DEBUG_LEVEL_VERBOSE, @"Trying encoding %d", theEncoding);
168
-		theSource = [[[NSString alloc] initWithData:data encoding:theEncoding] autorelease];
169
-		if (theSource != nil)
170
-		{
171
-			break;
172
-		}
173
-		else
174
-		{
175
-			DebugLog(DEBUG_LEVEL_DEBUG, @"Reading with encoding %d failed.", theEncoding);
176
-		}
177
-	}
178
-	return theSource;
134
+    int i;
135
+    NSStringEncoding theEncoding;
136
+    NSString *theSource = nil;
137
+    NSData *data;
138
+    
139
+    DebugLog(DEBUG_LEVEL_DEBUG, @"Indexing %@", pathToFile);
140
+    
141
+    // Read the file.
142
+    if (MaxSourceSize == NO_MAXIMUM)
143
+    {
144
+        data = [NSData dataWithContentsOfFile:pathToFile options:0 error:theError];
145
+    }
146
+    else
147
+    {
148
+        data = [NSData dataWithContentsOfFile:pathToFile maxSize:MaxSourceSize error:theError];
149
+        if ([data length] == MaxSourceSize)
150
+        {
151
+            // This is not absolutely certain to be correct, since the file might just have been
152
+            // MaxSourceSize bytes long.
153
+            DebugLog(DEBUG_LEVEL_DEBUG, @"Truncated indexing of '%@' to %d bytes", pathToFile, MaxSourceSize);
154
+        }
155
+    }
156
+    
157
+    if (data == nil)
158
+    {
159
+        return nil;
160
+    }
161
+    
162
+    // Try to convert the file contents to a string by trying the candidate
163
+    // encodings, in order.
164
+    for (i = 0; i < sizeof(PossibleSourceTextEncodings); i++)
165
+    {
166
+        theEncoding = PossibleSourceTextEncodings[i];
167
+        DebugLog(DEBUG_LEVEL_VERBOSE, @"Trying encoding %d", theEncoding);
168
+        theSource = [[[NSString alloc] initWithData:data encoding:theEncoding] autorelease];
169
+        if (theSource != nil)
170
+        {
171
+            break;
172
+        }
173
+        else
174
+        {
175
+            DebugLog(DEBUG_LEVEL_DEBUG, @"Reading with encoding %d failed.", theEncoding);
176
+        }
177
+    }
178
+    return theSource;
179 179
 }
180 180
 
181 181
 
... ...
@@ -184,17 +184,17 @@ static NSStringEncoding PossibleSourceTextEncodings[] = {	NSUTF8StringEncoding,
184 184
 
185 185
 - (BOOL)addMatchesTo:(NSMutableDictionary *)attributes fromLine:(NSString *)line usingRE:(AGRegex *)regex forKey:(NSString *)key
186 186
 {
187
-	AGRegexMatch *match = [regex findInString:line];
188
-	if (match)
189
-	{
190
-		NSString *name = [match groupAtIndex: 1];
191
-		[[attributes objectForKey:key] addObject:name];
192
-		return YES;
193
-	}
194
-	else
195
-	{
196
-		return NO;
197
-	}
187
+    AGRegexMatch *match = [regex findInString:line];
188
+    if (match)
189
+    {
190
+        NSString *name = [match groupAtIndex: 1];
191
+        [[attributes objectForKey:key] addObject:name];
192
+        return YES;
193
+    }
194
+    else
195
+    {
196
+        return NO;
197
+    }
198 198
 }
199 199
 
200 200
 
... ...
@@ -203,94 +203,94 @@ static NSStringEncoding PossibleSourceTextEncodings[] = {	NSUTF8StringEncoding,
203 203
 
204 204
 - (BOOL)importFile:(NSString *)inPathToFile contentType:(NSString *)inContentType attributes:(NSMutableDictionary *)inAttributes
205 205
 {
206
-	BOOL theResult = NO;
207
-	
208
-	@try
209
-	{
210
-		NSAutoreleasePool *theAutoreleasePool = [[NSAutoreleasePool alloc] init];
211
-		NSError *error = nil;
212
-		NSString *source;
213
-		
214
-		[self initStaticData];
215
-		
216
-		source = [self readContentsOfFile:inPathToFile error:&error];
217
-		if (source == nil)
218
-		{
219
-			if (error)
220
-			{
221
-				NSLog(@"Lisp Metadata Importer: Could not process file '%@': %@", inPathToFile, error);
222
-			}
223
-			else
224
-			{
225
-				NSLog(@"Lisp Metadata Importer: Could not process file '%@': unknown error", inPathToFile);
226
-			}	
227
-			return NO;
228
-		}
229
-		
230
-		// Only process the first MaxSourceSize of the file.  To try to do more
231
-		// invites the swapping death.
232
-		if ([source length] > MaxSourceSize)
233
-		{
234
-			source = [source substringToIndex:MaxSourceSize];
235
-		}
236
-		
237
-		NSMutableDictionary *moreAttributes = [[[NSMutableDictionary alloc] initWithCapacity:10] autorelease];
238
-		[moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_definitions"];
239
-		[moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defuns"];
240
-		[moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defmethods"];
241
-		[moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defgenerics"];
242
-		[moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defmacros"];
243
-		[moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defvars"];
244
-		[moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defclasses"];
245
-		[moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defstructs"];
246
-		
247
-		
248
-		// Divide the file contents into lines, using either CR or LF to end a line.
249
-		NSCharacterSet *eol = [NSCharacterSet characterSetWithCharactersInString:@"\n\r"];
250
-		NSArray *lines = [source componentsSeparatedByCharacterFromSet:eol];
251
-		
252
-		NSEnumerator *theEnum = [lines objectEnumerator];
253
-		NSString *theLine;
254
-		
255
-		while (nil != (theLine = [theEnum nextObject]))
256
-		{
257
-			// The following check speeds the indexer up by roughly 6x.
258
-			if (([theLine length] > 0) && ([theLine characterAtIndex:0] == '('))
259
-			{
260
-				if (![self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDef1_RE forKey:@"org_lisp_definitions"])
261
-				{
262
-					// The first expression didn't fire, try the second one.
263
-					[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDef2_RE forKey:@"org_lisp_definitions"];
264
-				}
265
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefun_RE forKey:@"org_lisp_defuns"];
266
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefunsetf_RE forKey:@"org_lisp_defuns"];
267
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefmethod_RE forKey:@"org_lisp_defmethods"];
268
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefmethodsetf_RE forKey:@"org_lisp_defmethods"];
269
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefgeneric_RE forKey:@"org_lisp_defgenerics"];
270
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefgenericsetf_RE forKey:@"org_lisp_defgenerics"];
271
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefmacro_RE forKey:@"org_lisp_defmacros"];
272
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefvar_RE forKey:@"org_lisp_defvars"];
273
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefclass_RE forKey:@"org_lisp_defclasses"];
274
-				[self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefstruct_RE forKey:@"org_lisp_defstructs"];
275
-			}
276
-			
277
-		}
278
-		
279
-		// Add the complete source code as metadata.
280
-		[moreAttributes setObject:source forKey:@"kMDItemTextContent"];
281
-		
282
-		[inAttributes addEntriesFromDictionary:moreAttributes];
283
-		theResult = YES;
284
-		[theAutoreleasePool release];
285
-	}
286
-	@catch (NSException *localException)
287
-	{
288
-		NSLog(@"Lisp Metadata Importer: Could not process file '%@' (Exception: %@)", inPathToFile, localException);
289
-	}
290
-	@finally
291
-	{
292
-	}
293
-	return(theResult);
206
+    BOOL theResult = NO;
207
+    
208
+    @try
209
+    {
210
+        NSAutoreleasePool *theAutoreleasePool = [[NSAutoreleasePool alloc] init];
211
+        NSError *error = nil;
212
+        NSString *source;
213
+        
214
+        [self initStaticData];
215
+        
216
+        source = [self readContentsOfFile:inPathToFile error:&error];
217
+        if (source == nil)
218
+        {
219
+            if (error)
220
+            {
221
+                NSLog(@"Lisp Metadata Importer: Could not process file '%@': %@", inPathToFile, error);
222
+            }
223
+            else
224
+            {
225
+                NSLog(@"Lisp Metadata Importer: Could not process file '%@': unknown error", inPathToFile);
226
+            }	
227
+            return NO;
228
+        }
229
+        
230
+        // Only process the first MaxSourceSize of the file.  To try to do more
231
+        // invites the swapping death.
232
+        if ([source length] > MaxSourceSize)
233
+        {
234
+            source = [source substringToIndex:MaxSourceSize];
235
+        }
236
+        
237
+        NSMutableDictionary *moreAttributes = [[[NSMutableDictionary alloc] initWithCapacity:10] autorelease];
238
+        [moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_definitions"];
239
+        [moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defuns"];
240
+        [moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defmethods"];
241
+        [moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defgenerics"];
242
+        [moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defmacros"];
243
+        [moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defvars"];
244
+        [moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defclasses"];
245
+        [moreAttributes setObject:[NSMutableArray arrayWithCapacity:100] forKey:@"org_lisp_defstructs"];
246
+        
247
+        
248
+        // Divide the file contents into lines, using either CR or LF to end a line.
249
+        NSCharacterSet *eol = [NSCharacterSet characterSetWithCharactersInString:@"\n\r"];
250
+        NSArray *lines = [source componentsSeparatedByCharacterFromSet:eol];
251
+        
252
+        NSEnumerator *theEnum = [lines objectEnumerator];
253
+        NSString *theLine;
254
+        
255
+        while (nil != (theLine = [theEnum nextObject]))
256
+        {
257
+            // The following check speeds the indexer up by roughly 6x.
258
+            if (([theLine length] > 0) && ([theLine characterAtIndex:0] == '('))
259
+            {
260
+                if (![self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDef1_RE forKey:@"org_lisp_definitions"])
261
+                {
262
+                    // The first expression didn't fire, try the second one.
263
+                    [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDef2_RE forKey:@"org_lisp_definitions"];
264
+                }
265
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefun_RE forKey:@"org_lisp_defuns"];
266
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefunsetf_RE forKey:@"org_lisp_defuns"];
267
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefmethod_RE forKey:@"org_lisp_defmethods"];
268
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefmethodsetf_RE forKey:@"org_lisp_defmethods"];
269
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefgeneric_RE forKey:@"org_lisp_defgenerics"];
270
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefgenericsetf_RE forKey:@"org_lisp_defgenerics"];
271
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefmacro_RE forKey:@"org_lisp_defmacros"];
272
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefvar_RE forKey:@"org_lisp_defvars"];
273
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefclass_RE forKey:@"org_lisp_defclasses"];
274
+                [self addMatchesTo:moreAttributes fromLine:theLine usingRE:LispDefstruct_RE forKey:@"org_lisp_defstructs"];
275
+            }
276
+            
277
+        }
278
+        
279
+        // Add the complete source code as metadata.
280
+        [moreAttributes setObject:source forKey:@"kMDItemTextContent"];
281
+        
282
+        [inAttributes addEntriesFromDictionary:moreAttributes];
283
+        theResult = YES;
284
+        [theAutoreleasePool release];
285
+    }
286
+    @catch (NSException *localException)
287
+    {
288
+        NSLog(@"Lisp Metadata Importer: Could not process file '%@' (Exception: %@)", inPathToFile, localException);
289
+    }
290
+    @finally
291
+    {
292
+    }
293
+    return(theResult);
294 294
 }
295 295
 
296 296
 @end
... ...
@@ -7,9 +7,9 @@
7 7
 
8 8
 
9 9
 typedef enum {
10
-	DEBUG_LEVEL_VERBOSE = 0,
11
-	DEBUG_LEVEL_DEBUG = 1,
12
-	DEBUG_LEVEL_OFF = 2,
10
+    DEBUG_LEVEL_VERBOSE = 0,
11
+    DEBUG_LEVEL_DEBUG = 1,
12
+    DEBUG_LEVEL_OFF = 2,
13 13
 } DebugLevel;
14 14
 
15 15
 DebugLevel DebugLevelNameToValue(NSString *name);
... ...
@@ -13,48 +13,48 @@ static DebugLevel GlobalDebugLevel = DEBUG_LEVEL_OFF;
13 13
 
14 14
 DebugLevel DebugLevelNameToValue(NSString *name)
15 15
 {
16
-	DebugLevel level = DEBUG_LEVEL_OFF;
17
-	
18
-	if ([name isEqualToString:@"DEBUG_LEVEL_VERBOSE"])
19
-	{
20
-		level = DEBUG_LEVEL_VERBOSE;
21
-	}
22
-	else if ([name isEqualToString:@"DEBUG_LEVEL_DEBUG"])
23
-	{
24
-		level = DEBUG_LEVEL_DEBUG;
25
-	}
26
-	else if ([name isEqualToString:@"DEBUG_LEVEL_OFF"])
27
-	{
28
-		level = DEBUG_LEVEL_OFF;
29
-	}
30
-	return level;
16
+    DebugLevel level = DEBUG_LEVEL_OFF;
17
+    
18
+    if ([name isEqualToString:@"DEBUG_LEVEL_VERBOSE"])
19
+    {
20
+        level = DEBUG_LEVEL_VERBOSE;
21
+    }
22
+    else if ([name isEqualToString:@"DEBUG_LEVEL_DEBUG"])
23
+    {
24
+        level = DEBUG_LEVEL_DEBUG;
25
+    }
26
+    else if ([name isEqualToString:@"DEBUG_LEVEL_OFF"])
27
+    {
28
+        level = DEBUG_LEVEL_OFF;
29
+    }
30
+    return level;
31 31
 }
32 32
 
33 33
 void SetDebugLogLevel(DebugLevel theLevel)
34 34
 {
35
-	GlobalDebugLevel = theLevel;
35
+    GlobalDebugLevel = theLevel;
36 36
 }
37 37
 
38 38
 void DebugLog(DebugLevel level, NSString *format, ...)
39 39
 {
40
-	if (level >= GlobalDebugLevel)
41
-	{
42
-		// get a reference to the arguments on the stack that follow
43
-		// the format paramter
44
-		va_list argList;
45
-		va_start (argList, format);
46
-		
47
-		// NSString luckily provides us with this handy method which
48
-		// will do all the work for us, including %@
49
-		NSString *string;
50
-		string = [[NSString alloc] initWithFormat: format
51
-										arguments: argList];
52
-		va_end  (argList);
53
-		
54
-		// Log it.
55
-		NSLog(@"%@", string);
56
-		
57
-		[string release];
58
-	}
40
+    if (level >= GlobalDebugLevel)
41
+    {
42
+        // get a reference to the arguments on the stack that follow
43
+        // the format paramter
44
+        va_list argList;
45
+        va_start (argList, format);
46
+        
47
+        // NSString luckily provides us with this handy method which
48
+        // will do all the work for us, including %@
49
+        NSString *string;
50
+        string = [[NSString alloc] initWithFormat: format
51
+                                        arguments: argList];
52
+        va_end  (argList);
53
+        
54
+        // Log it.
55
+        NSLog(@"%@", string);
56
+        
57
+        [string release];
58
+    }
59 59
 }
60 60
 
... ...
@@ -14,23 +14,23 @@
14 14
 
15 15
 Boolean GetMetadataForFile(void* thisInterface, NSMutableDictionary *attributes, NSString *contentTypeUTI, NSString *pathToFile)
16 16
 {
17
-	BOOL theResult = NO;
18
-	NSAutoreleasePool *theAutoreleasePool = [[NSAutoreleasePool alloc] init];
19
-	
20
-	@try
21
-	{
22
-		CMetadataImporter *theImporter = [[[CMetadataImporter alloc] init] autorelease];
23
-		theResult = [theImporter importFile:pathToFile contentType:contentTypeUTI attributes:attributes];
24
-	}
25
-	@catch (NSException *localException)
26
-	{
27
-		NSLog(@"Exception caught during import operation: %@", localException);
28
-	}
29
-	@finally
30
-	{
31
-	}
32
-	
33
-	[theAutoreleasePool release];
34
-	
35
-	return(theResult);;
17
+    BOOL theResult = NO;
18
+    NSAutoreleasePool *theAutoreleasePool = [[NSAutoreleasePool alloc] init];
19
+    
20
+    @try
21
+    {
22
+        CMetadataImporter *theImporter = [[[CMetadataImporter alloc] init] autorelease];
23
+        theResult = [theImporter importFile:pathToFile contentType:contentTypeUTI attributes:attributes];
24
+    }
25
+    @catch (NSException *localException)
26
+    {
27
+        NSLog(@"Exception caught during import operation: %@", localException);
28
+    }
29
+    @finally
30
+    {
31
+    }
32
+    
33
+    [theAutoreleasePool release];
34
+    
35
+    return(theResult);;
36 36
 }
... ...
@@ -19,140 +19,140 @@
19 19
 
20 20
 static BOOL readContentsOfFile(NSString* path, void** buf, unsigned int maxLen, unsigned int* len, NSZone* zone)
21 21
 {
22
-	const char	*thePath = 0;
23
-	FILE		*theFile = 0;
24
-	void		*tmp = 0;
25
-	int			c;
26
-	long		fileLength;
27
-	
28
-	thePath = [path fileSystemRepresentation];
29
-	if (thePath == 0)
22
+    const char	*thePath = 0;
23
+    FILE		*theFile = 0;
24
+    void		*tmp = 0;
25
+    int			c;
26
+    long		fileLength;
27
+    
28
+    thePath = [path fileSystemRepresentation];
29
+    if (thePath == 0)
30 30
     {
31
-		//      NSWarnFLog(@"Open (%@) attempt failed - bad path", path);
32
-		return NO;
31
+        //      NSWarnFLog(@"Open (%@) attempt failed - bad path", path);
32
+        return NO;
33 33
     }
34
-	
35
-	theFile = fopen(thePath, "rb");
36
-	
37
-	if (theFile == 0)		/* We failed to open the file. */
34
+    
35
+    theFile = fopen(thePath, "rb");
36
+    
37
+    if (theFile == 0)		/* We failed to open the file. */
38 38
     {
39
-		//      NSWarnFLog(@"Open (%@) attempt failed - %s", path,
40
-		//      GSLastErrorStr(errno));
41
-		goto failure;
39
+        //      NSWarnFLog(@"Open (%@) attempt failed - %s", path,
40
+        //      GSLastErrorStr(errno));
41
+        goto failure;
42 42
     }
43
-	
44
-	/*
45
-	 *	Seek to the end of the file.
46
-	 */
47
-	c = fseek(theFile, 0L, SEEK_END);
48
-	if (c != 0)
43
+    
44
+    /*
45
+     *	Seek to the end of the file.
46
+     */
47
+    c = fseek(theFile, 0L, SEEK_END);
48
+    if (c != 0)
49 49
     {
50
-		//      NSWarnFLog(@"Seek to end of file (%@) failed - %s", path,
51
-		//      GSLastErrorStr(errno));
52
-		goto failure;
50
+        //      NSWarnFLog(@"Seek to end of file (%@) failed - %s", path,
51
+        //      GSLastErrorStr(errno));
52
+        goto failure;
53 53
     }
54
-	
55
-	/*
56
-	 *	Determine the length of the file (having seeked to the end of the
57
-										  *	file) by calling ftell().
58
-	 */
59
-	fileLength = ftell(theFile);
60
-	if (fileLength == -1)
54
+    
55
+    /*
56
+     *	Determine the length of the file (having seeked to the end of the
57
+     *	file) by calling ftell().
58
+     */
59
+    fileLength = ftell(theFile);
60
+    if (fileLength == -1)
61 61
     {
62
-		//      NSWarnFLog(@"Ftell on %@ failed - %s", path,
63
-		//      GSLastErrorStr(errno));
64
-		goto failure;
62
+        //      NSWarnFLog(@"Ftell on %@ failed - %s", path,
63
+        //      GSLastErrorStr(errno));
64
+        goto failure;
65 65
     }
66
-	
67
-	/*
68
-	 *	Rewind the file pointer to the beginning, preparing to read in
69
-	 *	the file.
70
-	 */
71
-	c = fseek(theFile, 0L, SEEK_SET);
72
-	if (c != 0)
66
+    
67
+    /*
68
+     *	Rewind the file pointer to the beginning, preparing to read in
69
+     *	the file.
70
+     */
71
+    c = fseek(theFile, 0L, SEEK_SET);
72
+    if (c != 0)
73 73
     {
74
-		//      NSWarnFLog(@"Fseek to start of file (%@) failed - %s", path,
75
-		//      GSLastErrorStr(errno));
76
-		goto failure;
74
+        //      NSWarnFLog(@"Fseek to start of file (%@) failed - %s", path,
75
+        //      GSLastErrorStr(errno));
76
+        goto failure;
77 77
     }
78
-	
79
-	if (fileLength == 0)
78
+    
79
+    if (fileLength == 0)
80 80
     {
81
-		unsigned char	buf[BUFSIZ];
82
-		unsigned bytesToRead = maxLen;
83
-		/*
84
-		 * Special case ... a file of length zero may be a named pipe or some
85
-		 * file in the /proc filesystem, which will return us data if we read
86
-		 * from it ... so we try reading as much as we can, up to the specified
87
-		 * limit.
88
-		 */
89
-		while ((c = fread(buf, 1, (bytesToRead < BUFSIZ) ? bytesToRead : BUFSIZ, theFile)) != 0)
90
-		{
91
-			if (tmp == 0)
92
-			{
93
-				tmp = NSZoneMalloc(zone, c);
94
-			}
95
-			else
96
-			{
97
-				tmp = NSZoneRealloc(zone, tmp, fileLength + c);
98
-			}
99
-			if (tmp == 0)
100
-			{
101
-				//	      NSLog(@"Malloc failed for file (%@) of length %d - %s", path,
102
-				//		fileLength + c, GSLastErrorStr(errno));
103
-				goto failure;
104
-			}
105
-			memcpy(tmp + fileLength, buf, c);
106
-			fileLength += c;
107
-			bytesToRead -= c;
108
-		}
109
-		if (fileLength == maxLen)
110
-		{
111
-			DebugLog(DEBUG_LEVEL_DEBUG, @"Truncated indexing of %s to %d bytes", thePath, maxLen);
112
-		}
81
+        unsigned char	buf[BUFSIZ];
82
+        unsigned bytesToRead = maxLen;
83
+        /*
84
+         * Special case ... a file of length zero may be a named pipe or some
85
+         * file in the /proc filesystem, which will return us data if we read
86
+         * from it ... so we try reading as much as we can, up to the specified
87
+         * limit.
88
+         */
89
+        while ((c = fread(buf, 1, (bytesToRead < BUFSIZ) ? bytesToRead : BUFSIZ, theFile)) != 0)
90
+        {
91
+            if (tmp == 0)
92
+            {
93
+                tmp = NSZoneMalloc(zone, c);
94
+            }
95
+            else
96
+            {
97
+                tmp = NSZoneRealloc(zone, tmp, fileLength + c);
98
+            }
99
+            if (tmp == 0)
100
+            {
101
+                //	      NSLog(@"Malloc failed for file (%@) of length %d - %s", path,
102
+                //		fileLength + c, GSLastErrorStr(errno));
103
+                goto failure;
104
+            }
105
+            memcpy(tmp + fileLength, buf, c);
106
+            fileLength += c;
107
+            bytesToRead -= c;
108
+        }
109
+        if (fileLength == maxLen)
110
+        {
111
+            DebugLog(DEBUG_LEVEL_DEBUG, @"Truncated indexing of %s to %d bytes", thePath, maxLen);
112
+        }
113 113
     }
114
-	else
114
+    else
115 115
     {
116
-		if (fileLength > maxLen)
117
-		{
118
-			fileLength = maxLen;
119
-			DebugLog(DEBUG_LEVEL_DEBUG, @"Truncated indexing of %s to %d bytes", thePath, maxLen);
120
-		}
121
-		tmp = NSZoneMalloc(zone, fileLength);
122
-		if (tmp == 0)
123
-		{
124
-			//	  NSLog(@"Malloc failed for file (%@) of length %d - %s", path,
125
-			//	  fileLength, GSLastErrorStr(errno));
126
-			goto failure;
127
-		}
128
-	    
129
-		c = fread(tmp, 1, fileLength, theFile);
130
-		if (c != (int)fileLength)
131
-		{
132
-			//	  NSWarnFLog(@"read of file (%@) contents failed - %s", path,
133
-			//	  GSLastErrorStr(errno));
134
-			goto failure;
135
-		}
116
+        if (fileLength > maxLen)
117
+        {
118
+            fileLength = maxLen;
119
+            DebugLog(DEBUG_LEVEL_DEBUG, @"Truncated indexing of %s to %d bytes", thePath, maxLen);
120
+        }
121
+        tmp = NSZoneMalloc(zone, fileLength);
122
+        if (tmp == 0)
123
+        {
124
+            //	  NSLog(@"Malloc failed for file (%@) of length %d - %s", path,
125
+            //	  fileLength, GSLastErrorStr(errno));
126
+            goto failure;
127
+        }
128
+        
129
+        c = fread(tmp, 1, fileLength, theFile);
130
+        if (c != (int)fileLength)
131
+        {
132
+            //	  NSWarnFLog(@"read of file (%@) contents failed - %s", path,
133
+            //	  GSLastErrorStr(errno));
134
+            goto failure;
135
+        }
136 136
     }
137
-	
138
-	*buf = tmp;
139
-	*len = fileLength;
140
-	fclose(theFile);
141
-	return YES;
142
-	
143
-	/*
144
-	 *	Just in case the failure action needs to be changed.
145
-	 */
137
+    
138
+    *buf = tmp;
139
+    *len = fileLength;
140
+    fclose(theFile);
141
+    return YES;
142
+    
143
+    /*
144
+     *	Just in case the failure action needs to be changed.
145
+     */
146 146
 failure:
147
-		if (tmp != 0)
148
-		{
149
-			NSZoneFree(zone, tmp);
150
-		}
151
-	if (theFile != 0)
147
+    if (tmp != 0)
152 148
     {
153
-		fclose(theFile);
149
+        NSZoneFree(zone, tmp);
154 150
     }
155
-	return NO;
151
+    if (theFile != 0)
152
+    {
153
+        fclose(theFile);
154
+    }
155
+    return NO;
156 156
 }
157 157
 
158 158
 @implementation NSData (NSData_Extensions)
... ...
@@ -167,11 +167,11 @@ failure:
167 167
  */
168 168
 + (id) dataWithContentsOfFile: (NSString*)path maxSize:(int)theMaxSize error:(NSError**)error
169 169
 {
170
-	NSData	*d;
171
-	
172
-	d = [NSData allocWithZone: NSDefaultMallocZone()];
173
-	d = [d initWithContentsOfFile: path maxSize:theMaxSize error:error];
174
-	return [d autorelease];
170
+    NSData	*d;
171
+    
172
+    d = [NSData allocWithZone: NSDefaultMallocZone()];
173
+    d = [d initWithContentsOfFile: path maxSize:theMaxSize error:error];
174
+    return [d autorelease];
175 175
 }
176 176
 
177 177
 
... ...
@@ -185,32 +185,32 @@ failure:
185 185
  */
186 186
 - (id) initWithContentsOfFile: (NSString*)path maxSize:(int)theMaxSize error:(NSError**)error
187 187
 {
188
-	void		*fileBytes = NULL;
189
-	unsigned	fileLength = 0;
190
-	NSZone	*zone;
191
-	
192
-	zone = NSDefaultMallocZone();
193
-	if (readContentsOfFile(path, &fileBytes, theMaxSize, &fileLength, zone) == NO)
188
+    void		*fileBytes = NULL;
189
+    unsigned	fileLength = 0;
190
+    NSZone	*zone;
191
+    
192
+    zone = NSDefaultMallocZone();
193
+    if (readContentsOfFile(path, &fileBytes, theMaxSize, &fileLength, zone) == NO)
194 194
     {
195
-		if (error)
196
-		{
197
-			NSNumber *errorCode = [NSNumber numberWithInt:errno];
198
-			NSString *errorDescription = [NSString stringWithCString:strerror(errno)];
199
-			NSString* errorPath = path;
200
-			NSMutableDictionary *errorAttribs = [NSMutableDictionary dictionaryWithCapacity:2];
201
-			[errorAttribs setObject:errorCode forKey:@"Errno"];
202
-			[errorAttribs setObject:errorDescription forKey:@"Description"];
203
-			[errorAttribs setObject:errorPath forKey:@"Path"];
204
-			*error = [NSError errorWithDomain:NSPOSIXErrorDomain code:errno userInfo:errorAttribs];
205
-		}
206
-		[self dealloc];
207
-		return nil;
195
+        if (error)
196
+        {
197
+            NSNumber *errorCode = [NSNumber numberWithInt:errno];
198
+            NSString *errorDescription = [NSString stringWithCString:strerror(errno)];
199
+            NSString* errorPath = path;
200
+            NSMutableDictionary *errorAttribs = [NSMutableDictionary dictionaryWithCapacity:2];
201
+            [errorAttribs setObject:errorCode forKey:@"Errno"];
202
+            [errorAttribs setObject:errorDescription forKey:@"Description"];
203
+            [errorAttribs setObject:errorPath forKey:@"Path"];
204
+            *error = [NSError errorWithDomain:NSPOSIXErrorDomain code:errno userInfo:errorAttribs];
205
+        }
206
+        [self dealloc];
207
+        return nil;
208 208
     }
209
-	else
209
+    else
210 210
     {
211
-		self = [self initWithBytesNoCopy:fileBytes length:fileLength freeWhenDone:YES];
211
+        self = [self initWithBytesNoCopy:fileBytes length:fileLength freeWhenDone:YES];
212 212
     }
213
-	return self;
213
+    return self;
214 214
 }
215 215
 
216 216
 @end
... ...
@@ -20,28 +20,28 @@
20 20
  */
21 21
 - (NSArray*) componentsSeparatedByCharacterFromSet: (NSCharacterSet*)separatorSet
22 22
 {
23
-	NSRange search;
24
-	NSRange complete;
25
-	NSRange found;
26
-	NSMutableArray *array = [NSMutableArray array];
27
-
28
-	search = NSMakeRange(0, [self length]);
29
-	complete = search;
30
-	found = [self rangeOfCharacterFromSet:separatorSet];
31
-	while (found.length != 0)
23
+    NSRange search;
24
+    NSRange complete;
25
+    NSRange found;
26
+    NSMutableArray *array = [NSMutableArray array];
27
+    
28
+    search = NSMakeRange(0, [self length]);
29
+    complete = search;
30
+    found = [self rangeOfCharacterFromSet:separatorSet];
31
+    while (found.length != 0)
32 32
     {
33
-		NSRange current;
34
-
35
-		current = NSMakeRange(search.location, found.location - search.location);
36
-		[array addObject:[self substringWithRange:current]];
37
-
38
-		search = NSMakeRange(found.location + found.length, complete.length - found.location - found.length);
39
-		found = [self rangeOfCharacterFromSet:separatorSet options:0 range:search];
33
+        NSRange current;
34
+        
35
+        current = NSMakeRange(search.location, found.location - search.location);
36
+        [array addObject:[self substringWithRange:current]];
37
+        
38
+        search = NSMakeRange(found.location + found.length, complete.length - found.location - found.length);
39
+        found = [self rangeOfCharacterFromSet:separatorSet options:0 range:search];
40 40
     }
41
-	// Add the last search string range
42
-	[array addObject: [self substringWithRange: search]];
43
-
44
-	return array;
41
+    // Add the last search string range
42
+    [array addObject: [self substringWithRange: search]];
43
+    
44
+    return array;
45 45
 }
46 46
 
47 47
 /**
... ...
@@ -49,11 +49,11 @@
49 49
  */
50 50
 + (NSString*)stringWithContentsOfFile:(NSString*)pathToFile maxSize:(int)theMaxSize encoding:(NSStringEncoding)theEncoding error:(NSError**)theError
51 51
 {
52
-	NSString	*obj;
53
-	
54
-	obj = [self allocWithZone:NSDefaultMallocZone()];
55
-	obj = [obj initWithContentsOfFile:pathToFile maxSize:theMaxSize encoding:theEncoding error:theError];
56
-	return [obj autorelease];
52
+    NSString	*obj;
53
+    
54
+    obj = [self allocWithZone:NSDefaultMallocZone()];
55
+    obj = [obj initWithContentsOfFile:pathToFile maxSize:theMaxSize encoding:theEncoding error:theError];
56
+    return [obj autorelease];
57 57
 }
58 58
 
59 59
 /**
... ...
@@ -70,28 +70,28 @@
70 70
  */
71 71
 - (NSString*)initWithContentsOfFile:(NSString*)path maxSize:(int)theMaxSize encoding:(NSStringEncoding)theEncoding error:(NSError**)theError
72 72
 {
73
-	NSData		*d;
74
-	unsigned int		len;
75
-	const unsigned char	*data_bytes;
76
-	
77
-	d = [[NSData alloc] initWithContentsOfFile:path maxSize:theMaxSize error:theError];
78
-	if (d == nil)
73
+    NSData		*d;
74
+    unsigned int		len;
75
+    const unsigned char	*data_bytes;
76
+    
77
+    d = [[NSData alloc] initWithContentsOfFile:path maxSize:theMaxSize error:theError];
78
+    if (d == nil)
79 79
     {
80
-		[self dealloc];
81
-		return nil;
80
+        [self dealloc];
81
+        return nil;
82 82
     }
83
-	len = [d length];
84
-	if (len == 0)
83
+    len = [d length];
84
+    if (len == 0)
85 85
     {
86
-		[d release];
87
-		[self dealloc];
88
-		return @"";
86
+        [d release];
87
+        [self dealloc];
88
+        return @"";
89 89
     }
90
-	data_bytes = [d bytes];
91
-	
92
-	self = [self initWithData:d encoding:theEncoding];
93
-	[d release];
94
-	return self;
90
+    data_bytes = [d bytes];
91
+    
92
+    self = [self initWithData:d encoding:theEncoding];
93
+    [d release];
94
+    return self;
95 95
 }
96 96
 
97 97
 @end
... ...
@@ -31,10 +31,10 @@
31 31
 
32 32
 // The import function to be implemented in GetMetadataForFile.c
33 33
 Boolean GetMetadataForFile(void *thisInterface, 
34
-			   CFMutableDictionaryRef attributes, 
35
-			   CFStringRef contentTypeUTI,
36
-			   CFStringRef pathToFile);
37
-			   
34
+                           CFMutableDictionaryRef attributes, 
35
+                           CFStringRef contentTypeUTI,
36
+                           CFStringRef pathToFile);
37
+
38 38
 // The layout for an instance of MetaDataImporterPlugIn 
39 39
 typedef struct __MetadataImporterPluginType
40 40
 {
... ...
@@ -80,18 +80,18 @@ static MDImporterInterfaceStruct testInterfaceFtbl = {
80 80
 MetadataImporterPluginType *AllocMetadataImporterPluginType(CFUUIDRef inFactoryID)
81 81
 {
82 82
     MetadataImporterPluginType *theNewInstance;
83
-
83
+    
84 84
     theNewInstance = (MetadataImporterPluginType *)malloc(sizeof(MetadataImporterPluginType));
85 85
     memset(theNewInstance,0,sizeof(MetadataImporterPluginType));
86
-
87
-        /* Point to the function table */
86
+    
87
+    /* Point to the function table */
88 88
     theNewInstance->conduitInterface = &testInterfaceFtbl;
89
-
90
-        /*  Retain and keep an open instance refcount for each factory. */
89
+    
90
+    /*  Retain and keep an open instance refcount for each factory. */
91 91
     theNewInstance->factoryID = CFRetain(inFactoryID);
92 92
     CFPlugInAddInstanceForFactory(inFactoryID);
93
-
94
-        /* This function returns the IUnknown interface so set the refCount to one. */
93
+    
94
+    /* This function returns the IUnknown interface so set the refCount to one. */
95 95
     theNewInstance->refCount = 1;
96 96
     return theNewInstance;
97 97
 }
... ...
@@ -107,7 +107,7 @@ MetadataImporterPluginType *AllocMetadataImporterPluginType(CFUUIDRef inFactoryI
107 107
 void DeallocMetadataImporterPluginType(MetadataImporterPluginType *thisInstance)
108 108
 {
109 109
     CFUUIDRef theFactoryID;
110
-
110
+    
111 111
     theFactoryID = thisInstance->factoryID;
112 112
     free(thisInstance);
113 113
     if (theFactoryID){
... ...
@@ -124,27 +124,27 @@ void DeallocMetadataImporterPluginType(MetadataImporterPluginType *thisInstance)
124 124
 HRESULT MetadataImporterQueryInterface(void *thisInstance,REFIID iid,LPVOID *ppv)
125 125
 {
126 126
     CFUUIDRef interfaceID;
127
-
127
+    
128 128
     interfaceID = CFUUIDCreateFromUUIDBytes(kCFAllocatorDefault,iid);
129
-
129
+    
130 130
     if (CFEqual(interfaceID,kMDImporterInterfaceID)){
131
-            /* If the Right interface was requested, bump the ref count,
132
-             * set the ppv parameter equal to the instance, and
133
-             * return good status.
134
-             */
131
+        /* If the Right interface was requested, bump the ref count,
132
+         * set the ppv parameter equal to the instance, and
133
+         * return good status.
134
+         */
135 135
         ((MetadataImporterPluginType*)thisInstance)->conduitInterface->AddRef(thisInstance);
136 136
         *ppv = thisInstance;
137 137
         CFRelease(interfaceID);
138 138
         return S_OK;
139 139
     }else{
140 140
         if (CFEqual(interfaceID,IUnknownUUID)){
141
-                /* If the IUnknown interface was requested, same as above. */
141
+            /* If the IUnknown interface was requested, same as above. */
142 142
             ((MetadataImporterPluginType*)thisInstance )->conduitInterface->AddRef(thisInstance);
143 143
             *ppv = thisInstance;
144 144
             CFRelease(interfaceID);
145 145
             return S_OK;
146 146
         }else{
147
-                /* Requested interface unknown, bail with error. */
147
+            /* Requested interface unknown, bail with error. */
148 148
             *ppv = NULL;
149 149
             CFRelease(interfaceID);
150 150
             return E_NOINTERFACE;
... ...
@@ -191,18 +191,18 @@ void *MetadataImporterPluginFactory(CFAllocatorRef allocator,CFUUIDRef typeID)
191 191
 {
192 192
     MetadataImporterPluginType *result;
193 193
     CFUUIDRef                 uuid;
194
-
195 194
     
196
-        /* If correct type is being requested, allocate an
197
-         * instance of TestType and return the IUnknown interface.
198
-         */
195
+    
196
+    /* If correct type is being requested, allocate an
197
+     * instance of TestType and return the IUnknown interface.
198
+     */
199 199
     if (CFEqual(typeID,kMDImporterTypeID)){
200 200
         uuid = CFUUIDCreateFromString(kCFAllocatorDefault,CFSTR(PLUGIN_ID));
201 201
         result = AllocMetadataImporterPluginType(uuid);
202 202
         CFRelease(uuid);
203 203
         return result;
204 204
     }
205
-        /* If the requested type is incorrect, return NULL. */
205
+    /* If the requested type is incorrect, return NULL. */
206 206
     return NULL;
207 207
 }
208 208