root/trunk/meta/regex.d

Revision 84, 16.6 kB (checked in by pragma, 9 years ago)

Updated the bejezus out of this thing.


Currently Supported

  • character classes (including inverse char classes via [...])
  • match one or more (+)
  • match zero or more (*)
  • match zero or one (?)
  • escape sequences
  • whitespace matching (ws chars are treated literally right now)
  • {n} and {n,m} predicates
  • at the start of an expression
  • $ at the end of an expression
  • grouping via ()
  • most standard escape sequences
  • union operator (outside of parens)
Line 
1 /+
2     Copyright (c) 2005 Eric Anderton
3         
4     Permission is hereby granted, free of charge, to any person
5     obtaining a copy of this software and associated documentation
6     files (the "Software"), to deal in the Software without
7     restriction, including without limitation the rights to use,
8     copy, modify, merge, publish, distribute, sublicense, and/or
9     sell copies of the Software, and to permit persons to whom the
10     Software is furnished to do so, subject to the following
11     conditions:
12
13     The above copyright notice and this permission notice shall be
14     included in all copies or substantial portions of the Software.
15
16     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23     OTHER DEALINGS IN THE SOFTWARE.
24 +/
25 /**
26     Compile-Time regular expression engine.
27     
28     To use, simply call regexMatch!() and use the function pointer to perform matches
29     for that expression.  The match routine returns a set of strings for all matches;
30     char[][].
31     
32     --------------------
33     auto exp = &regexMatch!("[a-z]*\s*\w*");
34     writefln("matches: %s",exp("hello    world");
35     --------------------
36     
37     The compile-time regex parser generates a series of functions that match specific
38     portions of your expression, which make up the runtime expression engine.  The
39     resulting generated code set and call tree are absolutely minimalistic and match
40     the original expression's logic with as few productions as possible.
41         
42     Currently Supported
43     - character classes (including inverse char classes via [^...])
44     - match one or more (+)
45     - match zero or more (*)
46     - match zero or one (?)
47     - escape sequences
48     - whitespace matching (ws chars are treated literally right now)
49     - {n} and {n,m} predicates
50     - ^ at the start of an expression
51     - $ at the end of an expression
52     - grouping via ()
53     - most standard escape sequences
54     - union operator (outside of parens)
55
56     Planned
57     - {,m} and {n,} predicates
58     - different match operations other than first match
59     - move pipe to low parsing precedence rather than highest (support inside parens)
60     
61     Possible?!
62     - \d (decimal)
63     - \o (octal)
64     - multi-line matching semantics (like ^ and $ matching \n and such)
65     - \b word boundary
66     - \B non word boundary
67     - lazy matching (current implementation is greedy)
68     - replacement expressions
69     - named groups
70 */
71 module meta.regex;
72
73 import meta.conv;
74 import meta.strhacks;
75 import meta.string;
76
77 import meta.regexPredicate;
78
79 template getAt(char[] str,uint index){
80     const char getAt = str[index];
81 }
82
83 template first(char[] str){
84     const char first = str[0];
85 }
86
87 template last(char[] str){
88     const char last = getAt!(str,strlen!(str)-1);
89 }
90
91 template isSpecial(char[] pattern){
92     static if(
93         pattern[0] == '*' ||
94         pattern[0] == '+' ||
95         pattern[0] == '?' ||
96         pattern[0] == '.' ||
97         pattern[0] == '[' ||
98         pattern[0] == '{' ||
99         pattern[0] == '(' ||
100         //pattern[0] == ')' ||
101         //pattern[0] == '}' ||
102         //pattern[0] == ']' ||
103         pattern[0] == '$' ||
104         pattern[0] == '^' ||
105         pattern[0] == '\\' 
106     ){
107         const bit isSpecial = true;
108     }
109     else{
110         const bit isSpecial = false;
111     }
112 }
113
114 template parseTextToken(char[] pattern){
115     static if(strlen!(pattern) > 0){
116         static if(isSpecial!(pattern)){
117             const char[] parseTextToken="";
118         }
119         else{
120             const char[] parseTextToken = slice!(pattern,0,1) ~ .parseTextToken!(slice!(pattern,1,strlen!(pattern)));
121         }
122     }   
123     else{
124         const char[] parseTextToken="";
125     }           
126 }
127
128 /// parses up to and including terminator.  Returns everything up to terminator.
129 template parseUntil(char[] pattern,char terminator,bit fuzzy=false){
130     static if(strlen!(pattern) > 0){
131         static if(pattern[0] == '\\'){
132             static if(strlen!(pattern) > 1){
133                 const char[] nextSlice = sliceheadoff!(pattern,2);
134                 alias .parseUntil!(nextSlice,terminator,fuzzy) next;
135                 const char[] token = slice!(pattern,0,2) ~ next.token; 
136                 const uint consumed = next.consumed+2;
137             }
138             else{
139                 pragma(msg,"Error: exptected character to follow \\");
140                 static assert(false);
141             }
142         }
143         else static if(pattern[0] == terminator){
144             const char[] token="";
145             const uint consumed = 1;
146         }
147         else{
148             const char[] nextSlice = sliceheadoff!(pattern,1);
149             alias .parseUntil!(nextSlice,terminator,fuzzy) next;
150             const char[] token = slice!(pattern,0,1) ~ next.token;
151             const uint consumed = next.consumed+1;
152         }
153     }
154     else static if(fuzzy){
155         const char[] token = "";
156         const uint consumed = 0;
157     }
158     else{
159         pragma(msg,"Error: exptected " ~ makechar!(terminator) ~ " to terminate group expression");
160         static assert(false);
161     }           
162 }
163
164 // shim for parseUint
165 template charToUint(char[] value){
166     const uint charToUint = value[0];
167 }
168
169 template parseUint(char[] pattern,uint prev=0){
170     static if(strlen!(pattern) > 0){
171         static if(pattern[0] >= '0' && pattern[0] <= '9'){
172             const uint thisValue = (charToUint!(pattern)-'0') + prev*10;
173             alias .parseUint!(sliceheadoff!(pattern,1),thisValue) next;
174             const uint consumed = next.consumed+1;
175             const uint value = next.value;
176         }
177         else{
178             const uint consumed = 0;
179             const uint value = prev;
180         }
181     }
182     else{
183         const uint consumed = 0;
184         const uint value = prev;
185     }           
186 }
187
188 template regexCompileCharClassRecurse(alias termFn,char[] pattern){
189     static if(strlen!(pattern) > 0){
190         static if(pattern[0] != ']'){
191             debug pragma(msg,"REMAINING: " ~ pattern);
192            
193             alias regexCompileCharClass2!(pattern) next;
194             alias testOr!(termFn,next.fn,pattern) fn;
195             const uint consumed = next.consumed;
196         }
197         else{
198             alias termFn fn;
199             const uint consumed = 0;
200         }
201     }
202     else{
203         alias termFn fn;
204         const uint consumed = 0;
205     }
206     debug pragma(msg,"regexCompileCharClassRecurse consumed:" ~ itoa!(consumed));
207 }
208
209 template regexCompileCharClass2(char[] pattern){
210     static if(strlen!(pattern) > 0){
211         static if(pattern[0] == '\\'){
212             static if(strlen!(pattern) == 1){
213                 pragma(msg,"Error: expected character following \\ in character class");
214                 static assert(false);
215             }
216             else static if(pattern[1] == ']'){
217                 alias testChar!("]") fn;
218                 const uint thisConsumed = 2;
219             }
220             else static if(pattern[1] == '^'){
221                 alias testChar!("^") fn;
222                 const uint thisConsumed = 2;
223             }
224             else static if(pattern[1] == '-'){
225                 alias testChar!("-") fn;
226                 const uint thisConsumed = 2;
227             }
228             else{
229                 alias regexCompileEscape!(sliceheadoff!(pattern,1)) term;
230                 alias term.fn termFn;
231                 const uint thisConsumed = term.consumed+1;
232             }
233            
234             const char[] remaining = slice!(pattern,thisConsumed,strlen!(pattern));
235         }
236         else{
237             //NOTE: read ahead up to two chars for a range expression.
238             //NOTE: should probably be refactored off to something else
239             static if(strlen!(pattern) > 1){
240                 static if(pattern[1] == '-'){
241                     static if(strlen!(pattern) > 2){
242                         alias testRange!(slice!(pattern,0,1),slice!(pattern,2,3)) termFn;
243                         const uint thisConsumed = 3;
244                         const char[] remaining = slice!(pattern,3,strlen!(pattern));
245                     }
246                     else{ // length is 2
247                         pragma(msg,"Error: expected character following '-' in character class");
248                         static assert(false);   
249                     }
250                 }
251                 else{ // not '-'
252                     alias testChar!(slice!(pattern,0,1)) termFn;
253                     const uint thisConsumed = 1;
254                     const char[] remaining = slice!(pattern,1,strlen!(pattern));                       
255                 }
256             }
257             else{
258                 alias testChar!(slice!(pattern,0,1)) termFn;
259                 const uint thisConsumed = 1;
260                 const char[] remaining = slice!(pattern,1,strlen!(pattern));
261             }
262         }
263         alias regexCompileCharClassRecurse!(termFn,remaining) recurse;
264         alias recurse.fn fn;
265         const uint consumed = recurse.consumed + thisConsumed;
266     }
267     else{
268         //TODO: trigger error
269         alias testEmpty!() fn;
270         const uint consumed = 0;
271     }
272     debug pragma(msg,"regexCompileCharClass2 consumed:" ~ itoa!(consumed));
273 }
274
275 template regexCompileCharClass(char[] pattern){
276     static if(strlen!(pattern) > 0){
277         static if(pattern[0] == ']'){
278             alias testEmpty!() fn;
279             const uint consumed = 0;
280         }
281         else static if(pattern[0] == '^'){
282             pragma(msg,"foobar");
283             alias regexCompileCharClass2!(sliceheadoff!(pattern,1)) charClass;
284             alias testCharInverse!(charClass.fn,pattern) inverseCharClass;
285             alias inverseCharClass fn;
286             const uint consumed = charClass.consumed + 1;
287         }
288         else{
289             alias regexCompileCharClass2!(pattern) charClass;
290             alias charClass.fn fn;
291             const uint consumed = charClass.consumed;
292         }
293     }
294     else{
295         pragma(msg,"Error: expected closing ']' for character class");
296         static assert(false);   
297     }
298 }
299
300 // shim to assist with {n,m} notation
301 template validateMaxToken(uint tokenLength,uint consumed,uint value){
302     static if(consumed == 0 || consumed < tokenLength){
303         pragma(msg,"Error: expected expression in the format of {n,m}");
304         static assert(false);
305     }
306     const uint max = value;
307 }
308
309 // shim to assist with {n,m} notation
310 template parseMaxPredicate(uint min,char[] token){
311     static if(strlen!(token) > 0){
312         static if(getAt!(token,0) == ',' && strlen!(token) > 1){
313             alias parseUint!(sliceheadoff!(token,1)) maxToken;
314             const uint max = validateMaxToken!(strlen!(token),maxToken.consumed+1,maxToken.value).max;
315         }
316         else{
317             pragma(msg,"Error: expected expression in the format of {n,m}");
318             static assert(false);
319         }
320     }
321     else{
322         const uint max = min;
323     }
324 }
325
326 template regexCompilePredicate(alias test,char[] token,char[] pattern){
327     static if(strlen!(pattern) > 0){
328         static if(pattern[0] == '*'){
329             alias testZeroOrMore!(test,token) fn;
330             const uint consumed = 1;
331         }
332         else static if(pattern[0] == '+'){
333             alias testOneOrMore!(test,token) fn;
334             const uint consumed = 1;
335         }
336         else static if(pattern[0] == '?'){
337             alias testZeroOrMore!(test,token) fn;
338             const uint consumed = 1;
339         }
340         else static if(pattern[0] == '{'){
341             const char[] token = parseUntil!(sliceheadoff!(pattern,1),'}').token;
342             static if(strlen!(token) == 0){
343                 pragma(msg,"Error: expected number inside {n} expression");
344                 static assert(false);
345             }
346            
347             alias parseUint!(token) minToken;
348             const uint min = minToken.value;   
349             uint max = parseMaxPredicate!(min,sliceheadoff!(token,minToken.consumed)).max;         
350
351             alias testTimes!(min,max,test,token) fn;
352             const uint consumed = strlen!(token)+2;
353             debug pragma(msg,"consumed: " ~ itoa!(consumed));
354         }
355         else{
356             alias test fn;
357             const uint consumed = 0;           
358         }
359     }
360     else{
361         alias test fn;
362         const uint consumed = 0;
363     }   
364 }
365
366 template regexCompileEscape(char[] pattern){
367     static if(strlen!(pattern) > 0){
368         static if(pattern[0] == '\\'){
369             alias testChar!("\\") fn;
370         }       
371         //TODO: word boundary (/b) and non-word boundary (/B)
372         //TODO: /d (decimal) and /o (octal)?
373         else static if(pattern[0] == 'd'){
374             // numeric chars
375             alias testRange!("0","9") fn;
376         }
377         else static if(pattern[0] == 'D'){
378             // non numeric chars
379             alias testCharInverse!(testRange!("0","9"),pattern) fn;
380         }
381         else static if(pattern[0] == 'f'){
382             // form feed
383             alias testChar!("\f") fn;
384         }       
385         else static if(pattern[0] == 'n'){
386             // newline
387             alias testChar!("\n") fn;
388         }
389         else static if(pattern[0] == 'r'){
390             // carriage return
391             alias testChar!("\r") fn;
392         }
393         else static if(pattern[0] == 's'){
394             // whitespace char
395             alias testRange!("\x00","\x20") fn;
396         }
397         else static if(pattern[0] == 'S'){
398             //non-whitespace char
399             alias testCharInverse!(testRange!("\x00","\x20"),pattern) fn;
400         }
401         else static if(pattern[0] == 't'){
402             //tab   
403             alias testChar!("\t") fn;
404         }
405         else static if(pattern[0] == 'v'){
406             //vertical tab
407             alias testChar!("\v") fn;
408         }
409         else static if(pattern[0] == 'w'){
410             //word char
411             alias testWordChar!() fn;
412         }
413         else static if(pattern[0] == 'W'){
414             alias testCharInverse!(testWordChar!()) fn;
415         }
416         else{
417             alias testChar!(slice!(pattern,0,1)) fn;
418         }
419         const uint consumed = 1;
420     }
421     else{
422         pragma(msg,"Error: expected char following '\\'");
423         static assert(false);
424     }
425 }
426
427 /// recursive portion of regexCompile - shim to work around alias scope issue
428 template regexCompileRecurse(alias term,char[] pattern){
429     static if(strlen!(pattern) > 0){
430         debug pragma(msg,"REMAINING: " ~ pattern);
431        
432         alias regexCompile!(pattern) next;
433         alias testUnion!(term.fn,next.fn,pattern) fn;
434     }
435     else{
436         alias term.fn fn;
437     }
438 }
439
440 /// recursive descent parser for regex strings
441 //TODO: install pipe operator here and give regexCompile the 'consumed' protocol to make
442 // partial passes of the pattern possible
443 template regexCompile(char[] pattern){
444     debug pragma(msg,"PATTERN: " ~ pattern);
445     static if(strlen!(pattern) > 0){
446         static if(pattern[0] == '['){
447             const char[] charClassToken = parseUntil!(slice!(pattern,1,strlen!(pattern)),']').token;           
448             alias regexCompileCharClass!(charClassToken) charClass;
449             const char[] token = slice!(pattern,0,charClass.consumed+2);
450             const char[] next = slice!(pattern,charClass.consumed+2,strlen!(pattern));
451             alias charClass.fn test;
452         }
453         else static if(pattern[0] == '('){
454             const char[] groupToken = parseUntil!(slice!(pattern,1,strlen!(pattern)),')').token;
455             alias regexCompile!(groupToken) groupExpression;
456             const char[] token = slice!(pattern,0,strlen!(groupToken)+2);
457             const char[] next = slice!(pattern,strlen!(groupToken)+2,strlen!(pattern));
458             alias groupExpression.fn test;
459         }
460         else static if(pattern[0] == '.'){
461             const char[] token = ".";
462             const char[] next = sliceheadoff!(pattern,1);
463             alias testAny!() test;
464         }
465         else static if(pattern[0] == '\\'){
466             alias regexCompileEscape!(sliceheadoff!(pattern,1)) escapeSequence;
467             const char[] token = slice!(pattern,0,escapeSequence.consumed+1);
468             const char[] next = sliceheadoff!(pattern,escapeSequence.consumed+1);
469             alias escapeSequence.fn test;
470         }
471         else static if(pattern[0] == '$'){
472             pragma(msg,"Error: $ not allowed inside an expression (use \\$ instead)");
473             static assert(false);
474         }
475         else static if(pattern[0] == '^'){
476             pragma(msg,"Error: ^ not allowed inside an expression (use \\^ instead)");
477             static assert(false);
478         }       
479         else{
480             const char[] token = parseTextToken!(pattern);
481             static assert(strlen!(token) > 0);
482             const char[] next = slice!(pattern,strlen!(token),strlen!(pattern));
483             alias testText!(token) test;
484         }
485        
486         debug pragma(msg,"TOKEN: " ~ token);
487         debug pragma(msg,"NEXT: " ~ next);
488        
489         alias regexCompilePredicate!(test,token,next) term;
490         const char[] remaining = slice!(next,term.consumed,strlen!(next));
491        
492         alias regexCompileRecurse!(term,remaining).fn fn;
493     }
494     else{
495         alias testEmpty!() fn;
496     }   
497 }
498
499
500 //TODO: at this level, check each tokenized sub expression for starting with '^' or ending with '$', and
501 // apply the appropriate matching algorithm.
502
503 /// recursive portion of regexCompile - shim to work around alias scope issue
504 template compileMatchRecurse(alias termFn,char[] pattern){
505     static if(strlen!(pattern) > 0){
506         debug pragma(msg,"REMAINING: " ~ pattern);
507        
508         alias compileMatch!(pattern) next;
509         alias matchUnion!(termFn,next.fn,pattern) fn;
510     }
511     else{
512         alias termFn fn;
513     }
514 }
515
516 // shim
517 template compileMatch2(char[] token,char[] remaining){
518     static if(last!(token) == '$'){
519         static if(first!(token) == '^'){
520             alias matchTestPerfect!(regexCompile!(slicetailoff!(token,1)).fn,remaining) termFn;
521         }
522         else{
523             //TODO: refactor by reversing the string (should make for a faster match)
524             alias matchTestFromEnd!(regexCompile!(slicetailoff!(token,1)).fn,remaining) termFn;
525         }
526     }
527     else static if(first!(token) == '^'){
528         alias matchTestFromStart!(regexCompile!(sliceheadoff!(token,1)).fn,remaining) termFn;
529     }
530     else{
531         alias matchTest!(regexCompile!(token).fn,remaining) termFn;
532     }           
533     alias compileMatchRecurse!(termFn,remaining).fn fn;
534 }
535
536 template compileMatch(char[] pattern){
537     static if(strlen!(pattern) > 0){
538         alias parseUntil!(pattern,'|',true) unionToken;
539        
540         const char[] token = unionToken.token;
541         const char[] remaining = sliceheadoff!(pattern,unionToken.consumed);
542        
543         debug pragma(msg,"TOKEN: " ~ token);
544         debug pragma(msg,"REMAINING: " ~ remaining);   
545        
546         alias compileMatch2!(token,remaining).fn fn;
547     }
548     else{
549         alias matchTest!(testEmpty!(),pattern) fn;
550     }
551     //TODO: parse support for unions (another parser layer most likely)
552 }
553
554 template regexMatch(char[] pattern){   
555     //alias matchTest!(regexCompile!(pattern).fn,pattern) regexMatch;
556     alias compileMatch!(pattern).fn regexMatch;
557 }
Note: See TracBrowser for help on using the browser.