Changeset 155

Show
Ignore:
Timestamp:
02/25/06 09:05:10 (3 years ago)
Author:
Don Clugston
Message:

Regexp: Lazy matching now works properly. Yee-haw! Even in the complex union cases. Also added support for [] and [] character classes.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/meta/regexp2.d

    r154 r155  
    99// A "naked term" is either a "sequence" or an "atom". 
    1010 
    11  
    12 // Features currently supported: 
    13 //   Non-greedy quantifiers:  * + ? {m} {m,} {m,n} 
    14 //   match character 
    15 //   .  match any character 
    16 //   unions 
    17 //   uncaptured grouping via ( ) 
    18 //   ^   start of line 
    19 //   @n  match string variables passed into the functions as extra parameters. (this is a non-standard extension). 
     11/* 
     12Features currently supported: 
     13   * match previous expression 0 or more times 
     14   + match previous expression 1 or more times 
     15   ? match previous expression 0 or 1 times 
     16   {m,n}  match previous expression between m and n times 
     17   {m,} match previous expression m or more times 
     18   {,n} match previous expression between 0 and m times 
     19   {n} match previous expression exactly n times. 
     20   . match any character 
     21   other characters match themselves 
     22   a|b   match regular expression a or b 
     23   uncaptured grouping via ( ) 
     24   ^   start of line 
     25   $   end of line 
     26   [abc]  match any character in character class abc 
     27   [^abc] match any character not in character class abc 
     28   @n  match string variables passed into the functions as extra parameters. (this is a non-standard extension). 
     29    
     30   All matches are lazy, rather than greedy. 
     31    
     32Planned, but not yet supported: 
     33  captured grouping 
     34  escape characters 
     35  word matching 
     36  greedy matching 
     37  full diagnostic error messages 
     38   
     39Possible: 
     40  an optimisation pass, including features such as searching the pattern in reverse order. 
     41  \1..\9 to match previously captured subsequences 
     42    
     43*/ 
    2044 
    2145// Points of interest: 
     
    268292 
    269293// Evaluate one term (without quantifier). 
    270 // The ONLY purpose of this is to restore the 'p' pointer when we return. 
     294// This helper class has two purposes: 
     295// (1) to restore the 'p' pointer when we return. 
     296// (2) ensure that at least one character was consumed 
    271297template regSequenceDontUpdateP(char [] regstr) 
    272298{ 
    273299    bool fn() { 
    274300        mixin regSequence!(regstr, endSequence) x; 
     301        // It's only a successful match if _something_ was consumed 
     302        if (p==theinitialp) return false;         
    275303        int oldp = p; 
    276304        if (!x.fn()) return false; 
     
    286314        bool fn() 
    287315        { 
     316            // While evaluating this first sequence, if this is a sequence 
     317            // that potentially has zero length (ie, everything is a *, ? or {m,} term), 
     318            // each term should attempt to consume at least one character if possible. 
     319            int theinitialp = p; 
    288320            mixin regSequenceDontUpdateP!(regstr[1..t-1]) suddendeath; 
    289321            mixin regSequence!(regstr[1..t-1], suddendeath) a; 
     
    292324    } else { 
    293325            bool fn() { 
     326                // It's easy with atoms, because we know they always eat something. 
     327                // BUG: Maybe this will fail when null @n strings are passed in?                 
    294328                mixin regAtom!(regstr) a; 
    295329                if (!a.fn()) return false; 
     
    316350            const uint repmax = quantifierMax!(regstr[t..$]);           
    317351             
    318             // HORRENDOUSLY inefficient! In some cases, we generate the quantified term TWICE. 
     352            // HORRENDOUSLY inefficient! In some cases, we generate the quantified term THREE TIMES! 
    319353            // The first one contains the rest of the search tree. 
    320354            // This is used when we think we can do (atom).(next) for an early exit 
     
    377411    static if (regstr[0]=='[') { 
    378412        static if (regstr[1]=='^') 
    379             mixin regInvertedRange!(regstr[2..$-1]); 
    380         else 
    381             mixin regRange!(regstr[1..$-1]); 
    382     } else static if (regstr[0]=='.') { 
     413        { 
     414            bool fn() { // inverse character class             
     415                return (p<searchstr.length && !charMatches!(regstr[2..$-1])(searchstr[p++])); 
     416            } 
     417        } else { 
     418            bool fn() { // character class 
     419                return (p<searchstr.length && charMatches!(regstr[1..$-1])(searchstr[p++])); 
     420            } 
     421        }             
     422    } else static if (regstr[0]=='.') { // match any 
    383423        bool fn() { 
    384424            if (p==searchstr.length) return false; 
     
    413453} 
    414454 
    415 template regRange(char [] regstr) 
    416 
    417     bool fn() { 
    418         pragma(msg, "Parsing range: " ~ regstr ~ "   NOT YET IMPLEMENTED"); 
    419         return true;         
    420     } 
    421 
    422  
    423 template regInvertedRange(char [] regstr) 
    424 
    425     bool fn() { 
    426         pragma(msg, "Parsing inverted range: " ~ regstr ~ "   NOT YET IMPLEMENTED"); 
    427         return true;         
     455//"a-zA-Z0-9_" 
     456 
     457// return true if char ch is matched by the character class regstr. 
     458template charMatches(char [] regstr) 
     459
     460    bool charMatches(char ch) { 
     461        static if (regstr.length==0) return false; 
     462        else static if (regstr.length>=3 && regstr[1]=='-') { 
     463            return (ch>=regstr[0] && ch<=regstr[2]) || charMatches!(regstr[3..$])(ch); 
     464        } else return (ch==regstr[0]) || charMatches!(regstr[1..$])(ch); 
    428465    } 
    429466} 
     
    466503void main() 
    467504{ 
    468 char [] qq; 
    469 writefln("UNIT TESTS\n"); 
     505writefln("BEGINNING UNIT TESTS\n"); 
    470506assert(search!("ab")("aaab")=="ab");  
    471507assert(search!("a*b")("aaab")=="aaab");  
     
    474510assert(search!("(a*)b")("aaab")=="aaab");  
    475511assert(search!("(b*a*)*b")("aaab")=="aaab");  
    476 assert(search!("((a*b*)|da)*b")("fasdaaab")=="aaab"); 
    477512assert(search!("b+cd")("acdbbcabbcdaaab")=="bbcd"); 
    478513assert(search!("b?cd")("abcacbacdb")=="cd"); 
     
    482517assert(search!("(ab)*(abb)")("bababb")=="ababb");  
    483518assert(search!("e?(ab)*b+")("eaaababbbbaac")=="ababb"); 
    484  
     519assert(search!("(ab*)*c")("bbbababbaaabaaaabbbbc") == "ababbaaabaaaabbbbc"); 
     520char [] quasistatic="m"; 
     521assert(search!("(@1.*@1)")("they said D can't do metaprogramming?", quasistatic)=="metaprogram"); 
     522assert(search!("[h-za]*g")("metaprogramming")=="taprog"); 
     523assert(search!("(a*)*b")("cacaaab")=="aaab"); 
     524assert(search!("(a*b*)*c")("dababdaabababbaaabbbcab")=="aabababbaaabbbc"); 
     525 
     526assert(search!("((a*b*)|da)*b")("fasdaaab")=="daaab"); 
     527 
     528char [] qq; 
    485529writefln("========="); 
    486     char [] quasistatic="m"; 
    487 qq = search!("(@1.{10}@1)")("they said D can't do metaprogramming?", quasistatic); 
     530 
     531writefln("This example fails"); 
     532qq = search!("((a*b*)|da)*b")("fasdaaab"); 
    488533writefln("Result: ----",qq, "---"); 
    489534writefln("========="); 
    490535 
    491 /+    
    492     writefln("Test 1: The match is: ", search!("ac?b+d*c?a*")("dsafjabbbbcaac"));     
    493     writefln("Test 2: The match is: ", search!("e?(ab)*b+")("eaaababbbbaac")); 
    494     writefln("Test 3: The match is: ", search!("(abc)*abb")("abbbbaac")); 
    495     writefln("Test 3a: The match is: ", search!("(abc)*abb")("abcabcabbbaac")); 
    496     writefln("Test 4: The match is: ", search!("a{3}b+")("aaabbbbcaac")); 
    497     writefln("Test 5: The match is: ", search!("(z|a{3})b+|m.*")("aaabbbbcaac")); 
    498      
    499     char [] quasistatic; 
    500     quasistatic = "x"; 
    501     for (int i=0; i<3; ++i) { 
    502         if (i==1) quasistatic = "a"; 
    503         if (i==2) quasistatic = "m"; 
    504         writefln("Test 6: The match is: ",  
    505             search!("($1.{10}$1.*n)|g.*")("they said D can't do metaprogramming? ", quasistatic)); 
    506     } 
    507     writefln("Test 7: The match is: ", search!("(z|a{3})b+|M.*")("zand this is Metaprogramming in D")); 
    508 +/ 
    509 writefln("All Tests Pass"); 
    510536} 
    511537 
     
    530556// unit tests 
    531557//------------- 
    532 static assert(quantifierConsumed!("{456}345")==5); 
    533 static assert(parenConsumed!("(45(6)4)5")==8); 
    534 static assert(parenConsumed!(`(45\(6)45`)==7); 
     558version (testmeta) { 
     559    static assert(quantifierConsumed!("{456}345")==5); 
     560    static assert(parenConsumed!("(45(6)4)5")==8); 
     561    static assert(parenConsumed!(`(45\(6)45`)==7); 
     562