| 1 |
/+ |
|---|
| 2 |
Copyright (c) 2005 Eric Anderton |
|---|
| 3 |
|
|---|
| 4 |
Permission is hereby granted, free of charge, to any person |
|---|
| 5 |
obtaining a copy of this software and associated documentation |
|---|
| 6 |
files (the "Software"), to deal in the Software without |
|---|
| 7 |
restriction, including without limitation the rights to use, |
|---|
| 8 |
copy, modify, merge, publish, distribute, sublicense, and/or |
|---|
| 9 |
sell copies of the Software, and to permit persons to whom the |
|---|
| 10 |
Software is furnished to do so, subject to the following |
|---|
| 11 |
conditions: |
|---|
| 12 |
|
|---|
| 13 |
The above copyright notice and this permission notice shall be |
|---|
| 14 |
included in all copies or substantial portions of the Software. |
|---|
| 15 |
|
|---|
| 16 |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|---|
| 17 |
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
|---|
| 18 |
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|---|
| 19 |
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
|---|
| 20 |
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
|---|
| 21 |
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|---|
| 22 |
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
|---|
| 23 |
OTHER DEALINGS IN THE SOFTWARE. |
|---|
| 24 |
+/ |
|---|
| 25 |
/** |
|---|
| 26 |
Compile-Time regular expression engine. |
|---|
| 27 |
|
|---|
| 28 |
To use, simply call regexMatch!() and use the function pointer to perform matches |
|---|
| 29 |
for that expression. The match routine returns a set of strings for all matches; |
|---|
| 30 |
char[][]. |
|---|
| 31 |
|
|---|
| 32 |
-------------------- |
|---|
| 33 |
auto exp = ®exMatch!("[a-z]*\s*\w*"); |
|---|
| 34 |
writefln("matches: %s",exp("hello world"); |
|---|
| 35 |
-------------------- |
|---|
| 36 |
|
|---|
| 37 |
The compile-time regex parser generates a series of functions that match specific |
|---|
| 38 |
portions of your expression, which make up the runtime expression engine. The |
|---|
| 39 |
resulting generated code set and call tree are absolutely minimalistic and match |
|---|
| 40 |
the original expression's logic with as few productions as possible. |
|---|
| 41 |
|
|---|
| 42 |
Currently Supported |
|---|
| 43 |
- character classes (including inverse char classes via [^...]) |
|---|
| 44 |
- match one or more (+) |
|---|
| 45 |
- match zero or more (*) |
|---|
| 46 |
- match zero or one (?) |
|---|
| 47 |
- escape sequences |
|---|
| 48 |
- whitespace matching (ws chars are treated literally right now) |
|---|
| 49 |
- {n} and {n,m} predicates |
|---|
| 50 |
- ^ at the start of an expression |
|---|
| 51 |
- $ at the end of an expression |
|---|
| 52 |
- grouping via () |
|---|
| 53 |
- most standard escape sequences |
|---|
| 54 |
- union operator (outside of parens) |
|---|
| 55 |
|
|---|
| 56 |
Planned |
|---|
| 57 |
- {,m} and {n,} predicates |
|---|
| 58 |
- different match operations other than first match |
|---|
| 59 |
- move pipe to low parsing precedence rather than highest (support inside parens) |
|---|
| 60 |
|
|---|
| 61 |
Possible?! |
|---|
| 62 |
- \d (decimal) |
|---|
| 63 |
- \o (octal) |
|---|
| 64 |
- multi-line matching semantics (like ^ and $ matching \n and such) |
|---|
| 65 |
- \b word boundary |
|---|
| 66 |
- \B non word boundary |
|---|
| 67 |
- lazy matching (current implementation is greedy) |
|---|
| 68 |
- replacement expressions |
|---|
| 69 |
- named groups |
|---|
| 70 |
*/ |
|---|
| 71 |
module meta.regex; |
|---|
| 72 |
|
|---|
| 73 |
import meta.conv; |
|---|
| 74 |
import meta.strhacks; |
|---|
| 75 |
import meta.string; |
|---|
| 76 |
|
|---|
| 77 |
import meta.regexPredicate; |
|---|
| 78 |
|
|---|
| 79 |
template getAt(char[] str,uint index){ |
|---|
| 80 |
const char getAt = str[index]; |
|---|
| 81 |
} |
|---|
| 82 |
|
|---|
| 83 |
template first(char[] str){ |
|---|
| 84 |
const char first = str[0]; |
|---|
| 85 |
} |
|---|
| 86 |
|
|---|
| 87 |
template last(char[] str){ |
|---|
| 88 |
const char last = getAt!(str,strlen!(str)-1); |
|---|
| 89 |
} |
|---|
| 90 |
|
|---|
| 91 |
template isSpecial(char[] pattern){ |
|---|
| 92 |
static if( |
|---|
| 93 |
pattern[0] == '*' || |
|---|
| 94 |
pattern[0] == '+' || |
|---|
| 95 |
pattern[0] == '?' || |
|---|
| 96 |
pattern[0] == '.' || |
|---|
| 97 |
pattern[0] == '[' || |
|---|
| 98 |
pattern[0] == '{' || |
|---|
| 99 |
pattern[0] == '(' || |
|---|
| 100 |
//pattern[0] == ')' || |
|---|
| 101 |
//pattern[0] == '}' || |
|---|
| 102 |
//pattern[0] == ']' || |
|---|
| 103 |
pattern[0] == '$' || |
|---|
| 104 |
pattern[0] == '^' || |
|---|
| 105 |
pattern[0] == '\\' |
|---|
| 106 |
){ |
|---|
| 107 |
const bit isSpecial = true; |
|---|
| 108 |
} |
|---|
| 109 |
else{ |
|---|
| 110 |
const bit isSpecial = false; |
|---|
| 111 |
} |
|---|
| 112 |
} |
|---|
| 113 |
|
|---|
| 114 |
template parseTextToken(char[] pattern){ |
|---|
| 115 |
static if(strlen!(pattern) > 0){ |
|---|
| 116 |
static if(isSpecial!(pattern)){ |
|---|
| 117 |
const char[] parseTextToken=""; |
|---|
| 118 |
} |
|---|
| 119 |
else{ |
|---|
| 120 |
const char[] parseTextToken = slice!(pattern,0,1) ~ .parseTextToken!(slice!(pattern,1,strlen!(pattern))); |
|---|
| 121 |
} |
|---|
| 122 |
} |
|---|
| 123 |
else{ |
|---|
| 124 |
const char[] parseTextToken=""; |
|---|
| 125 |
} |
|---|
| 126 |
} |
|---|
| 127 |
|
|---|
| 128 |
/// parses up to and including terminator. Returns everything up to terminator. |
|---|
| 129 |
template parseUntil(char[] pattern,char terminator,bit fuzzy=false){ |
|---|
| 130 |
static if(strlen!(pattern) > 0){ |
|---|
| 131 |
static if(pattern[0] == '\\'){ |
|---|
| 132 |
static if(strlen!(pattern) > 1){ |
|---|
| 133 |
const char[] nextSlice = sliceheadoff!(pattern,2); |
|---|
| 134 |
alias .parseUntil!(nextSlice,terminator,fuzzy) next; |
|---|
| 135 |
const char[] token = slice!(pattern,0,2) ~ next.token; |
|---|
| 136 |
const uint consumed = next.consumed+2; |
|---|
| 137 |
} |
|---|
| 138 |
else{ |
|---|
| 139 |
pragma(msg,"Error: exptected character to follow \\"); |
|---|
| 140 |
static assert(false); |
|---|
| 141 |
} |
|---|
| 142 |
} |
|---|
| 143 |
else static if(pattern[0] == terminator){ |
|---|
| 144 |
const char[] token=""; |
|---|
| 145 |
const uint consumed = 1; |
|---|
| 146 |
} |
|---|
| 147 |
else{ |
|---|
| 148 |
const char[] nextSlice = sliceheadoff!(pattern,1); |
|---|
| 149 |
alias .parseUntil!(nextSlice,terminator,fuzzy) next; |
|---|
| 150 |
const char[] token = slice!(pattern,0,1) ~ next.token; |
|---|
| 151 |
const uint consumed = next.consumed+1; |
|---|
| 152 |
} |
|---|
| 153 |
} |
|---|
| 154 |
else static if(fuzzy){ |
|---|
| 155 |
const char[] token = ""; |
|---|
| 156 |
const uint consumed = 0; |
|---|
| 157 |
} |
|---|
| 158 |
else{ |
|---|
| 159 |
pragma(msg,"Error: exptected " ~ makechar!(terminator) ~ " to terminate group expression"); |
|---|
| 160 |
static assert(false); |
|---|
| 161 |
} |
|---|
| 162 |
} |
|---|
| 163 |
|
|---|
| 164 |
// shim for parseUint |
|---|
| 165 |
template charToUint(char[] value){ |
|---|
| 166 |
const uint charToUint = value[0]; |
|---|
| 167 |
} |
|---|
| 168 |
|
|---|
| 169 |
template parseUint(char[] pattern,uint prev=0){ |
|---|
| 170 |
static if(strlen!(pattern) > 0){ |
|---|
| 171 |
static if(pattern[0] >= '0' && pattern[0] <= '9'){ |
|---|
| 172 |
const uint thisValue = (charToUint!(pattern)-'0') + prev*10; |
|---|
| 173 |
alias .parseUint!(sliceheadoff!(pattern,1),thisValue) next; |
|---|
| 174 |
const uint consumed = next.consumed+1; |
|---|
| 175 |
const uint value = next.value; |
|---|
| 176 |
} |
|---|
| 177 |
else{ |
|---|
| 178 |
const uint consumed = 0; |
|---|
| 179 |
const uint value = prev; |
|---|
| 180 |
} |
|---|
| 181 |
} |
|---|
| 182 |
else{ |
|---|
| 183 |
const uint consumed = 0; |
|---|
| 184 |
const uint value = prev; |
|---|
| 185 |
} |
|---|
| 186 |
} |
|---|
| 187 |
|
|---|
| 188 |
template regexCompileCharClassRecurse(alias termFn,char[] pattern){ |
|---|
| 189 |
static if(strlen!(pattern) > 0){ |
|---|
| 190 |
static if(pattern[0] != ']'){ |
|---|
| 191 |
debug pragma(msg,"REMAINING: " ~ pattern); |
|---|
| 192 |
|
|---|
| 193 |
alias regexCompileCharClass2!(pattern) next; |
|---|
| 194 |
alias testOr!(termFn,next.fn,pattern) fn; |
|---|
| 195 |
const uint consumed = next.consumed; |
|---|
| 196 |
} |
|---|
| 197 |
else{ |
|---|
| 198 |
alias termFn fn; |
|---|
| 199 |
const uint consumed = 0; |
|---|
| 200 |
} |
|---|
| 201 |
} |
|---|
| 202 |
else{ |
|---|
| 203 |
alias termFn fn; |
|---|
| 204 |
const uint consumed = 0; |
|---|
| 205 |
} |
|---|
| 206 |
debug pragma(msg,"regexCompileCharClassRecurse consumed:" ~ itoa!(consumed)); |
|---|
| 207 |
} |
|---|
| 208 |
|
|---|
| 209 |
template regexCompileCharClass2(char[] pattern){ |
|---|
| 210 |
static if(strlen!(pattern) > 0){ |
|---|
| 211 |
static if(pattern[0] == '\\'){ |
|---|
| 212 |
static if(strlen!(pattern) == 1){ |
|---|
| 213 |
pragma(msg,"Error: expected character following \\ in character class"); |
|---|
| 214 |
static assert(false); |
|---|
| 215 |
} |
|---|
| 216 |
else static if(pattern[1] == ']'){ |
|---|
| 217 |
alias testChar!("]") fn; |
|---|
| 218 |
const uint thisConsumed = 2; |
|---|
| 219 |
} |
|---|
| 220 |
else static if(pattern[1] == '^'){ |
|---|
| 221 |
alias testChar!("^") fn; |
|---|
| 222 |
const uint thisConsumed = 2; |
|---|
| 223 |
} |
|---|
| 224 |
else static if(pattern[1] == '-'){ |
|---|
| 225 |
alias testChar!("-") fn; |
|---|
| 226 |
const uint thisConsumed = 2; |
|---|
| 227 |
} |
|---|
| 228 |
else{ |
|---|
| 229 |
alias regexCompileEscape!(sliceheadoff!(pattern,1)) term; |
|---|
| 230 |
alias term.fn termFn; |
|---|
| 231 |
const uint thisConsumed = term.consumed+1; |
|---|
| 232 |
} |
|---|
| 233 |
|
|---|
| 234 |
const char[] remaining = slice!(pattern,thisConsumed,strlen!(pattern)); |
|---|
| 235 |
} |
|---|
| 236 |
else{ |
|---|
| 237 |
//NOTE: read ahead up to two chars for a range expression. |
|---|
| 238 |
//NOTE: should probably be refactored off to something else |
|---|
| 239 |
static if(strlen!(pattern) > 1){ |
|---|
| 240 |
static if(pattern[1] == '-'){ |
|---|
| 241 |
static if(strlen!(pattern) > 2){ |
|---|
| 242 |
alias testRange!(slice!(pattern,0,1),slice!(pattern,2,3)) termFn; |
|---|
| 243 |
const uint thisConsumed = 3; |
|---|
| 244 |
const char[] remaining = slice!(pattern,3,strlen!(pattern)); |
|---|
| 245 |
} |
|---|
| 246 |
else{ // length is 2 |
|---|
| 247 |
pragma(msg,"Error: expected character following '-' in character class"); |
|---|
| 248 |
static assert(false); |
|---|
| 249 |
} |
|---|
| 250 |
} |
|---|
| 251 |
else{ // not '-' |
|---|
| 252 |
alias testChar!(slice!(pattern,0,1)) termFn; |
|---|
| 253 |
const uint thisConsumed = 1; |
|---|
| 254 |
const char[] remaining = slice!(pattern,1,strlen!(pattern)); |
|---|
| 255 |
} |
|---|
| 256 |
} |
|---|
| 257 |
else{ |
|---|
| 258 |
alias testChar!(slice!(pattern,0,1)) termFn; |
|---|
| 259 |
const uint thisConsumed = 1; |
|---|
| 260 |
const char[] remaining = slice!(pattern,1,strlen!(pattern)); |
|---|
| 261 |
} |
|---|
| 262 |
} |
|---|
| 263 |
alias regexCompileCharClassRecurse!(termFn,remaining) recurse; |
|---|
| 264 |
alias recurse.fn fn; |
|---|
| 265 |
const uint consumed = recurse.consumed + thisConsumed; |
|---|
| 266 |
} |
|---|
| 267 |
else{ |
|---|
| 268 |
//TODO: trigger error |
|---|
| 269 |
alias testEmpty!() fn; |
|---|
| 270 |
const uint consumed = 0; |
|---|
| 271 |
} |
|---|
| 272 |
debug pragma(msg,"regexCompileCharClass2 consumed:" ~ itoa!(consumed)); |
|---|
| 273 |
} |
|---|
| 274 |
|
|---|
| 275 |
template regexCompileCharClass(char[] pattern){ |
|---|
| 276 |
static if(strlen!(pattern) > 0){ |
|---|
| 277 |
static if(pattern[0] == ']'){ |
|---|
| 278 |
alias testEmpty!() fn; |
|---|
| 279 |
const uint consumed = 0; |
|---|
| 280 |
} |
|---|
| 281 |
else static if(pattern[0] == '^'){ |
|---|
| 282 |
pragma(msg,"foobar"); |
|---|
| 283 |
alias regexCompileCharClass2!(sliceheadoff!(pattern,1)) charClass; |
|---|
| 284 |
alias testCharInverse!(charClass.fn,pattern) inverseCharClass; |
|---|
| 285 |
alias inverseCharClass fn; |
|---|
| 286 |
const uint consumed = charClass.consumed + 1; |
|---|
| 287 |
} |
|---|
| 288 |
else{ |
|---|
| 289 |
alias regexCompileCharClass2!(pattern) charClass; |
|---|
| 290 |
alias charClass.fn fn; |
|---|
| 291 |
const uint consumed = charClass.consumed; |
|---|
| 292 |
} |
|---|
| 293 |
} |
|---|
| 294 |
else{ |
|---|
| 295 |
pragma(msg,"Error: expected closing ']' for character class"); |
|---|
| 296 |
static assert(false); |
|---|
| 297 |
} |
|---|
| 298 |
} |
|---|
| 299 |
|
|---|
| 300 |
// shim to assist with {n,m} notation |
|---|
| 301 |
template validateMaxToken(uint tokenLength,uint consumed,uint value){ |
|---|
| 302 |
static if(consumed == 0 || consumed < tokenLength){ |
|---|
| 303 |
pragma(msg,"Error: expected expression in the format of {n,m}"); |
|---|
| 304 |
static assert(false); |
|---|
| 305 |
} |
|---|
| 306 |
const uint max = value; |
|---|
| 307 |
} |
|---|
| 308 |
|
|---|
| 309 |
// shim to assist with {n,m} notation |
|---|
| 310 |
template parseMaxPredicate(uint min,char[] token){ |
|---|
| 311 |
static if(strlen!(token) > 0){ |
|---|
| 312 |
static if(getAt!(token,0) == ',' && strlen!(token) > 1){ |
|---|
| 313 |
alias parseUint!(sliceheadoff!(token,1)) maxToken; |
|---|
| 314 |
const uint max = validateMaxToken!(strlen!(token),maxToken.consumed+1,maxToken.value).max; |
|---|
| 315 |
} |
|---|
| 316 |
else{ |
|---|
| 317 |
pragma(msg,"Error: expected expression in the format of {n,m}"); |
|---|
| 318 |
static assert(false); |
|---|
| 319 |
} |
|---|
| 320 |
} |
|---|
| 321 |
else{ |
|---|
| 322 |
const uint max = min; |
|---|
| 323 |
} |
|---|
| 324 |
} |
|---|
| 325 |
|
|---|
| 326 |
template regexCompilePredicate(alias test,char[] token,char[] pattern){ |
|---|
| 327 |
static if(strlen!(pattern) > 0){ |
|---|
| 328 |
static if(pattern[0] == '*'){ |
|---|
| 329 |
alias testZeroOrMore!(test,token) fn; |
|---|
| 330 |
const uint consumed = 1; |
|---|
| 331 |
} |
|---|
| 332 |
else static if(pattern[0] == '+'){ |
|---|
| 333 |
alias testOneOrMore!(test,token) fn; |
|---|
| 334 |
const uint consumed = 1; |
|---|
| 335 |
} |
|---|
| 336 |
else static if(pattern[0] == '?'){ |
|---|
| 337 |
alias testZeroOrMore!(test,token) fn; |
|---|
| 338 |
const uint consumed = 1; |
|---|
| 339 |
} |
|---|
| 340 |
else static if(pattern[0] == '{'){ |
|---|
| 341 |
const char[] token = parseUntil!(sliceheadoff!(pattern,1),'}').token; |
|---|
| 342 |
static if(strlen!(token) == 0){ |
|---|
| 343 |
pragma(msg,"Error: expected number inside {n} expression"); |
|---|
| 344 |
static assert(false); |
|---|
| 345 |
} |
|---|
| 346 |
|
|---|
| 347 |
alias parseUint!(token) minToken; |
|---|
| 348 |
const uint min = minToken.value; |
|---|
| 349 |
uint max = parseMaxPredicate!(min,sliceheadoff!(token,minToken.consumed)).max; |
|---|
| 350 |
|
|---|
| 351 |
alias testTimes!(min,max,test,token) fn; |
|---|
| 352 |
const uint consumed = strlen!(token)+2; |
|---|
| 353 |
debug pragma(msg,"consumed: " ~ itoa!(consumed)); |
|---|
| 354 |
} |
|---|
| 355 |
else{ |
|---|
| 356 |
alias test fn; |
|---|
| 357 |
const uint consumed = 0; |
|---|
| 358 |
} |
|---|
| 359 |
} |
|---|
| 360 |
else{ |
|---|
| 361 |
alias test fn; |
|---|
| 362 |
const uint consumed = 0; |
|---|
| 363 |
} |
|---|
| 364 |
} |
|---|
| 365 |
|
|---|
| 366 |
template regexCompileEscape(char[] pattern){ |
|---|
| 367 |
static if(strlen!(pattern) > 0){ |
|---|
| 368 |
static if(pattern[0] == '\\'){ |
|---|
| 369 |
alias testChar!("\\") fn; |
|---|
| 370 |
} |
|---|
| 371 |
//TODO: word boundary (/b) and non-word boundary (/B) |
|---|
| 372 |
//TODO: /d (decimal) and /o (octal)? |
|---|
| 373 |
else static if(pattern[0] == 'd'){ |
|---|
| 374 |
// numeric chars |
|---|
| 375 |
alias testRange!("0","9") fn; |
|---|
| 376 |
} |
|---|
| 377 |
else static if(pattern[0] == 'D'){ |
|---|
| 378 |
// non numeric chars |
|---|
| 379 |
alias testCharInverse!(testRange!("0","9"),pattern) fn; |
|---|
| 380 |
} |
|---|
| 381 |
else static if(pattern[0] == 'f'){ |
|---|
| 382 |
// form feed |
|---|
| 383 |
alias testChar!("\f") fn; |
|---|
| 384 |
} |
|---|
| 385 |
else static if(pattern[0] == 'n'){ |
|---|
| 386 |
// newline |
|---|
| 387 |
alias testChar!("\n") fn; |
|---|
| 388 |
} |
|---|
| 389 |
else static if(pattern[0] == 'r'){ |
|---|
| 390 |
// carriage return |
|---|
| 391 |
alias testChar!("\r") fn; |
|---|
| 392 |
} |
|---|
| 393 |
else static if(pattern[0] == 's'){ |
|---|
| 394 |
// whitespace char |
|---|
| 395 |
alias testRange!("\x00","\x20") fn; |
|---|
| 396 |
} |
|---|
| 397 |
else static if(pattern[0] == 'S'){ |
|---|
| 398 |
//non-whitespace char |
|---|
| 399 |
alias testCharInverse!(testRange!("\x00","\x20"),pattern) fn; |
|---|
| 400 |
} |
|---|
| 401 |
else static if(pattern[0] == 't'){ |
|---|
| 402 |
//tab |
|---|
| 403 |
alias testChar!("\t") fn; |
|---|
| 404 |
} |
|---|
| 405 |
else static if(pattern[0] == 'v'){ |
|---|
| 406 |
//vertical tab |
|---|
| 407 |
alias testChar!("\v") fn; |
|---|
| 408 |
} |
|---|
| 409 |
else static if(pattern[0] == 'w'){ |
|---|
| 410 |
//word char |
|---|
| 411 |
alias testWordChar!() fn; |
|---|
| 412 |
} |
|---|
| 413 |
else static if(pattern[0] == 'W'){ |
|---|
| 414 |
alias testCharInverse!(testWordChar!()) fn; |
|---|
| 415 |
} |
|---|
| 416 |
else{ |
|---|
| 417 |
alias testChar!(slice!(pattern,0,1)) fn; |
|---|
| 418 |
} |
|---|
| 419 |
const uint consumed = 1; |
|---|
| 420 |
} |
|---|
| 421 |
else{ |
|---|
| 422 |
pragma(msg,"Error: expected char following '\\'"); |
|---|
| 423 |
static assert(false); |
|---|
| 424 |
} |
|---|
| 425 |
} |
|---|
| 426 |
|
|---|
| 427 |
/// recursive portion of regexCompile - shim to work around alias scope issue |
|---|
| 428 |
template regexCompileRecurse(alias term,char[] pattern){ |
|---|
| 429 |
static if(strlen!(pattern) > 0){ |
|---|
| 430 |
debug pragma(msg,"REMAINING: " ~ pattern); |
|---|
| 431 |
|
|---|
| 432 |
alias regexCompile!(pattern) next; |
|---|
| 433 |
alias testUnion!(term.fn,next.fn,pattern) fn; |
|---|
| 434 |
} |
|---|
| 435 |
else{ |
|---|
| 436 |
alias term.fn fn; |
|---|
| 437 |
} |
|---|
| 438 |
} |
|---|
| 439 |
|
|---|
| 440 |
/// recursive descent parser for regex strings |
|---|
| 441 |
//TODO: install pipe operator here and give regexCompile the 'consumed' protocol to make |
|---|
| 442 |
// partial passes of the pattern possible |
|---|
| 443 |
template regexCompile(char[] pattern){ |
|---|
| 444 |
debug pragma(msg,"PATTERN: " ~ pattern); |
|---|
| 445 |
static if(strlen!(pattern) > 0){ |
|---|
| 446 |
static if(pattern[0] == '['){ |
|---|
| 447 |
const char[] charClassToken = parseUntil!(slice!(pattern,1,strlen!(pattern)),']').token; |
|---|
| 448 |
alias regexCompileCharClass!(charClassToken) charClass; |
|---|
| 449 |
const char[] token = slice!(pattern,0,charClass.consumed+2); |
|---|
| 450 |
const char[] next = slice!(pattern,charClass.consumed+2,strlen!(pattern)); |
|---|
| 451 |
alias charClass.fn test; |
|---|
| 452 |
} |
|---|
| 453 |
else static if(pattern[0] == '('){ |
|---|
| 454 |
const char[] groupToken = parseUntil!(slice!(pattern,1,strlen!(pattern)),')').token; |
|---|
| 455 |
alias regexCompile!(groupToken) groupExpression; |
|---|
| 456 |
const char[] token = slice!(pattern,0,strlen!(groupToken)+2); |
|---|
| 457 |
const char[] next = slice!(pattern,strlen!(groupToken)+2,strlen!(pattern)); |
|---|
| 458 |
alias groupExpression.fn test; |
|---|
| 459 |
} |
|---|
| 460 |
else static if(pattern[0] == '.'){ |
|---|
| 461 |
const char[] token = "."; |
|---|
| 462 |
const char[] next = sliceheadoff!(pattern,1); |
|---|
| 463 |
alias testAny!() test; |
|---|
| 464 |
} |
|---|
| 465 |
else static if(pattern[0] == '\\'){ |
|---|
| 466 |
alias regexCompileEscape!(sliceheadoff!(pattern,1)) escapeSequence; |
|---|
| 467 |
const char[] token = slice!(pattern,0,escapeSequence.consumed+1); |
|---|
| 468 |
const char[] next = sliceheadoff!(pattern,escapeSequence.consumed+1); |
|---|
| 469 |
alias escapeSequence.fn test; |
|---|
| 470 |
} |
|---|
| 471 |
else static if(pattern[0] == '$'){ |
|---|
| 472 |
pragma(msg,"Error: $ not allowed inside an expression (use \\$ instead)"); |
|---|
| 473 |
static assert(false); |
|---|
| 474 |
} |
|---|
| 475 |
else static if(pattern[0] == '^'){ |
|---|
| 476 |
pragma(msg,"Error: ^ not allowed inside an expression (use \\^ instead)"); |
|---|
| 477 |
static assert(false); |
|---|
| 478 |
} |
|---|
| 479 |
else{ |
|---|
| 480 |
const char[] token = parseTextToken!(pattern); |
|---|
| 481 |
static assert(strlen!(token) > 0); |
|---|
| 482 |
const char[] next = slice!(pattern,strlen!(token),strlen!(pattern)); |
|---|
| 483 |
alias testText!(token) test; |
|---|
| 484 |
} |
|---|
| 485 |
|
|---|
| 486 |
debug pragma(msg,"TOKEN: " ~ token); |
|---|
| 487 |
debug pragma(msg,"NEXT: " ~ next); |
|---|
| 488 |
|
|---|
| 489 |
alias regexCompilePredicate!(test,token,next) term; |
|---|
| 490 |
const char[] remaining = slice!(next,term.consumed,strlen!(next)); |
|---|
| 491 |
|
|---|
| 492 |
alias regexCompileRecurse!(term,remaining).fn fn; |
|---|
| 493 |
} |
|---|
| 494 |
else{ |
|---|
| 495 |
alias testEmpty!() fn; |
|---|
| 496 |
} |
|---|
| 497 |
} |
|---|
| 498 |
|
|---|
| 499 |
|
|---|
| 500 |
//TODO: at this level, check each tokenized sub expression for starting with '^' or ending with '$', and |
|---|
| 501 |
// apply the appropriate matching algorithm. |
|---|
| 502 |
|
|---|
| 503 |
/// recursive portion of regexCompile - shim to work around alias scope issue |
|---|
| 504 |
template compileMatchRecurse(alias termFn,char[] pattern){ |
|---|
| 505 |
static if(strlen!(pattern) > 0){ |
|---|
| 506 |
debug pragma(msg,"REMAINING: " ~ pattern); |
|---|
| 507 |
|
|---|
| 508 |
alias compileMatch!(pattern) next; |
|---|
| 509 |
alias matchUnion!(termFn,next.fn,pattern) fn; |
|---|
| 510 |
} |
|---|
| 511 |
else{ |
|---|
| 512 |
alias termFn fn; |
|---|
| 513 |
} |
|---|
| 514 |
} |
|---|
| 515 |
|
|---|
| 516 |
// shim |
|---|
| 517 |
template compileMatch2(char[] token,char[] remaining){ |
|---|
| 518 |
static if(last!(token) == '$'){ |
|---|
| 519 |
static if(first!(token) == '^'){ |
|---|
| 520 |
alias matchTestPerfect!(regexCompile!(slicetailoff!(token,1)).fn,remaining) termFn; |
|---|
| 521 |
} |
|---|
| 522 |
else{ |
|---|
| 523 |
//TODO: refactor by reversing the string (should make for a faster match) |
|---|
| 524 |
alias matchTestFromEnd!(regexCompile!(slicetailoff!(token,1)).fn,remaining) termFn; |
|---|
| 525 |
} |
|---|
| 526 |
} |
|---|
| 527 |
else static if(first!(token) == '^'){ |
|---|
| 528 |
alias matchTestFromStart!(regexCompile!(sliceheadoff!(token,1)).fn,remaining) termFn; |
|---|
| 529 |
} |
|---|
| 530 |
else{ |
|---|
| 531 |
alias matchTest!(regexCompile!(token).fn,remaining) termFn; |
|---|
| 532 |
} |
|---|
| 533 |
alias compileMatchRecurse!(termFn,remaining).fn fn; |
|---|
| 534 |
} |
|---|
| 535 |
|
|---|
| 536 |
template compileMatch(char[] pattern){ |
|---|
| 537 |
static if(strlen!(pattern) > 0){ |
|---|
| 538 |
alias parseUntil!(pattern,'|',true) unionToken; |
|---|
| 539 |
|
|---|
| 540 |
const char[] token = unionToken.token; |
|---|
| 541 |
const char[] remaining = sliceheadoff!(pattern,unionToken.consumed); |
|---|
| 542 |
|
|---|
| 543 |
debug pragma(msg,"TOKEN: " ~ token); |
|---|
| 544 |
debug pragma(msg,"REMAINING: " ~ remaining); |
|---|
| 545 |
|
|---|
| 546 |
alias compileMatch2!(token,remaining).fn fn; |
|---|
| 547 |
} |
|---|
| 548 |
else{ |
|---|
| 549 |
alias matchTest!(testEmpty!(),pattern) fn; |
|---|
| 550 |
} |
|---|
| 551 |
//TODO: parse support for unions (another parser layer most likely) |
|---|
| 552 |
} |
|---|
| 553 |
|
|---|
| 554 |
template regexMatch(char[] pattern){ |
|---|
| 555 |
//alias matchTest!(regexCompile!(pattern).fn,pattern) regexMatch; |
|---|
| 556 |
alias compileMatch!(pattern).fn regexMatch; |
|---|
| 557 |
} |
|---|