| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777 | /******************************************************************************* copyright: Copyright (c) 2007-2008 Jascha Wetzel. All rights reserved. license: BSD style: $(LICENSE) version: Initial release: Jan 2008 authors: Jascha Wetzel This is a regular expression compiler and interpreter based on the Tagged NFA/DFA method. The Regex class is not thread safe. See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a> for details on regular expressions in general. The used method implies, that the expressions are <i>regular</i>, in the way language theory defines it, as opposed to what "regular expression" means in most implementations (e.g. PCRE or those from the standard libraries of Perl, Java or Python). The advantage of this method is it's performance, it's disadvantage is the inability to realize some features that Perl-like regular expressions have (e.g. back-references). See <a href="http://swtch.com/~rsc/regexp/regexp1.html">"Regular Expression Matching Can Be Simple And Fast"</a> for details on the differences. The time for matching a regular expression against an input string of length N is in O(M*N), where M depends on the number of matching brackets and the complexity of the expression. That is, M is constant wrt. the input and therefore matching is a linear-time process. The syntax of a regular expressions is as follows. <i>X</i> and <i>Y</i> stand for an arbitrary regular expression. <table border=1 cellspacing=0 cellpadding=5> <caption>Operators</caption> $(TR $(TD X|Y) $(TD alternation, i.e. X or Y) ) $(TR $(TD (X)) $(TD matching brackets - creates a sub-match) ) $(TR $(TD (?X)) $(TD non-matching brackets - only groups X, no sub-match is created) ) $(TR $(TD [Z]) $(TD character class specification, Z is a string of characters or character ranges, e.g. [a-zA-Z0-9_.\-]) ) $(TR $(TD [^Z]) $(TD negated character class specification) ) $(TR $(TD <X) $(TD lookbehind, X may be a single character or a character class) ) $(TR $(TD >X) $(TD lookahead, X may be a single character or a character class) ) $(TR $(TD ^) $(TD start of input or start of line) ) $(TR $(TD $) $(TD end of input or end of line) ) $(TR $(TD \b) $(TD start or end of word, equals (?<\s>\S|<\S>\s)) ) $(TR $(TD \B) $(TD opposite of \b, equals (?<\S>\S|<\s>\s)) ) </table> <table border=1 cellspacing=0 cellpadding=5> <caption>Quantifiers</caption> $(TR $(TD X?) $(TD zero or one) ) $(TR $(TD X*) $(TD zero or more) ) $(TR $(TD X+) $(TD one or more) ) $(TR $(TD X{n,m}) $(TD at least n, at most m instances of X.<br>If n is missing, it's set to 0.<br>If m is missing, it is set to infinity.) ) $(TR $(TD X??) $(TD non-greedy version of the above operators) ) $(TR $(TD X*?) $(TD see above) ) $(TR $(TD X+?) $(TD see above) ) $(TR $(TD X{n,m}?) $(TD see above) ) </table> <table border=1 cellspacing=0 cellpadding=5> <caption>Pre-defined character classes</caption> $(TR $(TD .) $(TD any printable character) ) $(TR $(TD \s) $(TD whitespace) ) $(TR $(TD \S) $(TD non-whitespace) ) $(TR $(TD \w) $(TD alpha-numeric characters or underscore) ) $(TR $(TD \W) $(TD opposite of \w) ) $(TR $(TD \d) $(TD digits) ) $(TR $(TD \D) $(TD non-digit) ) </table> Note that "alphanumeric" only applies to Latin-1. *******************************************************************************/ module tango.text.Regex; debug(TangoRegex) import tango.io.Stdout; /* ***************************************************************************** A simple pair *******************************************************************************/ private struct Pair(T) { static Pair opCall(T a, T b) { Pair p; p.a = a; p.b = b; return p; } union { struct { T first, second; } struct { T a, b; } } } /* ***************************************************************************** Double linked list *******************************************************************************/ private class List(T) { class Element { T value; Element prev, next; this(T v) { value = v; } } size_t len; Element head, tail; List opCatAssign(T v) { if ( tail is null ) head = tail = new Element(v); else { tail.next = new Element(v); tail.next.prev = tail; tail = tail.next; } ++len; return this; } List insertAfter(T w, T v) { foreach ( e; &this.elements ) { if ( e.value is w ) return insertAfter(e, v); } return null; } List insertAfter(Element e, T v) { auto tmp = new Element(v); tmp.prev = e; tmp.next = e.next; e.next.prev = tmp; e.next = tmp; if ( e is tail ) tail = tmp; ++len; return this; } List opCatAssign(List l) { if ( l.empty ) return this; if ( tail is null ) { head = l.head; tail = l.tail; } else { tail.next = l.head; tail.next.prev = tail; tail = l.tail; } len += l.len; return this; } List pushFront(T v) { if ( head is null ) head = tail = new Element(v); else { head.prev = new Element(v); head.prev.next = head; head = head.prev; } ++len; return this; } List insertBefore(T w, T v) { foreach ( e; &this.elements ) { if ( e.value is w ) return insertBefore(e, v); } return null; } List insertBefore(Element e, T v) { auto tmp = new Element(v); tmp.prev = e.prev; tmp.next = e; e.prev.next = tmp; e.prev = tmp; if ( e is head ) head = tmp; ++len; return this; } List pushFront(List l) { if ( l.empty ) return this; if ( head is null ) { head = l.head; tail = l.tail; } else { head.prev = l.tail; head.prev.next = head; head = l.head; } len += l.len; return this; } size_t length() { return len; } bool empty() { return head is null; } void clear() { head = null; tail = null; len = 0; } void pop() { remove(tail); } void remove(Element e) { if ( e is null ) return; if ( e.prev is null ) head = e.next; else e.prev.next = e.next; if ( e.next is null ) tail = e.prev; else e.next.prev = e.prev; --len; } int elements(int delegate(ref Element) dg) { for ( Element e=head; e !is null; e = e.next ) { int ret = dg(e); if ( ret ) return ret; } return 0; } int elements_reverse(int delegate(ref Element) dg) { for ( Element e=tail; e !is null; e = e.prev ) { int ret = dg(e); if ( ret ) return ret; } return 0; } int opApply(int delegate(ref T) dg) { for ( Element e=head; e !is null; e = e.next ) { int ret = dg(e.value); if ( ret ) return ret; } return 0; } int opApplyReverse(int delegate(ref T) dg) { for ( Element e=tail; e !is null; e = e.prev ) { int ret = dg(e.value); if ( ret ) return ret; } return 0; } } /* ***************************************************************************** Stack based on dynamic array *******************************************************************************/ private struct Stack(T) { size_t _top; T[] stack; void push(T v) { if ( _top >= stack.length ) stack.length = stack.length*2+1; stack[_top] = v; ++_top; } alias push opCatAssign; void opCatAssign(T[] vs) { size_t end = _top+vs.length; if ( end > stack.length ) stack.length = end*2; stack[_top..end] = vs; _top = end; } void pop(size_t num) { assert(_top>=num); _top -= num; } T pop() { assert(_top>0); return stack[--_top]; } T top() { assert(_top>0); return stack[_top-1]; } T* topPtr() { assert(_top>0); return &stack[_top-1]; } bool empty() { return _top == 0; } void clear() { _top = 0; } size_t length() { return _top; } T[] array() { return stack[0.._top]; } T opIndex(size_t i) { return stack[i]; } Stack dup() { Stack s; s._top = _top; s.stack = stack.dup; return s; } } /* ************************************************************************************************ Set container based on assoc array **************************************************************************************************/ private struct Set(T) { bool[T] data; static Set opCall() { Set s; return s; } static Set opCall(T v) { Set s; s ~= v; return s; } void opAddAssign(T v) { data[v] = true; } void opAddAssign(Set s) { foreach ( v; s.elements ) data[v] = true; } alias opAddAssign opCatAssign; size_t length() { return data.length; } T[] elements() { return data.keys; } bool remove(T v) { if ( (v in data) is null ) return false; data.remove(v); return true; } bool contains(T v) { return (v in data) !is null; } bool contains(Set s) { Set tmp = s - *this; return tmp.empty; } bool empty() { return data.length==0; } Set opSub(Set s) { Set res = dup; foreach ( v; s.elements ) res.remove(v); return res; } Set dup() { Set s; foreach ( v; data.keys ) s.data[v] = true; return s; } } /* ************************************************************************************************ **************************************************************************************************/ void quickSort(T)(T[] a) { quickSort(a,cast(size_t)0,a.length); } void quickSort(T)(T[] a, size_t l, size_t r) { T t; auto i = r-l; if ( i < 3 ) { if ( i < 2 ) return; if ( a[l] < a[l+1] ) return; t = a[l]; a[l] = a[l+1]; a[l+1] = t; return; } auto p = a[l]; i = l; auto j = r; while ( true ) { ++i; for ( ; i < j && a[i] < p; ++i ) {} --j; for ( ; i < j && a[j] >= p; --j ) {} if ( i >= j ) break; t = a[i]; a[i] = a[j]; a[j] = t; } --i; a[l] = a[i]; a[i] = p; quickSort(a, l, i); quickSort(a, i+1, r); } import tango.math.Math; /* ************************************************************************************************ A range of characters **************************************************************************************************/ struct CharRange(char_t) { char_t l_, r_; static CharRange opCall(char_t c) { CharRange cr; cr.l_ = c; cr.r_ = c; return cr; } static CharRange opCall(char_t a, char_t b) { CharRange cr; cr.l_ = min(a,b); cr.r_ = max(a,b); return cr; } char_t l() { return l_; } char_t r() { return r_; } /* ******************************************************************************************** Compares the ranges according to their beginning. **********************************************************************************************/ int opCmp(CharRange cr) { if ( l_ == cr.l_ ) return 0; if ( l_ < cr.l_ ) return -1; return 1; } int opEquals(CharRange cr) { if ( l_ == cr.l_ && r_ == cr.r_ ) return 1; return 0; } bool contains(char_t c) { return c >= l_ && c <= r_; } bool contains(CharRange cr) { return l_ <= cr.l_ && r_ >= cr.r_; } bool intersects(CharRange cr) { return r_ >= cr.l_ && l_ <= cr.r_; } CharRange intersect(CharRange cr) { assert(intersects(cr)); CharRange ir; ir.l_ = max(l_, cr.l_); ir.r_ = min(r_, cr.r_); if ( ir.l_ > ir.r_ ) ir.l_ = ir.r_ = char_t.min; return ir; } CharRange[] subtract(CharRange cr) { CharRange[] sr; if ( cr.contains(*this) ) return sr; if ( !intersects(cr) ) sr ~= *this; else { CharRange d; if ( contains(cr) ) { d.l_ = l_; d.r_ = cr.l_-1; if ( d.l_ <= d.r_ ) sr ~= d; d.l_ = cr.r_+1; d.r_ = r_; if ( d.l_ <= d.r_ ) sr ~= d; } else if ( cr.r_ > l_ ) { d.l_ = cr.r_+1; d.r_ = r_; if ( d.l_ <= d.r_ ) sr ~= d; } else if ( cr.l_ < r_ ) { d.l_ = l_; d.r_ = cr.l_-1; if ( d.l_ <= d.r_ ) sr ~= d; } } return sr; } string toString() { string str; if ( l_ == r_ ) { if ( l_ > 0x20 && l_ < 0x7f ) str = Format.convert("'{}'", l_); else str = Format.convert("({:x})", cast(int)l_); } else { if ( l_ > 0x20 && l_ < 0x7f ) str = Format.convert("'{}'", l_); else str = Format.convert("({:x})", cast(int)l_); str ~= "-"; if ( r_ > 0x20 && r_ < 0x7f ) str ~= Format.convert("'{}'", r_); else str ~= Format.convert("({:x})", cast(int)r_); } return str; } } /* ************************************************************************************************ Represents a class of characters as used in regular expressions (e.g. [0-9a-z], etc.) **************************************************************************************************/ struct CharClass(char_t) { alias CharRange!(char_t) range_t; //--------------------------------------------------------------------------------------------- // pre-defined character classes static const CharClass!(char_t) line_startend = {parts: [ {l_:0x00, r_:0x00}, {l_:0x0a, r_:0x0a}, {l_:0x13, r_:0x13} ]}, digit = {parts: [ {l_:0x30, r_:0x39} ]}, whitespace = {parts: [ {l_:0x09, r_:0x09}, {l_:0x0a, r_:0x0a}, {l_:0x0b, r_:0x0b}, {l_:0x13, r_:0x13}, {l_:0x14, r_:0x14}, {l_:0x20, r_:0x20} ]}; // 8bit classes static if ( is(char_t == char) ) { static const CharClass!(char_t) any_char = {parts: [ {l_:0x01, r_:0xff} ]}, dot_oper = {parts: [ {l_:0x01, r_:0xff} ]}, alphanum_ = {parts: [ {l_:0x30, r_:0x39}, {l_:0x41, r_:0x5a}, {l_:0x5f, r_:0x5f}, {l_:0x61, r_:0x7a} ]}; } // 16bit and 32bit classes static if ( is(char_t == wchar) || is(char_t == dchar) ) { static const CharClass!(char_t) any_char = {parts: [ {l_:0x0001, r_:0xffff} ]}, dot_oper = {parts: [ {l_:0x0001, r_:0xffff} ]}, alphanum_ = {parts: [ {l_:0x30, r_:0x39}, {l_:0x41, r_:0x5a}, {l_:0x5f, r_:0x5f}, {l_:0x61, r_:0x7a} ]}; } //--------------------------------------------------------------------------------------------- range_t[] parts; invariant() { // foreach ( i, p; parts ) // assert(p.l_<=p.r_, Int.toString(i)~": "~Int.toString(p.l_)~" > "~Int.toString(p.r_)); } static CharClass opCall(CharClass cc) { CharClass ncc; ncc.parts = cc.parts.dup; return ncc; } int opCmp(CharClass cc) { if ( parts.length < cc.parts.length ) return -1; if ( parts.length > cc.parts.length ) return 1; foreach ( i, p; cc.parts ) { if ( p.l_ != parts[i].l_ || p.r_ != parts[i].r_ ) return 1; } return 0; } bool empty() { return parts.length <= 0; } bool matches(char_t c) { foreach ( p; parts ) { if ( p.contains(c) ) return true; } return false; } CharClass intersect(CharClass cc) { CharClass ic; foreach ( p; parts ) { foreach ( cp; cc.parts ) { if ( p.intersects(cp) ) ic.parts ~= p.intersect(cp); } } return ic; } // requires the class to be optimized bool contains(range_t cr) { foreach ( p; parts ) { if ( p.contains(cr) ) return true; } return false; } // requires the class to be optimized bool contains(CharClass cc) { Louter: foreach ( p; cc.parts ) { foreach ( p2; parts ) { if ( p2.contains(p) ) continue Louter; } return false; } return true; } void subtract(CharClass cc) { negate; add(cc); negate; } void add(CharClass cc) { parts ~= cc.parts; } void add(range_t cr) { parts ~= cr; } void add(char_t c) { parts ~= CharRange!(char_t)(c); } /* ******************************************************************************************** Requires the CharClass to be optimized. **********************************************************************************************/ void negate() { optimize; char_t start = char_t.min; // first part touches left boundary of value range if ( parts.length > 0 && parts[0].l_ == start ) { start = parts[0].r_; if ( start < char_t.max ) ++start; foreach ( i, ref cr; parts[0 .. $-1] ) { cr.l_ = start; cr.r_ = parts[i+1].l_-1; start = parts[i+1].r_; if ( start < char_t.max ) ++start; } if ( start != char_t.max ) { parts[$-1].l_ = start; parts[$-1].r_ = char_t.max; } else parts.length = parts.length-1; return; } foreach ( i, ref cr; parts ) { char_t tmp = cr.l_-1; cr.l_ = start; start = cr.r_; if ( start < char_t.max ) ++start; cr.r_ = tmp; } // last part does not touch right boundary if ( start != char_t.max ) parts ~= range_t(start, char_t.max); } void optimize() { if ( empty ) return; parts.sort; size_t i = 0; foreach ( p; parts[1 .. $] ) { if ( p.l_ > parts[i].r_+1 ) { ++i; parts[i].l_ = p.l_; parts[i].r_ = p.r_; continue; } parts[i].r_ = max(p.r_, parts[i].r_); if ( parts[i].r_ >= char_t.max ) break; } parts.length = i+1; } char[] toString() { char[] str; str ~= "["; foreach ( p; parts ) str ~= p.toString; str ~= "]"; return str; } } debug(UnitTest) { unittest { static CharClass!(char) cc = { parts: [{l_:0,r_:10},{l_:0,r_:6},{l_:5,r_:12},{l_:12,r_:17},{l_:20,r_:100}] }; assert(cc.toString, "[(0)-(a)(0)-(6)(5)-(c)(c)-(11)(14)-'d']"); cc.optimize; assert(cc.toString, "[(0)-(11)(14)-'d']"); cc.negate; assert(cc.toString, " [(12)-(13)'e'-(ff)]"); cc.optimize; assert(cc.toString, "[(0)-(11)(14)-'d']"); cc.negate; assert(cc.toString, "[(12)-(13)'e'-(ff)]"); static CharClass!(char) cc2 = { parts: [] }; assert(cc.toString, "[]"); cc2.optimize; assert(cc.toString, "[]"); cc2.negate; assert(cc.toString, "[(0)-(ff)]"); cc2.optimize; assert(cc.toString, "[(0)-(ff)]"); cc2.negate; assert(cc.toString, "[]"); static CharClass!(char) cc3 = { parts: [{l_:0,r_:100},{l_:200,r_:0xff},] }; assert(cc3.toString, "[(0)-'d'(c8)-(ff)]"); cc3.negate; assert(cc.toString, "['e'-(c7)]"); cc3.negate; assert(cc.toString, "[(0)-'d'(c8)-(ff)]"); static CharClass!(char) cc4 = { parts: [{l_:0,r_:200},{l_:100,r_:0xff},] }; assert(cc.toString, "[(0)-(c8)'d'-(ff)]"); cc4.optimize; assert(cc.toString, "[(9)-(13)(20)-'~'(a0)-(ff)(100)-(17f)(180)-(24f)(20a3)-(20b5)]"); static CharClass!(dchar) cc5 = { parts: [{l_:0x9,r_:0x13},{0x20,r_:'~'},{l_:0xa0,r_:0xff},{l_:0x100,r_:0x17f},{l_:0x180,r_:0x24f},{l_:0x20a3,r_:0x20b5}] }; cc5.optimize; assert(cc.toString, "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]"); cc5.negate; assert(cc.toString, "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]"); cc5.optimize; assert(cc.toString, "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]"); cc5.negate; assert(cc.toString, "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]"); } } /* ************************************************************************************************ **************************************************************************************************/ private struct Predicate(char_t) { alias char_t[] string_t; alias CharClass!(char_t) cc_t; alias CharRange!(char_t) cr_t; // generic data enum Type { consume, epsilon, lookahead, lookbehind } cc_t input; Type type; // data for compiled predicates const uint MAX_BITMAP_LENGTH = 256, MAX_SEARCH_LENGTH = 256; enum MatchMode { generic, generic_l, single_char, bitmap, string_search, // consume single_char_l, bitmap_l, string_search_l // lookahead } MatchMode mode; // data_chr had to be pulled out of the union due to // http://d.puremagic.com/issues/show_bug.cgi?id=2632 --- don't put it back // in until this is resolved! // // Keep in mind that data_str.length can't be modified directly unless the // new length is strictly greater than the old length. This is essentially // because ubyte.sizeof can be less than char_t.sizeof. If you set // data_str.length to anything less than or equal to data_bmp.length, // data_str will not be reallocated: only the length value will change but // nothing will realize that not that much space has actually been // allocated. data_str will be too small and you'll likely get segfaults or // such. // // In short: don't mess with data_str.length. If you have to, remove the // union entirely. // // -- Deewiant union { ubyte[] data_bmp; string_t data_str; }; char_t data_chr; void compile() { assert(input.parts.length > 0); // single char? if ( input.parts.length == 1 && input.parts[0].l_ == input.parts[0].r_ ) { mode = type==Type.consume ? MatchMode.single_char : MatchMode.single_char_l; data_chr = input.parts[0].l_; return; } // check whether we can use a bitmap foreach ( p; input.parts ) { if ( p.l_ > MAX_BITMAP_LENGTH || p.r_ > MAX_BITMAP_LENGTH ) goto LnoBitmap; } // setup bitmap data_bmp.length = MAX_BITMAP_LENGTH/8; foreach ( p; input.parts ) { for ( char_t c = p.l_; c <= p.r_; ++c ) data_bmp[c/8] |= 1 << (c&7); } mode = type==Type.consume ? MatchMode.bitmap : MatchMode.bitmap_l; return; LnoBitmap: /* // check whether the class is small enough to justify a string-search // TODO: consider inverse class for 8bit chars? uint class_size; foreach ( p; input.parts ) class_size += cast(uint)p.r_+1-p.l_; if ( class_size > MAX_SEARCH_LENGTH ) goto Lgeneric; data_str.length = class_size; size_t ind; foreach ( p; input.parts ) { for ( char_t c = p.l_; c <= p.r_; ++c ) data_str[ind++] = c; } mode = type==Type.consume ? MatchMode.string_search : MatchMode.string_search_l; return; */ Lgeneric: data_str = cast(char_t[])input.parts; mode = type==Type.consume ? MatchMode.generic : MatchMode.generic_l; } bool matches(char_t c) { if ( type == Type.consume || type == Type.lookahead ) return input.matches(c); assert(0); } Predicate intersect(Predicate p) { Predicate p2; if ( type != Type.epsilon && p.type != Type.epsilon ) p2.input = input.intersect(p.input); return p2; } bool intersects(Predicate p) { if ( type != p.type ) return false; foreach ( cr; input.parts ) { foreach ( cr2; p.input.parts ) { if ( cr.intersects(cr2) ) return true; } } return false; } bool exceedsMax(uint maxc) { foreach ( p; input.parts ) { if ( p.l_ > maxc || p.r_ > maxc ) return true; } return false; } bool empty() { return type != Type.epsilon && input.empty; } void subtract(Predicate p) { if ( type != Type.epsilon && p.type != Type.epsilon ) input.subtract(p.input); } void negate() { assert(type != Type.epsilon); input.negate; } void optimize() { assert(type != Type.epsilon); input.optimize; } int opCmp(Predicate p) { return input.opCmp(p.input); } int opEquals(Predicate p) { if ( type != p.type ) return 0; if ( input.opCmp(p.input) != 0 ) return 0; return 1; } cc_t getInput() { return input; } void setInput(cc_t cc) { input = cc; } void appendInput(cr_t cr) { input.add(cr); } void appendInput(cc_t cc) { input.add(cc); } void appendInput(Predicate p) { input.add(p.input); } string toString() { string str; switch ( type ) { case Type.consume: str = input.toString; break; case Type.epsilon: str = "eps"; break; case Type.lookahead: str = "la:"~input.toString; break; case Type.lookbehind: str = "lb:"~input.toString; break; default: assert(0); } return str; } } import Utf = tango.text.convert.Utf; import tango.text.convert.Format; /* ************************************************************************************************ **************************************************************************************************/ class RegExpException : Exception { this(string msg) { super("RegExp: "~msg); } } /* ************************************************************************************************ TNFA state **************************************************************************************************/ private class TNFAState(char_t) { bool accept = false, visited = false; uint index; List!(TNFATransition!(char_t)) transitions; this() { transitions = new List!(TNFATransition!(char_t)); } } /* ************************************************************************************************ Priority classes used to linearize priorities after non-linear transition creation. **************************************************************************************************/ private enum PriorityClass { greedy=0, normal=1, reluctant=2, extraReluctant=3 } /* ******************************************************************************** TNFA tagged transition ***********************************************************************************/ private class TNFATransition(char_t) { TNFAState!(char_t) target; Predicate!(char_t) predicate; uint priority, tag; /// one-based tag number, 0 = untagged PriorityClass priorityClass; this(PriorityClass pc) { priorityClass = pc; } /****************************************************************************** Move through states only going via epsilon transitions, and only choosing the one with highest priority. If the highest priority transition from a state isn't an epsilon transition, false is returned. If the accepting NFA state can be reached in this manner, true is returned. NOTE: This method does not look for cycles which should be kept in mind for later. larsivi 20090827 *******************************************************************************/ bool canFinish() { TNFAState!(char_t) t = target; while (!t.accept) { TNFATransition!(char_t) highestPriTrans; foreach (trans; t.transitions) { if (!highestPriTrans || highestPriTrans.priority > trans.priority) highestPriTrans = trans; } if (!(highestPriTrans.predicate.type == Predicate!(char_t).Type.epsilon)) return false; t = highestPriTrans.target; } return true; } } /* ************************************************************************************************ Fragments of TNFAs as used in the Thompson method **************************************************************************************************/ private class TNFAFragment(char_t) { alias TNFAState!(char_t) state_t; alias TNFATransition!(char_t) trans_t; List!(trans_t) entries, /// transitions to be added to the entry state exits, /// transitions to be added to the exit state entry_state, /// transitions to write the entry state to exit_state; /// transitions to write the exit state to bool swapMatchingBracketSyntax; this() { entries = new List!(trans_t); exits = new List!(trans_t); entry_state = new List!(trans_t); exit_state = new List!(trans_t); } /* ******************************************************************************************** Write the given state as entry state to this fragment. **********************************************************************************************/ void setEntry(state_t state) { state.transitions ~= entries; foreach ( t; entry_state ) t.target = state; } /* ******************************************************************************************** Write the given state as exit state to this fragment. **********************************************************************************************/ void setExit(state_t state) { state.transitions ~= exits; foreach ( t; exit_state ) t.target = state; } } /* ************************************************************************************************ Tagged NFA **************************************************************************************************/ private final class TNFA(char_t) { alias TNFATransition!(char_t) trans_t; alias TNFAFragment!(char_t) frag_t; alias TNFAState!(char_t) state_t; alias Predicate!(char_t) predicate_t; alias char_t[] string_t; alias CharRange!(char_t) range_t; alias CharClass!(char_t) cc_t; string_t pattern; state_t[] states; state_t start; bool swapMatchingBracketSyntax; /// whether to make (?...) matching and (...) non-matching /* ******************************************************************************************** Creates the TNFA from the given regex pattern **********************************************************************************************/ this(string_t regex) { next_tag = 1; transitions = new List!(trans_t); pattern = regex; } /* ******************************************************************************************** Print the TNFA (tabular representation of the delta function) **********************************************************************************************/ debug(TangoRegex) void print() { foreach ( int i, s; states ) { Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" "); bool first=true; Stdout(" {"); foreach ( t; s.transitions ) { Stdout.format("{}{}{}:{}->{}", first?"":", ", t.priority, "gnrx"[t.priorityClass], t.predicate.toString, t.target is null?-1:t.target.index); if ( t.tag > 0 ) { Stdout.format(" t{}", t.tag); } first = false; } Stdout("}").newline; } } uint tagCount() { return next_tag-1; } /* ******************************************************************************************** Constructs the TNFA using extended Thompson method. Uses a slightly extended version of Dijkstra's shunting yard algorithm to convert the regexp from infix notation. **********************************************************************************************/ void parse(bool unanchored) { List!(frag_t) frags = new List!(frag_t); Stack!(Operator) opStack; Stack!(uint) tagStack; Stack!(Pair!(uint)) occurStack; opStack ~= Operator.eos; /* **************************************************************************************** Perform action on operator stack ******************************************************************************************/ bool perform(Operator next_op, bool explicit_operator=true) { // calculate index in action matrix int index = cast(int)opStack.top*(Operator.max+1); index += cast(int)next_op; debug(tnfa) Stdout.formatln("\t{}:{} -> {} {} frag(s)", operator_names[opStack.top], operator_names[next_op], action_names[action_lookup[index]], frags.length ); switch ( action_lookup[index] ) { case Act.pua: opStack ~= next_op; if ( next_op == Operator.open_par ) { tagStack ~= next_tag; next_tag += 2; } break; case Act.poc: switch ( opStack.top ) { case Operator.concat: constructConcat(frags); break; case Operator.altern: constructAltern(frags); break; case Operator.zero_one_g: constructZeroOne(frags, PriorityClass.greedy); break; case Operator.zero_one_ng: constructZeroOne(frags, PriorityClass.reluctant); break; case Operator.zero_one_xr: constructZeroOne(frags, PriorityClass.extraReluctant); break; case Operator.zero_more_g: constructZeroMore(frags, PriorityClass.greedy); break; case Operator.zero_more_ng: constructZeroMore(frags, PriorityClass.reluctant); break; case Operator.zero_more_xr: constructZeroMore(frags, PriorityClass.extraReluctant); break; case Operator.one_more_g: constructOneMore(frags, PriorityClass.greedy); break; case Operator.one_more_ng: constructOneMore(frags, PriorityClass.reluctant); break; case Operator.one_more_xr: constructOneMore(frags, PriorityClass.extraReluctant); break; case Operator.occur_g: Pair!(uint) occur = occurStack.pop; constructOccur(frags, occur.a, occur.b, PriorityClass.greedy); break; case Operator.occur_ng: Pair!(uint) occur = occurStack.pop; constructOccur(frags, occur.a, occur.b, PriorityClass.reluctant); break; default: throw new RegExpException("cannot process operand at \""~Utf.toString(pattern[cursor..$])~"\""); } opStack.pop; perform(next_op, false); break; case Act.poa: opStack.pop; break; case Act.pca: if ( opStack.top == Operator.open_par ) { if ( tagStack.empty ) throw new RegExpException(Format.convert("Missing opening parentheses for closing parentheses at char {} \"{}\"", cursor, Utf.toString(pattern[cursor..$]))); constructBracket(frags, tagStack.top); tagStack.pop; } else { assert(opStack.top == Operator.open_par_nm); constructBracket(frags); } opStack.pop; break; case Act.don: return true; case Act.err: default: throw new RegExpException(Format.convert("Unexpected operand at char {} \"{}\" in \"{}\"", cursor, Utf.toString(pattern[cursor..$]), Utf.toString(pattern))); } return false; } // add implicit extra reluctant .* (with . == any_char) at the beginning for unanchored matches // and matching bracket for total match group if ( unanchored ) { frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume); perform(Operator.zero_more_xr, false); perform(Operator.concat, false); perform(Operator.open_par, false); } // convert regex to postfix and create TNFA bool implicit_concat; predicate_t.Type pred_type; while ( !endOfPattern ) { pred_type = predicate_t.Type.consume; dchar c = readPattern; switch ( c ) { case '|': perform(Operator.altern); implicit_concat = false; break; case '(': if ( implicit_concat ) perform(Operator.concat, false); implicit_concat = false; if ( peekPattern == '?' ) { readPattern; perform(swapMatchingBracketSyntax?Operator.open_par:Operator.open_par_nm); } else perform(swapMatchingBracketSyntax?Operator.open_par_nm:Operator.open_par); break; case ')': perform(Operator.close_par); break; case '?': if ( peekPattern == '?' ) { readPattern; perform(Operator.zero_one_ng); } else perform(Operator.zero_one_g); break; case '*': if ( peekPattern == '?' ) { readPattern; perform(Operator.zero_more_ng); } else perform(Operator.zero_more_g); break; case '+': if ( peekPattern == '?' ) { readPattern; perform(Operator.one_more_ng); } else perform(Operator.one_more_g); break; case '{': Pair!(uint) occur; parseOccurCount(occur.a, occur.b); occurStack ~= occur; if ( peekPattern == '?' ) { readPattern; perform(Operator.occur_ng); } else perform(Operator.occur_g); break; case '[': if ( implicit_concat ) perform(Operator.concat, false); implicit_concat = true; frags ~= constructCharClass(pred_type); break; case '.': if ( implicit_concat ) perform(Operator.concat, false); implicit_concat = true; frags ~= constructChars(cc_t.dot_oper, pred_type); break; case '$': if ( implicit_concat ) perform(Operator.concat, false); implicit_concat = true; frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookahead); break; case '^': if ( implicit_concat ) perform(Operator.concat, false); implicit_concat = true; frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookbehind); break; case '>': c = readPattern; pred_type = predicate_t.Type.lookahead; if ( c == '[' ) goto case '['; else if ( c == '\\' ) goto case '\\'; else if ( c == '.' ) goto case '.'; else goto default; case '<': c = readPattern; pred_type = predicate_t.Type.lookbehind; if ( c == '[' ) goto case '['; else if ( c == '\\' ) goto case '\\'; else if ( c == '.' ) goto case '.'; else goto default; case '\\': c = readPattern; if ( implicit_concat ) perform(Operator.concat, false); implicit_concat = true; switch ( c ) { case 't': frags ~= constructSingleChar('\t', pred_type); break; case 'n': frags ~= constructSingleChar('\n', pred_type); break; case 'r': frags ~= constructSingleChar('\r', pred_type); break; case 'w': // alphanumeric and _ frags ~= constructChars(cc_t.alphanum_, pred_type); break; case 'W': // non-(alphanum and _) auto cc = cc_t(cc_t.alphanum_); cc.negate; frags ~= constructChars(cc, pred_type); break; case 's': // whitespace frags ~= constructChars(cc_t.whitespace, pred_type); break; case 'S': // non-whitespace auto cc = cc_t(cc_t.whitespace); cc.negate; frags ~= constructChars(cc, pred_type); break; case 'd': // digit frags ~= constructChars(cc_t.digit, pred_type); break; case 'D': // non-digit auto cc = cc_t(cc_t.digit); cc.negate; frags ~= constructChars(cc, pred_type); break; case 'b': // either end of word if ( pred_type != predicate_t.Type.consume ) throw new RegExpException("Escape sequence \\b not allowed in look-ahead or -behind"); // create (?<\S>\s|<\s>\S) auto cc = cc_t(cc_t.whitespace); cc.negate; perform(Operator.open_par_nm); frags ~= constructChars(cc, predicate_t.Type.lookbehind); perform(Operator.concat, false); frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); perform(Operator.altern, false); frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); perform(Operator.concat, false); frags ~= constructChars(cc, predicate_t.Type.lookahead); perform(Operator.close_par, false); break; case 'B': // neither end of word if ( pred_type != predicate_t.Type.consume ) throw new RegExpException("Escape sequence \\B not allowed in look-ahead or -behind"); // create (?<\S>\S|<\s>\s) auto cc = cc_t(cc_t.whitespace); cc.negate; perform(Operator.open_par_nm); frags ~= constructChars(cc, predicate_t.Type.lookbehind); perform(Operator.concat, false); frags ~= constructChars(cc, predicate_t.Type.lookahead); perform(Operator.altern, false); frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); perform(Operator.concat, false); frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); perform(Operator.close_par, false); break; case '(': case ')': case '[': case ']': case '{': case '}': case '*': case '+': case '?': case '.': case '\\': case '^': case '$': case '|': case '<': case '>': default: frags ~= constructSingleChar(c, pred_type); break; // throw new RegExpException(Format.convert("Unknown escape sequence \\{}", c)); } break; default: if ( implicit_concat ) perform(Operator.concat, false); implicit_concat = true; frags ~= constructSingleChar(c, pred_type); } } // add implicit reluctant .* (with . == any_char) at the end for unanchored matches if ( unanchored ) { perform(Operator.close_par, false); if ( implicit_concat ) perform(Operator.concat, false); frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume); perform(Operator.zero_more_ng, false); } // empty operator stack while ( !perform(Operator.eos) ) {} // set start and finish states start = addState; state_t finish = addState; finish.accept = true; foreach ( f; frags ) { f.setExit(finish); f.setEntry(start); } // set transition priorities List!(trans_t)[PriorityClass.max+1] trans; foreach ( ref t; trans ) t = new List!(trans_t); Stack!(trans_t) todo; state_t state = start; while ( !todo.empty || !state.visited ) { if ( !state.visited ) { state.visited = true; foreach_reverse ( t; state.transitions ) todo ~= t; } if ( todo.empty ) break; trans_t t = todo.top; todo.pop; assert(t.priorityClass<=PriorityClass.max); trans[t.priorityClass] ~= t; state = t.target; } uint nextPrio; foreach ( ts; trans ) { foreach ( t; ts ) t.priority = nextPrio++; } } private: uint next_tag; size_t cursor, next_cursor; List!(trans_t) transitions; state_t[state_t] clonedStates; trans_t[trans_t] clonedTransitions; /// RegEx operators enum Operator { eos, concat, altern, open_par, close_par, zero_one_g, zero_more_g, one_more_g, // greedy zero_one_ng, zero_more_ng, one_more_ng, // non-greedy/reluctant zero_one_xr, zero_more_xr, one_more_xr, // extra-reluctant open_par_nm, occur_g, occur_ng } const char[][] operator_names = ["EOS", "concat", "|", "(", ")", "?", "*", "+", "??", "*?", "+?", "??x", "*?x", "+?x", "(?", "{x,y}", "{x,y}?"]; /// Actions for to-postfix transformation enum Act { pua, poc, poa, pca, don, err } const char[][] action_names = ["push+advance", "pop+copy", "pop+advance", "pop+copy+advance", "done", "error"]; /// Action lookup for to-postfix transformation const Act[] action_lookup = [ // eos concat | ( ) ? * + ?? *? +? ??extra *?extra +?extra (? {x,y} {x,y}? Act.don, Act.pua, Act.pua, Act.pua, Act.err, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.poc, Act.pua, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc ]; final dchar peekPattern() { auto tmp = next_cursor; if ( tmp < pattern.length ) return decode(pattern, tmp); return 0; } final dchar readPattern() { cursor = next_cursor; if ( next_cursor < pattern.length ) return decode(pattern, next_cursor); return 0; } final bool endOfPattern() { return next_cursor >= pattern.length; } state_t addState() { state_t s = new state_t; s.index = states.length; states ~= s; return s; } trans_t addTransition(PriorityClass pc = PriorityClass.normal) { trans_t trans = new trans_t(pc); transitions ~= trans; return trans; } uint parseNumber() { uint res; while ( !endOfPattern ) { auto c = peekPattern; if ( c < '0' || c > '9' ) break; res = res*10+(c-'0'); readPattern; } return res; } void parseOccurCount(out uint minOccur, out uint maxOccur) { assert(pattern[cursor] == '{'); |