If anyone has spare time to try it out and give feedback, here's a beta-test version of the RegEx functions:
http://www.autohotke...eyRegExTest.exeYou can rename it to overwrite your existing AutoHotkey.exe if you want, since it's only the RegEx functions that are beta.
Here's the syntax for the two functions:
FoundPos := RegExMatch(Haystack, NeedleRegEx, OutputVarOrArray = "", StartingPos = 1)NewStr := RegExReplace(Haystack, NeedleRegEx, Replacement = "", OutputVarCount = "", Limit = -1, StartingPos = 1)Upon failure (other than "no match"):
- Both functions return "".
- ErrorLevel is set to something non-zero (a message or code, some of which might change).
OutputVar/Array should be unquoted to work (or pass "" to explicitly indicate no output). RegExMatch() creates an array only when there are subpatterns (i.e. parts in parentheses) in the RegEx. Regardless of whether an array is created, the function also stores the substring that matched the entire pattern in ArrayName itself.
The RegEx options (if any) are included at the start of the RegEx/Needle parameter, followed by an open-parenthesis. For example, the RegEx "
i)abc". Would search for abc in the caseless mode. Options consist of zero or more of the letters from
http://php.net/manua... ... ifiers.phpIn addition, the option `n switches from the default end-of-line character (`r`n) to `n.
In addition, RegExMatch() supports the letter P, which causes the output array (if specified) to be split into two arrays: ArrayNamePos and ArrayNameLen (but if there are no subpatterns, no arrays are created). Regardless of whether arrays are created, the function also stores the length of the substring that matched the entire pattern in ArrayName itself.
RegExReplace()'s
Replacement parameter supports backreferences in the form $1, ${11}, or ${named}. To use a literal $, specify $$.
Naming: The function names could still be changed to use a prefix of RE_ or Reg instead of RegEx (might make them easier to type). However, as PhiLho point out, those are more ambiguous (such as confusion with the registry commands). There's a new poll for this here.
There's still a lot of testing to do. In addition, the performance will be improved via a cache (currently there's no caching).
Finally, here's a test script that runs though some simple RegEx's. It's by no means comprehensive, and many more tests will be done before the final release (probably in 3 to 5 days).
Comments and bug reports are wecome. Thanks to everyone for your advice.
VarSetCapacity(bigstr, 200000) ; For performance
Loop 10000
bigstr = %bigstr%0123`r`nabc`r`n789`r`n
StringReplace, bigstr_repl, bigstr, abc`r`n, XYZ456`r`n, UseErrorLevel
ReplCount := ErrorLevel ; Used later below.
newstr := RegExReplace(bigstr, "^a.c$", "XYZ456", count) ; Not found because multiline option is absent.
if count <> 0
MsgBox Count %count% should have been 0.
if (newstr <> bigstr)
MsgBox newstr should have been the same as bigstr.
newstr := RegExReplace(bigstr, "m)^a.c$", "XYZ456", count) ; Match found due to 'm' option.
if (count <> ReplCount)
MsgBox Count %count% should have been %ReplCount%.
if (newstr <> bigstr_repl)
MsgBox newstr should have been the same as bigstr_repl.
newstr := RegExReplace(bigstr, "m)^[0-3]*`r`na.c`r`n[7-9]*`r`n", "", count)
if (newstr <> "")
msgbox newstr was supposed to be empty.
if count <> 10000
msgbox count was supposed to be 10000
; TEST REPLACE():
testR(1, "", "", "", "") ; Pretty obscure, but 1 does seem to be the correct number of replacements.
testR(3, "xxx", "abc", ".", "x")
testR(2, "xx", "abc", ".*", "x") ; Confirmed correct by http://www.regextester.com. Explanation? Replaces abc by x, then the empty string at the end with x.
testR(1, "x", "abc", ".*", "x", 1)
testR(5, "bbbbbbbbbb", "aaaaa", "a", "bb") ; Replace small with larger.
testR(5, "aaaaa", "bbbbbbbbbb", "bb", "a") ; Converse.
testR(5, "bbbbb", "aaaaa", "a", "b")
testR(3, "bbbaa", "aaaaa", "a", "b", 3) ; Limit the number of replacements.
testR(0, "aaaaa", "aaaaa", "a", "b", 0)
testR(4, "aaabaca", "abc", "", "a") ; Confirmed correct by http://www.regextester.com
testR(1, "azc", "abc", "b", "z")
; TEST PCRE_NEWLINE_LF, PCRE_MULTILINE, and related
testR(0, "123`r`nabc`r`n789", "123`r`nabc`r`n789", "^[0-9]*$", "xxx") ; Not found due to anchoring.
testR(2, "xxx`r`nabc`r`nxxx", "123`r`nabc`r`n789", "m)^[0-9]*$", "xxx") ; Found because now anchoring is sees the newlines.
testR(2, "xxx`nabc`nxxx", "123`nabc`n789", "m`n)^[0-9]*$", "xxx") ; Same but with LF vs. CRLF.
testR(2, "xxx`rabc`rxxx", "123`rabc`r789", "m`r)^[0-9]*$", "xxx") ; Same but with CR vs. CRLF.
; TEST THINGS THAT AREN'T QUITE BACKREFERENCES
testR(1, "abc$", "abc", "abc", "abc$")
testR(1, "abc$", "abc", "abc", "abc$$")
testR(1, "abc${}${5}${", "abc", "abc", "abc$${}${$}$${5}${")
testR(1, "a$xbc${xx", "abc", "abc", "a$xbc${xx") ; Unclosed braces are transcribed literally.
testR(1, "abc", "abc", "abc", "abc${-5}") ; Negative or out of bounds treated as blank.
testR(1, "abc", "abc", "abc", "abc${99}") ; Same.
; TEST NUMBERED BACKREFERENCES
testR(1, "abcabc", "abc", "abc", "abc${0}")
testR(1, "xyz123abcx", "abc123xyz", "([a-z]+)([0-9]+)([a-z]+)", "$3${2}$1$9${77}x")
testR(1, "PhiLho", "Philippe Lhoste", "^(\w{3})\w*\s+\b(\w{3})\w*$", "$1$2")
; TEST NAMED BACKREFERENCES
testR(1, "123|badc|89", "123abcd89", "(a)(?P<x>b)(c)(?P<y>d)", "|${x}$1${y}$3${bogus}|")
; TEST OPTION 's' (PCRE_DOTALL) (note: CRLF requires two dots to match when dotall is in effect).
testR(1, "abc123x", "abc123`r`ndef", "s)..def", "x") ; With dot-all
testR(0, "abc123`r`ndef", "abc123`r`ndef", "..def", "x") ; Without it (not found, so not replacment).
; TEST MATCH()
; TEST BORDERLINE/SPECIAL CASES
testM(1, "", "", "") ; Empty string found in itself at pos 1.
testM(1, "", "abc", "") ; Empty string found at pos 1.
testM(3, "c", "abc", "[a-z]+", 3)
testM(0, "", "abc", "[a-z]+", 5) ; Test StartingPos greater than length of string.
testM(4, "", "abc", "", 5) ; Finds empty string though?
testM(1, 0, "abc", "P)") ; Position mode, which should yield 0 for length of main pattern.
; TEST OPTIONS PARSING, BORDERLINE CASES
testM(1, "abc", "abc", ")abc") ; Empty string found in itself at pos 1.
testM(4, "i)abc", "abci)abc", "i[)]abc") ; Empty string found in itself at pos 1.
testM(4, "i)abc", "abci)abc", "i\)abc") ; Empty string found in itself at pos 1.
; TEST OPTION 'i' (PCRE_CASELESS)
testM(4, "aBc", "123aBc789", "i)abc")
testM(0, "", "123aBc789", "abc") ; Counterpoint to above.
testM(4, "3", "123aBc789", "iP)abc") ; Position mode.
; TEST OPTION 'm' (PCRE_MULTILINE)
testM(6, "abc", "123`r`nabc`r`n789", "m)^abc$")
testM(0, "", "123`nabc`r`n789", "^abc$") ; Counterpoint to above (i.e. no multiline)
testM(6, "abc", "123`r`nabc`r`n789", "m)^abc$", 6)
testM(0, "", "123`r`nabc`r`n789", "m)^abc$", 7) ; Not found if StartingPos a little too far right.
testM(6, 3, "123`r`nabc`r`n789", "mP)^abc$") ; Position mode
; TEST OPTION 's' (PCRE_DOTALL) (note: CRLF requires two dots to match when dotall is in effect).
testM(0, "", "123`r`nabc`r`n789", "123..abc") ; First with no dot-all.
testM(1, "123`r`nabc", "123`r`nabc`r`n789", "s)123..abc") ; Same but with dot-all.
testM(1, "123`nabc", "123`nabc`n789", "123.abc") ; Now with dot-all & LF (works with or without the s) because `n isn't a valid newline char.
testM(1, "7", "123`nabc`n789", "sP)123.abc") ; Position mode.
; TEST OPTION 'A' Anchored.
testM(0, "", "123aBc789", "A)aBc")
testM(1, "123", "123aBc789", "A)123")
; TEST OPTION `n (PCRE_NEWLINE_LF) and related:
testM(1, "123", "123`r`nabc`r`n789", "m)^123$")
testM(1, "123", "123`nabc`n789", "m`n)^123$")
testM(0, "", "123`nabc`n789", "m)^123$") ; Not found because wrong NEWLINE chars.
testM(0, "", "123`r`nabc`r`n789", "`nm)^123$") ; Same.
testM(1, "123", "123`r`nabc`r`n789", "m)^123$") ; `r`n now in haystack too
testM(1, "123`t`r", "123`t`r`nabc`r`n789", "m`n)^123`t`r$") ; Variation.
; TEST OPTION 'x' (PCRE_EXTENDED) (NO CURRENT TESTS FOR THIS AND OTHER OPTIONS)
; TEST GENERAL STUFF:
testM(7, "abc`t`r`n789", "123`t`r`nabc`t`r`n789", "abc`t`r`n.*$")
testM(7, "abc`t`r`n789", "123`t`r`nabc`t`r`n789", "abc\t\r\n.*$") ; Same as above but let PCRE escape needle via backslash.
testM(0, "", "123aBc789", "xyz")
testM(10, "aBc", "123aBc789aBc", "aBc$")
testM(6, "c789", "123aBc789", "(xyz)|([a-z]+)7(.)(x*)9", 1, "", "c", "8", "")
testM(6, "4", "123aBc789", "P)(xyz)|([a-z]+)7(.)(x*)9", 1, "0", "0", "6", "1") ; Position mode.
MsgBox Done
testR(aExpectedCount, aExpectedResult, aHaystack, aNeedle, aRepl, aLimit = -1)
{
static test_number
++test_number
ErrorLevel = Not Initialized ; To catch bugs where it wasn't properly set by the command.
actual_result := [color=red]RegExReplace[/color](aHaystack, aNeedle, aRepl, actual_count, aLimit)
if ErrorLevel
{
MsgBox Replace() Test #%test_number%`nErrorLevel = "%ErrorLevel%"`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"`nReplacement = "%aRepl%"
return ; Show just one error per test.
}
if (actual_result <> aExpectedResult)
{
MsgBox Replace() Test #%test_number%`nActual result (%actual_result%) <> expected (%aExpectedResult%).`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"`nReplacement = "%aRepl%"
return ; Show just one error per test.
}
if (actual_count <> aExpectedCount)
{
MsgBox Replace() Test #%test_number%`nActual replacement count (%actual_count%) <> expected (%aExpectedCount%).`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"`nReplacement = "%aRepl%"
return ; Show just one error per test.
}
if (strlen(actual_result) <> strlen(aExpectedResult)) ; THIS CHECKS INTERNALLY-STORED LENGTH FOR CORRUPTION (but make the above test take precedence in case the length discrepancy is due merely to the two strings not being equal).
{
MsgBox Replace() Test #%test_number%`nActual length <> expected length.`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"`nReplacement = "%aRepl%"
return ; Show just one error per test.
}
}
testM(aExpectedPos, aExpectedFoundStr, aHaystack, aNeedle, aOffset = 1
, aSub1 = -1, aSub2 = -1, aSub3 = -1, aSub4 = -1)
{
static test_number
++test_number
ErrorLevel = Not Initialized ; To catch bugs where it wasn't properly set by the command.
FoundPos := [color=red]RegExMatch[/color](aHaystack, aNeedle, match, aOffset)
if ErrorLevel
{
MsgBox Test #%test_number%`nErrorLevel = "%ErrorLevel%"`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"
return ; Show just one error per test.
}
if (FoundPos <> aExpectedPos)
{
MsgBox Test #%test_number%`nFoundPos actual (%FoundPos%) <> expected (%aExpectedPos%).`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"
return ; Show just one error per test.
}
if not (aExpectedFoundStr == match)
{
MsgBox Test #%test_number%`nFoundStr actual (%match%) <> expected (%aExpectedFoundStr%).`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"
return ; Show just one error per test.
}
if RegExMatch(aNeedle, "[a-zA-z`r`n]*P") ; Verify the SubN items as though they contain positions.
{
v = 1
Loop 4
{
expected := aSub%A_Index%
if (expected = -1)
continue
if mod(A_Index, 2)
actual := matchPos%v%
else
{
actual := matchLen%v%
++v
}
if (actual <> expected)
MsgBox Test #%test_number%`nSubstring #%A_Index% actual (%actual%) <> expected (%expected%).`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"
}
}
else ; Verify the SubN items as though they contain substrings that matched the subpatterns.
{
Loop 4
{
expected := aSub%A_Index%
if (expected = -1)
continue
actual := match%A_Index%
if (actual <> expected)
MsgBox Test #%test_number%`nSubstring #%A_Index% actual (%actual%) <> expected (%expected%).`nHaystack = "%aHaystack%"`nNeedle = "%aNeedle%"
}
}
}