Unidecode port for AHK
Posted: 15 Jun 2015, 11:54
What is it for:
It tries to translate unicode characters to ascii.
Why:
When i saw the topic for removing LetterAccent I remembered that i've done somthing similar to that. I needed to transcript names to ascii. Swedish names,russian and others.
Credits:
After searching the web i found a perlscript named Unidecode. Mr Burke has done the whole transcription of Unicodesymbols to ascii.
Not all of course but a lot.
What i did:
I use his translated files and merged them to one big texfile of ~400kB.
You can load this file to an array and could easily translate a lot of unicode to ascii-chars.
At first you've to download his files and extract them to a directory. You can delete everything besides the unidecode Dir.
Put the follwing script above this dir and run it to make the textfile unidecode.tbl
And here is an example how to use it:
(put the file unidecode.tbl in the same dir)
Try it in the chinese forum. Dont know if there is a meaning for the ascii chars.
edit: the unidecode.tbl file will now loaded only once.
⠁⠥⠞⠕⠓⠕⠞⠅⠑⠽ makes it possible!
It tries to translate unicode characters to ascii.
Why:
When i saw the topic for removing LetterAccent I remembered that i've done somthing similar to that. I needed to transcript names to ascii. Swedish names,russian and others.
Credits:
After searching the web i found a perlscript named Unidecode. Mr Burke has done the whole transcription of Unicodesymbols to ascii.
Not all of course but a lot.
What i did:
I use his translated files and merged them to one big texfile of ~400kB.
You can load this file to an array and could easily translate a lot of unicode to ascii-chars.
At first you've to download his files and extract them to a directory. You can delete everything besides the unidecode Dir.
Put the follwing script above this dir and run it to make the textfile unidecode.tbl
Code: Select all
makeUniDecodetablefile()
return
makeUniDecodetablefile(pathToUnidecodeDir="unidecode",tablename="unidecode.tbl"){
b:=[]
i:=0
Loop, Files, %pathToUnidecodeDir%\*.pm
{
FileRead, OutputVar, %A_LoopFileFullPath%
index:= PerlfilePMToVar(OutputVar)
SetFormat, IntegerFast, d
i := index +1
;for debugging
;c:=count(OutputVar)
;msgbox, % i " " index ", " c ", "OutputVar
b[i]:=OutputVar
}
for i, element in b
FileAppend , %element% `, `n,%tablename%
}
PerlfilePMToVar(ByRef haystack){
static test
result:=haystack
Pattern1:= "i),\s+(#.*?`n)"
Pattern2:= "i)(#\s+BLOCK.*?`n)"
pos:=1
while pos
{
pos := RegExMatch(Haystack, pattern1, match, pos + strlen(match))
result:=strReplace(result,match1,"")
}
pos:=1
while pos
{
pos := RegExMatch(Haystack, pattern2, matcher, pos + strlen(matcher))
result:=strReplace(result,matcher1,"")
}
match2:=""
result:=strReplace(result,"`n")
Pattern:= "i)\[(.*)\]\s+=\s+\[(.*),\]"
Pattern2:= "i)\[(.*)\]\s+=\s+Text"
if !InStr(result,"make_placeholder_map")
pos := RegExMatch(result, pattern, match)
else
{
pos := RegExMatch(result, pattern2, match)
loop,255
match2 .= """"","
match2 .= """"""
}
match2:=strReplace(match2,"`n")
haystack:=match2
return match1
}
count(Text){
; only for debugging
count:=0
Text:= strreplace(text,"qq{,,}","qq{zweikomma}")
text:= strreplace(Text,"qq{,}","qq{einkomma}")
text:= strreplace(Text,"qq{, }","qq{dreikomma}")
Loop, Parse,Text,`,
{
x:=trim(A_LoopField)
if (x = "qq{zweikomma}")
count++
else if (x = "qq{einkomma}")
count++
else if (x = "qq{dreikomma}")
count++
else if (x = """""")
count++
else if (InStr(x,"qq{"))
count++
else
count++
}
return count
}
(put the file unidecode.tbl in the same dir)
Try it in the chinese forum. Dont know if there is a meaning for the ascii chars.
Code: Select all
;text= €€€€€@@@ßäÄüÜö
;msgbox, % unidecode(text)
!^u::msgbox, % unidecode(clipboard,"äÄ")
return
unidecode(text, donotdecode=""){
static a
Transform, text, HTML, %text% ,2
if (donotdecode<>"")
Loop, Parse,donotdecode
{
Transform, dn, HTML, %A_Loopfield% ,2
text := strReplace(text,dn,A_Loopfield)
}
u:=getDecUnicode(text)
Sort u, N D, U
if !(a.length()=65536){
a:=[]
FileRead, tbl, unidecode.tbl
a:=unidecodeTable2Array(tbl,a)
}
if !(a.length()=65536){
msgbox, % "Error loading unidecode.tbl. Array length is " a.length() " instead of 65536."
exitapp
}
Loop, Parse,u,`,
text := strReplace(text,"&#" . A_Loopfield ";", a[A_Loopfield])
return text
}
unidecodeTable2Array(Text,array){
loop, parse, Text,`n
{
usatz:= strreplace(A_LoopField,"qq{,,}","qq{zweikomma}")
usatz:= strreplace(usatz,"qq{,}","qq{einkomma}")
usatz:= strreplace(usatz,"qq{, }","qq{dreikomma}")
i:=(a_index-1)*256
;msgbox, % "i" i
index:=0+i
Loop, Parse,usatz,`,
{
x:=trim(A_LoopField)
if (x = "qq{zweikomma}")
array[Index]:=",,"
else if (x = "qq{einkomma}")
array[Index]:=","
else if (x = "qq{dreikomma}")
array[Index]:=", "
else if (x = """""")
array[Index]:=""
else if (InStr(x,"qq{"))
array[Index]:=substr(x,4,strlen(x)-4)
else
array[Index]:=substr(x,2,strlen(x)-2)
index++
}
}
return array
}
getDecUnicode(haystack){
Pattern:= "i)&#(\d+)?;"
pos:=1
while pos
{
pos := RegExMatch(Haystack, pattern, match, pos + strlen(match))
if (result ="") and (match1<>"")
result := match1
else if (match1<>"")
result .= "," . match1
}
return result
}
⠁⠥⠞⠕⠓⠕⠞⠅⠑⠽ makes it possible!