Both functions return one of the following Format or CodePage
- Format CodePage
- UTF-8 CP65001
- UTF-16 (BE) CP1201
- UTF-16 (LE) CP1200
- UTF-32 (BE) CP12001
- UTF-32 (LE) CP12000
- UTF-7 CP65001
- GB 18030 CP54936
Please note, there is no way to determine the file Encoding 100% sure, even if a file contains BOM.
The result of this functions is simply a best guess assuming UTF-8 more common when BOM is missing.
FileGetFormat.ahk returns format of given file, e.g. UTF-8 no BOM
FileGetFormat(file){ static BOM:={254_255:"UTF-16 BE",255_254:"UTF-16 LE",239_187_191:"UTF-8",0_0_254_255:"UTF-32 BE" ,255_254_0_0:"UTF-32 LE",43_47_118_43:"UTF-7",43_47_118_47:"UTF-7",43_47_118_56:"UTF-7" ,43_47_118_57:"UTF-7",221_115_102_115:"UTF-EBCDIC",132_49_149_51:"GB 18030"} If ("D"!=aFormat:=A_FormatInteger) SetFormat,Integer,D f:=FileOpen(file,"rw"),f.Pos:=0 BOM4:=(BOM3:=(BOM2:=f.ReadUChar() "_" f.ReadUChar()) "_" f.ReadUChar()) "_" f.ReadUChar(),f.Close() If (aFormat!="D") SetFormat,Integer,%aFormat% FileRead,f,*c %file% If BOM.HasKey(BOM4) return BOM[BOM4] else if BOM.HasKey(BOM3) return BOM[BOM3] else if BOM.HasKey(BOM2) return BOM[BOM2] FileRead,f,*P65001 %file% FileGetSize,size,%file% return StrLen(f)=size?"ANSI":"UTF-8 no BOM" }
FileGetEncoding.ahk returns CodePage for given file, e.g. CP65001
FileGetEncoding(file){ static BOM:={254_255:"CP1201",255_254:"CP1200",239_187_191:"CP65001",0_0_254_255:"CP12001" ,255_254_0_0:"CP12000",43_47_118_43:"CP65000",43_47_118_47:"CP65000",43_47_118_56:"CP65000" ,43_47_118_57:"CP65000",221_115_102_115:"CP500",132_49_149_51:"CP54936"} If ("D"!=aFormat:=A_FormatInteger) SetFormat,Integer,D f:=FileOpen(file,"rw"),f.Pos:=0 BOM4:=(BOM3:=(BOM2:=f.ReadUChar() "_" f.ReadUChar()) "_" f.ReadUChar()) "_" f.ReadUChar(),f.Close() If (aFormat!="D") SetFormat,Integer,%aFormat% If BOM.HasKey(BOM4) return BOM[BOM4] else if BOM.HasKey(BOM3) return BOM[BOM3] else if BOM.HasKey(BOM2) return BOM[BOM2] FileRead,f,*P65001 %file% FileGetSize,size,%file% return StrLen(f)=size?"CP0":"CP65001" }