Optical character recognition (OCR) with UWP API

Post your working scripts, libraries and tools for AHK v1.1 and older
byzod
Posts: 87
Joined: 21 Jun 2021, 06:46

Re: Optical character recognition (OCR) with UWP API

23 Jul 2021, 22:25

Tried to merge them in one file
But it's still scattered functions because I don't have the skill and time to wrap them in a class :(

Now you can use it like this

Code: Select all

#NoEnv  ; Recommended for performance and compatibility with future AutoHotkey releases.
SendMode Input  ; Recommended for new scripts due to its superior speed and reliability.
SetWorkingDir %A_ScriptDir%  ; Ensures a consistent starting directory.
#Include OCR.ahk


img := "1.png"
text1 := OcrFile(img)
text2 := OcrArea(277, 77, 200, 50)
Msgbox % "File ocr:`n" . text1 . "`n`nScreen ocr:`n" . text2
ExitApp

Merged ocr.ahk:

Code: Select all

; Optical character recognition (OCR) with UWP API
; Contributor: malcev, teadrinker
; https://www.autohotkey.com/boards/viewtopic.php?f=6&t=72674



; X,Y [int] coordinate of upper left corner of area on screen to ocr
; W,H [int] width and height of ocr area
; lang [str] ocr with given language
; (return) text [str] ocr result text
OcrArea(X, Y, W, H, lang := "FirstFromAvailableLanguages"){
	hBitmap := HBitmapFromScreen(X, Y, W, H)
	pIRandomAccessStream := HBitmapToRandomAccessStream(hBitmap)
	DllCall("DeleteObject", "Ptr", hBitmap)
	text := ocr(pIRandomAccessStream, lang)
	return text
}

; filename [str] filename or fullpath of image to ocr
; lang [str] ocr with given language
; (return) text [str] ocr result text
OcrFile(filename, lang := "FirstFromAvailableLanguages"){
   if (SubStr(filename, 2, 1) != ":")
      filename := A_ScriptDir "\" filename
   if !FileExist(filename) or InStr(FileExist(filename), "D")
   {
      throw { Message: "File Error"
             ,What: "File " . filename . " does not exist"
             ,File: filename}
   }
   VarSetCapacity(GUID, 16)
   DllCall("ole32\CLSIDFromString", "wstr", IID_RandomAccessStream := "{905A0FE1-BC53-11DF-8C49-001E4FC686DA}", "ptr", &GUID)
   DllCall("ShCore\CreateRandomAccessStreamOnFile", "wstr", filename, "uint", Read := 0, "ptr", &GUID, "ptr*", IRandomAccessStream)
   text := ocr(IRandomAccessStream, lang)
   return text
}

GetArea() {
   area := []
   StartSelection(area)
   while !area.w
      Sleep, 100
   Return area
}
   
StartSelection(area) {
   handler := Func("Select").Bind(area)
   Hotkey, LButton, % handler, On
   ReplaceSystemCursors("IDC_CROSS")
}

Select(area) {
   static hGui := CreateSelectionGui()
   Hook := new WindowsHook(WH_MOUSE_LL := 14, "LowLevelMouseProc", hGui)
   Loop {
      KeyWait, LButton
      WinGetPos, X, Y, W, H, ahk_id %hGui%
   } until w > 0
   ReplaceSystemCursors("")
   Hotkey, LButton, Off
   Hook := ""
   Gui, %hGui%:Show, Hide
   for k, v in ["x", "y", "w", "h"]
      area[v] := %v%
}

ReplaceSystemCursors(IDC = "")
{
   static IMAGE_CURSOR := 2, SPI_SETCURSORS := 0x57
        , exitFunc := Func("ReplaceSystemCursors").Bind("")
        , SysCursors := { IDC_APPSTARTING: 32650
                        , IDC_ARROW      : 32512
                        , IDC_CROSS      : 32515
                        , IDC_HAND       : 32649
                        , IDC_HELP       : 32651
                        , IDC_IBEAM      : 32513
                        , IDC_NO         : 32648
                        , IDC_SIZEALL    : 32646
                        , IDC_SIZENESW   : 32643
                        , IDC_SIZENWSE   : 32642
                        , IDC_SIZEWE     : 32644
                        , IDC_SIZENS     : 32645 
                        , IDC_UPARROW    : 32516
                        , IDC_WAIT       : 32514 }
   if !IDC {
      DllCall("SystemParametersInfo", UInt, SPI_SETCURSORS, UInt, 0, UInt, 0, UInt, 0)
      OnExit(exitFunc, 0)
   }
   else  {
      hCursor := DllCall("LoadCursor", Ptr, 0, UInt, SysCursors[IDC], Ptr)
      for k, v in SysCursors  {
         hCopy := DllCall("CopyImage", Ptr, hCursor, UInt, IMAGE_CURSOR, Int, 0, Int, 0, UInt, 0, Ptr)
         DllCall("SetSystemCursor", Ptr, hCopy, UInt, v)
      }
      OnExit(exitFunc)
   }
}

CreateSelectionGui() {
   Gui, New, +hwndhGui +Alwaysontop -Caption +LastFound +ToolWindow +E0x20 -DPIScale
   WinSet, Transparent, 130
   Gui, Color, FFC800
   Return hGui
}

LowLevelMouseProc(nCode, wParam, lParam) {
   static WM_MOUSEMOVE := 0x200, WM_LBUTTONUP := 0x202
        , coords := [], startMouseX, startMouseY, hGui
        , timer := Func("LowLevelMouseProc").Bind("timer", "", "")
   
   if (nCode = "timer") {
      while coords[1] {
         point := coords.RemoveAt(1)
         mouseX := point[1], mouseY := point[2]
         x := startMouseX < mouseX ? startMouseX : mouseX
         y := startMouseY < mouseY ? startMouseY : mouseY
         w := Abs(mouseX - startMouseX)
         h := Abs(mouseY - startMouseY)
         try Gui, %hGUi%: Show, x%x% y%y% w%w% h%h% NA
      }
   }
   else {
      (!hGui && hGui := A_EventInfo)
      if (wParam = WM_LBUTTONUP)
         startMouseX := startMouseY := ""
      if (wParam = WM_MOUSEMOVE)  {
         mouseX := NumGet(lParam + 0, "Int")
         mouseY := NumGet(lParam + 4, "Int")
         if (startMouseX = "") {
            startMouseX := mouseX
            startMouseY := mouseY
         }
         coords.Push([mouseX, mouseY])
         SetTimer, % timer, -10
      }
      Return DllCall("CallNextHookEx", Ptr, 0, Int, nCode, UInt, wParam, Ptr, lParam)
   }
}

class WindowsHook {
   __New(type, callback, eventInfo := "", isGlobal := true) {
      this.callbackPtr := RegisterCallback(callback, "Fast", 3, eventInfo)
      this.hHook := DllCall("SetWindowsHookEx", "Int", type, "Ptr", this.callbackPtr
                                              , "Ptr", !isGlobal ? 0 : DllCall("GetModuleHandle", "UInt", 0, "Ptr")
                                              , "UInt", isGlobal ? 0 : DllCall("GetCurrentThreadId"), "Ptr")
   }
   __Delete() {
      DllCall("UnhookWindowsHookEx", "Ptr", this.hHook)
      DllCall("GlobalFree", "Ptr", this.callBackPtr, "Ptr")
   }
}

HBitmapFromScreen(X, Y, W, H) {
   HDC := DllCall("GetDC", "Ptr", 0, "UPtr")
   HBM := DllCall("CreateCompatibleBitmap", "Ptr", HDC, "Int", W, "Int", H, "UPtr")
   PDC := DllCall("CreateCompatibleDC", "Ptr", HDC, "UPtr")
   DllCall("SelectObject", "Ptr", PDC, "Ptr", HBM)
   DllCall("BitBlt", "Ptr", PDC, "Int", 0, "Int", 0, "Int", W, "Int", H
                   , "Ptr", HDC, "Int", X, "Int", Y, "UInt", 0x00CC0020)
   DllCall("DeleteDC", "Ptr", PDC)
   DllCall("ReleaseDC", "Ptr", 0, "Ptr", HDC)
   Return HBM
}

HBitmapToRandomAccessStream(hBitmap) {
   static IID_IRandomAccessStream := "{905A0FE1-BC53-11DF-8C49-001E4FC686DA}"
        , IID_IPicture            := "{7BF80980-BF32-101A-8BBB-00AA00300CAB}"
        , PICTYPE_BITMAP := 1
        , BSOS_DEFAULT   := 0
        
   DllCall("Ole32\CreateStreamOnHGlobal", "Ptr", 0, "UInt", true, "PtrP", pIStream, "UInt")
   
   VarSetCapacity(PICTDESC, sz := 8 + A_PtrSize*2, 0)
   NumPut(sz, PICTDESC)
   NumPut(PICTYPE_BITMAP, PICTDESC, 4)
   NumPut(hBitmap, PICTDESC, 8)
   riid := CLSIDFromString(IID_IPicture, GUID1)
   DllCall("OleAut32\OleCreatePictureIndirect", "Ptr", &PICTDESC, "Ptr", riid, "UInt", false, "PtrP", pIPicture, "UInt")
   ; IPicture::SaveAsFile
   DllCall(NumGet(NumGet(pIPicture+0) + A_PtrSize*15), "Ptr", pIPicture, "Ptr", pIStream, "UInt", true, "UIntP", size, "UInt")
   riid := CLSIDFromString(IID_IRandomAccessStream, GUID2)
   DllCall("ShCore\CreateRandomAccessStreamOverStream", "Ptr", pIStream, "UInt", BSOS_DEFAULT, "Ptr", riid, "PtrP", pIRandomAccessStream, "UInt")
   ObjRelease(pIPicture)
   ObjRelease(pIStream)
   Return pIRandomAccessStream
}

CLSIDFromString(IID, ByRef CLSID) {
   VarSetCapacity(CLSID, 16, 0)
   if res := DllCall("ole32\CLSIDFromString", "WStr", IID, "Ptr", &CLSID, "UInt")
      throw Exception("CLSIDFromString failed. Error: " . Format("{:#x}", res))
   Return &CLSID
}


ocr(file, lang := "FirstFromAvailableLanguages")
{
   static OcrEngineStatics, OcrEngine, MaxDimension, LanguageFactory, Language, CurrentLanguage, BitmapDecoderStatics, GlobalizationPreferencesStatics
   if (OcrEngineStatics = "")
   {
      CreateClass("Windows.Globalization.Language", ILanguageFactory := "{9B0252AC-0C27-44F8-B792-9793FB66C63E}", LanguageFactory)
      CreateClass("Windows.Graphics.Imaging.BitmapDecoder", IBitmapDecoderStatics := "{438CCB26-BCEF-4E95-BAD6-23A822E58D01}", BitmapDecoderStatics)
      CreateClass("Windows.Media.Ocr.OcrEngine", IOcrEngineStatics := "{5BFFA85A-3384-3540-9940-699120D428A8}", OcrEngineStatics)
      DllCall(NumGet(NumGet(OcrEngineStatics+0)+6*A_PtrSize), "ptr", OcrEngineStatics, "uint*", MaxDimension)   ; MaxImageDimension
   }
   if (file = "ShowAvailableLanguages")
   {
      if (GlobalizationPreferencesStatics = "")
         CreateClass("Windows.System.UserProfile.GlobalizationPreferences", IGlobalizationPreferencesStatics := "{01BF4326-ED37-4E96-B0E9-C1340D1EA158}", GlobalizationPreferencesStatics)
      DllCall(NumGet(NumGet(GlobalizationPreferencesStatics+0)+9*A_PtrSize), "ptr", GlobalizationPreferencesStatics, "ptr*", LanguageList)   ; get_Languages
      DllCall(NumGet(NumGet(LanguageList+0)+7*A_PtrSize), "ptr", LanguageList, "int*", count)   ; count
      loop % count
      {
         DllCall(NumGet(NumGet(LanguageList+0)+6*A_PtrSize), "ptr", LanguageList, "int", A_Index-1, "ptr*", hString)   ; get_Item
         DllCall(NumGet(NumGet(LanguageFactory+0)+6*A_PtrSize), "ptr", LanguageFactory, "ptr", hString, "ptr*", LanguageTest)   ; CreateLanguage
         DllCall(NumGet(NumGet(OcrEngineStatics+0)+8*A_PtrSize), "ptr", OcrEngineStatics, "ptr", LanguageTest, "int*", bool)   ; IsLanguageSupported
         if (bool = 1)
         {
            DllCall(NumGet(NumGet(LanguageTest+0)+6*A_PtrSize), "ptr", LanguageTest, "ptr*", hText)
            buffer := DllCall("Combase.dll\WindowsGetStringRawBuffer", "ptr", hText, "uint*", length, "ptr")
            text .= StrGet(buffer, "UTF-16") "`n"
         }
         ObjRelease(LanguageTest)
      }
      ObjRelease(LanguageList)
      return text
   }
   if (lang != CurrentLanguage) or (lang = "FirstFromAvailableLanguages")
   {
      if (OcrEngine != "")
      {
         ObjRelease(OcrEngine)
         if (CurrentLanguage != "FirstFromAvailableLanguages")
            ObjRelease(Language)
      }
      if (lang = "FirstFromAvailableLanguages")
         DllCall(NumGet(NumGet(OcrEngineStatics+0)+10*A_PtrSize), "ptr", OcrEngineStatics, "ptr*", OcrEngine)   ; TryCreateFromUserProfileLanguages
      else
      {
         CreateHString(lang, hString)
         DllCall(NumGet(NumGet(LanguageFactory+0)+6*A_PtrSize), "ptr", LanguageFactory, "ptr", hString, "ptr*", Language)   ; CreateLanguage
         DeleteHString(hString)
         DllCall(NumGet(NumGet(OcrEngineStatics+0)+9*A_PtrSize), "ptr", OcrEngineStatics, ptr, Language, "ptr*", OcrEngine)   ; TryCreateFromLanguage
      }
      if (OcrEngine = 0)
      {
         msgbox Can not use language "%lang%" for OCR, please install language pack.
         ExitApp
      }
      CurrentLanguage := lang
   }
   IRandomAccessStream := file
   DllCall(NumGet(NumGet(BitmapDecoderStatics+0)+14*A_PtrSize), "ptr", BitmapDecoderStatics, "ptr", IRandomAccessStream, "ptr*", BitmapDecoder)   ; CreateAsync
   WaitForAsync(BitmapDecoder)
   BitmapFrame := ComObjQuery(BitmapDecoder, IBitmapFrame := "{72A49A1C-8081-438D-91BC-94ECFC8185C6}")
   DllCall(NumGet(NumGet(BitmapFrame+0)+12*A_PtrSize), "ptr", BitmapFrame, "uint*", width)   ; get_PixelWidth
   DllCall(NumGet(NumGet(BitmapFrame+0)+13*A_PtrSize), "ptr", BitmapFrame, "uint*", height)   ; get_PixelHeight
   if (width > MaxDimension) or (height > MaxDimension)
   {
      msgbox Image is to big - %width%x%height%.`nIt should be maximum - %MaxDimension% pixels
      ExitApp
   }
   BitmapFrameWithSoftwareBitmap := ComObjQuery(BitmapDecoder, IBitmapFrameWithSoftwareBitmap := "{FE287C9A-420C-4963-87AD-691436E08383}")
   DllCall(NumGet(NumGet(BitmapFrameWithSoftwareBitmap+0)+6*A_PtrSize), "ptr", BitmapFrameWithSoftwareBitmap, "ptr*", SoftwareBitmap)   ; GetSoftwareBitmapAsync
   WaitForAsync(SoftwareBitmap)
   DllCall(NumGet(NumGet(OcrEngine+0)+6*A_PtrSize), "ptr", OcrEngine, ptr, SoftwareBitmap, "ptr*", OcrResult)   ; RecognizeAsync
   WaitForAsync(OcrResult)
   DllCall(NumGet(NumGet(OcrResult+0)+6*A_PtrSize), "ptr", OcrResult, "ptr*", LinesList)   ; get_Lines
   DllCall(NumGet(NumGet(LinesList+0)+7*A_PtrSize), "ptr", LinesList, "int*", count)   ; count
   loop % count
   {
      DllCall(NumGet(NumGet(LinesList+0)+6*A_PtrSize), "ptr", LinesList, "int", A_Index-1, "ptr*", OcrLine)
      DllCall(NumGet(NumGet(OcrLine+0)+7*A_PtrSize), "ptr", OcrLine, "ptr*", hText) 
      buffer := DllCall("Combase.dll\WindowsGetStringRawBuffer", "ptr", hText, "uint*", length, "ptr")
      text .= StrGet(buffer, "UTF-16") "`n"
      ObjRelease(OcrLine)
   }
   Close := ComObjQuery(IRandomAccessStream, IClosable := "{30D5A829-7FA4-4026-83BB-D75BAE4EA99E}")
   DllCall(NumGet(NumGet(Close+0)+6*A_PtrSize), "ptr", Close)   ; Close
   ObjRelease(Close)
   Close := ComObjQuery(SoftwareBitmap, IClosable := "{30D5A829-7FA4-4026-83BB-D75BAE4EA99E}")
   DllCall(NumGet(NumGet(Close+0)+6*A_PtrSize), "ptr", Close)   ; Close
   ObjRelease(Close)
   ObjRelease(IRandomAccessStream)
   ObjRelease(BitmapDecoder)
   ObjRelease(BitmapFrame)
   ObjRelease(BitmapFrameWithSoftwareBitmap)
   ObjRelease(SoftwareBitmap)
   ObjRelease(OcrResult)
   ObjRelease(LinesList)
   return text
}



CreateClass(string, interface, ByRef Class)
{
   CreateHString(string, hString)
   VarSetCapacity(GUID, 16)
   DllCall("ole32\CLSIDFromString", "wstr", interface, "ptr", &GUID)
   result := DllCall("Combase.dll\RoGetActivationFactory", "ptr", hString, "ptr", &GUID, "ptr*", Class)
   if (result != 0)
   {
      if (result = 0x80004002)
         msgbox No such interface supported
      else if (result = 0x80040154)
         msgbox Class not registered
      else
         msgbox error: %result%
      ExitApp
   }
   DeleteHString(hString)
}

CreateHString(string, ByRef hString)
{
    DllCall("Combase.dll\WindowsCreateString", "wstr", string, "uint", StrLen(string), "ptr*", hString)
}

DeleteHString(hString)
{
   DllCall("Combase.dll\WindowsDeleteString", "ptr", hString)
}

WaitForAsync(ByRef Object)
{
   AsyncInfo := ComObjQuery(Object, IAsyncInfo := "{00000036-0000-0000-C000-000000000046}")
   loop
   {
      DllCall(NumGet(NumGet(AsyncInfo+0)+7*A_PtrSize), "ptr", AsyncInfo, "uint*", status)   ; IAsyncInfo.Status
      if (status != 0)
      {
         if (status != 1)
         {
            DllCall(NumGet(NumGet(AsyncInfo+0)+8*A_PtrSize), "ptr", AsyncInfo, "uint*", ErrorCode)   ; IAsyncInfo.ErrorCode
            msgbox AsyncInfo status error: %ErrorCode%
            ExitApp
         }
         ObjRelease(AsyncInfo)
         break
      }
      sleep 10
   }
   DllCall(NumGet(NumGet(Object+0)+8*A_PtrSize), "ptr", Object, "ptr*", ObjectResult)   ; GetResults
   ObjRelease(Object)
   Object := ObjectResult
}
doubledave22
Posts: 343
Joined: 08 Jun 2019, 17:36

Re: Optical character recognition (OCR) with UWP API

13 Aug 2021, 10:57

Hi, every so often (maybe once every 50-100 scans) I get a full script crash when using OCR. I was finally able to capture the exit code: 3221226356 which is 0xc0000374 in hex or STATUS_HEAP_CORRUPTION. Do you have any ideas what might cause this and how I could help to debug/prevent it?
User avatar
submeg
Posts: 326
Joined: 14 Apr 2017, 20:39
Contact:

Re: Optical character recognition (OCR) with UWP API

05 Sep 2021, 06:10

Joe Glines wrote:
24 Mar 2020, 08:56
I incorporated the OCR to my screen clipping tool! Thanks for the work everyone!
https://the-automator.com/update-to-screen-clipping-tool-with-built-in-ocr-on-windows-10-machines/
Now I know which one you are using Joe! I was about to ask, which one is most accurate OCR...seems to be this one.
____________________________________
Check out my site, submeg.com
Connect with me on LinkedIn
Courses on AutoHotkey :ugeek:
User avatar
Joe Glines
Posts: 770
Joined: 30 Sep 2013, 20:49
Location: Dallas
Contact:

Re: Optical character recognition (OCR) with UWP API

05 Sep 2021, 07:40

@submeg My current script uses the built-in Windows 10 OCR. We also added that the user can select their language and choose a default translation language (using Google Translate) I think the link you provided goes to the same tool but this one has a better explanation of what's available. https://www.the-automator.com/window-snipping/

We're working on some additions like: Being able to: 1) draw an arrow, circle, square 2) being able to type text 3) being able to change the size of the clip. It should be pretty amazing!
Sign-up for the 🅰️HK Newsletter

ImageImageImageImage:clap:
AHK Tutorials:Web Scraping | | Webservice APIs | AHK and Excel | Chrome | RegEx | Functions
Training: AHK Webinars Courses on AutoHotkey :ugeek:
YouTube

:thumbup: Quick Access Popup, the powerful Windows folders, apps and documents launcher!
Janusz
Posts: 89
Joined: 18 Dec 2020, 17:47

Re: Optical character recognition (OCR) with UWP API

21 Sep 2021, 14:15

I would like to publicly congratulate MR @malcev on this post from February 2020. His code is still working like A charm and his code can help many visually impaired developers or advanced users. Why? Because code can be used inside revolutionary OCR app which uses WIA directly. For what? For scanning books and to read them and convert graphical input to The clean ASCII .txt file. So I can only publicly write. Dear MR malcev. Very very well done.
Albert Schenning
Posts: 23
Joined: 03 Jan 2022, 14:40

Re: Optical character recognition (OCR) with UWP API

03 Jan 2022, 16:36

Lines with 1 character are not recognized. Any idea why ??

I have tested it with both the scripts from malcev and teadrinker. Both fail.
test.jpg
test.jpg (18.71 KiB) Viewed 4105 times
result.jpg
result.jpg (7.33 KiB) Viewed 4105 times
User avatar
labrint
Posts: 379
Joined: 14 Jun 2017, 05:06
Location: Malta

Re: Optical character recognition (OCR) with UWP API

04 Jan 2022, 06:09

Is there a way to keep the table structure in an image using this API? I'm getting line breaks for each cell.
malcev
Posts: 1769
Joined: 12 Aug 2014, 12:37

Re: Optical character recognition (OCR) with UWP API

04 Jan 2022, 06:33

labrint, You can get coordinates of founded word and depending of them construct output text.
viewtopic.php?p=318651#p318651
User avatar
labrint
Posts: 379
Joined: 14 Jun 2017, 05:06
Location: Malta

Re: Optical character recognition (OCR) with UWP API

04 Jan 2022, 06:50

I get what you mean, but I have invoices with lots of columns @malcev . This is really really good. Its a pity there is no easy way to structure the data.

I would suggest:
1) autorotation of the image till OCR is possible
2) table format
3) something to deal with italicized text as italics "l" becomes backslash etc.

Let me give you the background story of what I'm doing.

1. I needed to work on a project for a wholesaler who needs his invoices printed and sent via email to the client.
2. His invoicing software doesn't combine these functions.
3. I used Bullzip PDF printer to print to PDF from their software and then built a system that watches a folder for PDFs and physically prints them.
4. Then I could not extract the text easily from PDF because it was an Image PDF so I made bullzip print an image and use your code to get the text.
5. The text file generated is checked for emails that are not the wholesalers (client emails) and using this, the images are sent to the client.

So far so good. Some clients need an excel output not an image. This makes the text output structure important to also attach to the email.
malcev
Posts: 1769
Joined: 12 Aug 2014, 12:37

Re: Optical character recognition (OCR) with UWP API

04 Jan 2022, 07:23

May be for You is better to use tesseract and play with TessPageIteratorLevel.
https://tesseract.patagames.com/help/html/T_Patagames_Ocr_Enums_PageSegMode.htm
Also You can get html table with words and their position with their api.
https://tesseract.patagames.com/help/html/M_Patagames_Ocr_OcrApi_GetHOCRText.htm
User avatar
labrint
Posts: 379
Joined: 14 Jun 2017, 05:06
Location: Malta

Re: Optical character recognition (OCR) with UWP API

04 Jan 2022, 09:55

@malcev I tried using Vis2OCR which uses tesseract, put all libraries inside the AHK file, but it requires some external files like leptonica and tesseract. Could not get it to work.

Yours is super easy.. just paste the code in the mother AHK script and call the function and its done. Pity.
stretchymantis
Posts: 10
Joined: 13 Nov 2021, 22:24

Re: Optical character recognition (OCR) with UWP API

08 Jan 2022, 17:33

Hi everyone,
Pretty new to Autohotkey and programming in general, but been snooping around here a bit to learn. It's an awesome community here & nice to see so many knowledgeable people selflessly helping a constant influx of stumbling-drunk newbies (points to self) with often zany problems and requests. Here's hoping mine's a simple one:

This OCR code is just incredible. When using it, I'll sometimes have results that mistakenly return diacritic characters instead of standard English alphanumerical characters such as: Ø, Ä, ¯, etc... I've managed to mostly fix them through code and learning about RegExMatch/RegExReplace, but get stragglers that slip through occasionally.

So I was wondering if there is a way to tell the OCR script to not return diacritic characters in the results? And if not, then to dissuade from or otherwise minimize their frequency from the results somehow?

Thank You!!!
Jason
malcev
Posts: 1769
Joined: 12 Aug 2014, 12:37

Re: Optical character recognition (OCR) with UWP API

08 Jan 2022, 18:51

It should not return diacritic characters.
Do You set English language in ocr function, like this?

Code: Select all

ocr("test.jpg", "en")
stretchymantis
Posts: 10
Joined: 13 Nov 2021, 22:24

Re: Optical character recognition (OCR) with UWP API

09 Jan 2022, 18:29

Thanks for replying. I do have the language set to "en". To be sure, I double-checked and then added msgbox, OCR(showavailablelanguages) which resulted in "en-US" as the only listing.
That got me to thinking though about how the OCR may be 'seeing' the characters. I use VSCode and am primarily testing the OCR in the editor on a commented out line with a specific string. But I'm using a dark theme and Consolas font, so for kicks I changed to a light theme and DejaVu Sans Mono font. So far, it has not returned any diacritics. And your response directly led me to that, so thank you. I also found a script by @j-t-r and @lexikos (thank you) on this forum that removes accented characters, so ultimately I think I can consider my issue resolved mostly.

That said, I am still curious if it's not supposed to return any diacritics, why it does. If I run the OCR on, say, this "ẨẩẬậẮắẰằ", then I will still get exactly that back. Just if you (or anyone) happen to know and feel like satiating my curiosity. :)
malcev
Posts: 1769
Joined: 12 Aug 2014, 12:37

Re: Optical character recognition (OCR) with UWP API

10 Jan 2022, 15:44

Now I understand, I thought that You get diacritics characters from standard English alphanumerical characters (instead of A You get Ẩ).
User avatar
SteveMylo
Posts: 233
Joined: 22 Jun 2021, 00:50
Location: Australia
Contact:

Re: Optical character recognition (OCR) with UWP API

07 Feb 2022, 03:49

Need HELP, Hi OCR crew....
I have made slight adjustments where I type my word in the script if (text = "loop") for OCR to find , which is what I want. ( I didn't want search boxes) & then the mouse moves there. GREAT!

Code: Select all

if (text = "loop") 
         {
              MouseMove, X, Y, 5
	 }
But Can anyone please help?? I really need to just make my mouse move to one of the instances, say... the 2nd instance found only. :crazy:
There are 2 words containing "loop" found and I only want to move it to the 2nd "loop" word found for example.


Anyways..., I'll paste my full OCR script that ya'll are familiar with below which has my adjustments that just incorporates a mousemove.
If anyone can find a way to choose where the mouse moves if the OCR finds multiple instances that would be swell!
*oh, and if anyone is super keen, adjusting the search range does not work at all in any Coordmode. It'll find the Text, but the mousemoves in a completely different area of the screen.

Code: Select all

#NoEnv
#SingleInstance,Force
Coordmode, Mouse, Screen   ;by default it is relative to window
SetBatchLines, -1

v::

;~ InputBox, Find_Text,Text to search, Enter Text to find,,300,130 

hBitmap := HBitmapFromScreen(0, 0, A_ScreenWidth, A_ScreenHeight)   ;  if I change the search range, the mouse moves in a completely different area which is wired. 
pIRandomAccessStream := HBitmapToRandomAccessStream(hBitmap)
ocr(pIRandomAccessStream, "en", Find_Text)

return


HBitmapFromScreen(X, Y, W, H) {
   HDC := DllCall("GetDC", "Ptr", 0, "UPtr")
   HBM := DllCall("CreateCompatibleBitmap", "Ptr", HDC, "Int", W, "Int", H, "UPtr")
   PDC := DllCall("CreateCompatibleDC", "Ptr", HDC, "UPtr")
   DllCall("SelectObject", "Ptr", PDC, "Ptr", HBM)
   DllCall("BitBlt", "Ptr", PDC, "Int", 0, "Int", 0, "Int", W, "Int", H
                   , "Ptr", HDC, "Int", X, "Int", Y, "UInt", 0x00CC0020)
   DllCall("DeleteDC", "Ptr", PDC)
   DllCall("ReleaseDC", "Ptr", 0, "Ptr", HDC)
   Return HBM
}

HBitmapToRandomAccessStream(hBitmap) {
   static IID_IRandomAccessStream := "{905A0FE1-BC53-11DF-8C49-001E4FC686DA}"
        , IID_IPicture            := "{7BF80980-BF32-101A-8BBB-00AA00300CAB}"
        , PICTYPE_BITMAP := 1
        , BSOS_DEFAULT   := 0
        
   DllCall("Ole32\CreateStreamOnHGlobal", "Ptr", 0, "UInt", true, "PtrP", pIStream, "UInt")
   
   VarSetCapacity(PICTDESC, sz := 8 + A_PtrSize*2, 0)
   NumPut(sz, PICTDESC)
   NumPut(PICTYPE_BITMAP, PICTDESC, 4)
   NumPut(hBitmap, PICTDESC, 8)
   riid := CLSIDFromString(IID_IPicture, GUID1)
   DllCall("OleAut32\OleCreatePictureIndirect", "Ptr", &PICTDESC, "Ptr", riid, "UInt", false, "PtrP", pIPicture, "UInt")
   ; IPicture::SaveAsFile
   DllCall(NumGet(NumGet(pIPicture+0) + A_PtrSize*15), "Ptr", pIPicture, "Ptr", pIStream, "UInt", true, "UIntP", size, "UInt")
   riid := CLSIDFromString(IID_IRandomAccessStream, GUID2)
   DllCall("ShCore\CreateRandomAccessStreamOverStream", "Ptr", pIStream, "UInt", BSOS_DEFAULT, "Ptr", riid, "PtrP", pIRandomAccessStream, "UInt")
   ObjRelease(pIPicture)
   ObjRelease(pIStream)
   Return pIRandomAccessStream
}

CLSIDFromString(IID, ByRef CLSID) {
   VarSetCapacity(CLSID, 16, 0)
   if res := DllCall("ole32\CLSIDFromString", "WStr", IID, "Ptr", &CLSID, "UInt")
      throw Exception("CLSIDFromString failed. Error: " . Format("{:#x}", res))
   Return &CLSID
}







ocr(file, lang := "en", mytext := "")
{
   static OcrEngineStatics, OcrEngine, MaxDimension, LanguageFactory, Language, CurrentLanguage, BitmapDecoderStatics, GlobalizationPreferencesStatics
   if (OcrEngineStatics = "")
   {
      CreateClass("Windows.Globalization.Language", ILanguageFactory := "{9B0252AC-0C27-44F8-B792-9793FB66C63E}", LanguageFactory)
      CreateClass("Windows.Graphics.Imaging.BitmapDecoder", IBitmapDecoderStatics := "{438CCB26-BCEF-4E95-BAD6-23A822E58D01}", BitmapDecoderStatics)
      CreateClass("Windows.Media.Ocr.OcrEngine", IOcrEngineStatics := "{5BFFA85A-3384-3540-9940-699120D428A8}", OcrEngineStatics)
      DllCall(NumGet(NumGet(OcrEngineStatics+0)+6*A_PtrSize), "ptr", OcrEngineStatics, "uint*", MaxDimension)   ; MaxImageDimension
   }
   if (file = "ShowAvailableLanguages")
   {
      if (GlobalizationPreferencesStatics = "")
         CreateClass("Windows.System.UserProfile.GlobalizationPreferences", IGlobalizationPreferencesStatics := "{01BF4326-ED37-4E96-B0E9-C1340D1EA158}", GlobalizationPreferencesStatics)
      DllCall(NumGet(NumGet(GlobalizationPreferencesStatics+0)+9*A_PtrSize), "ptr", GlobalizationPreferencesStatics, "ptr*", LanguageList)   ; get_Languages
      DllCall(NumGet(NumGet(LanguageList+0)+7*A_PtrSize), "ptr", LanguageList, "int*", count)   ; count
      loop % count
      {
         DllCall(NumGet(NumGet(LanguageList+0)+6*A_PtrSize), "ptr", LanguageList, "int", A_Index-1, "ptr*", hString)   ; get_Item
         DllCall(NumGet(NumGet(LanguageFactory+0)+6*A_PtrSize), "ptr", LanguageFactory, "ptr", hString, "ptr*", LanguageTest)   ; CreateLanguage
         DllCall(NumGet(NumGet(OcrEngineStatics+0)+8*A_PtrSize), "ptr", OcrEngineStatics, "ptr", LanguageTest, "int*", bool)   ; IsLanguageSupported
         if (bool = 1)
         {
            DllCall(NumGet(NumGet(LanguageTest+0)+6*A_PtrSize), "ptr", LanguageTest, "ptr*", hText)
            buffer := DllCall("Combase.dll\WindowsGetStringRawBuffer", "ptr", hText, "uint*", length, "ptr")
            text .= StrGet(buffer, "UTF-16") "`n"
         }
         ObjRelease(LanguageTest)
      }
      ObjRelease(LanguageList)
      return text
   }
   if (lang != CurrentLanguage) or (lang = "FirstFromAvailableLanguages")
   {
      if (OcrEngine != "")
      {
         ObjRelease(OcrEngine)
         if (CurrentLanguage != "FirstFromAvailableLanguages")
            ObjRelease(Language)
      }
      if (lang = "FirstFromAvailableLanguages")
         DllCall(NumGet(NumGet(OcrEngineStatics+0)+10*A_PtrSize), "ptr", OcrEngineStatics, "ptr*", OcrEngine)   ; TryCreateFromUserProfileLanguages
      else
      {
         CreateHString(lang, hString)
         DllCall(NumGet(NumGet(LanguageFactory+0)+6*A_PtrSize), "ptr", LanguageFactory, "ptr", hString, "ptr*", Language)   ; CreateLanguage
         DeleteHString(hString)
         DllCall(NumGet(NumGet(OcrEngineStatics+0)+9*A_PtrSize), "ptr", OcrEngineStatics, ptr, Language, "ptr*", OcrEngine)   ; TryCreateFromLanguage
      }
      if (OcrEngine = 0)
      {
         msgbox Can not use language "%lang%" for OCR, please install language pack.
         ExitApp
      }
      CurrentLanguage := lang
   }
   IRandomAccessStream := file
   DllCall(NumGet(NumGet(BitmapDecoderStatics+0)+14*A_PtrSize), "ptr", BitmapDecoderStatics, "ptr", IRandomAccessStream, "ptr*", BitmapDecoder)   ; CreateAsync
   WaitForAsync(BitmapDecoder)
   BitmapFrame := ComObjQuery(BitmapDecoder, IBitmapFrame := "{72A49A1C-8081-438D-91BC-94ECFC8185C6}")
   DllCall(NumGet(NumGet(BitmapFrame+0)+12*A_PtrSize), "ptr", BitmapFrame, "uint*", width)   ; get_PixelWidth
   DllCall(NumGet(NumGet(BitmapFrame+0)+13*A_PtrSize), "ptr", BitmapFrame, "uint*", height)   ; get_PixelHeight
   if (width > MaxDimension) or (height > MaxDimension)
   {
      msgbox Image is to big - %width%x%height%.`nIt should be maximum - %MaxDimension% pixels
      ExitApp
   }
   BitmapFrameWithSoftwareBitmap := ComObjQuery(BitmapDecoder, IBitmapFrameWithSoftwareBitmap := "{FE287C9A-420C-4963-87AD-691436E08383}")
   DllCall(NumGet(NumGet(BitmapFrameWithSoftwareBitmap+0)+6*A_PtrSize), "ptr", BitmapFrameWithSoftwareBitmap, "ptr*", SoftwareBitmap)   ; GetSoftwareBitmapAsync
   WaitForAsync(SoftwareBitmap)
   DllCall(NumGet(NumGet(OcrEngine+0)+6*A_PtrSize), "ptr", OcrEngine, ptr, SoftwareBitmap, "ptr*", OcrResult)   ; RecognizeAsync
   WaitForAsync(OcrResult)
   DllCall(NumGet(NumGet(OcrResult+0)+6*A_PtrSize), "ptr", OcrResult, "ptr*", LinesList)   ; get_Lines
   DllCall(NumGet(NumGet(LinesList+0)+7*A_PtrSize), "ptr", LinesList, "int*", count)   ; count


   loop % count
   {
      DllCall(NumGet(NumGet(LinesList+0)+6*A_PtrSize), "ptr", LinesList, "int", A_Index-1, "ptr*", OcrLine)
      DllCall(NumGet(NumGet(OcrLine+0)+6*A_PtrSize), "ptr", OcrLine, "ptr*", WordsList)   ; get_Words
      DllCall(NumGet(NumGet(WordsList+0)+7*A_PtrSize), "ptr", WordsList, "int*", WordsCount)   ; Words count
   
   
      loop % WordsCount
  
      {
      
         DllCall(NumGet(NumGet(WordsList+0)+6*A_PtrSize), "ptr", WordsList, "int", A_Index-1, "ptr*", OcrWord)
         VarSetCapacity(RECT, 16, 0)
         DllCall(NumGet(NumGet(OcrWord+0)+6*A_PtrSize), "ptr", OcrWord, "ptr", &RECT)   ; get_BoundingRect
         DllCall(NumGet(NumGet(OcrWord+0)+7*A_PtrSize), "ptr", OcrWord, "ptr*", hText)   ; get_Text
         buffer := DllCall("Combase.dll\WindowsGetStringRawBuffer", "ptr", hText, "uint*", length, "ptr")
         x := NumGet(&RECT, 0, "float")
         y := NumGet(&RECT, 4, "float")
         w := NumGet(&RECT, 8, "float")
         h := NumGet(&RECT, 12, "float")
         text := StrGet(buffer, "UTF-16")
            
         
         ;~ msgbox %text%`nx: %x%, y: %y%, w: %w%, h: %h%
           if (text = "loop") 
         
				{
                   MouseMove, X, Y, 5
				}
           
         ObjRelease(OcrWord)
       
      }
      ObjRelease(WordsList)
      ObjRelease(OcrLine)
   }
   Close := ComObjQuery(IRandomAccessStream, IClosable := "{30D5A829-7FA4-4026-83BB-D75BAE4EA99E}")
   DllCall(NumGet(NumGet(Close+0)+6*A_PtrSize), "ptr", Close)   ; Close
   ObjRelease(Close)
   Close := ComObjQuery(SoftwareBitmap, IClosable := "{30D5A829-7FA4-4026-83BB-D75BAE4EA99E}")
   DllCall(NumGet(NumGet(Close+0)+6*A_PtrSize), "ptr", Close)   ; Close
   ObjRelease(Close)
   ObjRelease(IRandomAccessStream)
   ObjRelease(BitmapDecoder)
   ObjRelease(BitmapFrame)
   ObjRelease(BitmapFrameWithSoftwareBitmap)
   ObjRelease(SoftwareBitmap)
   ObjRelease(OcrResult)
   ObjRelease(LinesList)

   ;~ ExitApp
   return text
}



CreateClass(string, interface, ByRef Class)
{
   CreateHString(string, hString)
   VarSetCapacity(GUID, 16)
   DllCall("ole32\CLSIDFromString", "wstr", interface, "ptr", &GUID)
   result := DllCall("Combase.dll\RoGetActivationFactory", "ptr", hString, "ptr", &GUID, "ptr*", Class, "uint")
   if (result != 0)
   {
      if (result = 0x80004002)
         msgbox No such interface supported
      else if (result = 0x80040154)
         msgbox Class not registered
      else
         msgbox error: %result%
      ExitApp
   }
   DeleteHString(hString)
}

CreateHString(string, ByRef hString)
{
    DllCall("Combase.dll\WindowsCreateString", "wstr", string, "uint", StrLen(string), "ptr*", hString)
}

DeleteHString(hString)
{
   DllCall("Combase.dll\WindowsDeleteString", "ptr", hString)
}

WaitForAsync(ByRef Object)
{
   AsyncInfo := ComObjQuery(Object, IAsyncInfo := "{00000036-0000-0000-C000-000000000046}")
   loop
   {
      DllCall(NumGet(NumGet(AsyncInfo+0)+7*A_PtrSize), "ptr", AsyncInfo, "uint*", status)   ; IAsyncInfo.Status
      if (status != 0)
      {
         if (status != 1)
         {
            DllCall(NumGet(NumGet(AsyncInfo+0)+8*A_PtrSize), "ptr", AsyncInfo, "uint*", ErrorCode)   ; IAsyncInfo.ErrorCode
            msgbox AsyncInfo status error: %ErrorCode%
            ExitApp
         }
         ObjRelease(AsyncInfo)
         break
      }
      sleep 10
   }
   DllCall(NumGet(NumGet(Object+0)+8*A_PtrSize), "ptr", Object, "ptr*", ObjectResult)   ; GetResults
   ObjRelease(Object)
   Object := ObjectResult
}






~Esc::ExitApp
Last edited by SteveMylo on 07 Feb 2022, 12:29, edited 1 time in total.
malcev
Posts: 1769
Joined: 12 Aug 2014, 12:37

Re: Optical character recognition (OCR) with UWP API

07 Feb 2022, 11:49

Add variable that counts instances and move mouse inside function.
ocr searches always from x= 0 and y = 0 coordinates, therefore if You send to it different starting points then just plus them to founded values.
User avatar
SteveMylo
Posts: 233
Joined: 22 Jun 2021, 00:50
Location: Australia
Contact:

Re: Optical character recognition (OCR) with UWP API

07 Feb 2022, 12:37

malcev wrote:
07 Feb 2022, 11:49
Add variable that counts instances and move mouse inside function.
Thanks Malcev. Although i feel I’m learning fast, how to add a variable to count instances has alluded me. Are you able to show me an example.
It’ll be a huge help. Then i’ll learn from there
malcev
Posts: 1769
Joined: 12 Aug 2014, 12:37

Re: Optical character recognition (OCR) with UWP API

07 Feb 2022, 12:44

Code: Select all

if (text = "loop") 
   n++
User avatar
SteveMylo
Posts: 233
Joined: 22 Jun 2021, 00:50
Location: Australia
Contact:

Re: Optical character recognition (OCR) with UWP API

09 Feb 2022, 07:14

@malcev Thanks, I spent a few hours bit i still don’t know how to incorporate the mousemove with N++.
Also all my tests had the mouse moving to every single word on the entire screen haha.
I only need it to go to one chosen instance found. 😌

Return to “Scripts and Functions (v1)”

Who is online

Users browsing this forum: No registered users and 142 guests