so, I need to write a find and replace, which I thought a ps1 could. However, ps1 files don't do what you ask, randomly breaks, tries to be non humanly readable, and replace things I never asked it to replace and doesn't replace stuff I asked it to.
I need to port it to an ahk file, which was working splendidly before it broke for not apparent reason: Here is my draft. It was working fine, I addeded severeral lines, every thing executed, now it writes a 3 byte txt. nothing works as written. Got to be a typo, but not seeing it.[*]
Code: Select all
FileRead, content, D:\Program Files\Tesseract-OCR\whiteimages\doc\Book.txt
; Replace lines that contain only numbers with an empty string
content := RegExReplace(content, "m)^\d+$", "")
; Replace different types of line breaks with spaces
content := StrReplace(content, "`n", " ")
content := RegExReplace(content, "_", " ")
content := RegExReplace(content, "°", "")
content := RegExReplace(content, "â", "")
content := RegExReplace(content, "€", "")
content := RegExReplace(content, "Â", "")
content := RegExReplace(content, "§", "")
content := RegExReplace(content, "¶", "")
; Add your replacement rules here
content := RegExReplace(content, "i) t ", " I ")
content := RegExReplace(content, "/bi)t/b", " I ")
content := RegExReplace(content, "i) td ", " I'd ")
content := RegExReplace(content, "i) id ", " I'd ")
content := RegExReplace(content, "i) t'll ", " I'll ")
content := RegExReplace(content, "i) tt's ", " It's ")
content := RegExReplace(content, "i) tts ", " Its ")
content := RegExReplace(content, "i) ts ", " Is ")
content := RegExReplace(content, "i) tm ", " i'm ")
content := RegExReplace(content, "i) t'm ", " i'm ")
content := RegExReplace(content, "i) 0 ", " to ")
content := RegExReplace(content, "i) o ", " to ")
content := RegExReplace(content, "i) ollow", " follow")
content := RegExReplace(content, "i) ailed", " failed")
content := RegExReplace(content, "i) ry ", " try ")
content := RegExReplace(content, "i) ell ", " tell ")
content := RegExReplace(content, "i) ime ", " time ")
content := RegExReplace(content, "i) ove ", " love ")
content := RegExReplace(content, "i) oved ", " loved ")
content := RegExReplace(content, "i) ost ", " lost ")
content := RegExReplace(content, "i) ose ", " lose ")
content := RegExReplace(content, "i) oss ", " loss ")
content := RegExReplace(content, "i) oser ", " loser ")
content := RegExReplace(content, "i) tself ", " itself ")
content := RegExReplace(content, "i) oom", " loom")
content := RegExReplace(content, "i) ight", " light")
content := RegExReplace(content, "i) ike ", " like ")
content := RegExReplace(content, "i) iked ", " liked ")
content := RegExReplace(content, "i) ive ", " live ")
content := RegExReplace(content, "i) ived ", " lived ")
content := RegExReplace(content, "i) hing ", " thing ")
content := RegExReplace(content, "i) hings ", " things ")
content := RegExReplace(content, "i) ook", " look")
content := RegExReplace(content, "i) elevision", " Television")
content := RegExReplace(content, "i) she old ", " she told")
content := RegExReplace(content, "i) he old ", " he told")
content := RegExReplace(content, "i) amily", " family")
content := RegExReplace(content, "i) ound", " found")
content := RegExReplace(content, "i)\\bind\\b", " find")
content := RegExReplace(content, "i)\\binds\\b", " finds")
content := RegExReplace(content, "i)\\binding\\b", " finding")
content := RegExReplace(content, "i)\\bindings\\b", " findings ")
content := RegExReplace(content, "i)\\briend\\b", " friend")
content := RegExReplace(content, "i)\\bherefore\\b", " therefore")
content := RegExReplace(content, "i) actory", " factory")
content := RegExReplace(content, "i) rom", " from")
content := RegExReplace(content, "i) ong", " long")
content := RegExReplace(content, "i) ive", " live")
content := RegExReplace(content, "i) -ive", " lived")
content := RegExReplace(content, "i) ind", " find")
content := RegExReplace(content, "i) augh", " laugh")
content := RegExReplace(content, "i) isten", " listen")
content := RegExReplace(content, "i) t'll", " I'll")
content := RegExReplace(content, "i) ix", " fix")
content := RegExReplace(content, "i) irst", " first")
; Fix contractions
content := RegExReplace(content, "i)\byoud\b", "you'd")
content := RegExReplace(content, "i)\bwhod\b", "who'd")
content := RegExReplace(content, "i)\bhed\b", "he'd")
content := RegExReplace(content, "i)\bshes\b", "she's")
content := RegExReplace(content, "i)\bwerent\b", "weren't")
content := RegExReplace(content, "i)\bcant\b", "can't")
content := RegExReplace(content, "i)\bdont\b", "don't")
content := RegExReplace(content, "i)oulnt\b", "ouldn't")
content := RegExReplace(content, "i)oulnt\b", "ouldn't")
content := RegExReplace(content, "i)\bcant\b", " can't")
content := RegExReplace(content, "i)\bid\b", " I'd")
content := RegExReplace(content, "i)\bill\b", " I'll")
content := RegExReplace(content, "i)\bhell\b", " he'll")
content := RegExReplace(content, "i)\bshell\b", " she'll")
content := RegExReplace(content, "i)\bhed\b", " he'd")
content := RegExReplace(content, "i)\bshed\b", " she'd")
content := RegExReplace(content, "\bIll\b", " I'll ")
; conditinal
content := RegExReplace(content, "i)(?<!black|fat|skinny|white|red|your|our|his|big|small|yello|ugly|my|the|a|her|their) hem ", " them ")
content := RegExReplace(content, "i)(?<!\b(black|fat|skinny|white|red|your|our|his|big|small|yellow|ugly|the|a|her|my|their|mother)\b) \bhen\b ", " then ")
content := RegExReplace(content, "i)(?<!black|white|red|your|our|his|my|blue|big|small|top|yello|ugly|the|a|her|their|brim|brimmed|new|fancy|straw|green|woolen|felt|my|Cowboy|Fedora|Panama|Baseball|Bowler|Sun|Beanie|Derby|Trilby|Checked|Plaid|Floral|Polka-dot|Wool|Silk|Leather|Knit|Felt|floppy|Safety|Hard|Winter|Summer) hat ", " that ")
content := RegExReplace(content, " st\.", " Saint")
content := RegExReplace(content, " ph\.d\.", " PHD")
content := RegExReplace(content, " dr\.", " Doctor")
content := RegExReplace(content, " mr\.", " Mister")
content := RegExReplace(content, " mrs\.", " Missus")
content := RegExReplace(content, " ms\.", " Miss")
content := RegExReplace(content, " jr\.", " Junior")
content := RegExReplace(content, " sr\.", " Senior")
content := RegExReplace(content, " co\.", " Company")
content := RegExReplace(content, " inc\.", " Incorporated")
content := RegExReplace(content, " ltd\.", " Limited")
content := RegExReplace(content, " intl\.", " International")
content := RegExReplace(content, " prof\.", " Professor")
content := RegExReplace(content, " gov\.", " Governor")
content := RegExReplace(content, " capt\.", " Captain")
content := RegExReplace(content, " sgt\.", " Sergeant")
content := RegExReplace(content, " corp\.", " Corporation")
content := RegExReplace(content, " ave\.", " Avenue")
content := RegExReplace(content, " blvd\.", " Boulevard")
content := RegExReplace(content, " ft\.", " Fort")
content := RegExReplace(content, " mt\.", " Mount")
content := RegExReplace(content, " ln\.", " Lane")
content := RegExReplace(content, " rd\.", " Road")
content := RegExReplace(content, " etc\.", " Etcetera")
content := RegExReplace(content, " esp\.", " Especially")
content := RegExReplace(content, " e\.g\.", " For example")
content := RegExReplace(content, " i\.e\.", " That is")
content := RegExReplace(content, "i) p\.(?=\s?\d)", " Page")
content := RegExReplace(content, " pp\.", " Pages")
content := RegExReplace(content, " par\.", " Paragraph")
content := RegExReplace(content, " vol\.", " Volume")
content := RegExReplace(content, " lb\.", " Pound")
content := RegExReplace(content, " oz\.", " Ounce")
content := RegExReplace(content, " gal\.", " Gallon")
content := RegExReplace(content, " qt\.", " Quart")
content := RegExReplace(content, " pt\.", " Pint")
content := RegExReplace(content, " yd\.", " Yard")
content := RegExReplace(content, "i)(?<=\d\s?)in\.", " Inch")
content := RegExReplace(content, " ft\.", " Foot")
content := RegExReplace(content, " mi\.", " Mile")
content := RegExReplace(content, " mm\.", " Millimeter")
content := RegExReplace(content, " cm\.", " Centimeter")
content := RegExReplace(content, "i)(?<=\d\s?)m\.", " Meter")
content := RegExReplace(content, " km\.", " Kilometer")
content := RegExReplace(content, " mg\.", " Milligram")
content := RegExReplace(content, "i)(?<=\d\s?)g\.", " Gram")
content := RegExReplace(content, " kg\.", " Kilogram")
content := RegExReplace(content, "i)(?<=\d\s?)l\.", " Liter")
content := RegExReplace(content, " ml\.", " Milliliter")
content := RegExReplace(content, " tbsp\.", " Tablespoon")
content := RegExReplace(content, " tsp\.", " Teaspoon")
content := RegExReplace(content, " sq\.", " Square")
content := RegExReplace(content, "i)\\byoud\\b", "you'd")
content := RegExReplace(content, "i)\\bwhod\\b", "who'd")
content := RegExReplace(content, "i)\\bhed\\b", "he'd")
content := RegExReplace(content, "i)\\bshes\\b", "she's")
content := RegExReplace(content, "i)\\bwerent\\b", "weren't")
content := RegExReplace(content, "i)\\bcant\\b", "can't")
content := RegExReplace(content, "i)\\bdont\\b", "don't")
content := RegExReplace(content, "i)\\barent\\b", "aren't")
content := RegExReplace(content, "i)\\bcant\\b", "can't")
content := RegExReplace(content, "i)\\bcouldve\\b", "could've")
content := RegExReplace(content, "i)\\bcouldnt\\b", "couldn't")
content := RegExReplace(content, "i)\\bdidnt\\b", "didn't")
content := RegExReplace(content, "i)\\bdoesnt\\b", "doesn't")
content := RegExReplace(content, "i)\\bdont\\b", "don't")
content := RegExReplace(content, "i)\\bhadnt\\b", "hadn't")
content := RegExReplace(content, "i)\\bhasnt\\b", "hasn't")
content := RegExReplace(content, "i)\\bhavent\\b", "haven't")
content := RegExReplace(content, "i)\\bhed\\b", "he'd")
content := RegExReplace(content, "i)\\bhes\\b", "he's")
content := RegExReplace(content, "i)\\bhowd\\b", "how'd")
content := RegExReplace(content, "i)\\bhowll\\b", "how'll")
content := RegExReplace(content, "i)\\bhowre\\b", "how're")
content := RegExReplace(content, "i)\\bhowve\\b", "how've")
content := RegExReplace(content, "i)\\bId\\b", "I'd")
content := RegExReplace(content, "i)\\bIll\\b", "I'll")
content := RegExReplace(content, "i)\\bIm\\b", "I'm")
content := RegExReplace(content, "i)\\bim\\b", "I'm")
content := RegExReplace(content, "i)\\bIve\\b", "I've")
content := RegExReplace(content, "i)\\bisnt\\b", "isn't")
content := RegExReplace(content, "i)\\bitd\\b", "it'd")
content := RegExReplace(content, "i)\\bitll\\b", "it'll")
content := RegExReplace(content, "i)\\bits\\b", "it's")
content := RegExReplace(content, "i)\\bIve\\b", "I've")
content := RegExReplace(content, "i)\\bmightve\\b", "might've")
content := RegExReplace(content, "i)\\bmustve\\b", "must've")
content := RegExReplace(content, "i)\\bmustnt\\b", "mustn't")
content := RegExReplace(content, "i)\\bshant\\b", "shan't")
content := RegExReplace(content, "i)\\bshed\\b", "she'd")
content := RegExReplace(content, "i)\\bshes\\b", "she's")
content := RegExReplace(content, "i)\\bshouldve\\b", "should've")
content := RegExReplace(content, "i)\\bshouldnt\\b", "shouldn't")
content := RegExReplace(content, "i)\\bthatd\\b", "that'd")
content := RegExReplace(content, "i)\\bthats\\b", "that's")
content := RegExReplace(content, "i)\\bthered\\b", "there'd")
content := RegExReplace(content, "i)\\btheres\\b", "there's")
content := RegExReplace(content, "i)\\btheyd\\b", "they'd")
content := RegExReplace(content, "i)\\btheyll\\b", "they'll")
content := RegExReplace(content, "i)\\btheyre\\b", "they're")
content := RegExReplace(content, "i)\\btheyve\\b", "they've")
content := RegExReplace(content, "i)\\bwasnt\\b", "wasn't")
content := RegExReplace(content, "i)\\bwerent\\b", "weren't")
content := RegExReplace(content, "i)\\bwhatd\\b", "what'd")
content := RegExReplace(content, "i)\\bwhatll\\b", "what'll")
content := RegExReplace(content, "i)\\bwhats\\b", "what's")
content := RegExReplace(content, "i)\\bwhatve\\b", "what've")
content := RegExReplace(content, "i)\\bwhend\\b", "when'd")
content := RegExReplace(content, "i)\\bwhens\\b", "when's")
content := RegExReplace(content, "i)\\bwhered\\b", "where'd")
content := RegExReplace(content, "i)\\bwheres\\b", "where's")
content := RegExReplace(content, "i)\\bwhereve\\b", "where've")
content := RegExReplace(content, "i)\\bwhod\\b", "who'd")
content := RegExReplace(content, "i)\\bwholl\\b", "who'll")
content := RegExReplace(content, "i) ll ", " I'll ")
; Add more replacements as needed
; Replace two or more spaces with a single space
content := RegExReplace(content, "\s{2,}", " ")
; Remove non-ASCII characters
content := RegExReplace(content, "[^\x00-\x7F]", "")
; Write the modified content to the output file
;FileDelete, D:\Program Files\Tesseract-OCR\whiteimages\doc\Book-singleLine2.txt
FileAppend, %content%, D:\Program Files\Tesseract-OCR\whiteimages\doc\Book-singleLine2 test.txt, UTF-8
MsgBox, Script completed.