Page 1 of 1

HTML Decoder

Posted: 04 Nov 2018, 11:53
by Freire
If you want to encode string to HTML: https://autohotkey.com/docs/commands/Transform.htm#HTML

HTML Decoder

This function translates HTML encoded string back to the original string.
It does have all listed HTML4 named Entities. IMO it still does a very good cover in general (Talking with HTML5 in mind).
Most of the new conventions aren't used yet. Entities References: https://en.wikipedia.org/wiki/List_of_X ... es_in_HTML

It supports Decimal encode, Hexadecimal encode and Named encode: Eg: "&" "&" "&" Are all equal to: "&"

Thanks to @CerpinTaxt[Discord] for some help with visual arrangement.

Code: Select all

htmlDecode(str) {
  html_array_1 := {"exclamation":33,"quot":34,"percent":37,"amp":38,"apos":39,"add":43,"lt":60,"equal":61,"gt":62,"nbsp":160,"iexcl":161,"cent":162,"pound":163,"curren":164,"yen":165,"brvbar":166,"sect":167,"uml":168,"copy":169,"ordf":170,"laquo":171,"not":172,"shy":173,"reg":174,"macr":175,"deg":176,"plusmn":177,"sup2":178,"sup3":179,"acute":180,"micro":181,"para":182,"middot":183,"cedil":184,"sup1":185,"ordm":186,"raquo":187,"frac14":188,"frac12":189,"frac34":190,"iquest":191,"Agrave":192,"Aacute":193,"Acirc":194,"Atilde":195,"Auml":196,"Aring":197,"AElig":198,"Ccedil":199,"Egrave":200,"Eacute":201,"Ecirc":202,"Euml":203,"Igrave":204,"Iacute":205,"Icirc":206,"Iuml":207,"ETH":208,"Ntilde":209,"Ograve":210,"Oacute":211,"Ocirc":212,"Otilde":213,"Ouml":214,"times":215,"Oslash":216,"Ugrave":217,"Uacute":218,"Ucirc":219}
  html_array_2 := {"Uuml":220,"Yacute":221,"THORN":222,"szlig":223,"agrave":224,"aacute":225,"acirc":226,"atilde":227,"auml":228,"aring":229,"aelig":230,"ccedil":231,"egrave":232,"eacute":233,"ecirc":234,"euml":235,"igrave":236,"iacute":237,"icirc":238,"iuml":239,"eth":240,"ntilde":241,"ograve":242,"oacute":243,"ocirc":244,"otilde":245,"ouml":246,"divide":247,"oslash":248,"ugrave":249,"uacute":250,"ucirc":251,"uuml":252,"yacute":253,"thorn":254,"yuml":255,"OElig":338,"oelig":339,"Scaron":352,"scaron":353,"Yuml":376,"fnof":402,"circ":710,"tilde":732,"Alpha":913,"Beta":914,"Gamma":915,"Delta":916,"Epsilon":917,"Zeta":918,"Eta":919,"Theta":920,"Iota":921,"Kappa":922,"Lambda":923,"Mu":924,"Nu":925,"Xi":926,"Omicron":927,"Pi":928,"Rho":929,"Sigma":931}
  html_array_3 := {"Tau":932,"Upsilon":933,"Phi":934,"Chi":935,"Psi":936,"Omega":937,"alpha":945,"beta":946,"gamma":947,"delta":948,"epsilon":949,"zeta":950,"eta":951,"theta":952,"iota":953,"kappa":954,"lambda":955,"mu":956,"nu":957,"xi":958,"omicron":959,"pi":960,"rho":961,"sigmaf":962,"sigma":963,"tau":964,"upsilon":965,"phi":966,"chi":967,"psi":968,"omega":969,"thetasym":977,"upsih":978,"piv":982,"ensp":8194,"emsp":8195,"thinsp":8201,"zwnj":8204,"zwj":8205,"lrm":8206,"rlm":8207,"ndash":8211,"mdash":8212,"horbar":8213,"lsquo":8216,"rsquo":8217,"sbquo":8218,"ldquo":8220,"rdquo":8221,"bdquo":8222,"dagger":8224,"Dagger":8225,"bull":8226,"hellip":8230,"permil":8240,"prime":8242,"Prime":8243,"lsaquo":8249,"rsaquo":8250,"oline":8254,"frasl":8260,"euro":8364,"image":8465,"weierp":8472,"real":8476,"trade":8482,"alefsym":8501,"larr":8592,"uarr":8593,"rarr":8594,"darr":8595,"harr":8596,"crarr":8629,"lArr":8656,"uArr":8657,"rArr":8658,"dArr":8659,"hArr":8660,"forall":8704,"part":8706,"exist":8707,"empty":8709,"nabla":8711,"isin":8712,"notin":8713,"ni":8715,"prod":8719,"sum":8721,"minus":8722,"lowast":8727,"radic":8730,"prop":8733,"infin":8734,"ang":8736,"and":8743,"or":8744,"cap":8745,"cup":8746,"int":8747,"there4":8756,"sim":8764,"cong":8773,"asymp":8776,"ne":8800,"equiv":8801,"le":8804,"ge":8805,"sub":8834,"sup":8835,"nsub":8836,"sube":8838,"supe":8839,"oplus":8853,"otimes":8855,"perp":8869,"sdot":8901,"lceil":8968,"rceil":8969,"lfloor":8970,"rfloor":8971,"lang":9001,"rang":9002,"loz":9674,"spades":9824,"clubs":9827,"hearts":9829,"diams":9830}

  While RegExMatch(str, "(?<=&#)\d+(?=;)", char)
    StringReplace, str, str, % "&#" char ";" , % Chr(char), All
  While RegExMatch(str, "i)(?<=&#x)[a-f0-9]+(?=;)", char)
    StringReplace, str, str, % "&#x" char ";" , % Chr("0x"char), All
  While RegExMatch(str, "(?<=&)\w+(?=;)", char) {
    if (html_array_1[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_1[char]), All
    } Else If (html_array_2[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_2[char]), All
    } Else If (html_array_3[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_3[char]), All
    } Else {
      MsgBox, % "ERROR:`nFound an Unkown Named Entity: &" char ";`nIt will be ignored."
      StringReplace, str, str, % "&" char ";" , % "", All
    }
  }
  Return str
}
Validation code example:

Code: Select all

encoded_str=
(`%
<div class="header-wrap--home  js-header-wrap">
<div class="header--aside js-header-aside"><a class="header__button--menu  js-side-menu-open" href="#">%&#168;$#&%&@*#(!)(@#(<>:ASD?</a><div class="header--aside__item showcase header__label"><span class="header__clickable js-hl-button" data-type="showcase">
)
decoded_str=
(`%
<div class="header-wrap--home  js-header-wrap">
<div class="header--aside js-header-aside"><a class="header__button--menu  js-side-menu-open" href="#">%¨$#&%&@*#(!)(@#(<>:ASD?</a><div class="header--aside__item showcase header__label"><span class="header__clickable js-hl-button" data-type="showcase">
)

msgbox, % htmlDecode(encoded_str)
msgbox, % htmlDecode(encoded_str)=decoded_str?"Working":"Not Working"

Return

htmlDecode(str) {
  html_array_1 := {"exclamation":33,"quot":34,"percent":37,"amp":38,"apos":39,"add":43,"lt":60,"equal":61,"gt":62,"nbsp":160,"iexcl":161,"cent":162,"pound":163,"curren":164,"yen":165,"brvbar":166,"sect":167,"uml":168,"copy":169,"ordf":170,"laquo":171,"not":172,"shy":173,"reg":174,"macr":175,"deg":176,"plusmn":177,"sup2":178,"sup3":179,"acute":180,"micro":181,"para":182,"middot":183,"cedil":184,"sup1":185,"ordm":186,"raquo":187,"frac14":188,"frac12":189,"frac34":190,"iquest":191,"Agrave":192,"Aacute":193,"Acirc":194,"Atilde":195,"Auml":196,"Aring":197,"AElig":198,"Ccedil":199,"Egrave":200,"Eacute":201,"Ecirc":202,"Euml":203,"Igrave":204,"Iacute":205,"Icirc":206,"Iuml":207,"ETH":208,"Ntilde":209,"Ograve":210,"Oacute":211,"Ocirc":212,"Otilde":213,"Ouml":214,"times":215,"Oslash":216,"Ugrave":217,"Uacute":218,"Ucirc":219}
  html_array_2 := {"Uuml":220,"Yacute":221,"THORN":222,"szlig":223,"agrave":224,"aacute":225,"acirc":226,"atilde":227,"auml":228,"aring":229,"aelig":230,"ccedil":231,"egrave":232,"eacute":233,"ecirc":234,"euml":235,"igrave":236,"iacute":237,"icirc":238,"iuml":239,"eth":240,"ntilde":241,"ograve":242,"oacute":243,"ocirc":244,"otilde":245,"ouml":246,"divide":247,"oslash":248,"ugrave":249,"uacute":250,"ucirc":251,"uuml":252,"yacute":253,"thorn":254,"yuml":255,"OElig":338,"oelig":339,"Scaron":352,"scaron":353,"Yuml":376,"fnof":402,"circ":710,"tilde":732,"Alpha":913,"Beta":914,"Gamma":915,"Delta":916,"Epsilon":917,"Zeta":918,"Eta":919,"Theta":920,"Iota":921,"Kappa":922,"Lambda":923,"Mu":924,"Nu":925,"Xi":926,"Omicron":927,"Pi":928,"Rho":929,"Sigma":931}
  html_array_3 := {"Tau":932,"Upsilon":933,"Phi":934,"Chi":935,"Psi":936,"Omega":937,"alpha":945,"beta":946,"gamma":947,"delta":948,"epsilon":949,"zeta":950,"eta":951,"theta":952,"iota":953,"kappa":954,"lambda":955,"mu":956,"nu":957,"xi":958,"omicron":959,"pi":960,"rho":961,"sigmaf":962,"sigma":963,"tau":964,"upsilon":965,"phi":966,"chi":967,"psi":968,"omega":969,"thetasym":977,"upsih":978,"piv":982,"ensp":8194,"emsp":8195,"thinsp":8201,"zwnj":8204,"zwj":8205,"lrm":8206,"rlm":8207,"ndash":8211,"mdash":8212,"horbar":8213,"lsquo":8216,"rsquo":8217,"sbquo":8218,"ldquo":8220,"rdquo":8221,"bdquo":8222,"dagger":8224,"Dagger":8225,"bull":8226,"hellip":8230,"permil":8240,"prime":8242,"Prime":8243,"lsaquo":8249,"rsaquo":8250,"oline":8254,"frasl":8260,"euro":8364,"image":8465,"weierp":8472,"real":8476,"trade":8482,"alefsym":8501,"larr":8592,"uarr":8593,"rarr":8594,"darr":8595,"harr":8596,"crarr":8629,"lArr":8656,"uArr":8657,"rArr":8658,"dArr":8659,"hArr":8660,"forall":8704,"part":8706,"exist":8707,"empty":8709,"nabla":8711,"isin":8712,"notin":8713,"ni":8715,"prod":8719,"sum":8721,"minus":8722,"lowast":8727,"radic":8730,"prop":8733,"infin":8734,"ang":8736,"and":8743,"or":8744,"cap":8745,"cup":8746,"int":8747,"there4":8756,"sim":8764,"cong":8773,"asymp":8776,"ne":8800,"equiv":8801,"le":8804,"ge":8805,"sub":8834,"sup":8835,"nsub":8836,"sube":8838,"supe":8839,"oplus":8853,"otimes":8855,"perp":8869,"sdot":8901,"lceil":8968,"rceil":8969,"lfloor":8970,"rfloor":8971,"lang":9001,"rang":9002,"loz":9674,"spades":9824,"clubs":9827,"hearts":9829,"diams":9830}

  While RegExMatch(str, "(?<=&#)\d+(?=;)", char)
    StringReplace, str, str, % "&#" char ";" , % Chr(char), All
  While RegExMatch(str, "i)(?<=&#x)[a-f0-9]+(?=;)", char)
    StringReplace, str, str, % "&#x" char ";" , % Chr("0x"char), All
  While RegExMatch(str, "(?<=&)\w+(?=;)", char) {
    if (html_array_1[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_1[char]), All
    } Else If (html_array_2[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_2[char]), All
    } Else If (html_array_3[char]){
      StringReplace, str, str, % "&" char ";" , % Chr(html_array_3[char]), All
    } Else {
      MsgBox, % "ERROR:`nFound an Unkown Named Entity: &" char ";`nIt will be ignored."
      StringReplace, str, str, % "&" char ";" , % "", All
    }
  }
  Return str
}

Re: HTML Decoder

Posted: 05 Nov 2018, 09:11
by burque505
Thank you Freire, it works well for me so far.
I changed

Code: Select all

msgbox, % htmlDecode(encoded_str)
msgbox, % htmlDecode(encoded_str)=str_confirm?"Working":"Not Working"
to

Code: Select all

msgbox, % htmlDecode(encoded_str)
msgbox, % htmlDecode(encoded_str)=decoded_str?"Working":"Not Working"
in the validation example to match the decoded string rather than one that wasn't there :)
Regards,
burque505

Re: HTML Decoder

Posted: 05 Nov 2018, 16:06
by Freire
Thanks, I've just edited. Now it's the correct variable.

Re: HTML Decoder

Posted: 18 Nov 2018, 08:01
by SKAN
I've been planning to upgrade my old function to support HTML 4.
I wouldn't use associative array(s) to resolve named entities.. Associative array keys aren't case sensitive.
In your function (for eg.) both Aacute (193) and aacute (225) resolves to Aacute (193) only.

Same problem with StringReplace command ( deprecated. Use StrReplace() ).
It will replace all Aacute and aacute in one shot unless StringCaseSense is on/1.
RegExReplace() would be considerably slower, I guess.

Then there is the problem of uppercase entities (for eg. &GT;) which needs to resolved as case insensitive match.

Re: HTML Decoder

Posted: 18 Nov 2018, 20:08
by kczx3
SKAN, I think you mean HTML 5

Re: HTML Decoder

Posted: 18 Nov 2018, 21:09
by SKAN
kczx3 wrote:
18 Nov 2018, 20:08
SKAN, I think you mean HTML 5
Why? You need it? :)
I did mean HTML 4. Its simply a matter of replacing the HTML 4 lookup table with HTML 5.
It only takes 3 lines for me to resolve HTML entities but here follows the 26KB HTML 5 lookup table that won't even init in one Static variable.

Spoiler

Re: HTML Decoder

Posted: 18 Nov 2018, 21:13
by kczx3
I guess my point was that no one should still be writing HTML 4.

Re: HTML Decoder

Posted: 18 Nov 2018, 21:27
by SKAN
I guess my point was that no one should still be writing HTML 4.
Ah, I see... Point noted. I rarely write HTML... :)

Re: HTML Decoder

Posted: 24 Nov 2018, 00:16
by Freire
Thanks for the feedback. I'll be implementing the changes.
I haven't implemented the HTML5 because I haven't seen a need for that, and as SKAN mentioned: The table is too big.
Even with the much smaller HTML4 it already doesn't fit into one variable alone.
Skan, if you update your function, post link here I'll be glad to use it.

Re: HTML Decoder

Posted: 24 Nov 2018, 07:43
by SKAN
Skan, if you update your function, post link here I'll be glad to use it.
Sure. It will take a while though. I am writing it using InStr() and SubStr(), but the problem is
StringReplace() has to be called between StringCaseSense, On and StringCaseSense, Off
I can't use StringCaseSense, On at the top of function as it overrides the case sensitivity parameter of InStr().

If you're interested, I can post here a (slower) RegEx powered version which you can adapt to your needs.

:)