From f0a6abfdebbbfd1b578d7339df1627f1afacd2be Mon Sep 17 00:00:00 2001 From: b Date: Mon, 26 Feb 2024 21:27:45 +0000 Subject: [PATCH] optimised HTM entity encoding / decoding --- botm_common.pm | 503 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 329 insertions(+), 174 deletions(-) diff --git a/botm_common.pm b/botm_common.pm index 18bde54..33572f3 100644 --- a/botm_common.pm +++ b/botm_common.pm @@ -26,7 +26,7 @@ use File::Copy; use Exporter; -our $VERSION = '1.1.2'; +our $VERSION = '1.1.3'; our @ISA = qw(Exporter); our @EXPORT = (); our @EXPORT_OK = ( @@ -879,7 +879,6 @@ use constant HTTP_STATUS => { 'unassigned' => 509, 'not_extended' => 510, 'network_authentication required' => 511, - }; sub http_header_line { @@ -1176,16 +1175,14 @@ sub read_header_env { # If $all is true then every character will be escaped sub url_encode { (my $t, my $encoding, my $all) = @_; - if ($encoding eq '') { $encoding = 'UTF-8'; } - if ($all) { - $t =~ s/(.)/url_encode_1ch($1, $encoding)/egs; + $t =~ s/./url_encode_1ch($&,$encoding)/egs; } else { - $t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/egs; + $t =~ s/[^0-9A-Za-z.~\-_]/url_encode_1ch($&,$encoding)/egs; } return $t; } @@ -1195,12 +1192,8 @@ sub url_encode { # $encoding is the encoding to use sub url_encode_1ch { (my $ch, my $encoding) = @_; - - if ($encoding eq '') { - $encoding = 'UTF-8'; - } $ch = encode($encoding, $ch); - $ch =~ s/(.)/sprintf('%%%02X',ord($1))/egs; + $ch =~ s/./sprintf('%%%02X',ord($&))/egs; return $ch; } @@ -1211,14 +1204,13 @@ sub url_encode_1ch { # if $plus is true then '+' will be decoded to ' '. sub url_decode { (my $t, my $encoding, my $plus) = @_; - if ($encoding eq '') { $encoding = 'utf8'; } if ($plus) { $t =~ s/\+/ /gs; } - $t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1, $encoding)/egs; + $t =~ s/(%[0-9A-Fa-f]{2})+/url_decode_xch($&,$encoding)/egs; return $t; } @@ -1228,10 +1220,6 @@ sub url_decode { sub url_decode_xch { (my $xch, my $encoding) = @_; my $y = ''; - - if ($encoding eq '') { - $encoding = 'utf8'; - } while ($xch ne '') { $y .= chr(hex(substr($xch, 1, 2))); $xch = substr($xch, 3); @@ -3801,36 +3789,306 @@ use constant HTML_ENTITY_CODE_INF => { 'yen' => [0x000A5], 'yuml' => [0x000FF] }; +# here a reverse map: from code to name, but only HTML 4.01 +# See https://www.w3.org/TR/1999/PR-html40-19990824/sgml/entities.html +use constant HTML_ENTITY_NAME => { + 0x0022 =>'quot', + 0x0026 =>'amp', + 0x003C =>'lt', + 0x003E =>'gt', + 0x00A0 =>'nbsp', + 0x00A1 =>'iexcl', + 0x00A2 =>'cent', + 0x00A3 =>'pound', + 0x00A4 =>'curren', + 0x00A5 =>'yen', + 0x00A6 =>'brvbar', + 0x00A7 =>'sect', + 0x00A8 =>'uml', + 0x00A9 =>'copy', + 0x00AA =>'ordf', + 0x00AB =>'laquo', + 0x00AC =>'not', + 0x00AD =>'shy', + 0x00AE =>'reg', + 0x00AF =>'macr', + 0x00B0 =>'deg', + 0x00B1 =>'plusmn', + 0x00B2 =>'sup2', + 0x00B3 =>'sup3', + 0x00B4 =>'acute', + 0x00B5 =>'micro', + 0x00B6 =>'para', + 0x00B7 =>'middot', + 0x00B8 =>'cedil', + 0x00B9 =>'sup1', + 0x00BA =>'ordm', + 0x00BB =>'raquo', + 0x00BC =>'frac14', + 0x00BD =>'frac12', + 0x00BE =>'frac34', + 0x00BF =>'iquest', + 0x00C0 =>'Agrave', + 0x00C1 =>'Aacute', + 0x00C2 =>'Acirc', + 0x00C3 =>'Atilde', + 0x00C4 =>'Auml', + 0x00C5 =>'Aring', + 0x00C6 =>'AElig', + 0x00C7 =>'Ccedil', + 0x00C8 =>'Egrave', + 0x00C9 =>'Eacute', + 0x00CA =>'Ecirc', + 0x00CB =>'Euml', + 0x00CC =>'Igrave', + 0x00CD =>'Iacute', + 0x00CE =>'Icirc', + 0x00CF =>'Iuml', + 0x00D0 =>'ETH', + 0x00D1 =>'Ntilde', + 0x00D2 =>'Ograve', + 0x00D3 =>'Oacute', + 0x00D4 =>'Ocirc', + 0x00D5 =>'Otilde', + 0x00D6 =>'Ouml', + 0x00D7 =>'times', + 0x00D8 =>'Oslash', + 0x00D9 =>'Ugrave', + 0x00DA =>'Uacute', + 0x00DB =>'Ucirc', + 0x00DC =>'Uuml', + 0x00DD =>'Yacute', + 0x00DE =>'THORN', + 0x00DF =>'szlig', + 0x00E0 =>'agrave', + 0x00E1 =>'aacute', + 0x00E2 =>'acirc', + 0x00E3 =>'atilde', + 0x00E4 =>'auml', + 0x00E5 =>'aring', + 0x00E6 =>'aelig', + 0x00E7 =>'ccedil', + 0x00E8 =>'egrave', + 0x00E9 =>'eacute', + 0x00EA =>'ecirc', + 0x00EB =>'euml', + 0x00EC =>'igrave', + 0x00ED =>'iacute', + 0x00EE =>'icirc', + 0x00EF =>'iuml', + 0x00F0 =>'eth', + 0x00F1 =>'ntilde', + 0x00F2 =>'ograve', + 0x00F3 =>'oacute', + 0x00F4 =>'ocirc', + 0x00F5 =>'otilde', + 0x00F6 =>'ouml', + 0x00F7 =>'divide', + 0x00F8 =>'oslash', + 0x00F9 =>'ugrave', + 0x00FA =>'uacute', + 0x00FB =>'ucirc', + 0x00FC =>'uuml', + 0x00FD =>'yacute', + 0x00FE =>'thorn', + 0x00FF =>'yuml', + 0x0152 =>'OElig', + 0x0153 =>'oelig', + 0x0160 =>'Scaron', + 0x0161 =>'scaron', + 0x0178 =>'Yuml', + 0x0192 =>'fnof', + 0x02C6 =>'circ', + 0x02DC =>'tilde', + 0x0391 =>'Alpha', + 0x0392 =>'Beta', + 0x0393 =>'Gamma', + 0x0394 =>'Delta', + 0x0395 =>'Epsilon', + 0x0396 =>'Zeta', + 0x0397 =>'Eta', + 0x0398 =>'Theta', + 0x0399 =>'Iota', + 0x039A =>'Kappa', + 0x039B =>'Lambda', + 0x039C =>'Mu', + 0x039D =>'Nu', + 0x039E =>'Xi', + 0x039F =>'Omicron', + 0x03A0 =>'Pi', + 0x03A1 =>'Rho', + 0x03A3 =>'Sigma', + 0x03A4 =>'Tau', + 0x03A5 =>'Upsilon', + 0x03A6 =>'Phi', + 0x03A7 =>'Chi', + 0x03A8 =>'Psi', + 0x03A9 =>'Omega', + 0x03B1 =>'alpha', + 0x03B2 =>'beta', + 0x03B3 =>'gamma', + 0x03B4 =>'delta', + 0x03B5 =>'epsilon', + 0x03B6 =>'zeta', + 0x03B7 =>'eta', + 0x03B8 =>'theta', + 0x03B9 =>'iota', + 0x03BA =>'kappa', + 0x03BB =>'lambda', + 0x03BC =>'mu', + 0x03BD =>'nu', + 0x03BE =>'xi', + 0x03BF =>'omicron', + 0x03C0 =>'pi', + 0x03C1 =>'rho', + 0x03C2 =>'sigmaf', + 0x03C3 =>'sigma', + 0x03C4 =>'tau', + 0x03C5 =>'upsilon', + 0x03C6 =>'phi', + 0x03C7 =>'chi', + 0x03C8 =>'psi', + 0x03C9 =>'omega', + 0x03D1 =>'thetasym', + 0x03D2 =>'upsih', + 0x03D6 =>'piv', + 0x2002 =>'ensp', + 0x2003 =>'emsp', + 0x2009 =>'thinsp', + 0x200C =>'zwnj', + 0x200D =>'zwj', + 0x200E =>'lrm', + 0x200F =>'rlm', + 0x2013 =>'ndash', + 0x2014 =>'mdash', + 0x2018 =>'lsquo', + 0x2019 =>'rsquo', + 0x201A =>'sbquo', + 0x201C =>'ldquo', + 0x201D =>'rdquo', + 0x201E =>'bdquo', + 0x2020 =>'dagger', + 0x2021 =>'Dagger', + 0x2022 =>'bull', + 0x2026 =>'hellip', + 0x2030 =>'permil', + 0x2032 =>'prime', + 0x2033 =>'Prime', + 0x2039 =>'lsaquo', + 0x203A =>'rsaquo', + 0x203E =>'oline', + 0x2044 =>'frasl', + 0x20AC =>'euro', + 0x2111 =>'image', + 0x2118 =>'weierp', + 0x211C =>'real', + 0x2122 =>'trade', + 0x2135 =>'alefsym', + 0x2190 =>'larr', + 0x2191 =>'uarr', + 0x2192 =>'rarr', + 0x2193 =>'darr', + 0x2194 =>'harr', + 0x21B5 =>'crarr', + 0x21D0 =>'lArr', + 0x21D1 =>'uArr', + 0x21D2 =>'rArr', + 0x21D3 =>'dArr', + 0x21D4 =>'hArr', + 0x2200 =>'forall', + 0x2202 =>'part', + 0x2203 =>'exist', + 0x2205 =>'empty', + 0x2207 =>'nabla', + 0x2208 =>'isin', + 0x2209 =>'notin', + 0x220B =>'ni', + 0x220F =>'prod', + 0x2211 =>'sum', + 0x2212 =>'minus', + 0x2217 =>'lowast', + 0x221A =>'radic', + 0x221D =>'prop', + 0x221E =>'infin', + 0x2220 =>'ang', + 0x2227 =>'and', + 0x2228 =>'or', + 0x2229 =>'cap', + 0x222A =>'cup', + 0x222B =>'int', + 0x2234 =>'there4', + 0x223C =>'sim', + 0x2245 =>'cong', + 0x2248 =>'asymp', + 0x2260 =>'ne', + 0x2261 =>'equiv', + 0x2264 =>'le', + 0x2265 =>'ge', + 0x2282 =>'sub', + 0x2283 =>'sup', + 0x2284 =>'nsub', + 0x2286 =>'sube', + 0x2287 =>'supe', + 0x2295 =>'oplus', + 0x2295 =>'otimes', + 0x22A5 =>'perp', + 0x22C5 =>'sdot', + 0x2308 =>'lceil', + 0x230A =>'lfloor', + 0x230B =>'rfloor', + 0x2329 =>'lang', + 0x232A =>'rang', + 0x25CA =>'loz', + 0x2660 =>'spades', + 0x2663 =>'clubs', + 0x2665 =>'hearts', + 0x2666 =>'diams' +}; + # html_entity_encode_dec() escapes a text using HTML entity references # in decimal form. -# only reserved characters will be escaped. +# reserved characters will be escaped. # $t is the text to encode # if $non_ascii is true then every non-ASCII character will be escaped # if $all is true then every character will be escaped. sub html_entity_encode_dec { (my $t, my $non_ascii, my $all) = @_; - - $t =~ s/(.)/html_entity_encode_1ch_dec($1,$non_ascii,$all)/egs; + if ($all) { + $t =~ s/./'&#'.ord($&).';'/egs; + } + elsif ($non_ascii) { + $t =~ s/[\"\&<=>\P{ASCII}]/'&#'.ord($&).';'/egs; + } + else { + $t =~ s/[\"\&<=>]/'&#'.ord($&).';'/egs; + } return $t; } # html_entity_encode_hex() escapes a text using HTML entity references # in hexadecimal form. -# only reserved characters will be escaped. +# reserved characters will be escaped. # $t is the text to encode # if $non_ascii is true then every non-ASCII character will be escaped # if $all is true then every character will be escaped. sub html_entity_encode_hex { (my $t, my $non_ascii, my $all) = @_; - - $t =~ s/(.)/html_entity_encode_1ch_hex($1,$non_ascii,$all)/egs; + if ($all) { + $t =~ s/./sprintf('&#x%X;',ord($&))/egs; + } + elsif ($non_ascii) { + $t =~ s/[\"\&<=>\P{ASCII}]/sprintf('&#x%X;',ord($&))/egs; + } + else { + $t =~ s/[\"\&<=>]/sprintf('&#x%X;',ord($&))/egs; + } return $t; } # html_entity_encode_name() escapes a text using HTML entity references # by name. -# only reserved characters will be escaped. +# reserved characters will be escaped. # $t is the text to encode # if $non_ascii is true then every non-ASCII character will be escaped # if $all is true then every character will be escaped. @@ -3839,115 +4097,45 @@ sub html_entity_encode_hex { # or if $fallback is 'hex' then character without name will be escaped # in hexadecimal form instead # or otherwise they will not be escaped -# if a character has more than 1 possible name, then -# the name will be chosen unpredicatbly. sub html_entity_encode_name { (my $t, my $non_ascii, my $all, my $fallback) = @_; - $t =~ s/(.)/html_entity_encode_1ch_name($1,$non_ascii,$all,$fallback)/egs; - return $t; -} - -# html_entity_encode_dec() escapes a single character -# as an HTML entity reference in decimal form. -# $ch is the chatacter to encode -# if $non_ascii is true then every non-ASCII character will be escaped -# if $all is true then every character will be escaped. -sub html_entity_encode_1ch_dec { - (my $ch, my $non_ascii, my $all) = @_; - - my $n = ord($ch); - if ( - $all or - ($non_ascii and ($n > 0x7F)) or - ($ch =~ /[\"\&<=>]/) - ) { - return sprintf('&#%02u;', $n); - } - else { - return $ch; + if ($all) { + $t =~ s/./html_entity_encode_1ch_name($&,$fallback)/egs; } -} - -# html_entity_encode_1ch_hex() escapes a single character -# as an HTML entity reference in hexadecimal form. -# $ch is the chatacter to encode -# if $non_ascii is true then every non-ASCII character will be escaped -# if $all is true then every character will be escaped. -sub html_entity_encode_1ch_hex { - (my $ch, my $non_ascii, my $all) = @_; - - my $n = ord($ch); - if ( - $all or - ($non_ascii and ($n > 0x7F)) or - ($ch =~ /[\"\&<=>]/) - ) { - return sprintf('&#x%02X;', $n); + elsif ($non_ascii) { + $t =~ s/[\"\&<=>\P{ASCII}]/html_entity_encode_1ch_name($&,$fallback)/egs; } else { - return $ch; + $t =~ s/[\"\&<=>]/html_entity_encode_1ch_name($&,$fallback)/egs; } + return $t; } # html_entity_encode_1ch_name() escapes a single character # as an HTML entity reference by name. -# only a reserved character will be escaped. # $ch is the chatacter to encode -# if $non_ascii is true then every non-ASCII character will be escaped -# if $all is true then every character will be escaped. # if $fallback is 'dec' then character without name will be escaped in # decimal form instead # or if $fallback is 'hex' then character without name will be escaped # in hexadecimal form instead # or otherwise they will not be escaped -# if a character has more than 1 possible name, then -# the name will be chosen unpredicatbly. sub html_entity_encode_1ch_name { - (my $ch, my $non_ascii, my $all, my $fallback) = @_; - + (my $ch, my $fallback) = @_; my $n = ord($ch); - - if ($ch =~ /[\"\&<=>]/) { - if ($ch eq '"') { - return '"'; - } - elsif ($ch eq '&') { - return '&'; - } - elsif ($ch eq '<') { - return '<'; - } - elsif ($ch eq '=') { - return '='; - } - elsif ($ch eq '>') { - return '>'; - } - else { - return $ch; - } + my $name = HTML_ENTITY_NAME->{$n}; + if ($name ne '') { + return "&$name;"; } - elsif ( - $all or - ($non_ascii and ($n > 0x7F)) - ) { - foreach my $name (keys %{+HTML_ENTITY_CODE}) { - my @code = @{HTML_ENTITY_CODE->{$name}}; - if (($code[0] == $n) and (@code == 1)) { - return "&$name;"; - } - } - - if ($fallback eq 'hex') { - return html_entity_encode_1ch_hex($ch, $non_ascii, $all); - } - elsif ($fallback eq 'dec') { - return html_entity_encode_1ch_dec($ch, $non_ascii, $all); - } + elsif ($fallback eq 'dec') { + return "&#$n;"; + } + elsif ($fallback eq 'hex') { + return sprintf('&#x%X;', $n); + } + else { + return $ch; } - - return $ch; } # html_entity_decode() decodes a text escaped by HTML entity references @@ -3956,84 +4144,51 @@ sub html_entity_encode_1ch_name { # $t is the text to decode sub html_entity_decode { (my $t) = @_; - - my $d = ''; - - while ($t ne '') { - if ($t =~ /^([^\&]+)(\&.*)$/s) { # any normal text - $d .= $1; - $t = $2; - } - elsif ($t =~ /^\&#?[A-Za-z0-9]+;/s) { # correct encoded character - $d .= html_entity_decode_1en($&); - $t = $'; - } - elsif ($t =~ /^\&[A-Za-z0-9]+/s) { # encoded character without ";" - $d .= html_entity_decode_1en($&); - $t = $'; - } - elsif ($t =~ /^\&/s) { # invalid "&" - $d .= $&; - $t = $'; - } - else { # nothing left to decode - $d .= $t; - $t = ''; - } - } - - return $d; + $t =~ s/&((#(([0-9]+)|(x[0-9A-Fa-f]+));)|([A-Za-z0-9]+;?))/html_entity_decode_1en($&)/egs; + return $t; } # html_entity_decode_1en() decodes a single HTML entity reference # in any of the available forms. # an invalid entity reference will be left unchanged. -# $en is the text to decode - it is assumed without checking -# that the text is indeed in this format. +# $en is the text to decode sub html_entity_decode_1en { (my $en) = @_; - my $y = ''; - - if ($en !~ /;$/) { # name without ";" - my $n = substr($en, 1); - # we HAVE TO iterate as we don't know where name terminates :/ - # why they thought this is a good idea ? - foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) { - if (index($n, $name) == 0) { # name (beginning of entire string) is valid: - # decode - my @code = @{HTML_ENTITY_CODE_INF->{$name}}; - foreach my $c (@code) { - $y .= chr($c); - } - # rest of the string, insert literally - $y .= substr($n, length($name)); - return $y; - } - } - # invalid name, insert literally - return $en; - } - elsif ($en =~ /^\&#([0-9]+);$/) { # decimal + if ($en =~ /^&#([0-9]+);$/) { return chr($1); } - elsif ($en =~ /^\&#x([0-9A-Fa-f]+);$/) { # hexadecimal + elsif ($en =~ /^&#x([0-9A-Fa-f]+);$/) { return chr(hex($1)); } - else { # name - my $name = substr($en, 1, length($en) - 2); - if (defined HTML_ENTITY_CODE->{$name}) { - # valid name, decode - my @code = @{HTML_ENTITY_CODE->{$name}}; - foreach my $c (@code) { - $y .= chr($c); + elsif ($en =~ /^&([A-Za-z0-9]+)(;?)$/) { + my $y = ''; + if ($2 ne '') { + if (defined HTML_ENTITY_CODE->{$1}) { + my @code = @{HTML_ENTITY_CODE->{$1}}; + foreach my $c (@code) { + $y .= chr($c); + } + return $y; } - return $y; } else { - # invalid name, insert literally - return $en; + # we HAVE TO iterate as we don't know where name terminates :/ + # why they thought this is a good idea ? + foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) { + if (index($1, $name) == 0) { # name (beginning of entire string) is valid: + # decode + my @code = @{HTML_ENTITY_CODE_INF->{$name}}; + foreach my $c (@code) { + $y .= chr($c); + } + # rest of the string, insert literally + $y .= substr($1, length($name)); + return $y; + } + } } } + return $en; } 1 -- 2.30.2