optimised HTM entity encoding / decoding

author b <rowerynaksiezycu@gmail.com>

Mon, 26 Feb 2024 21:27:45 +0000 (21:27 +0000)

committer b <rowerynaksiezycu@gmail.com>

Mon, 26 Feb 2024 21:27:45 +0000 (21:27 +0000)
author b <rowerynaksiezycu@gmail.com>
Mon, 26 Feb 2024 21:27:45 +0000 (21:27 +0000)
committer b <rowerynaksiezycu@gmail.com>
Mon, 26 Feb 2024 21:27:45 +0000 (21:27 +0000)
diff --git a/botm_common.pm b/botm_common.pm

index 18bde54359dbb3acfd59bc0d97d0f9aca8b00f06..33572f33f9ad63e0646f3d601fc487170e5f5b47 100644 (file)
--- a/botm_common.pm
+++ b/botm_common.pm
@@ -26,7 +26,7 @@ use File::Copy;
  
  use Exporter;
  
-our $VERSION     = '1.1.2';
+our $VERSION     = '1.1.3';
  our @ISA         = qw(Exporter);
  our @EXPORT      = ();
  our @EXPORT_OK   = (
@@ -879,7 +879,6 @@ use constant HTTP_STATUS => {
         'unassigned'                      => 509,
         'not_extended'                    => 510,
         'network_authentication required' => 511,
-
  };
  
  sub http_header_line {
@@ -1176,16 +1175,14 @@ sub read_header_env {
  # If $all is true then every character will be escaped
  sub url_encode {
         (my $t, my $encoding, my $all) = @_;
-       
         if ($encoding eq '') {
                 $encoding = 'UTF-8';
         }
-
         if ($all) {
-               $t =~ s/(.)/url_encode_1ch($1, $encoding)/egs;
+               $t =~ s/./url_encode_1ch($&,$encoding)/egs;
         }
         else {
-               $t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/egs;
+               $t =~ s/[^0-9A-Za-z.~\-_]/url_encode_1ch($&,$encoding)/egs;
         }
         return $t;
  }
@@ -1195,12 +1192,8 @@ sub url_encode {
  # $encoding is the encoding to use
  sub url_encode_1ch {
         (my $ch, my $encoding) = @_;
-       
-       if ($encoding eq '') {
-               $encoding = 'UTF-8';
-       }
         $ch = encode($encoding, $ch);
-       $ch =~ s/(.)/sprintf('%%%02X',ord($1))/egs;
+       $ch =~ s/./sprintf('%%%02X',ord($&))/egs;
         return $ch;
  }
  
@@ -1211,14 +1204,13 @@ sub url_encode_1ch {
  # if $plus is true then '+' will be decoded to ' '.
  sub url_decode {
         (my $t, my $encoding, my $plus) = @_;
-       
         if ($encoding eq '') {
                 $encoding = 'utf8';
         }
         if ($plus) {
                 $t =~ s/\+/ /gs;
         }
-       $t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1, $encoding)/egs;
+       $t =~ s/(%[0-9A-Fa-f]{2})+/url_decode_xch($&,$encoding)/egs;
         return $t;
  }
  
@@ -1228,10 +1220,6 @@ sub url_decode {
  sub url_decode_xch {
         (my $xch, my $encoding) = @_;
         my $y = '';
-       
-       if ($encoding eq '') {
-               $encoding = 'utf8';
-       }
         while ($xch ne '') {
                 $y .= chr(hex(substr($xch, 1, 2)));
                 $xch = substr($xch, 3);
@@ -3801,36 +3789,306 @@ use constant HTML_ENTITY_CODE_INF => {
         'yen'                            => [0x000A5],
         'yuml'                           => [0x000FF]
  };
+# here a reverse map: from code to name, but only HTML 4.01
+# See https://www.w3.org/TR/1999/PR-html40-19990824/sgml/entities.html
+use constant HTML_ENTITY_NAME => {
+       0x0022 =>'quot',
+       0x0026 =>'amp',
+       0x003C =>'lt',
+       0x003E =>'gt',
+       0x00A0 =>'nbsp',
+       0x00A1 =>'iexcl',
+       0x00A2 =>'cent',
+       0x00A3 =>'pound',
+       0x00A4 =>'curren',
+       0x00A5 =>'yen',
+       0x00A6 =>'brvbar',
+       0x00A7 =>'sect',
+       0x00A8 =>'uml',
+       0x00A9 =>'copy',
+       0x00AA =>'ordf',
+       0x00AB =>'laquo',
+       0x00AC =>'not',
+       0x00AD =>'shy',
+       0x00AE =>'reg',
+       0x00AF =>'macr',
+       0x00B0 =>'deg',
+       0x00B1 =>'plusmn',
+       0x00B2 =>'sup2',
+       0x00B3 =>'sup3',
+       0x00B4 =>'acute',
+       0x00B5 =>'micro',
+       0x00B6 =>'para',
+       0x00B7 =>'middot',
+       0x00B8 =>'cedil',
+       0x00B9 =>'sup1',
+       0x00BA =>'ordm',
+       0x00BB =>'raquo',
+       0x00BC =>'frac14',
+       0x00BD =>'frac12',
+       0x00BE =>'frac34',
+       0x00BF =>'iquest',
+       0x00C0 =>'Agrave',
+       0x00C1 =>'Aacute',
+       0x00C2 =>'Acirc',
+       0x00C3 =>'Atilde',
+       0x00C4 =>'Auml',
+       0x00C5 =>'Aring',
+       0x00C6 =>'AElig',
+       0x00C7 =>'Ccedil',
+       0x00C8 =>'Egrave',
+       0x00C9 =>'Eacute',
+       0x00CA =>'Ecirc',
+       0x00CB =>'Euml',
+       0x00CC =>'Igrave',
+       0x00CD =>'Iacute',
+       0x00CE =>'Icirc',
+       0x00CF =>'Iuml',
+       0x00D0 =>'ETH',
+       0x00D1 =>'Ntilde',
+       0x00D2 =>'Ograve',
+       0x00D3 =>'Oacute',
+       0x00D4 =>'Ocirc',
+       0x00D5 =>'Otilde',
+       0x00D6 =>'Ouml',
+       0x00D7 =>'times',
+       0x00D8 =>'Oslash',
+       0x00D9 =>'Ugrave',
+       0x00DA =>'Uacute',
+       0x00DB =>'Ucirc',
+       0x00DC =>'Uuml',
+       0x00DD =>'Yacute',
+       0x00DE =>'THORN',
+       0x00DF =>'szlig',
+       0x00E0 =>'agrave',
+       0x00E1 =>'aacute',
+       0x00E2 =>'acirc',
+       0x00E3 =>'atilde',
+       0x00E4 =>'auml',
+       0x00E5 =>'aring',
+       0x00E6 =>'aelig',
+       0x00E7 =>'ccedil',
+       0x00E8 =>'egrave',
+       0x00E9 =>'eacute',
+       0x00EA =>'ecirc',
+       0x00EB =>'euml',
+       0x00EC =>'igrave',
+       0x00ED =>'iacute',
+       0x00EE =>'icirc',
+       0x00EF =>'iuml',
+       0x00F0 =>'eth',
+       0x00F1 =>'ntilde',
+       0x00F2 =>'ograve',
+       0x00F3 =>'oacute',
+       0x00F4 =>'ocirc',
+       0x00F5 =>'otilde',
+       0x00F6 =>'ouml',
+       0x00F7 =>'divide',
+       0x00F8 =>'oslash',
+       0x00F9 =>'ugrave',
+       0x00FA =>'uacute',
+       0x00FB =>'ucirc',
+       0x00FC =>'uuml',
+       0x00FD =>'yacute',
+       0x00FE =>'thorn',
+       0x00FF =>'yuml',
+       0x0152 =>'OElig',
+       0x0153 =>'oelig',
+       0x0160 =>'Scaron',
+       0x0161 =>'scaron',
+       0x0178 =>'Yuml',
+       0x0192 =>'fnof',
+       0x02C6 =>'circ',
+       0x02DC =>'tilde',
+       0x0391 =>'Alpha',
+       0x0392 =>'Beta',
+       0x0393 =>'Gamma',
+       0x0394 =>'Delta',
+       0x0395 =>'Epsilon',
+       0x0396 =>'Zeta',
+       0x0397 =>'Eta',
+       0x0398 =>'Theta',
+       0x0399 =>'Iota',
+       0x039A =>'Kappa',
+       0x039B =>'Lambda',
+       0x039C =>'Mu',
+       0x039D =>'Nu',
+       0x039E =>'Xi',
+       0x039F =>'Omicron',
+       0x03A0 =>'Pi',
+       0x03A1 =>'Rho',
+       0x03A3 =>'Sigma',
+       0x03A4 =>'Tau',
+       0x03A5 =>'Upsilon',
+       0x03A6 =>'Phi',
+       0x03A7 =>'Chi',
+       0x03A8 =>'Psi',
+       0x03A9 =>'Omega',
+       0x03B1 =>'alpha',
+       0x03B2 =>'beta',
+       0x03B3 =>'gamma',
+       0x03B4 =>'delta',
+       0x03B5 =>'epsilon',
+       0x03B6 =>'zeta',
+       0x03B7 =>'eta',
+       0x03B8 =>'theta',
+       0x03B9 =>'iota',
+       0x03BA =>'kappa',
+       0x03BB =>'lambda',
+       0x03BC =>'mu',
+       0x03BD =>'nu',
+       0x03BE =>'xi',
+       0x03BF =>'omicron',
+       0x03C0 =>'pi',
+       0x03C1 =>'rho',
+       0x03C2 =>'sigmaf',
+       0x03C3 =>'sigma',
+       0x03C4 =>'tau',
+       0x03C5 =>'upsilon',
+       0x03C6 =>'phi',
+       0x03C7 =>'chi',
+       0x03C8 =>'psi',
+       0x03C9 =>'omega',
+       0x03D1 =>'thetasym',
+       0x03D2 =>'upsih',
+       0x03D6 =>'piv',
+       0x2002 =>'ensp',
+       0x2003 =>'emsp',
+       0x2009 =>'thinsp',
+       0x200C =>'zwnj',
+       0x200D =>'zwj',
+       0x200E =>'lrm',
+       0x200F =>'rlm',
+       0x2013 =>'ndash',
+       0x2014 =>'mdash',
+       0x2018 =>'lsquo',
+       0x2019 =>'rsquo',
+       0x201A =>'sbquo',
+       0x201C =>'ldquo',
+       0x201D =>'rdquo',
+       0x201E =>'bdquo',
+       0x2020 =>'dagger',
+       0x2021 =>'Dagger',
+       0x2022 =>'bull',
+       0x2026 =>'hellip',
+       0x2030 =>'permil',
+       0x2032 =>'prime',
+       0x2033 =>'Prime',
+       0x2039 =>'lsaquo',
+       0x203A =>'rsaquo',
+       0x203E =>'oline',
+       0x2044 =>'frasl',
+       0x20AC =>'euro',
+       0x2111 =>'image',
+       0x2118 =>'weierp',
+       0x211C =>'real',
+       0x2122 =>'trade',
+       0x2135 =>'alefsym',
+       0x2190 =>'larr',
+       0x2191 =>'uarr',
+       0x2192 =>'rarr',
+       0x2193 =>'darr',
+       0x2194 =>'harr',
+       0x21B5 =>'crarr',
+       0x21D0 =>'lArr',
+       0x21D1 =>'uArr',
+       0x21D2 =>'rArr',
+       0x21D3 =>'dArr',
+       0x21D4 =>'hArr',
+       0x2200 =>'forall',
+       0x2202 =>'part',
+       0x2203 =>'exist',
+       0x2205 =>'empty',
+       0x2207 =>'nabla',
+       0x2208 =>'isin',
+       0x2209 =>'notin',
+       0x220B =>'ni',
+       0x220F =>'prod',
+       0x2211 =>'sum',
+       0x2212 =>'minus',
+       0x2217 =>'lowast',
+       0x221A =>'radic',
+       0x221D =>'prop',
+       0x221E =>'infin',
+       0x2220 =>'ang',
+       0x2227 =>'and',
+       0x2228 =>'or',
+       0x2229 =>'cap',
+       0x222A =>'cup',
+       0x222B =>'int',
+       0x2234 =>'there4',
+       0x223C =>'sim',
+       0x2245 =>'cong',
+       0x2248 =>'asymp',
+       0x2260 =>'ne',
+       0x2261 =>'equiv',
+       0x2264 =>'le',
+       0x2265 =>'ge',
+       0x2282 =>'sub',
+       0x2283 =>'sup',
+       0x2284 =>'nsub',
+       0x2286 =>'sube',
+       0x2287 =>'supe',
+       0x2295 =>'oplus',
+       0x2295 =>'otimes',
+       0x22A5 =>'perp',
+       0x22C5 =>'sdot',
+       0x2308 =>'lceil',
+       0x230A =>'lfloor',
+       0x230B =>'rfloor',
+       0x2329 =>'lang',
+       0x232A =>'rang',
+       0x25CA =>'loz',
+       0x2660 =>'spades',
+       0x2663 =>'clubs',
+       0x2665 =>'hearts',
+       0x2666 =>'diams'
+};
+
  
  # html_entity_encode_dec() escapes a text using HTML entity references
  # in decimal form.
-# only reserved characters will be escaped.
+# reserved characters will be escaped.
  # $t is the text to encode
  # if $non_ascii is true then every non-ASCII character will be escaped
  # if $all is true then every character will be escaped.
  sub html_entity_encode_dec {
         (my $t, my $non_ascii, my $all) = @_;
-       
-       $t =~ s/(.)/html_entity_encode_1ch_dec($1,$non_ascii,$all)/egs;
+       if ($all) {
+               $t =~ s/./'&#'.ord($&).';'/egs;
+       }
+       elsif ($non_ascii) {
+               $t =~ s/[\"\&<=>\P{ASCII}]/'&#'.ord($&).';'/egs;
+       }
+       else {
+               $t =~ s/[\"\&<=>]/'&#'.ord($&).';'/egs;
+       }
         return $t;
  }
  
  # html_entity_encode_hex() escapes a text using HTML entity references
  # in hexadecimal form.
-# only reserved characters will be escaped.
+# reserved characters will be escaped.
  # $t is the text to encode
  # if $non_ascii is true then every non-ASCII character will be escaped
  # if $all is true then every character will be escaped.
  sub html_entity_encode_hex {
         (my $t, my $non_ascii, my $all) = @_;
-       
-       $t =~ s/(.)/html_entity_encode_1ch_hex($1,$non_ascii,$all)/egs;
+       if ($all) {
+               $t =~ s/./sprintf('&#x%X;',ord($&))/egs;
+       }
+       elsif ($non_ascii) {
+               $t =~ s/[\"\&<=>\P{ASCII}]/sprintf('&#x%X;',ord($&))/egs;
+       }
+       else {
+               $t =~ s/[\"\&<=>]/sprintf('&#x%X;',ord($&))/egs;
+       }
         return $t;
  }
  
  # html_entity_encode_name() escapes a text using HTML entity references
  # by name.
-# only reserved characters will be escaped.
+# reserved characters will be escaped.
  # $t is the text to encode
  # if $non_ascii is true then every non-ASCII character will be escaped
  # if $all is true then every character will be escaped.
@@ -3839,115 +4097,45 @@ sub html_entity_encode_hex {
  # or if $fallback is 'hex' then character without name will be escaped
  # in hexadecimal form instead
  # or otherwise they will not be escaped
-# if a character has more than 1 possible name, then
-# the name will be chosen unpredicatbly.
  sub html_entity_encode_name {
         (my $t, my $non_ascii, my $all, my $fallback) = @_;
         
-       $t =~ s/(.)/html_entity_encode_1ch_name($1,$non_ascii,$all,$fallback)/egs;
-       return $t;
-}
-
-# html_entity_encode_dec() escapes a single character
-# as an HTML entity reference in decimal form.
-# $ch is the chatacter to encode
-# if $non_ascii is true then every non-ASCII character will be escaped
-# if $all is true then every character will be escaped.
-sub html_entity_encode_1ch_dec {
-       (my $ch, my $non_ascii, my $all) = @_;
-       
-       my $n = ord($ch);
-       if (
-               $all or
-               ($non_ascii and ($n > 0x7F)) or
-               ($ch =~ /[\"\&<=>]/)
-       ) {
-               return sprintf('&#%02u;', $n);
-       }
-       else {
-               return $ch;
+       if ($all) {
+               $t =~ s/./html_entity_encode_1ch_name($&,$fallback)/egs;
         }
-}
-
-# html_entity_encode_1ch_hex() escapes a single character
-# as an HTML entity reference in hexadecimal form.
-# $ch is the chatacter to encode
-# if $non_ascii is true then every non-ASCII character will be escaped
-# if $all is true then every character will be escaped.
-sub html_entity_encode_1ch_hex {
-       (my $ch, my $non_ascii, my $all) = @_;
-       
-       my $n = ord($ch);
-       if (
-               $all or
-               ($non_ascii and ($n > 0x7F)) or
-               ($ch =~ /[\"\&<=>]/)
-       ) {
-               return sprintf('&#x%02X;', $n);
+       elsif ($non_ascii) {
+               $t =~ s/[\"\&<=>\P{ASCII}]/html_entity_encode_1ch_name($&,$fallback)/egs;
         }
         else {
-               return $ch;
+               $t =~ s/[\"\&<=>]/html_entity_encode_1ch_name($&,$fallback)/egs;
         }
+       return $t;
  }
  
  # html_entity_encode_1ch_name() escapes a single character
  # as an HTML entity reference by name.
-# only a reserved character will be escaped.
  # $ch is the chatacter to encode
-# if $non_ascii is true then every non-ASCII character will be escaped
-# if $all is true then every character will be escaped.
  # if $fallback is 'dec' then character without name will be escaped in
  # decimal form instead
  # or if $fallback is 'hex' then character without name will be escaped
  # in hexadecimal form instead
  # or otherwise they will not be escaped
-# if a character has more than 1 possible name, then
-# the name will be chosen unpredicatbly.
  sub html_entity_encode_1ch_name {
-       (my $ch, my $non_ascii, my $all, my $fallback) = @_;
-       
+       (my $ch, my $fallback) = @_;
         my $n = ord($ch);
-       
-       if ($ch =~ /[\"\&<=>]/) {
-               if ($ch eq '"') {
-                       return '&quot;';
-               }
-               elsif ($ch eq '&') {
-                       return '&amp;';
-               }
-               elsif ($ch eq '<') {
-                       return '&lt;';
-               }
-               elsif ($ch eq '=') {
-                       return '&equals;';
-               }
-               elsif ($ch eq '>') {
-                       return '&gt;';
-               }
-               else {
-                       return $ch;
-               }
+       my $name = HTML_ENTITY_NAME->{$n};
+       if ($name ne '') {
+               return "&$name;";
         }
-       elsif (
-               $all or
-               ($non_ascii and ($n > 0x7F))
-       ) {
-               foreach my $name (keys %{+HTML_ENTITY_CODE}) {
-                       my @code = @{HTML_ENTITY_CODE->{$name}};
-                       if (($code[0] == $n) and (@code == 1)) {
-                               return "&$name;";
-                       }
-               }
-               
-               if ($fallback eq 'hex') {
-                       return html_entity_encode_1ch_hex($ch, $non_ascii, $all);
-               }
-               elsif ($fallback eq 'dec') {
-                       return html_entity_encode_1ch_dec($ch, $non_ascii, $all);
-               }
+       elsif ($fallback eq 'dec') {
+               return "&#$n;";
+       }
+       elsif ($fallback eq 'hex') {
+               return sprintf('&#x%X;', $n);
+       }
+       else {
+               return $ch;
         }
-       
-       return $ch;
  }
  
  # html_entity_decode() decodes a text escaped by HTML entity references
@@ -3956,84 +4144,51 @@ sub html_entity_encode_1ch_name {
  # $t is the text to decode
  sub html_entity_decode {
         (my $t) = @_;
-       
-       my $d = '';
-       
-       while ($t ne '') {
-               if ($t =~ /^([^\&]+)(\&.*)$/s) { # any normal text
-                       $d .= $1;
-                       $t = $2;
-               }
-               elsif ($t =~ /^\&#?[A-Za-z0-9]+;/s) { # correct encoded character
-                       $d .= html_entity_decode_1en($&);
-                       $t = $';
-               }
-               elsif ($t =~ /^\&[A-Za-z0-9]+/s) { # encoded character without ";"
-                       $d .= html_entity_decode_1en($&);
-                       $t = $';
-               }
-               elsif ($t =~ /^\&/s) { # invalid "&"
-                       $d .= $&;
-                       $t = $';
-               }
-               else {  # nothing left to decode
-                       $d .= $t;
-                       $t = '';
-               }
-       }
-       
-       return $d;
+       $t =~ s/&((#(([0-9]+)|(x[0-9A-Fa-f]+));)|([A-Za-z0-9]+;?))/html_entity_decode_1en($&)/egs;
+       return $t;
  }
  
  # html_entity_decode_1en() decodes a single HTML entity reference
  # in any of the available forms.
  # an invalid entity reference will be left unchanged.
-# $en is the text to decode - it is assumed without checking
-# that the text is indeed in this format.
+# $en is the text to decode
  sub html_entity_decode_1en {
         (my $en) = @_;
-       my $y = '';
-       
-       if ($en !~ /;$/) { # name without ";"
-               my $n = substr($en, 1);
-               # we HAVE TO iterate as we don't know where name terminates :/
-               # why they thought this is a good idea ?
-               foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) {
-                       if (index($n, $name) == 0) { # name (beginning of entire string) is valid:
-                               # decode
-                               my @code = @{HTML_ENTITY_CODE_INF->{$name}};
-                               foreach my $c (@code) {
-                                       $y .= chr($c);
-                               }
-                               # rest of the string, insert literally
-                               $y .= substr($n, length($name));
-                               return $y;
-                       }
-               }
-               # invalid name, insert literally
-               return $en;
-       }
-       elsif ($en =~ /^\&#([0-9]+);$/) { # decimal
+       if ($en =~ /^&#([0-9]+);$/) {
                 return chr($1);
         }
-       elsif ($en =~ /^\&#x([0-9A-Fa-f]+);$/) { # hexadecimal
+       elsif ($en =~ /^&#x([0-9A-Fa-f]+);$/) {
                 return chr(hex($1));
         }
-       else { # name
-               my $name = substr($en, 1, length($en) - 2);
-               if (defined HTML_ENTITY_CODE->{$name}) {
-                       # valid name, decode
-                       my @code = @{HTML_ENTITY_CODE->{$name}};
-                       foreach my $c (@code) {
-                               $y .= chr($c);
+       elsif ($en =~ /^&([A-Za-z0-9]+)(;?)$/) {
+               my $y = '';
+               if ($2 ne '') {
+                       if (defined HTML_ENTITY_CODE->{$1}) {
+                               my @code = @{HTML_ENTITY_CODE->{$1}};
+                               foreach my $c (@code) {
+                                       $y .= chr($c);
+                               }
+                               return $y;
                         }
-                       return $y;
                 }
                 else {
-                       # invalid name, insert literally
-                       return $en;
+                       # we HAVE TO iterate as we don't know where name terminates :/
+                       # why they thought this is a good idea ?
+                       foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) {
+                               if (index($1, $name) == 0) { # name (beginning of entire string) is valid:
+                                       # decode
+                                       my @code = @{HTML_ENTITY_CODE_INF->{$name}};
+                                       foreach my $c (@code) {
+                                               $y .= chr($c);
+                                       }
+                                       # rest of the string, insert literally
+                                       $y .= substr($1, length($name));
+                                       return $y;
+                               }
+                       }
                 }
         }
+       return $en;
  }
  
  1
author	b <rowerynaksiezycu@gmail.com>
	Mon, 26 Feb 2024 21:27:45 +0000 (21:27 +0000)
committer	b <rowerynaksiezycu@gmail.com>
	Mon, 26 Feb 2024 21:27:45 +0000 (21:27 +0000)