From: b <rowerynaksiezycu@gmail.com>
Date: Mon, 26 Feb 2024 21:27:45 +0000 (+0000)
Subject: optimised HTM entity encoding / decoding
X-Git-Tag: v1.1.3
X-Git-Url: http://bicyclesonthemoon.info/git-projects/?a=commitdiff_plain;h=f0a6abfdebbbfd1b578d7339df1627f1afacd2be;p=botm%2Fcommon-perl

optimised HTM entity encoding / decoding
---

diff --git a/botm_common.pm b/botm_common.pm
index 18bde54..33572f3 100644
--- a/botm_common.pm
+++ b/botm_common.pm
@@ -26,7 +26,7 @@ use File::Copy;
 
 use Exporter;
 
-our $VERSION     = '1.1.2';
+our $VERSION     = '1.1.3';
 our @ISA         = qw(Exporter);
 our @EXPORT      = ();
 our @EXPORT_OK   = (
@@ -879,7 +879,6 @@ use constant HTTP_STATUS => {
 	'unassigned'                      => 509,
 	'not_extended'                    => 510,
 	'network_authentication required' => 511,
-
 };
 
 sub http_header_line {
@@ -1176,16 +1175,14 @@ sub read_header_env {
 # If $all is true then every character will be escaped
 sub url_encode {
 	(my $t, my $encoding, my $all) = @_;
-	
 	if ($encoding eq '') {
 		$encoding = 'UTF-8';
 	}
-
 	if ($all) {
-		$t =~ s/(.)/url_encode_1ch($1, $encoding)/egs;
+		$t =~ s/./url_encode_1ch($&,$encoding)/egs;
 	}
 	else {
-		$t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/egs;
+		$t =~ s/[^0-9A-Za-z.~\-_]/url_encode_1ch($&,$encoding)/egs;
 	}
 	return $t;
 }
@@ -1195,12 +1192,8 @@ sub url_encode {
 # $encoding is the encoding to use
 sub url_encode_1ch {
 	(my $ch, my $encoding) = @_;
-	
-	if ($encoding eq '') {
-		$encoding = 'UTF-8';
-	}
 	$ch = encode($encoding, $ch);
-	$ch =~ s/(.)/sprintf('%%%02X',ord($1))/egs;
+	$ch =~ s/./sprintf('%%%02X',ord($&))/egs;
 	return $ch;
 }
 
@@ -1211,14 +1204,13 @@ sub url_encode_1ch {
 # if $plus is true then '+' will be decoded to ' '.
 sub url_decode {
 	(my $t, my $encoding, my $plus) = @_;
-	
 	if ($encoding eq '') {
 		$encoding = 'utf8';
 	}
 	if ($plus) {
 		$t =~ s/\+/ /gs;
 	}
-	$t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1, $encoding)/egs;
+	$t =~ s/(%[0-9A-Fa-f]{2})+/url_decode_xch($&,$encoding)/egs;
 	return $t;
 }
 
@@ -1228,10 +1220,6 @@ sub url_decode {
 sub url_decode_xch {
 	(my $xch, my $encoding) = @_;
 	my $y = '';
-	
-	if ($encoding eq '') {
-		$encoding = 'utf8';
-	}
 	while ($xch ne '') {
 		$y .= chr(hex(substr($xch, 1, 2)));
 		$xch = substr($xch, 3);
@@ -3801,36 +3789,306 @@ use constant HTML_ENTITY_CODE_INF => {
 	'yen'                            => [0x000A5],
 	'yuml'                           => [0x000FF]
 };
+# here a reverse map: from code to name, but only HTML 4.01
+# See https://www.w3.org/TR/1999/PR-html40-19990824/sgml/entities.html
+use constant HTML_ENTITY_NAME => {
+	0x0022 =>'quot',
+	0x0026 =>'amp',
+	0x003C =>'lt',
+	0x003E =>'gt',
+	0x00A0 =>'nbsp',
+	0x00A1 =>'iexcl',
+	0x00A2 =>'cent',
+	0x00A3 =>'pound',
+	0x00A4 =>'curren',
+	0x00A5 =>'yen',
+	0x00A6 =>'brvbar',
+	0x00A7 =>'sect',
+	0x00A8 =>'uml',
+	0x00A9 =>'copy',
+	0x00AA =>'ordf',
+	0x00AB =>'laquo',
+	0x00AC =>'not',
+	0x00AD =>'shy',
+	0x00AE =>'reg',
+	0x00AF =>'macr',
+	0x00B0 =>'deg',
+	0x00B1 =>'plusmn',
+	0x00B2 =>'sup2',
+	0x00B3 =>'sup3',
+	0x00B4 =>'acute',
+	0x00B5 =>'micro',
+	0x00B6 =>'para',
+	0x00B7 =>'middot',
+	0x00B8 =>'cedil',
+	0x00B9 =>'sup1',
+	0x00BA =>'ordm',
+	0x00BB =>'raquo',
+	0x00BC =>'frac14',
+	0x00BD =>'frac12',
+	0x00BE =>'frac34',
+	0x00BF =>'iquest',
+	0x00C0 =>'Agrave',
+	0x00C1 =>'Aacute',
+	0x00C2 =>'Acirc',
+	0x00C3 =>'Atilde',
+	0x00C4 =>'Auml',
+	0x00C5 =>'Aring',
+	0x00C6 =>'AElig',
+	0x00C7 =>'Ccedil',
+	0x00C8 =>'Egrave',
+	0x00C9 =>'Eacute',
+	0x00CA =>'Ecirc',
+	0x00CB =>'Euml',
+	0x00CC =>'Igrave',
+	0x00CD =>'Iacute',
+	0x00CE =>'Icirc',
+	0x00CF =>'Iuml',
+	0x00D0 =>'ETH',
+	0x00D1 =>'Ntilde',
+	0x00D2 =>'Ograve',
+	0x00D3 =>'Oacute',
+	0x00D4 =>'Ocirc',
+	0x00D5 =>'Otilde',
+	0x00D6 =>'Ouml',
+	0x00D7 =>'times',
+	0x00D8 =>'Oslash',
+	0x00D9 =>'Ugrave',
+	0x00DA =>'Uacute',
+	0x00DB =>'Ucirc',
+	0x00DC =>'Uuml',
+	0x00DD =>'Yacute',
+	0x00DE =>'THORN',
+	0x00DF =>'szlig',
+	0x00E0 =>'agrave',
+	0x00E1 =>'aacute',
+	0x00E2 =>'acirc',
+	0x00E3 =>'atilde',
+	0x00E4 =>'auml',
+	0x00E5 =>'aring',
+	0x00E6 =>'aelig',
+	0x00E7 =>'ccedil',
+	0x00E8 =>'egrave',
+	0x00E9 =>'eacute',
+	0x00EA =>'ecirc',
+	0x00EB =>'euml',
+	0x00EC =>'igrave',
+	0x00ED =>'iacute',
+	0x00EE =>'icirc',
+	0x00EF =>'iuml',
+	0x00F0 =>'eth',
+	0x00F1 =>'ntilde',
+	0x00F2 =>'ograve',
+	0x00F3 =>'oacute',
+	0x00F4 =>'ocirc',
+	0x00F5 =>'otilde',
+	0x00F6 =>'ouml',
+	0x00F7 =>'divide',
+	0x00F8 =>'oslash',
+	0x00F9 =>'ugrave',
+	0x00FA =>'uacute',
+	0x00FB =>'ucirc',
+	0x00FC =>'uuml',
+	0x00FD =>'yacute',
+	0x00FE =>'thorn',
+	0x00FF =>'yuml',
+	0x0152 =>'OElig',
+	0x0153 =>'oelig',
+	0x0160 =>'Scaron',
+	0x0161 =>'scaron',
+	0x0178 =>'Yuml',
+	0x0192 =>'fnof',
+	0x02C6 =>'circ',
+	0x02DC =>'tilde',
+	0x0391 =>'Alpha',
+	0x0392 =>'Beta',
+	0x0393 =>'Gamma',
+	0x0394 =>'Delta',
+	0x0395 =>'Epsilon',
+	0x0396 =>'Zeta',
+	0x0397 =>'Eta',
+	0x0398 =>'Theta',
+	0x0399 =>'Iota',
+	0x039A =>'Kappa',
+	0x039B =>'Lambda',
+	0x039C =>'Mu',
+	0x039D =>'Nu',
+	0x039E =>'Xi',
+	0x039F =>'Omicron',
+	0x03A0 =>'Pi',
+	0x03A1 =>'Rho',
+	0x03A3 =>'Sigma',
+	0x03A4 =>'Tau',
+	0x03A5 =>'Upsilon',
+	0x03A6 =>'Phi',
+	0x03A7 =>'Chi',
+	0x03A8 =>'Psi',
+	0x03A9 =>'Omega',
+	0x03B1 =>'alpha',
+	0x03B2 =>'beta',
+	0x03B3 =>'gamma',
+	0x03B4 =>'delta',
+	0x03B5 =>'epsilon',
+	0x03B6 =>'zeta',
+	0x03B7 =>'eta',
+	0x03B8 =>'theta',
+	0x03B9 =>'iota',
+	0x03BA =>'kappa',
+	0x03BB =>'lambda',
+	0x03BC =>'mu',
+	0x03BD =>'nu',
+	0x03BE =>'xi',
+	0x03BF =>'omicron',
+	0x03C0 =>'pi',
+	0x03C1 =>'rho',
+	0x03C2 =>'sigmaf',
+	0x03C3 =>'sigma',
+	0x03C4 =>'tau',
+	0x03C5 =>'upsilon',
+	0x03C6 =>'phi',
+	0x03C7 =>'chi',
+	0x03C8 =>'psi',
+	0x03C9 =>'omega',
+	0x03D1 =>'thetasym',
+	0x03D2 =>'upsih',
+	0x03D6 =>'piv',
+	0x2002 =>'ensp',
+	0x2003 =>'emsp',
+	0x2009 =>'thinsp',
+	0x200C =>'zwnj',
+	0x200D =>'zwj',
+	0x200E =>'lrm',
+	0x200F =>'rlm',
+	0x2013 =>'ndash',
+	0x2014 =>'mdash',
+	0x2018 =>'lsquo',
+	0x2019 =>'rsquo',
+	0x201A =>'sbquo',
+	0x201C =>'ldquo',
+	0x201D =>'rdquo',
+	0x201E =>'bdquo',
+	0x2020 =>'dagger',
+	0x2021 =>'Dagger',
+	0x2022 =>'bull',
+	0x2026 =>'hellip',
+	0x2030 =>'permil',
+	0x2032 =>'prime',
+	0x2033 =>'Prime',
+	0x2039 =>'lsaquo',
+	0x203A =>'rsaquo',
+	0x203E =>'oline',
+	0x2044 =>'frasl',
+	0x20AC =>'euro',
+	0x2111 =>'image',
+	0x2118 =>'weierp',
+	0x211C =>'real',
+	0x2122 =>'trade',
+	0x2135 =>'alefsym',
+	0x2190 =>'larr',
+	0x2191 =>'uarr',
+	0x2192 =>'rarr',
+	0x2193 =>'darr',
+	0x2194 =>'harr',
+	0x21B5 =>'crarr',
+	0x21D0 =>'lArr',
+	0x21D1 =>'uArr',
+	0x21D2 =>'rArr',
+	0x21D3 =>'dArr',
+	0x21D4 =>'hArr',
+	0x2200 =>'forall',
+	0x2202 =>'part',
+	0x2203 =>'exist',
+	0x2205 =>'empty',
+	0x2207 =>'nabla',
+	0x2208 =>'isin',
+	0x2209 =>'notin',
+	0x220B =>'ni',
+	0x220F =>'prod',
+	0x2211 =>'sum',
+	0x2212 =>'minus',
+	0x2217 =>'lowast',
+	0x221A =>'radic',
+	0x221D =>'prop',
+	0x221E =>'infin',
+	0x2220 =>'ang',
+	0x2227 =>'and',
+	0x2228 =>'or',
+	0x2229 =>'cap',
+	0x222A =>'cup',
+	0x222B =>'int',
+	0x2234 =>'there4',
+	0x223C =>'sim',
+	0x2245 =>'cong',
+	0x2248 =>'asymp',
+	0x2260 =>'ne',
+	0x2261 =>'equiv',
+	0x2264 =>'le',
+	0x2265 =>'ge',
+	0x2282 =>'sub',
+	0x2283 =>'sup',
+	0x2284 =>'nsub',
+	0x2286 =>'sube',
+	0x2287 =>'supe',
+	0x2295 =>'oplus',
+	0x2295 =>'otimes',
+	0x22A5 =>'perp',
+	0x22C5 =>'sdot',
+	0x2308 =>'lceil',
+	0x230A =>'lfloor',
+	0x230B =>'rfloor',
+	0x2329 =>'lang',
+	0x232A =>'rang',
+	0x25CA =>'loz',
+	0x2660 =>'spades',
+	0x2663 =>'clubs',
+	0x2665 =>'hearts',
+	0x2666 =>'diams'
+};
+
 
 # html_entity_encode_dec() escapes a text using HTML entity references
 # in decimal form.
-# only reserved characters will be escaped.
+# reserved characters will be escaped.
 # $t is the text to encode
 # if $non_ascii is true then every non-ASCII character will be escaped
 # if $all is true then every character will be escaped.
 sub html_entity_encode_dec {
 	(my $t, my $non_ascii, my $all) = @_;
-	
-	$t =~ s/(.)/html_entity_encode_1ch_dec($1,$non_ascii,$all)/egs;
+	if ($all) {
+		$t =~ s/./'&#'.ord($&).';'/egs;
+	}
+	elsif ($non_ascii) {
+		$t =~ s/[\"\&<=>\P{ASCII}]/'&#'.ord($&).';'/egs;
+	}
+	else {
+		$t =~ s/[\"\&<=>]/'&#'.ord($&).';'/egs;
+	}
 	return $t;
 }
 
 # html_entity_encode_hex() escapes a text using HTML entity references
 # in hexadecimal form.
-# only reserved characters will be escaped.
+# reserved characters will be escaped.
 # $t is the text to encode
 # if $non_ascii is true then every non-ASCII character will be escaped
 # if $all is true then every character will be escaped.
 sub html_entity_encode_hex {
 	(my $t, my $non_ascii, my $all) = @_;
-	
-	$t =~ s/(.)/html_entity_encode_1ch_hex($1,$non_ascii,$all)/egs;
+	if ($all) {
+		$t =~ s/./sprintf('&#x%X;',ord($&))/egs;
+	}
+	elsif ($non_ascii) {
+		$t =~ s/[\"\&<=>\P{ASCII}]/sprintf('&#x%X;',ord($&))/egs;
+	}
+	else {
+		$t =~ s/[\"\&<=>]/sprintf('&#x%X;',ord($&))/egs;
+	}
 	return $t;
 }
 
 # html_entity_encode_name() escapes a text using HTML entity references
 # by name.
-# only reserved characters will be escaped.
+# reserved characters will be escaped.
 # $t is the text to encode
 # if $non_ascii is true then every non-ASCII character will be escaped
 # if $all is true then every character will be escaped.
@@ -3839,115 +4097,45 @@ sub html_entity_encode_hex {
 # or if $fallback is 'hex' then character without name will be escaped
 # in hexadecimal form instead
 # or otherwise they will not be escaped
-# if a character has more than 1 possible name, then
-# the name will be chosen unpredicatbly.
 sub html_entity_encode_name {
 	(my $t, my $non_ascii, my $all, my $fallback) = @_;
 	
-	$t =~ s/(.)/html_entity_encode_1ch_name($1,$non_ascii,$all,$fallback)/egs;
-	return $t;
-}
-
-# html_entity_encode_dec() escapes a single character
-# as an HTML entity reference in decimal form.
-# $ch is the chatacter to encode
-# if $non_ascii is true then every non-ASCII character will be escaped
-# if $all is true then every character will be escaped.
-sub html_entity_encode_1ch_dec {
-	(my $ch, my $non_ascii, my $all) = @_;
-	
-	my $n = ord($ch);
-	if (
-		$all or
-		($non_ascii and ($n > 0x7F)) or
-		($ch =~ /[\"\&<=>]/)
-	) {
-		return sprintf('&#%02u;', $n);
-	}
-	else {
-		return $ch;
+	if ($all) {
+		$t =~ s/./html_entity_encode_1ch_name($&,$fallback)/egs;
 	}
-}
-
-# html_entity_encode_1ch_hex() escapes a single character
-# as an HTML entity reference in hexadecimal form.
-# $ch is the chatacter to encode
-# if $non_ascii is true then every non-ASCII character will be escaped
-# if $all is true then every character will be escaped.
-sub html_entity_encode_1ch_hex {
-	(my $ch, my $non_ascii, my $all) = @_;
-	
-	my $n = ord($ch);
-	if (
-		$all or
-		($non_ascii and ($n > 0x7F)) or
-		($ch =~ /[\"\&<=>]/)
-	) {
-		return sprintf('&#x%02X;', $n);
+	elsif ($non_ascii) {
+		$t =~ s/[\"\&<=>\P{ASCII}]/html_entity_encode_1ch_name($&,$fallback)/egs;
 	}
 	else {
-		return $ch;
+		$t =~ s/[\"\&<=>]/html_entity_encode_1ch_name($&,$fallback)/egs;
 	}
+	return $t;
 }
 
 # html_entity_encode_1ch_name() escapes a single character
 # as an HTML entity reference by name.
-# only a reserved character will be escaped.
 # $ch is the chatacter to encode
-# if $non_ascii is true then every non-ASCII character will be escaped
-# if $all is true then every character will be escaped.
 # if $fallback is 'dec' then character without name will be escaped in
 # decimal form instead
 # or if $fallback is 'hex' then character without name will be escaped
 # in hexadecimal form instead
 # or otherwise they will not be escaped
-# if a character has more than 1 possible name, then
-# the name will be chosen unpredicatbly.
 sub html_entity_encode_1ch_name {
-	(my $ch, my $non_ascii, my $all, my $fallback) = @_;
-	
+	(my $ch, my $fallback) = @_;
 	my $n = ord($ch);
-	
-	if ($ch =~ /[\"\&<=>]/) {
-		if ($ch eq '"') {
-			return '&quot;';
-		}
-		elsif ($ch eq '&') {
-			return '&amp;';
-		}
-		elsif ($ch eq '<') {
-			return '&lt;';
-		}
-		elsif ($ch eq '=') {
-			return '&equals;';
-		}
-		elsif ($ch eq '>') {
-			return '&gt;';
-		}
-		else {
-			return $ch;
-		}
+	my $name = HTML_ENTITY_NAME->{$n};
+	if ($name ne '') {
+		return "&$name;";
 	}
-	elsif (
-		$all or
-		($non_ascii and ($n > 0x7F))
-	) {
-		foreach my $name (keys %{+HTML_ENTITY_CODE}) {
-			my @code = @{HTML_ENTITY_CODE->{$name}};
-			if (($code[0] == $n) and (@code == 1)) {
-				return "&$name;";
-			}
-		}
-		
-		if ($fallback eq 'hex') {
-			return html_entity_encode_1ch_hex($ch, $non_ascii, $all);
-		}
-		elsif ($fallback eq 'dec') {
-			return html_entity_encode_1ch_dec($ch, $non_ascii, $all);
-		}
+	elsif ($fallback eq 'dec') {
+		return "&#$n;";
+	}
+	elsif ($fallback eq 'hex') {
+		return sprintf('&#x%X;', $n);
+	}
+	else {
+		return $ch;
 	}
-	
-	return $ch;
 }
 
 # html_entity_decode() decodes a text escaped by HTML entity references
@@ -3956,84 +4144,51 @@ sub html_entity_encode_1ch_name {
 # $t is the text to decode
 sub html_entity_decode {
 	(my $t) = @_;
-	
-	my $d = '';
-	
-	while ($t ne '') {
-		if ($t =~ /^([^\&]+)(\&.*)$/s) { # any normal text
-			$d .= $1;
-			$t = $2;
-		}
-		elsif ($t =~ /^\&#?[A-Za-z0-9]+;/s) { # correct encoded character
-			$d .= html_entity_decode_1en($&);
-			$t = $';
-		}
-		elsif ($t =~ /^\&[A-Za-z0-9]+/s) { # encoded character without ";"
-			$d .= html_entity_decode_1en($&);
-			$t = $';
-		}
-		elsif ($t =~ /^\&/s) { # invalid "&"
-			$d .= $&;
-			$t = $';
-		}
-		else {  # nothing left to decode
-			$d .= $t;
-			$t = '';
-		}
-	}
-	
-	return $d;
+	$t =~ s/&((#(([0-9]+)|(x[0-9A-Fa-f]+));)|([A-Za-z0-9]+;?))/html_entity_decode_1en($&)/egs;
+	return $t;
 }
 
 # html_entity_decode_1en() decodes a single HTML entity reference
 # in any of the available forms.
 # an invalid entity reference will be left unchanged.
-# $en is the text to decode - it is assumed without checking
-# that the text is indeed in this format.
+# $en is the text to decode
 sub html_entity_decode_1en {
 	(my $en) = @_;
-	my $y = '';
-	
-	if ($en !~ /;$/) { # name without ";"
-		my $n = substr($en, 1);
-		# we HAVE TO iterate as we don't know where name terminates :/
-		# why they thought this is a good idea ?
-		foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) {
-			if (index($n, $name) == 0) { # name (beginning of entire string) is valid:
-				# decode
-				my @code = @{HTML_ENTITY_CODE_INF->{$name}};
-				foreach my $c (@code) {
-					$y .= chr($c);
-				}
-				# rest of the string, insert literally
-				$y .= substr($n, length($name));
-				return $y;
-			}
-		}
-		# invalid name, insert literally
-		return $en;
-	}
-	elsif ($en =~ /^\&#([0-9]+);$/) { # decimal
+	if ($en =~ /^&#([0-9]+);$/) {
 		return chr($1);
 	}
-	elsif ($en =~ /^\&#x([0-9A-Fa-f]+);$/) { # hexadecimal
+	elsif ($en =~ /^&#x([0-9A-Fa-f]+);$/) {
 		return chr(hex($1));
 	}
-	else { # name
-		my $name = substr($en, 1, length($en) - 2);
-		if (defined HTML_ENTITY_CODE->{$name}) {
-			# valid name, decode
-			my @code = @{HTML_ENTITY_CODE->{$name}};
-			foreach my $c (@code) {
-				$y .= chr($c);
+	elsif ($en =~ /^&([A-Za-z0-9]+)(;?)$/) {
+		my $y = '';
+		if ($2 ne '') {
+			if (defined HTML_ENTITY_CODE->{$1}) {
+				my @code = @{HTML_ENTITY_CODE->{$1}};
+				foreach my $c (@code) {
+					$y .= chr($c);
+				}
+				return $y;
 			}
-			return $y;
 		}
 		else {
-			# invalid name, insert literally
-			return $en;
+			# we HAVE TO iterate as we don't know where name terminates :/
+			# why they thought this is a good idea ?
+			foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) {
+				if (index($1, $name) == 0) { # name (beginning of entire string) is valid:
+					# decode
+					my @code = @{HTML_ENTITY_CODE_INF->{$name}};
+					foreach my $c (@code) {
+						$y .= chr($c);
+					}
+					# rest of the string, insert literally
+					$y .= substr($1, length($name));
+					return $y;
+				}
+			}
 		}
 	}
+	return $en;
 }
 
 1