From: b <rowerynaksiezycu@gmail.com>
Date: Tue, 30 May 2023 22:22:50 +0000 (+0000)
Subject: HTML entity decoding finished
X-Git-Tag: v1.0.6
X-Git-Url: http://bicyclesonthemoon.info/git-projects/?a=commitdiff_plain;h=refs%2Ftags%2Fv1.0.6;p=botm%2Fcommon-perl

HTML entity decoding finished
---

diff --git a/botm_common.pm b/botm_common.pm
index 9adfd55..0587fb6 100644
--- a/botm_common.pm
+++ b/botm_common.pm
@@ -32,7 +32,7 @@ $VERSION     = 1.000006;
 	'read_data_file', 'write_data_file',
 	'url_encode', 'url_decode',
 	'join_path',
-	'html_entity_encode_dec', 'html_entity_encode_hex', 'html_entity_encode_name'
+	'html_entity_encode_dec', 'html_entity_encode_hex', 'html_entity_encode_name', 'html_entity_decode',
 );
 %EXPORT_TAGS = ();
 
@@ -131,6 +131,7 @@ sub read_data_file {
 		}
 		
 		$line =~ s/[\n]$//g;
+		$line =~ s/[\r]$//g;
 		
 		# Empty line - end of header.
 		if ($line eq ''){
@@ -271,16 +272,16 @@ sub url_encode {
 	}
 
 	if ($all) {
-		$t =~ s/(.)/url_encode_1ch($1, $encoding)/eg;
+		$t =~ s/(.)/url_encode_1ch($1, $encoding)/egs;
 	}
 	else {
-		$t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/eg;
+		$t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/egs;
 	}
 	return $t;
 }
 
 # url_encode_1ch() escapes a single chatacter using URL-encoding.
-# $ch is the text to encode.
+# $ch is the chatacter to encode.
 # $encoding is the encoding to use
 sub url_encode_1ch {
 	(my $ch, my $encoding) = @_;
@@ -289,7 +290,7 @@ sub url_encode_1ch {
 		$encoding = "UTF-8";
 	}
 	$ch = encode($encoding, $ch);
-	$ch =~ s/(.)/sprintf('%%%02X',ord($1))/eg;
+	$ch =~ s/(.)/sprintf('%%%02X',ord($1))/egs;
 	return $ch;
 }
 
@@ -303,11 +304,11 @@ sub url_decode {
 	if ($encoding eq '') {
 		$encoding = "utf8";
 	}
-	$t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1)/eg;
+	$t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1)/egs;
 	return $t;
 }
 
-# url_decode_xc() decodes a continuous string of characters escaped
+# url_decode_xch() decodes a continuous string of characters escaped
 # $xch is the text to decode - it is assumed without checking
 # that the text is indeed in this format. 
 sub url_decode_xch {
@@ -2585,57 +2586,86 @@ use constant HTML_ENTITY_CODE_INF => {
 	'yuml'                           => [0x000FF]
 };
 
-
+# html_entity_encode_dec() escapes a text using HTML entity references
+# in decimal form.
+# only reserved characters will be escaped.
+# $t is the text to encode
+# if $all is true then every character will be escaped.
 sub html_entity_encode_dec {
 	(my $t, my $all) = @_;
 	
 	if ($all) {
-		$t =~ s/(.)/html_entity_encode_1ch_dec($1)/eg;
+		$t =~ s/(.)/html_entity_encode_1ch_dec($1)/egs;
 	}
 	else {
-		$t =~ s/([\"\&<=>])/html_entity_encode_1ch_dec($1)/eg;
+		$t =~ s/([\"\&<=>])/html_entity_encode_1ch_dec($1)/egs;
 	}
 	return $t;
 }
 
+# html_entity_encode_hex() escapes a text using HTML entity references
+# in hexadecimal form.
+# only reserved characters will be escaped.
+# $t is the text to encode
+# if $all is true then every character will be escaped.
 sub html_entity_encode_hex {
 	(my $t, my $all) = @_;
 	
 	if ($all) {
-		$t =~ s/(.)/html_entity_encode_1ch_hex($1)/eg;
+		$t =~ s/(.)/html_entity_encode_1ch_hex($1)/egs;
 	}
 	else {
-		$t =~ s/([\"\&<=>])/html_entity_encode_1ch_hex($1)/eg;
+		$t =~ s/([\"\&<=>])/html_entity_encode_1ch_hex($1)/egs;
 	}
 	return $t;
 }
 
+# html_entity_encode_name() escapes a text using HTML entity references
+# by name.
+# only reserved characters will be escaped.
+# $t is the text to encode
+# if $all is true then every character which has a name will be escaped,
+# and additionally if a character has more than 1 possible name, then
+# the name will be chosen unpredicatbly.
 sub html_entity_encode_name {
 	(my $t, my $all) = @_;
 	
 	if ($all) {
-		$t =~ s/(.)/html_entity_encode_1ch_name($1,1)/eg;
+		$t =~ s/(.)/html_entity_encode_1ch_name($1,1)/egs;
 	}
 	else {
-		$t =~ s/([\"\&<=>])/html_entity_encode_1ch_name($1,0)/eg;
+		$t =~ s/([\"\&<=>])/html_entity_encode_1ch_name($1,0)/egs;
 	}
 	return $t;
 }
 
+# html_entity_encode_dec() escapes a single character
+# as an HTML entity reference in decimal form.
+# $ch is the chatacter to encode
 sub html_entity_encode_1ch_dec {
 	(my $ch) = @_;
 	
-	$ch =~ s/(.)/sprintf('&#%02u;',ord($1))/eg;
+	$ch =~ s/(.)/sprintf('&#%02u;',ord($1))/egs;
 	return $ch;
 }
 
+# html_entity_encode_1ch_hex() escapes a single character
+# as an HTML entity reference in hexadecimal form.
+# $ch is the chatacter to encode
 sub html_entity_encode_1ch_hex {
 	(my $ch) = @_;
 	
-	$ch =~ s/(.)/sprintf('&#x%02X;',ord($1))/eg;
+	$ch =~ s/(.)/sprintf('&#x%02X;',ord($1))/egs;
 	return $ch;
 }
 
+# html_entity_encode_1ch_name() escapes a single character
+# as an HTML entity reference by name.
+# only a reserved character will be escaped.
+# $ch is the chatacter to encode
+# if $all is true then any character which has a name will be escaped,
+# and additionally if the character has more than 1 possible name, then
+# the name will be chosen unpredicatbly.
 sub html_entity_encode_1ch_name {
 	(my $ch, my $all) = @_;
 	
@@ -2671,9 +2701,90 @@ sub html_entity_encode_1ch_name {
 	}
 }
 
+# html_entity_decode() decodes a text escaped by HTML entity references
+# in any of the available forms.
+# invalid entity references will be left unchanged.
+# $t is the text to decode
+sub html_entity_decode {
+	(my $t) = @_;
+	
+	my $d = '';
+	
+	while ($t ne '') {
+		if ($t =~ /^([^\&]+)(\&.*)$/s) { # any normal text
+			$d .= $1;
+			$t = $2;
+		}
+		elsif ($t =~ /^(\&#?[A-Za-z0-9]+;)(.*)$/s) { # correct encoded character
+			$d .= html_entity_decode_1en($1);
+			$t = $2;
+		}
+		elsif ($t =~ /^(\&[A-Za-z0-9]+)(.*)$/s) { # encoded character without ";"
+			$d .= html_entity_decode_1en($1);
+			$t = $2;
+		}
+		elsif ($t =~ /^(\&)(.*)$/s) { # invalid "&"
+			$d .= $1;
+			$t = $2;
+		}
+		else {  # nothing left to decode
+			$d .= $t;
+			$t = '';
+		}
+	}
+	
+	return $d;
+}
 
-# html_entity_encode_name
-# html_entity_decode
+# html_entity_decode_1en() decodes a single HTML entity reference
+# in any of the available forms.
+# an invalid entity reference will be left unchanged.
+# $en is the text to decode - it is assumed without checking
+# that the text is indeed in this format.
+sub html_entity_decode_1en {
+	(my $en) = @_;
+	my $y = '';
+	
+	if ($en !~ /;$/) { # name without ";"
+		my $n = substr($en, 1);
+		# we HAVE TO iterate :/
+		foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) {
+			if (index($n, $name) == 0) { # name (beginning of entire string) is valid:
+				# decode
+				my @code = @{HTML_ENTITY_CODE_INF->{$name}};
+				foreach my $c (@code) {
+					$y .= chr($c);
+				}
+				# rest of the string, insert literally
+				$y .= substr($n, length($name));
+				return $y;
+			}
+		}
+		# invalid name, insert literally
+		return $en;
+	}
+	elsif ($en =~ /^\&#([0-9]+);$/) { # decimal
+		return chr($1);
+	}
+	elsif ($en =~ /^\&#x([0-9A-Fa-f]+);$/) { # hexadecimal
+		return chr(hex($1));
+	}
+	else { # name
+		my $name = substr($en, 1, length($en) - 2);
+		if (defined HTML_ENTITY_CODE->{$name}) {
+			# valid name, decode
+			my @code = @{HTML_ENTITY_CODE->{$name}};
+			foreach my $c (@code) {
+				$y .= chr($c);
+			}
+			return $y;
+		}
+		else {
+			# invalid name, insert literally
+			return $en;
+		}
+	}
+}