From 5eb6fff2ecca3f68b44aeffcfaf10663a9e72502 Mon Sep 17 00:00:00 2001 From: b Date: Tue, 30 May 2023 22:22:50 +0000 Subject: [PATCH] HTML entity decoding finished --- botm_common.pm | 147 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 129 insertions(+), 18 deletions(-) diff --git a/botm_common.pm b/botm_common.pm index 9adfd55..0587fb6 100644 --- a/botm_common.pm +++ b/botm_common.pm @@ -32,7 +32,7 @@ $VERSION = 1.000006; 'read_data_file', 'write_data_file', 'url_encode', 'url_decode', 'join_path', - 'html_entity_encode_dec', 'html_entity_encode_hex', 'html_entity_encode_name' + 'html_entity_encode_dec', 'html_entity_encode_hex', 'html_entity_encode_name', 'html_entity_decode', ); %EXPORT_TAGS = (); @@ -131,6 +131,7 @@ sub read_data_file { } $line =~ s/[\n]$//g; + $line =~ s/[\r]$//g; # Empty line - end of header. if ($line eq ''){ @@ -271,16 +272,16 @@ sub url_encode { } if ($all) { - $t =~ s/(.)/url_encode_1ch($1, $encoding)/eg; + $t =~ s/(.)/url_encode_1ch($1, $encoding)/egs; } else { - $t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/eg; + $t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/egs; } return $t; } # url_encode_1ch() escapes a single chatacter using URL-encoding. -# $ch is the text to encode. +# $ch is the chatacter to encode. # $encoding is the encoding to use sub url_encode_1ch { (my $ch, my $encoding) = @_; @@ -289,7 +290,7 @@ sub url_encode_1ch { $encoding = "UTF-8"; } $ch = encode($encoding, $ch); - $ch =~ s/(.)/sprintf('%%%02X',ord($1))/eg; + $ch =~ s/(.)/sprintf('%%%02X',ord($1))/egs; return $ch; } @@ -303,11 +304,11 @@ sub url_decode { if ($encoding eq '') { $encoding = "utf8"; } - $t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1)/eg; + $t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1)/egs; return $t; } -# url_decode_xc() decodes a continuous string of characters escaped +# url_decode_xch() decodes a continuous string of characters escaped # $xch is the text to decode - it is assumed without checking # that the text is indeed in this format. sub url_decode_xch { @@ -2585,57 +2586,86 @@ use constant HTML_ENTITY_CODE_INF => { 'yuml' => [0x000FF] }; - +# html_entity_encode_dec() escapes a text using HTML entity references +# in decimal form. +# only reserved characters will be escaped. +# $t is the text to encode +# if $all is true then every character will be escaped. sub html_entity_encode_dec { (my $t, my $all) = @_; if ($all) { - $t =~ s/(.)/html_entity_encode_1ch_dec($1)/eg; + $t =~ s/(.)/html_entity_encode_1ch_dec($1)/egs; } else { - $t =~ s/([\"\&<=>])/html_entity_encode_1ch_dec($1)/eg; + $t =~ s/([\"\&<=>])/html_entity_encode_1ch_dec($1)/egs; } return $t; } +# html_entity_encode_hex() escapes a text using HTML entity references +# in hexadecimal form. +# only reserved characters will be escaped. +# $t is the text to encode +# if $all is true then every character will be escaped. sub html_entity_encode_hex { (my $t, my $all) = @_; if ($all) { - $t =~ s/(.)/html_entity_encode_1ch_hex($1)/eg; + $t =~ s/(.)/html_entity_encode_1ch_hex($1)/egs; } else { - $t =~ s/([\"\&<=>])/html_entity_encode_1ch_hex($1)/eg; + $t =~ s/([\"\&<=>])/html_entity_encode_1ch_hex($1)/egs; } return $t; } +# html_entity_encode_name() escapes a text using HTML entity references +# by name. +# only reserved characters will be escaped. +# $t is the text to encode +# if $all is true then every character which has a name will be escaped, +# and additionally if a character has more than 1 possible name, then +# the name will be chosen unpredicatbly. sub html_entity_encode_name { (my $t, my $all) = @_; if ($all) { - $t =~ s/(.)/html_entity_encode_1ch_name($1,1)/eg; + $t =~ s/(.)/html_entity_encode_1ch_name($1,1)/egs; } else { - $t =~ s/([\"\&<=>])/html_entity_encode_1ch_name($1,0)/eg; + $t =~ s/([\"\&<=>])/html_entity_encode_1ch_name($1,0)/egs; } return $t; } +# html_entity_encode_dec() escapes a single character +# as an HTML entity reference in decimal form. +# $ch is the chatacter to encode sub html_entity_encode_1ch_dec { (my $ch) = @_; - $ch =~ s/(.)/sprintf('&#%02u;',ord($1))/eg; + $ch =~ s/(.)/sprintf('&#%02u;',ord($1))/egs; return $ch; } +# html_entity_encode_1ch_hex() escapes a single character +# as an HTML entity reference in hexadecimal form. +# $ch is the chatacter to encode sub html_entity_encode_1ch_hex { (my $ch) = @_; - $ch =~ s/(.)/sprintf('&#x%02X;',ord($1))/eg; + $ch =~ s/(.)/sprintf('&#x%02X;',ord($1))/egs; return $ch; } +# html_entity_encode_1ch_name() escapes a single character +# as an HTML entity reference by name. +# only a reserved character will be escaped. +# $ch is the chatacter to encode +# if $all is true then any character which has a name will be escaped, +# and additionally if the character has more than 1 possible name, then +# the name will be chosen unpredicatbly. sub html_entity_encode_1ch_name { (my $ch, my $all) = @_; @@ -2671,9 +2701,90 @@ sub html_entity_encode_1ch_name { } } +# html_entity_decode() decodes a text escaped by HTML entity references +# in any of the available forms. +# invalid entity references will be left unchanged. +# $t is the text to decode +sub html_entity_decode { + (my $t) = @_; + + my $d = ''; + + while ($t ne '') { + if ($t =~ /^([^\&]+)(\&.*)$/s) { # any normal text + $d .= $1; + $t = $2; + } + elsif ($t =~ /^(\&#?[A-Za-z0-9]+;)(.*)$/s) { # correct encoded character + $d .= html_entity_decode_1en($1); + $t = $2; + } + elsif ($t =~ /^(\&[A-Za-z0-9]+)(.*)$/s) { # encoded character without ";" + $d .= html_entity_decode_1en($1); + $t = $2; + } + elsif ($t =~ /^(\&)(.*)$/s) { # invalid "&" + $d .= $1; + $t = $2; + } + else { # nothing left to decode + $d .= $t; + $t = ''; + } + } + + return $d; +} -# html_entity_encode_name -# html_entity_decode +# html_entity_decode_1en() decodes a single HTML entity reference +# in any of the available forms. +# an invalid entity reference will be left unchanged. +# $en is the text to decode - it is assumed without checking +# that the text is indeed in this format. +sub html_entity_decode_1en { + (my $en) = @_; + my $y = ''; + + if ($en !~ /;$/) { # name without ";" + my $n = substr($en, 1); + # we HAVE TO iterate :/ + foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) { + if (index($n, $name) == 0) { # name (beginning of entire string) is valid: + # decode + my @code = @{HTML_ENTITY_CODE_INF->{$name}}; + foreach my $c (@code) { + $y .= chr($c); + } + # rest of the string, insert literally + $y .= substr($n, length($name)); + return $y; + } + } + # invalid name, insert literally + return $en; + } + elsif ($en =~ /^\&#([0-9]+);$/) { # decimal + return chr($1); + } + elsif ($en =~ /^\&#x([0-9A-Fa-f]+);$/) { # hexadecimal + return chr(hex($1)); + } + else { # name + my $name = substr($en, 1, length($en) - 2); + if (defined HTML_ENTITY_CODE->{$name}) { + # valid name, decode + my @code = @{HTML_ENTITY_CODE->{$name}}; + foreach my $c (@code) { + $y .= chr($c); + } + return $y; + } + else { + # invalid name, insert literally + return $en; + } + } +} -- 2.30.2