]> bicyclesonthemoon.info Git - botm/common-perl/commitdiff
HTML entity decoding finished v1.0.6
authorb <rowerynaksiezycu@gmail.com>
Tue, 30 May 2023 22:22:50 +0000 (22:22 +0000)
committerb <rowerynaksiezycu@gmail.com>
Tue, 30 May 2023 22:22:50 +0000 (22:22 +0000)
botm_common.pm

index 9adfd55354425c94c47d1dba105b561598b15a27..0587fb667f159682f7f0e1511b4c229b057589fc 100644 (file)
@@ -32,7 +32,7 @@ $VERSION     = 1.000006;
        'read_data_file', 'write_data_file',
        'url_encode', 'url_decode',
        'join_path',
-       'html_entity_encode_dec', 'html_entity_encode_hex', 'html_entity_encode_name'
+       'html_entity_encode_dec', 'html_entity_encode_hex', 'html_entity_encode_name', 'html_entity_decode',
 );
 %EXPORT_TAGS = ();
 
@@ -131,6 +131,7 @@ sub read_data_file {
                }
                
                $line =~ s/[\n]$//g;
+               $line =~ s/[\r]$//g;
                
                # Empty line - end of header.
                if ($line eq ''){
@@ -271,16 +272,16 @@ sub url_encode {
        }
 
        if ($all) {
-               $t =~ s/(.)/url_encode_1ch($1, $encoding)/eg;
+               $t =~ s/(.)/url_encode_1ch($1, $encoding)/egs;
        }
        else {
-               $t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/eg;
+               $t =~ s/([^0-9A-Za-z.~\-_])/url_encode_1ch($1, $encoding)/egs;
        }
        return $t;
 }
 
 # url_encode_1ch() escapes a single chatacter using URL-encoding.
-# $ch is the text to encode.
+# $ch is the chatacter to encode.
 # $encoding is the encoding to use
 sub url_encode_1ch {
        (my $ch, my $encoding) = @_;
@@ -289,7 +290,7 @@ sub url_encode_1ch {
                $encoding = "UTF-8";
        }
        $ch = encode($encoding, $ch);
-       $ch =~ s/(.)/sprintf('%%%02X',ord($1))/eg;
+       $ch =~ s/(.)/sprintf('%%%02X',ord($1))/egs;
        return $ch;
 }
 
@@ -303,11 +304,11 @@ sub url_decode {
        if ($encoding eq '') {
                $encoding = "utf8";
        }
-       $t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1)/eg;
+       $t =~ s/((%[0-9A-Fa-f]{2})+)/url_decode_xch($1)/egs;
        return $t;
 }
 
-# url_decode_xc() decodes a continuous string of characters escaped
+# url_decode_xch() decodes a continuous string of characters escaped
 # $xch is the text to decode - it is assumed without checking
 # that the text is indeed in this format. 
 sub url_decode_xch {
@@ -2585,57 +2586,86 @@ use constant HTML_ENTITY_CODE_INF => {
        'yuml'                           => [0x000FF]
 };
 
-
+# html_entity_encode_dec() escapes a text using HTML entity references
+# in decimal form.
+# only reserved characters will be escaped.
+# $t is the text to encode
+# if $all is true then every character will be escaped.
 sub html_entity_encode_dec {
        (my $t, my $all) = @_;
        
        if ($all) {
-               $t =~ s/(.)/html_entity_encode_1ch_dec($1)/eg;
+               $t =~ s/(.)/html_entity_encode_1ch_dec($1)/egs;
        }
        else {
-               $t =~ s/([\"\&<=>])/html_entity_encode_1ch_dec($1)/eg;
+               $t =~ s/([\"\&<=>])/html_entity_encode_1ch_dec($1)/egs;
        }
        return $t;
 }
 
+# html_entity_encode_hex() escapes a text using HTML entity references
+# in hexadecimal form.
+# only reserved characters will be escaped.
+# $t is the text to encode
+# if $all is true then every character will be escaped.
 sub html_entity_encode_hex {
        (my $t, my $all) = @_;
        
        if ($all) {
-               $t =~ s/(.)/html_entity_encode_1ch_hex($1)/eg;
+               $t =~ s/(.)/html_entity_encode_1ch_hex($1)/egs;
        }
        else {
-               $t =~ s/([\"\&<=>])/html_entity_encode_1ch_hex($1)/eg;
+               $t =~ s/([\"\&<=>])/html_entity_encode_1ch_hex($1)/egs;
        }
        return $t;
 }
 
+# html_entity_encode_name() escapes a text using HTML entity references
+# by name.
+# only reserved characters will be escaped.
+# $t is the text to encode
+# if $all is true then every character which has a name will be escaped,
+# and additionally if a character has more than 1 possible name, then
+# the name will be chosen unpredicatbly.
 sub html_entity_encode_name {
        (my $t, my $all) = @_;
        
        if ($all) {
-               $t =~ s/(.)/html_entity_encode_1ch_name($1,1)/eg;
+               $t =~ s/(.)/html_entity_encode_1ch_name($1,1)/egs;
        }
        else {
-               $t =~ s/([\"\&<=>])/html_entity_encode_1ch_name($1,0)/eg;
+               $t =~ s/([\"\&<=>])/html_entity_encode_1ch_name($1,0)/egs;
        }
        return $t;
 }
 
+# html_entity_encode_dec() escapes a single character
+# as an HTML entity reference in decimal form.
+# $ch is the chatacter to encode
 sub html_entity_encode_1ch_dec {
        (my $ch) = @_;
        
-       $ch =~ s/(.)/sprintf('&#%02u;',ord($1))/eg;
+       $ch =~ s/(.)/sprintf('&#%02u;',ord($1))/egs;
        return $ch;
 }
 
+# html_entity_encode_1ch_hex() escapes a single character
+# as an HTML entity reference in hexadecimal form.
+# $ch is the chatacter to encode
 sub html_entity_encode_1ch_hex {
        (my $ch) = @_;
        
-       $ch =~ s/(.)/sprintf('&#x%02X;',ord($1))/eg;
+       $ch =~ s/(.)/sprintf('&#x%02X;',ord($1))/egs;
        return $ch;
 }
 
+# html_entity_encode_1ch_name() escapes a single character
+# as an HTML entity reference by name.
+# only a reserved character will be escaped.
+# $ch is the chatacter to encode
+# if $all is true then any character which has a name will be escaped,
+# and additionally if the character has more than 1 possible name, then
+# the name will be chosen unpredicatbly.
 sub html_entity_encode_1ch_name {
        (my $ch, my $all) = @_;
        
@@ -2671,9 +2701,90 @@ sub html_entity_encode_1ch_name {
        }
 }
 
+# html_entity_decode() decodes a text escaped by HTML entity references
+# in any of the available forms.
+# invalid entity references will be left unchanged.
+# $t is the text to decode
+sub html_entity_decode {
+       (my $t) = @_;
+       
+       my $d = '';
+       
+       while ($t ne '') {
+               if ($t =~ /^([^\&]+)(\&.*)$/s) { # any normal text
+                       $d .= $1;
+                       $t = $2;
+               }
+               elsif ($t =~ /^(\&#?[A-Za-z0-9]+;)(.*)$/s) { # correct encoded character
+                       $d .= html_entity_decode_1en($1);
+                       $t = $2;
+               }
+               elsif ($t =~ /^(\&[A-Za-z0-9]+)(.*)$/s) { # encoded character without ";"
+                       $d .= html_entity_decode_1en($1);
+                       $t = $2;
+               }
+               elsif ($t =~ /^(\&)(.*)$/s) { # invalid "&"
+                       $d .= $1;
+                       $t = $2;
+               }
+               else {  # nothing left to decode
+                       $d .= $t;
+                       $t = '';
+               }
+       }
+       
+       return $d;
+}
 
-# html_entity_encode_name
-# html_entity_decode
+# html_entity_decode_1en() decodes a single HTML entity reference
+# in any of the available forms.
+# an invalid entity reference will be left unchanged.
+# $en is the text to decode - it is assumed without checking
+# that the text is indeed in this format.
+sub html_entity_decode_1en {
+       (my $en) = @_;
+       my $y = '';
+       
+       if ($en !~ /;$/) { # name without ";"
+               my $n = substr($en, 1);
+               # we HAVE TO iterate :/
+               foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) {
+                       if (index($n, $name) == 0) { # name (beginning of entire string) is valid:
+                               # decode
+                               my @code = @{HTML_ENTITY_CODE_INF->{$name}};
+                               foreach my $c (@code) {
+                                       $y .= chr($c);
+                               }
+                               # rest of the string, insert literally
+                               $y .= substr($n, length($name));
+                               return $y;
+                       }
+               }
+               # invalid name, insert literally
+               return $en;
+       }
+       elsif ($en =~ /^\&#([0-9]+);$/) { # decimal
+               return chr($1);
+       }
+       elsif ($en =~ /^\&#x([0-9A-Fa-f]+);$/) { # hexadecimal
+               return chr(hex($1));
+       }
+       else { # name
+               my $name = substr($en, 1, length($en) - 2);
+               if (defined HTML_ENTITY_CODE->{$name}) {
+                       # valid name, decode
+                       my @code = @{HTML_ENTITY_CODE->{$name}};
+                       foreach my $c (@code) {
+                               $y .= chr($c);
+                       }
+                       return $y;
+               }
+               else {
+                       # invalid name, insert literally
+                       return $en;
+               }
+       }
+}