From 41916d233bacd00c4c52d75714205feea5f068b0 Mon Sep 17 00:00:00 2001 From: b Date: Thu, 3 Dec 2015 21:48:27 +0000 Subject: [PATCH] Some more html processing git-svn-id: svn://botcastle1b/yplom/facebug1@3 7dec801f-c475-4e67-ba99-809552d69c55 --- bot.1.pl | 63 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/bot.1.pl b/bot.1.pl index 10ec131..3d00706 100644 --- a/bot.1.pl +++ b/bot.1.pl @@ -3,7 +3,7 @@ use strict; ###PROXY_LIB; -use proxy_lib qw(url2path path2urldiv getcgi divideurl readconfigfile entitydecode); +use proxy_lib qw(url2path path2urldiv getcgi divideurl readconfigfile entitydecode urldecode); use POSIX qw(strftime); ###ARCH_PATH; @@ -186,6 +186,9 @@ sub processfile { my $mode = 'thread'; my $level = 0; my $closetag=0; + my $ignoretext; + my $link; + my $hidename; local $/ = '<'; unless (defined ($text = <$contentfile>)) { @@ -217,11 +220,14 @@ sub processfile { elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bj|bk)$/)) { $mode='thread-content'; $level=0; + $ignoretext=1; + $hidename=0; + } + elsif ($tag{'<'} eq 'abbr') { + $mode = 'thread-time'; } - elsif (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/)){ - print "$tag{'id'} - $id\n"; $mode='posts'; #!!! { @@ -251,25 +257,57 @@ sub processfile { elsif ($mode eq 'thread-content') { if ($tag{'<'} eq 'div') { # There should not be any sub
s! ++$level; - $thread{'postcontent'}.="
\n
\n"; + $thread{'postcontent'}.='
'; } elsif ($tag{'<'} eq '/div') { if($level){ --$level; - $thread{'postcontent'}.="
\n
\n"; + $thread{'postcontent'}.='
'; } else { $mode = 'thread'; } } + elsif ($tag{'<'} eq 'p') { + $thread{'postcontent'}.='

'; + $ignoretext=0; + } elsif ($tag{'<'} eq '/p') { - $thread{'postcontent'}.="
\n"; + $thread{'postcontent'}.='

'; + $ignoretext=1; + } + elsif (!$ignoretext) { + if ($tag{'<'} eq 'a') { + if ($tag{'href'} =~ /^https?:\/\/([a-z0-9]+\.)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { + $thread{'postcontent'}.=''; + $link=1; + } + elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) { + $thread{'postcontent'}.=''.(($$names{$1} ne '')?$$names{$1}:$$names{'default'}); + $link=1; + $hidename=1; + } + } + elsif ($tag{'<'} eq '/a') { + if($link) { + $thread{'postcontent'}.=''; + $link=0; + $hidename=0; + } + } + # else { + # $thread{'postcontent'}.='<<<'.$tag{'<'}.'>>>'; + # } + } + + } + elsif ($mode eq 'thread-time') { + if ($tag{'<'} eq '/abbr') { + $mode = 'thread'; } - # else { - # $thread{'postcontent'}.='<'.$tag{'<'}.'>'; - # } } + if ($tag{"\\"} ne '') { $closetag = 1; next; @@ -283,7 +321,12 @@ sub processfile { $text =~ s/<$//; if($mode eq 'thread-content') { - $thread{'postcontent'}.=$text; + unless ($ignoretext or $hidename){ + $thread{'postcontent'}.=$text; + } + } + elsif ($mode eq 'thread-time') { + $thread{'timetext'}.=$text; } # print "text: $text\n"; } -- 2.30.2