From: b Date: Mon, 14 Dec 2015 21:22:17 +0000 (+0000) Subject: state machine improved, X-Git-Url: http://bicyclesonthemoon.info/git-projects/?a=commitdiff_plain;h=8041f3c76a80c9d7f74d279605f9ac75bce6493e;p=yplom%2Ffacebug1 state machine improved, state 'thread-attachment' merged with 'thread', now also processing pages with thread list (firstposts). git-svn-id: svn://botcastle1b/yplom/facebug1@7 7dec801f-c475-4e67-ba99-809552d69c55 --- diff --git a/bot.1.pl b/bot.1.pl index 61b3a77..82f50ae 100644 --- a/bot.1.pl +++ b/bot.1.pl @@ -4,7 +4,7 @@ use strict; use Fcntl; use File::Copy; ###PROXY_LIB; -use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl readconfigfile entitydecode urldecode); +use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode urldecode); use POSIX qw(strftime); ###ARCH_PATH; @@ -120,6 +120,8 @@ sub processfile { my %post; my %post2; + my $pagetype; + if ($headerpath =~ /^((.+)\@h)$/) { $headerpath = $1; $basepath = $2; @@ -131,12 +133,21 @@ sub processfile { ($prot, $host, $port, $path, $query) = path2urldiv($basepath); + print 'Page '.joinurl($prot, $host, $port, $path, $query)."\n"; + ### REDESIGN THE CONDITIONS! if($query ne '') { %cgi=getcgi($query); $id = $cgi{'id'}; + if ($id =~ /^[0-9]+$/) { + $pagetype='thread'; + } + else { + $pagetype = 'group'; + } } else { $id=''; + $pagetype = 'group'; } for (my $ind=0; $ind)) { @@ -219,33 +241,89 @@ sub processfile { return; } $text =~ s/>$//; - # print "tag: $text\n"; + # # DEBUG: + # if($pagetype eq 'thread'){ + # print ">>$mode: <$text>\n"; + # } %tag = taginfo($text); } local $/ = "\n"; - if ($mode eq 'thread'){ + if ($mode eq 'threads'){ + if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) { + print "Thread [$1]\n"; + $mode = 'thread'; + %thread = (); + $thread{'groupid'}=$groupid; + $thread{'timenumber2'}=$timenumber; + $level = 0; + $attnumber=0; + $incomplete=0; + } + } + + elsif ($mode eq 'thread'){ + # print "+++$text+++\n"; if ($tag{'<'} eq 'h3') { $mode = 'thread-author'; } - elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bj|bk|bm)$/)) { # These are very helpful names, facebug, thank you! - $mode='thread-content'; - $level=0; - $ignoretext=1; - $hidename=0; - $link=0; + elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) { + # These are very 'helpful' class names, facebug, thank you! + # After recent changes do I still need the unreliable class name? + # Is post content always in first
after author with a 2 letter class name? + # Let's test: + if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) { + # if (($tag{'class'} =~ /^(bj|bk|bm)|(db|da)$/) and (!defined($thread{'postcontent'}))) { + $mode='thread-content'; + $level2=0; + $ignoretext=1; + $hidename=0; + $link=0; + } + # elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)|(dc|db)$/)) { ### NAMES NOT RELIABLE! HAVE TO IMPROVE SERIOUSLY! + # $mode='thread-attachment'; + # $level2=0; + # $attnumber=0; + # } + else { + ++$level; + } + } + elsif (($tag{'<'} eq '/div') and $level) { + --$level; } + elsif ($tag{'<'} eq 'abbr') { $mode = 'thread-time'; } - elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)$/)) { - $mode='thread-attachment'; - $level=0; - $attnumber=0; + + elsif ($tag{'<'} eq 'a') { + if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { + ++$attnumber; + $thread{'img-'.$attnumber}='a_'.$2; + $mode = 'thread-attachment-img'; + } + elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { + ++$attnumber; + $thread{'link-'.$attnumber}=urldecode($3); + $mode = 'thread-attachment-link'; + } + elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) { + if ($thread{'id'} eq '') { + $thread{'id'} = $2; + print "Thread $thread{'id'}\n"; + } + $mode = 'thread-replies'; + } } - elsif (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/)){ - $mode='posts'; + elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) { + if ($pagetype eq 'thread') { + $mode='posts'; + } + else { + $mode='threads'; + } my $threadfile; my $threadpath = ARCH_PATH.$$settings{'id'}.'/'; @@ -267,10 +345,25 @@ sub processfile { if (flock ($threadfile, 2)) { %thread2 = readdatafile($threadfile); - if (($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'})) { + if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) { print ("Newer version already saved.\n\n"); } else { + if($pagetype ne 'thread'){ + if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) { + print ("Newer version of post content already saved.\n"); + delete $thread{'postcontent'}; + } + elsif($incomplete) { + print ("Post content incomplete.\n"); + if(defined($thread2{'postcontent'})){ + delete $thread{'postcontent'}; + } + } + else { + $thread{'timenumber'}=$thread{'timenumber2'}; + } + } foreach my $ind (keys %thread2) { if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) { delete $thread2{$ind}; @@ -313,6 +406,9 @@ sub processfile { if ($tag{'<'} eq 'a') { if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { my $author = $1; + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $author = urldecode($2); + } if ($thread{'author'} eq '') { $thread{'author'} = $author; $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; @@ -326,18 +422,26 @@ sub processfile { elsif ($mode eq 'thread-content') { if ($tag{'<'} eq 'div') { # There should not be any sub
s! - ++$level; - $thread{'postcontent'}.='
'; + ++$level2; + # $thread{'postcontent'}.='
'; + $ignoretext=1; } elsif ($tag{'<'} eq '/div') { - if($level){ - --$level; - $thread{'postcontent'}.='
'; + if($level2){ + --$level2; + # $thread{'postcontent'}.='
'; + unless($level2) { + $ignoretext=0; + } } else { $mode = 'thread'; } } + elsif ($tag{'<'} eq 'br') { + $thread{'postcontent'}.='
'; + $ignoretext=0; + } elsif ($tag{'<'} eq 'p') { $thread{'postcontent'}.='

'; $ignoretext=0; @@ -353,7 +457,11 @@ sub processfile { $link=1; } elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) { - $thread{'postcontent'}.=''.(($$names{$1} ne '')?$$names{$1}:$$names{'default'}); + my $person=$1; + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $person = urldecode($2); + } + $thread{'postcontent'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); $link=1; $hidename=1; } @@ -369,7 +477,12 @@ sub processfile { # $thread{'postcontent'}.=''; } } - + elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) { + unless($incomplete) { + $thread{'postcontent'}.='

Post not completely archived.

'; + } + $incomplete=1; + } } elsif ($mode eq 'thread-time') { @@ -378,31 +491,32 @@ sub processfile { } } - elsif ($mode eq 'thread-attachment') { - if ($tag{'<'} eq 'div') { - ++$level; - } - elsif ($tag{'<'} eq '/div') { - if($level){ - --$level; - } - else { - $mode = 'thread'; - } - } - elsif ($tag{'<'} eq 'a') { - if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { - ++$attnumber; - $thread{'img-'.$attnumber}='a_'.$2; - $mode = 'thread-attachment-img'; - } - elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { - ++$attnumber; - $thread{'link-'.$attnumber}=urldecode($3); - $mode = 'thread-attachment-link'; - } - } - } + + # elsif ($mode eq 'thread-attachment') { + # if ($tag{'<'} eq 'div') { + # ++$level2; + # } + # elsif ($tag{'<'} eq '/div') { + # if($level2){ + # --$level2; + # } + # else { + # $mode = 'thread'; + # } + # } + # elsif ($tag{'<'} eq 'a') { + # if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { + # ++$attnumber; + # $thread{'img-'.$attnumber}='a_'.$2; + # $mode = 'thread-attachment-img'; + # } + # elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { + # ++$attnumber; + # $thread{'link-'.$attnumber}=urldecode($3); + # $mode = 'thread-attachment-link'; + # } + # } + # } elsif ($mode eq 'thread-attachment-img') { if ($tag{'<'} eq 'img') { @@ -416,7 +530,7 @@ sub processfile { } } elsif ($tag{'<'} eq '/a') { - $mode = 'thread-attachment'; + $mode = 'thread'; } } @@ -441,7 +555,7 @@ sub processfile { } } elsif ($tag{'<'} eq '/a') { - $mode = 'thread-attachment'; + $mode = 'thread'; } } @@ -451,6 +565,12 @@ sub processfile { } } + elsif ($mode eq 'thread-replies') { + if ($tag{'<'} eq '/a') { + $mode = 'thread'; + } + } + elsif ($mode eq 'posts') { if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) { @@ -476,7 +596,7 @@ sub processfile { $mode = 'post-time'; } if ($tag{'<'} eq 'div') { - if(($tag{'class'} eq '')and($post{'content'} eq '')) { + if(($tag{'class'} eq '')and(!defined($post{'content'}))) { $mode = 'post-content'; $level2=0; $ignoretext=0; @@ -593,6 +713,9 @@ sub processfile { if ($tag{'<'} eq 'a') { if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { my $author = $1; + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $author = urldecode($2); + } if ($post{'author'} eq '') { $post{'author'} = $author; $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; @@ -607,12 +730,16 @@ sub processfile { elsif ($mode eq 'post-content') { if ($tag{'<'} eq 'div') { # There should not be any sub
s! ++$level2; - $post{'content'}.='
'; + # $post{'content'}.='
'; + $ignoretext=1; } elsif ($tag{'<'} eq '/div') { if($level2){ --$level2; - $post{'content'}.='
'; + # $post{'content'}.='
'; + unless($level2){ + $ignoretext=0; + } } else { $mode = 'post'; @@ -651,7 +778,11 @@ sub processfile { $link=1; } elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) { - $post{'content'}.='
'.(($$names{$1} ne '')?$$names{$1}:$$names{'default'}); + my $person = $1; + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $person = urldecode($2); + } + $post{'content'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); $link=1; $hidename=1; } @@ -664,7 +795,7 @@ sub processfile { } } else { - $post{'content'}.=''; + # $post{'content'}.=''; } } @@ -696,6 +827,11 @@ sub processfile { local $/ = "\n"; $text =~ s/<$//; + # # DEBUG + # if ($pagetype eq 'thread') { + # print ">>$mode: $text\n"; + # } + if($mode eq 'thread-content') { unless ($ignoretext or $hidename){ $thread{'postcontent'}.=$text; @@ -710,6 +846,11 @@ sub processfile { elsif ($mode eq 'thread-attachment-link') { $thread{'linktext-'.$attnumber}.=$text; } + elsif ($mode eq 'thread-replies') { + if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) { + $thread{'replies'} = $1; + } + } if($mode eq 'post-content') { unless ($ignoretext or $hidename){ @@ -720,7 +861,7 @@ sub processfile { $post{'timetext'}.=$text; } elsif ($mode eq 'post-replies') { - if($text =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) { + if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) { $post{'replies'} = $1; } }