]> bicyclesonthemoon.info Git - yplom/facebug1/commitdiff
state machine improved,
authorb <b@7dec801f-c475-4e67-ba99-809552d69c55>
Mon, 14 Dec 2015 21:22:17 +0000 (21:22 +0000)
committerb <b@7dec801f-c475-4e67-ba99-809552d69c55>
Mon, 14 Dec 2015 21:22:17 +0000 (21:22 +0000)
state 'thread-attachment' merged with 'thread',
now also processing pages with thread list (firstposts).

git-svn-id: svn://botcastle1b/yplom/facebug1@7 7dec801f-c475-4e67-ba99-809552d69c55

bot.1.pl

index 61b3a77388cdbd6addb2350f33cbc1df2fa0783b..82f50ae5a6cfee998f3e1e76c4c0812faca4bacd 100644 (file)
--- a/bot.1.pl
+++ b/bot.1.pl
@@ -4,7 +4,7 @@ use strict;
 use Fcntl;
 use File::Copy;
 ###PROXY_LIB;
-use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl readconfigfile entitydecode urldecode);
+use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode urldecode);
 use POSIX qw(strftime);
 
 ###ARCH_PATH;
@@ -120,6 +120,8 @@ sub processfile {
        my %post;
        my %post2;
        
+       my $pagetype;
+       
        if ($headerpath =~ /^((.+)\@h)$/) {
                $headerpath = $1;
                $basepath = $2;
@@ -131,12 +133,21 @@ sub processfile {
        
        ($prot, $host, $port, $path, $query) = path2urldiv($basepath);
        
+       print 'Page '.joinurl($prot, $host, $port, $path, $query)."\n";
+       ### REDESIGN THE CONDITIONS!
        if($query ne '') {
                %cgi=getcgi($query);
                $id = $cgi{'id'};
+               if ($id =~ /^[0-9]+$/) {
+                       $pagetype='thread';
+               }
+               else {
+                       $pagetype = 'group';
+               }
        }
        else {
                $id='';
+               $pagetype = 'group';
        }
        
        for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
@@ -172,33 +183,44 @@ sub processfile {
                }
        }
        
-       if ($id =~ /^[0-9]+$/) {
-               print "Thread $id\n";
-               
-               $thread{'id'}=$id;
-               $thread{'groupid'}=$groupid;
-               $thread{'timenumber'}=$timenumber;
-               
-               my $line;
-               
-               my %postdata;
+       # REDESIGN THE CONDITIONS!
+       if ($pagetype) {
                
                
                unless (open ($contentfile, "<",$contentpath)) {
-                       print "Can't open file";
+                       print "Can't open $contentpath.\n";
                        return;
                }
                
                my $text;
                my %tag;
-               my $mode = 'thread';
-               my $level = 0;
+               my $mode;
+               my $level;
                my $level2;
                my $closetag=0;
                my $ignoretext;
                my $link;
                my $hidename;
                my $attnumber;
+               my $incomplete;
+               
+               if ($pagetype eq 'thread') {
+                       print "Thread $id\n";
+                       
+                       $thread{'id'}=$id;
+                       $thread{'groupid'}=$groupid;
+                       $thread{'timenumber'}=$timenumber;
+                       $mode = 'thread';
+                       $level=0;
+                       $attnumber=0;
+                       $incomplete=0;
+               }
+               else { #group
+                       print "Threads\n";
+                       $mode = 'threads';
+               }
+               
+               my $line;
                
                local $/ = '<';
                unless (defined ($text = <$contentfile>)) {
@@ -219,33 +241,89 @@ sub processfile {
                                        return;
                                }
                                $text =~ s/>$//;
-                               # print "tag: $text\n";
+                               # # DEBUG:
+                               # if($pagetype eq 'thread'){
+                                       # print ">>$mode: <$text>\n";
+                               # }
                                %tag = taginfo($text);
                        }
                        local $/ = "\n";
                        
-                       if ($mode eq 'thread'){
+                       if ($mode eq 'threads'){
+                               if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) {
+                                       print "Thread [$1]\n";
+                                       $mode = 'thread';
+                                       %thread = ();
+                                       $thread{'groupid'}=$groupid;
+                                       $thread{'timenumber2'}=$timenumber;
+                                       $level = 0;
+                                       $attnumber=0;
+                                       $incomplete=0;
+                               }
+                       }
+                       
+                       elsif ($mode eq 'thread'){
+                               # print "+++$text+++\n";
                                if ($tag{'<'} eq 'h3') {
                                        $mode = 'thread-author';
                                }
-                               elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bj|bk|bm)$/)) { # These are very helpful names, facebug, thank you!
-                                       $mode='thread-content';
-                                       $level=0;
-                                       $ignoretext=1;
-                                       $hidename=0;
-                                       $link=0;
+                               elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) {
+                                       # These are very 'helpful' class names, facebug, thank you!
+                                       # After recent changes do I still need the unreliable class name?
+                                       # Is post content always in first <div> after author with a 2 letter class name?
+                                       # Let's test:
+                                       if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) {
+                                       # if (($tag{'class'} =~ /^(bj|bk|bm)|(db|da)$/) and (!defined($thread{'postcontent'}))) {
+                                               $mode='thread-content';
+                                               $level2=0;
+                                               $ignoretext=1;
+                                               $hidename=0;
+                                               $link=0;
+                                       }
+                                       # elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)|(dc|db)$/)) { ### NAMES NOT RELIABLE! HAVE TO IMPROVE SERIOUSLY!
+                                               # $mode='thread-attachment';
+                                               # $level2=0;
+                                               # $attnumber=0;
+                                       # }
+                                       else {
+                                               ++$level;
+                                       }
+                               }
+                               elsif (($tag{'<'} eq '/div') and $level) {
+                                       --$level;
                                }
+                               
                                elsif ($tag{'<'} eq 'abbr') {
                                        $mode = 'thread-time';
                                }
-                               elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)$/)) {
-                                       $mode='thread-attachment';
-                                       $level=0;
-                                       $attnumber=0;
+                               
+                               elsif ($tag{'<'} eq 'a') {
+                                       if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+                                               ++$attnumber;
+                                               $thread{'img-'.$attnumber}='a_'.$2;
+                                               $mode = 'thread-attachment-img';
+                                       }
+                                       elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+                                               ++$attnumber;
+                                               $thread{'link-'.$attnumber}=urldecode($3);
+                                               $mode = 'thread-attachment-link';
+                                       }
+                                       elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                               if ($thread{'id'} eq '') {
+                                                       $thread{'id'} = $2;
+                                                       print "Thread $thread{'id'}\n";
+                                               }
+                                               $mode = 'thread-replies';
+                                       }
                                }
                                
-                               elsif (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/)){
-                                       $mode='posts';
+                               elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) {
+                                       if ($pagetype eq 'thread') {
+                                               $mode='posts';
+                                       }
+                                       else {
+                                               $mode='threads';
+                                       }
                                        
                                        my $threadfile;
                                        my $threadpath = ARCH_PATH.$$settings{'id'}.'/';
@@ -267,10 +345,25 @@ sub processfile {
                                                if (flock ($threadfile, 2)) {
                                                        %thread2 = readdatafile($threadfile);
                                                        
-                                                       if (($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'})) {
+                                                       if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) {
                                                                print ("Newer version already saved.\n\n");
                                                        }
                                                        else {
+                                                               if($pagetype ne 'thread'){
+                                                                       if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) {
+                                                                               print ("Newer version of post content already saved.\n");
+                                                                               delete $thread{'postcontent'};
+                                                                       }
+                                                                       elsif($incomplete) {
+                                                                               print ("Post content incomplete.\n");
+                                                                               if(defined($thread2{'postcontent'})){
+                                                                                       delete $thread{'postcontent'};
+                                                                               }
+                                                                       }
+                                                                       else {
+                                                                               $thread{'timenumber'}=$thread{'timenumber2'};
+                                                                       }
+                                                               }
                                                                foreach my $ind (keys %thread2) {
                                                                        if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) {
                                                                                delete $thread2{$ind};
@@ -313,6 +406,9 @@ sub processfile {
                                if ($tag{'<'} eq 'a') {
                                        if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
                                                my $author = $1;
+                                               if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                                       $author = urldecode($2);
+                                               }
                                                if ($thread{'author'} eq '') {
                                                        $thread{'author'} = $author;
                                                        $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
@@ -326,18 +422,26 @@ sub processfile {
                        
                        elsif ($mode eq 'thread-content') {
                                if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
-                                       ++$level;
-                                       $thread{'postcontent'}.='<div>';
+                                       ++$level2;
+                                       # $thread{'postcontent'}.='<div>';
+                                       $ignoretext=1;
                                }
                                elsif ($tag{'<'} eq '/div') {
-                                       if($level){
-                                               --$level;
-                                               $thread{'postcontent'}.='</div>';
+                                       if($level2){
+                                               --$level2;
+                                               # $thread{'postcontent'}.='</div>';
+                                               unless($level2) {
+                                                       $ignoretext=0;
+                                               }
                                        }
                                        else {
                                                $mode = 'thread';
                                        }
                                }
+                               elsif ($tag{'<'} eq 'br') {
+                                       $thread{'postcontent'}.='<br>';
+                                       $ignoretext=0;
+                               }
                                elsif ($tag{'<'} eq 'p') {
                                        $thread{'postcontent'}.='<p>';
                                        $ignoretext=0;
@@ -353,7 +457,11 @@ sub processfile {
                                                        $link=1;
                                                }
                                                elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
-                                                       $thread{'postcontent'}.='<a href="#">'.(($$names{$1} ne '')?$$names{$1}:$$names{'default'});
+                                                       my $person=$1;
+                                                       if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                                               $person = urldecode($2);
+                                                       }
+                                                       $thread{'postcontent'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
                                                        $link=1;
                                                        $hidename=1;
                                                }
@@ -369,7 +477,12 @@ sub processfile {
                                        # $thread{'postcontent'}.='<!'.$tag{'<'}.'!>';
                                        }
                                }
-                               
+                               elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) {
+                                       unless($incomplete) {
+                                               $thread{'postcontent'}.='<p><b>Post not completely archived.</b></p>';
+                                       }
+                                       $incomplete=1;
+                               }
                        }
                        
                        elsif ($mode eq 'thread-time') {
@@ -378,31 +491,32 @@ sub processfile {
                                }
                        }
                        
-                       elsif ($mode eq 'thread-attachment') {
-                               if ($tag{'<'} eq 'div') {
-                                       ++$level;
-                               }
-                               elsif ($tag{'<'} eq '/div') {
-                                       if($level){
-                                               --$level;
-                                       }
-                                       else {
-                                               $mode = 'thread';
-                                       }
-                               }
-                               elsif ($tag{'<'} eq 'a') {
-                                       if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
-                                               ++$attnumber;
-                                               $thread{'img-'.$attnumber}='a_'.$2;
-                                               $mode = 'thread-attachment-img';
-                                       }
-                                       elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
-                                               ++$attnumber;
-                                               $thread{'link-'.$attnumber}=urldecode($3);
-                                               $mode = 'thread-attachment-link';
-                                       }
-                               }
-                       }
+                       
+                       # elsif ($mode eq 'thread-attachment') {
+                               # if ($tag{'<'} eq 'div') {
+                                       # ++$level2;
+                               # }
+                               # elsif ($tag{'<'} eq '/div') {
+                                       # if($level2){
+                                               # --$level2;
+                                       # }
+                                       # else {
+                                               # $mode = 'thread';
+                                       # }
+                               # }
+                               # elsif ($tag{'<'} eq 'a') {
+                                       # if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+                                               # ++$attnumber;
+                                               # $thread{'img-'.$attnumber}='a_'.$2;
+                                               # $mode = 'thread-attachment-img';
+                                       # }
+                                       # elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+                                               # ++$attnumber;
+                                               # $thread{'link-'.$attnumber}=urldecode($3);
+                                               # $mode = 'thread-attachment-link';
+                                       # }
+                               # }
+                       # }
                        
                        elsif ($mode eq 'thread-attachment-img') {
                                if ($tag{'<'} eq 'img') {
@@ -416,7 +530,7 @@ sub processfile {
                                        }
                                }
                                elsif ($tag{'<'} eq '/a') {
-                                       $mode = 'thread-attachment';
+                                       $mode = 'thread';
                                }
                        }
                        
@@ -441,7 +555,7 @@ sub processfile {
                                        }
                                }
                                elsif ($tag{'<'} eq '/a') {
-                                       $mode = 'thread-attachment';
+                                       $mode = 'thread';
                                }
                        }
                        
@@ -451,6 +565,12 @@ sub processfile {
                                }
                        }
                        
+                       elsif ($mode eq 'thread-replies') {
+                               if ($tag{'<'} eq '/a') {
+                                       $mode = 'thread';
+                               }
+                       }
+                       
                        
                        elsif ($mode eq 'posts') {
                                if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) {
@@ -476,7 +596,7 @@ sub processfile {
                                        $mode = 'post-time';
                                }
                                if ($tag{'<'} eq 'div') {
-                                       if(($tag{'class'} eq '')and($post{'content'} eq '')) {
+                                       if(($tag{'class'} eq '')and(!defined($post{'content'}))) {
                                                $mode = 'post-content';
                                                $level2=0;
                                                $ignoretext=0;
@@ -593,6 +713,9 @@ sub processfile {
                                if ($tag{'<'} eq 'a') {
                                        if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
                                                my $author = $1;
+                                               if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                                       $author = urldecode($2);
+                                               }
                                                if ($post{'author'} eq '') {
                                                        $post{'author'} = $author;
                                                        $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
@@ -607,12 +730,16 @@ sub processfile {
                        elsif ($mode eq 'post-content') {
                                if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
                                        ++$level2;
-                                       $post{'content'}.='<div>';
+                                       # $post{'content'}.='<div>';
+                                       $ignoretext=1;
                                }
                                elsif ($tag{'<'} eq '/div') {
                                        if($level2){
                                                --$level2;
-                                               $post{'content'}.='</div>';
+                                               # $post{'content'}.='</div>';
+                                               unless($level2){
+                                                       $ignoretext=0;
+                                               }
                                        }
                                        else {
                                                $mode = 'post';
@@ -651,7 +778,11 @@ sub processfile {
                                                        $link=1;
                                                }
                                                elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
-                                                       $post{'content'}.='<a href="#">'.(($$names{$1} ne '')?$$names{$1}:$$names{'default'});
+                                                       my $person = $1;
+                                                       if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                                               $person = urldecode($2);
+                                                       }
+                                                       $post{'content'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
                                                        $link=1;
                                                        $hidename=1;
                                                }
@@ -664,7 +795,7 @@ sub processfile {
                                                }
                                        }
                                        else {
-                                       $post{'content'}.='<!'.$tag{'<'}.'!>';
+                                       $post{'content'}.='<!'.$tag{'<'}.'!>';
                                        }
                                }
                                
@@ -696,6 +827,11 @@ sub processfile {
                        local $/ = "\n";
                        $text =~ s/<$//;
                        
+                       # # DEBUG
+                       # if ($pagetype eq 'thread') {
+                               # print ">>$mode: $text\n";
+                       # }
+                       
                        if($mode eq 'thread-content') {
                                unless ($ignoretext or $hidename){
                                        $thread{'postcontent'}.=$text;
@@ -710,6 +846,11 @@ sub processfile {
                        elsif ($mode eq 'thread-attachment-link') {
                                $thread{'linktext-'.$attnumber}.=$text;
                        }
+                       elsif ($mode eq 'thread-replies') {
+                               if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) {
+                                       $thread{'replies'} = $1;
+                               }
+                       }
                        
                        if($mode eq 'post-content') {
                                unless ($ignoretext or $hidename){
@@ -720,7 +861,7 @@ sub processfile {
                                $post{'timetext'}.=$text;
                        }
                        elsif ($mode eq 'post-replies') {
-                               if($text =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
+                               if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
                                        $post{'replies'} = $1;
                                }
                        }