]> bicyclesonthemoon.info Git - yplom/facebug1/commitdiff
Saving nextposts and images.
authorb <b@7dec801f-c475-4e67-ba99-809552d69c55>
Wed, 9 Dec 2015 11:47:32 +0000 (11:47 +0000)
committerb <b@7dec801f-c475-4e67-ba99-809552d69c55>
Wed, 9 Dec 2015 11:47:32 +0000 (11:47 +0000)
git-svn-id: svn://botcastle1b/yplom/facebug1@6 7dec801f-c475-4e67-ba99-809552d69c55

bot.1.pl

index eb9f75c2186a38f481ae8b8f0aad8f33bd8b1608..61b3a77388cdbd6addb2350f33cbc1df2fa0783b 100644 (file)
--- a/bot.1.pl
+++ b/bot.1.pl
@@ -117,6 +117,9 @@ sub processfile {
        my %thread;
        my %thread2;
        
+       my %post;
+       my %post2;
+       
        if ($headerpath =~ /^((.+)\@h)$/) {
                $headerpath = $1;
                $basepath = $2;
@@ -190,6 +193,7 @@ sub processfile {
                my %tag;
                my $mode = 'thread';
                my $level = 0;
+               my $level2;
                my $closetag=0;
                my $ignoretext;
                my $link;
@@ -229,6 +233,7 @@ sub processfile {
                                        $level=0;
                                        $ignoretext=1;
                                        $hidename=0;
+                                       $link=0;
                                }
                                elsif ($tag{'<'} eq 'abbr') {
                                        $mode = 'thread-time';
@@ -243,18 +248,34 @@ sub processfile {
                                        $mode='posts';
                                        
                                        my $threadfile;
-                                       my $threadpath = ARCH_PATH.$$settings{'id'}.'/'.'thread/'.$thread{'id'};
+                                       my $threadpath = ARCH_PATH.$$settings{'id'}.'/';
+                                       unless (-d $threadpath) {
+                                               unless (mkdir $threadpath) {
+                                                       print "Can't mkdir $threadpath.\n";
+                                               }
+                                       }
+                                       $threadpath.='thread/';
+                                       unless (-d $threadpath) {
+                                               unless (mkdir $threadpath) {
+                                                       print "Can't mkdir $threadpath.\n";
+                                               }
+                                       }
+                                       $threadpath.=$thread{'id'};
                                        
                                        # if (open ($threadfile, "+<", $threadpath)) {
                                        if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) {
                                                if (flock ($threadfile, 2)) {
-                                                       # In the future it might be necessary to use a different function.
-                                                       %thread2 = readheaderfile($threadfile);
+                                                       %thread2 = readdatafile($threadfile);
                                                        
                                                        if (($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'})) {
                                                                print ("Newer version already saved.\n\n");
                                                        }
                                                        else {
+                                                               foreach my $ind (keys %thread2) {
+                                                                       if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) {
+                                                                               delete $thread2{$ind};
+                                                                       }
+                                                               }
                                                                foreach my $ind (keys %thread) {
                                                                        $thread2{$ind}=$thread{$ind};
                                                                }
@@ -263,16 +284,16 @@ sub processfile {
                                                                }
                                                                
                                                                if (seek($threadfile, 0, 0)) {
+                                                                       writedatafile($threadfile,%thread2);
+                                                                       truncate ($threadfile , tell($threadfile));
+                                                                       
                                                                        foreach my $ind (keys %thread2) {
-                                                                               print $threadfile "$ind: $thread2{$ind}\n";
                                                                                print "$ind: $thread2{$ind}\n"; ####
                                                                        }
-                                                                       print $threadfile "\n";
-                                                                       truncate ($threadfile , tell($threadfile));
                                                                        print "saved.\n\n";
                                                                }
                                                                else {
-                                                                       print "Failed seek $threadfile.\n\n";
+                                                                       print "Can't seek $threadfile.\n\n";
                                                                }
                                                        }
                                                }
@@ -287,6 +308,7 @@ sub processfile {
                                        }
                                }
                        }
+                       
                        elsif ($mode eq 'thread-author') {
                                if ($tag{'<'} eq 'a') {
                                        if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
@@ -301,6 +323,7 @@ sub processfile {
                                        $mode='thread';
                                }
                        }
+                       
                        elsif ($mode eq 'thread-content') {
                                if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
                                        ++$level;
@@ -309,7 +332,7 @@ sub processfile {
                                elsif ($tag{'<'} eq '/div') {
                                        if($level){
                                                --$level;
-                                               $thread{'postcontent'}.='<div>';
+                                               $thread{'postcontent'}.='</div>';
                                        }
                                        else {
                                                $mode = 'thread';
@@ -348,6 +371,7 @@ sub processfile {
                                }
                                
                        }
+                       
                        elsif ($mode eq 'thread-time') {
                                if ($tag{'<'} eq '/abbr') {
                                        $mode = 'thread';
@@ -369,7 +393,7 @@ sub processfile {
                                elsif ($tag{'<'} eq 'a') {
                                        if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
                                                ++$attnumber;
-                                               $thread{'img-'.$attnumber}=$2;
+                                               $thread{'img-'.$attnumber}='a_'.$2;
                                                $mode = 'thread-attachment-img';
                                        }
                                        elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
@@ -387,7 +411,7 @@ sub processfile {
                                                $thread{'imgkey-'.$attnumber}=$imgkey;
                                        }
                                        else {
-                                               $thread{'img-'.$attnumber}=undef;
+                                               delete $thread{'img-'.$attnumber};
                                                --$attnumber;
                                        }
                                }
@@ -402,7 +426,7 @@ sub processfile {
                                }
                                elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) {
                                        my $imgurl = urldecode($3);
-                                       my $imgid='';
+                                       my $imgid='i_';
                                        $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
                                        
                                        while(length($imgurl)>240) {
@@ -426,12 +450,244 @@ sub processfile {
                                        $mode = 'thread-attachment-link';
                                }
                        }
+                       
+                       
+                       elsif ($mode eq 'posts') {
+                               if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) {
+                                       %post = ();
+                                       
+                                       $post{'id'} = $1;
+                                       $post{'threadid'} = $id;
+                                       $post{'groupid'} = $$settings{'id'};
+                                       $post{'timenumber'} = $timenumber;
+                                       
+                                       $mode = 'post';
+                                       $level=0;
+                                       $attnumber=0;
+                                       print "Post $post{'id'}\n";
+                               }
+                       }
+                       
+                       elsif ($mode eq 'post') {
+                               if ($tag {'<'} eq 'h3') {
+                                       $mode = 'post-author';
+                               }
+                               elsif ($tag{'<'} eq 'abbr') {
+                                       $mode = 'post-time';
+                               }
+                               if ($tag{'<'} eq 'div') {
+                                       if(($tag{'class'} eq '')and($post{'content'} eq '')) {
+                                               $mode = 'post-content';
+                                               $level2=0;
+                                               $ignoretext=0;
+                                               $hidename=0;
+                                               $link=0;
+                                       }
+                                       else {
+                                               ++$level;
+                                       }
+                               }
+                               elsif ($tag{'<'} eq '/div') {
+                                       if ($level) {
+                                               --$level;
+                                       }
+                                       else {
+                                               $mode = 'posts';
+                                               
+                                               my $postfile;
+                                               my $postpath = ARCH_PATH.$$settings{'id'}.'/';
+                                               unless (-d $postpath) {
+                                                       unless (mkdir $postpath) {
+                                                               print "Can't mkdir $postpath.\n";
+                                                       }
+                                               }
+                                               $postpath.='post/';
+                                               unless (-d $postpath) {
+                                                       unless (mkdir $postpath) {
+                                                               print "Can't mkdir $postpath.\n";
+                                                       }
+                                               }
+                                               $postpath.=$thread{'id'}.'/';
+                                               unless (-d $postpath) {
+                                                       unless (mkdir $postpath) {
+                                                               print "Can't mkdir $postpath.\n";
+                                                       }
+                                               }
+                                               $postpath.=$post{'id'};
+                                               
+                                               if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) {
+                                                       if (flock ($postfile, 2)) {
+                                                               %post2 = readdatafile($postfile);
+                                                               
+                                                               if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) {
+                                                                       print ("Newer version already saved.\n\n");
+                                                               }
+                                                               else {
+                                                                       foreach my $ind (keys %post2) {
+                                                                               if($ind =~ /^img(key)?-[0-9]+$/) {
+                                                                                       delete $post2{$ind};
+                                                                               }
+                                                                       }
+                                                                       foreach my $ind (keys %post) {
+                                                                               $post2{$ind}=$post{$ind};
+                                                                       }
+                                                                       if ($post2{'key'} eq '') {
+                                                                               $post2{'key'} = key(KEY_BITS);
+                                                                       }
+                                                                       if (seek($postfile, 0, 0)) {
+                                                                               writedatafile($postfile,%post2);
+                                                                               truncate ($postfile , tell($postfile));
+                                                               
+                                                                               foreach my $ind (keys %post2) {
+                                                                                       print "$ind: $post2{$ind}\n";
+                                                                               }
+                                                                               print "saved.\n\n";
+                                                                       }
+                                                                       else {
+                                                                               print "Can't seek $postfile.\n\n";
+                                                                       }
+                                                               }
+                                                       }
+                                                       else {
+                                                               print "Can't lock $postfile.\n\n";
+                                                       }
+                                                       close ($postfile);
+                                               }
+                                               else
+                                               {
+                                                       print "Can't open $postpath.\n\n";
+                                               }
+                                       }
+                               }
+                               
+                               elsif ($tag{'<'} eq 'a') {
+                                       if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+                                               ++$attnumber;
+                                               $post{'img-'.$attnumber}='a_'.$2;
+                                               $mode = 'post-img';
+                                       }
+                                       elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) {
+                                               $mode = 'post-replies';
+                                       }
+                               }
+                       }
+                       
+                       elsif ($mode eq 'post-img') {
+                               if ($tag{'<'} eq 'img') {
+                                       my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'});
+                                       if ($imgkey ne '') {
+                                               $post{'imgkey-'.$attnumber}=$imgkey;
+                                       }
+                                       else {
+                                               delete $post{'img-'.$attnumber};
+                                               --$attnumber;
+                                       }
+                               }
+                               elsif ($tag{'<'} eq '/a') {
+                                       $mode = 'post';
+                               }
+                       }
+                       
+                       
+                       elsif ($mode eq 'post-author') {
+                               if ($tag{'<'} eq 'a') {
+                                       if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
+                                               my $author = $1;
+                                               if ($post{'author'} eq '') {
+                                                       $post{'author'} = $author;
+                                                       $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
+                                               }
+                                       }
+                               }
+                               elsif ($tag{'<'} eq '/h3') {
+                                       $mode='post';
+                               }
+                       }
+                       
+                       elsif ($mode eq 'post-content') {
+                               if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
+                                       ++$level2;
+                                       $post{'content'}.='<div>';
+                               }
+                               elsif ($tag{'<'} eq '/div') {
+                                       if($level2){
+                                               --$level2;
+                                               $post{'content'}.='</div>';
+                                       }
+                                       else {
+                                               $mode = 'post';
+                                       }
+                               }
+                               elsif ($tag{'<'} eq 'p') {
+                                       $post{'content'}.='<p>';
+                               }
+                               elsif ($tag{'<'} eq '/p') {
+                                       $post{'content'}.='</p>';
+                               }
+                               elsif (!$ignoretext) {
+                                       if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) {
+                                               my $imgurl = urldecode($2);
+                                               my $imgid = 'r_';
+                                               $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
+                                               
+                                               while(length($imgurl)>240) {
+                                                       $imgid.=substr($imgurl,0,120).'-/';
+                                                       $imgurl=substr($imgurl,120);
+                                               }
+                                               $imgid.=$imgurl;
+                                       
+                                               my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
+                                               if ($imgkey ne '') {
+                                                       $post{'content'}.='<img src="&img@i'.$imgid.'@k'.$imgkey.';" alt="'.$tag{'alt'}.'">';
+                                               }
+                                               else {
+                                                       $post{'content'}.='<img src="" alt="'.$tag{'alt'}.'">';
+                                               }
+                                               
+                                       }
+                                       elsif ($tag{'<'} eq 'a') {
+                                               if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+                                                       $post{'content'}.='<a href="'.urldecode($3).'">';
+                                                       $link=1;
+                                               }
+                                               elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
+                                                       $post{'content'}.='<a href="#">'.(($$names{$1} ne '')?$$names{$1}:$$names{'default'});
+                                                       $link=1;
+                                                       $hidename=1;
+                                               }
+                                       }
+                                       elsif ($tag{'<'} eq '/a') {
+                                               if($link) {
+                                                       $post{'content'}.='</a>';
+                                                       $link=0;
+                                                       $hidename=0;
+                                               }
+                                       }
+                                       else {
+                                       $post{'content'}.='<!'.$tag{'<'}.'!>';
+                                       }
+                               }
+                               
+                       }
+                       
+                       elsif ($mode eq 'post-time') {
+                               if ($tag{'<'} eq '/abbr') {
+                                       $mode = 'post';
+                               }
+                       }
+                       
+                       elsif ($mode eq 'post-replies') {
+                               if ($tag{'<'} eq '/a') {
+                                       $mode = 'post';
+                               }
+                       }
+                       
+                       
                        if ($tag{"\\"} ne '') {
                                $closetag = 1;
                                next;
                        }
                        
-                       
                        local $/ = '<';
                        unless (defined ($text = <$contentfile>)) {
                                close($contentfile);
@@ -455,18 +711,24 @@ sub processfile {
                                $thread{'linktext-'.$attnumber}.=$text;
                        }
                        
-                       # print "text: $text\n";
+                       if($mode eq 'post-content') {
+                               unless ($ignoretext or $hidename){
+                                       $post{'content'}.=$text;
+                               }
+                       }
+                       elsif ($mode eq 'post-time') {
+                               $post{'timetext'}.=$text;
+                       }
+                       elsif ($mode eq 'post-replies') {
+                               if($text =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
+                                       $post{'replies'} = $1;
+                               }
+                       }
+                       
                }
                
                
                
-               # while (defined ($line = <$contentfile>)) {
-                       # if ($firstline) {
-                               # $firstline = 0;
-                               # next;
-                       # }
-                       # # print "$line\n";
-               # }
                close ($contentfile);
                
        }
@@ -494,12 +756,12 @@ sub readheaderfile {
                my $headname='';
                my $headval='';
                
-               if($line =~ /^[ \t]+([^ \t].*)$/){
+               if($line =~ /^[ \t]+([^ \t](.*[^ \t])?)[ \t]*$/){
                        if($lastname ne '') {
                                $header{$lastname}.=$1;
                        }
                }
-               elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {
+               elsif ($line =~ /^([^:]+):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {
                        $headname = lc($1);
                        $headval = $2;
                        
@@ -520,6 +782,95 @@ sub readheaderfile {
        return %header;
 }
 
+# Very similar to header file reading.
+# Differences:
+#
+# After field name and colon there must be exactly one whitespace (space or
+# tab). Any other leading or trailing whitespace (but not the newline character
+# at the end of the line) is treated as part of the field value.
+#
+# When header field is split into multiple lines the next lines must start with
+# exactly one whitespace (tab or space) Any other leading or trailing whitespace
+# (but not the newline character at the end of the line) is treated as part of
+# the field value. the lines will be joined with a newline between them.
+#
+# When the same field name appears it replaces the previous one.
+sub readdatafile {
+       (my $headerpath) = @_;
+       my $headerfile;
+       my %header;
+       
+       if(ref($headerpath)) {
+               $headerfile=$headerpath;
+       }
+       else {
+               unless (open ($headerfile, "<", $headerpath)) {
+                       return %header;
+               }
+       }
+       
+       my $lastname='';
+       
+       while (defined(my $line = <$headerfile>)) {
+               $line =~ s/[\r\n]$//g;
+               my $headname='';
+               my $headval='';
+               
+               if($line =~ /^[ \t](.*)$/){
+                       if($lastname ne '') {
+                               $header{$lastname}.="\n".$1;
+                       }
+               }
+               elsif ($line =~ /^([^:]+):[ \t](.*)$/) {
+                       $headname = lc($1);
+                       $headval = $2;
+                       
+                       # if ($header{$headname} ne '') {
+                               # $header{$headname}.=$headval;
+                       # }
+                       # else {
+                               $header{$headname}=$headval;
+                       # }
+                       $lastname = $headname;
+               }
+       }
+       
+       unless (ref($headerpath)) {
+               close ($headerfile);
+       }
+       
+       return %header;
+}
+
+sub writedatafile {
+       (my $headerpath, my %header) = @_;
+       my $headerfile;
+       
+       if(ref($headerpath)) {
+               $headerfile=$headerpath;
+       }
+       else {
+               unless (open ($headerfile, ">", $headerpath)) {
+                       return 0;
+               }
+       }
+       
+       foreach my $ind (keys %header) {
+               my $headname = $ind;
+               my $headval = $header{$ind};
+               $headval =~ s/\r//g;
+               $headval =~ s/\n/\n /g;
+               print $headerfile "$headname: $headval\n";
+       }
+       print $headerfile "\n";
+       
+       unless (ref($headerpath)) {
+               close ($headerfile);
+       }
+       
+       return 1;
+}
+
 sub gettimenumber {
        (my $date) = @_;
        my $year;
@@ -672,16 +1023,31 @@ sub saveimg {
        (my $prot, my $host, my $port, my $path, my $query);
        my @stat;
        
-       print "  Image $id\n";
+       print "Image $id\n";
+       
+       $basepath=ARCH_PATH.$groupid.'/';
+       unless (-d $basepath){
+               unless (mkdir $basepath) {
+                       print "Can't mkdir $basepath.\n";
+                       return '';
+               }
+       }
+       $basepath.='image/';
+       unless (-d $basepath){
+               unless (mkdir $basepath) {
+                       print "Can't mkdir $basepath.\n";
+                       return '';
+               }
+       }
+       
        
-       $basepath=ARCH_PATH.$groupid.'/image/';
        my $idtemp = $id;
        while((my $ind = index($idtemp,'/'))>=0) {
                $basepath.=substr($idtemp, 0, $ind+1);
                $idtemp=substr($idtemp,$ind+1);
                unless (-d $basepath){
                        unless (mkdir $basepath) {
-                               print "  Can't mkdir $basepath.\n";
+                               print "Can't mkdir $basepath.\n";
                                return '';
                        }
                }
@@ -696,7 +1062,7 @@ sub saveimg {
                        if (flock ($headfile, 2)) {
                                %header = readheaderfile($headfile);
                                if ($header{'key'} ne '') {
-                                       print "  Already saved.\n\n";
+                                       print "Already saved.\n\n";
                                        close($headfile);
                                        return $header{'key'};
                                }
@@ -705,7 +1071,7 @@ sub saveimg {
                                }
                        }
                        else {
-                               print "  Can't lock $headpath.\n\n";
+                               print "Can't lock $headpath.\n\n";
                                close($headfile);
                                return '';
                        }
@@ -716,7 +1082,7 @@ sub saveimg {
        $archbasepath = urldiv2path($prot, $host, $port, $path, $query);
        $archheadpath = $archbasepath.'@h';
        $archimgpath = $archbasepath.'@v';
-       print "  url: $url\n";
+       print "url: $url\n";
        for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
                %archheader = readheaderfile($archheadpath); 
                
@@ -728,7 +1094,7 @@ sub saveimg {
                        if ($location !~ /^[a-z]+:\/\//) {
                                $location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location;
                        }
-                       print "  Redirect: $location\n";
+                       print "Redirect: $location\n";
                        ($prot, $host, $port, $path, $query) = divideurl($location);
                        $archbasepath = urldiv2path($prot, $host, $port, $path, $query);
                        $archheadpath = $archbasepath.'@h';
@@ -739,7 +1105,7 @@ sub saveimg {
                }
        }
        if ($archheader{'status'} !~ /^200 /) {
-               print "  Not found ($url $archheader{'status'}).\n\n";
+               print "Not found ($archheader{'status'}).\n\n";
                if ($headopen) {
                        close ($headfile);
                }
@@ -756,7 +1122,7 @@ sub saveimg {
                $header{'content-length'}=$stat[7];
        }
        else {
-               print "  Can't stat $imgpath. $stat[7]\n";
+               print "Can't stat $imgpath. $stat[7]\n";
        }
        $header{'id'}=$id;
        $header{'groupid'}=$groupid;
@@ -764,18 +1130,18 @@ sub saveimg {
        
        if ($headopen) {
                unless (seek($headfile,0,0)) {
-                       print "  Can't seek $headpath.\n\n";
+                       print "Can't seek $headpath.\n\n";
                        close ($headfile);
                        return '';
                }
        }
        else{
                unless (open($headfile,">",$headpath)) {
-                       print "  Can't open $headpath.\n\n";
+                       print "Can't open $headpath.\n\n";
                        return '';
                }
                unless (flock ($headfile, 2)) {
-                       print "  Can't lock $headpath.\n\n";
+                       print "Can't lock $headpath.\n\n";
                        close($headfile);
                        return '';
                }
@@ -783,13 +1149,13 @@ sub saveimg {
        
        foreach my $ind (keys %header) {
                print $headfile "$ind: $header{$ind}\n";
-               print "  $ind: $header{$ind}\n";
+               print "$ind: $header{$ind}\n";
        }
        print $headfile "\n";
        close ($headfile);
        
        unless (copy($archimgpath,$imgpath)) {
-               print "  Can't copy $archimgpath.\n\n";
+               print "Can't copy $archimgpath.\n\n";
                return '';
        }
        print "saved.\n\n";