]> bicyclesonthemoon.info Git - yplom/facebug1/commitdiff
File processing started.
authorb <b@7dec801f-c475-4e67-ba99-809552d69c55>
Tue, 1 Dec 2015 23:36:54 +0000 (23:36 +0000)
committerb <b@7dec801f-c475-4e67-ba99-809552d69c55>
Tue, 1 Dec 2015 23:36:54 +0000 (23:36 +0000)
git-svn-id: svn://botcastle1b/yplom/facebug1@2 7dec801f-c475-4e67-ba99-809552d69c55

bot.1.pl
configure.pl

index cc473a5155b6f8cf4d3746e80fdea2d3bcb510e6..10ec13153520bbba6d567cdf3ffb9b99a774ba7c 100644 (file)
--- a/bot.1.pl
+++ b/bot.1.pl
-###PERL;\r
-\r
-use strict;\r
-\r
-###PROXY_LIB;\r
-use proxy_lib qw(url2path path2urldiv getcgi divideurl);\r
-use POSIX qw(strftime);\r
-\r
-my $time = time();\r
-print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n";\r
-for (my $ind=0; $ind < scalar @ARGV; ++$ind) {\r
-       my %set = {};\r
-       my $basepath='';\r
-       my $configfile;\r
-       \r
-       unless (open ($configfile, "<", $ARGV[$ind])) {\r
-               print "Cannot open $ARGV[$ind]\n";\r
-               next;\r
-       }\r
-       \r
-       while (defined(my $line = <$configfile>)) {\r
-               $line =~ s/[\r\n]//g;\r
-               $line =~ s/#.*$//;\r
-               if ($line =~ /^ *([a-zA-Z0-9_]+) *= *(.*)$/){\r
-                       my $name=$1;\r
-                       my $value=$2;\r
-                       $value =~ s/ *$//; \r
-                       $set{$name}=$value;\r
-               }\r
-       }\r
-       close ($configfile);\r
-       \r
-       if ($set{'id'} =~ /^([0-9]+)$/) {\r
-               my $basepath = url2path('https://m.facebook.com/groups/'.$1).'@q/';\r
-               processdir($basepath);\r
-       }\r
-       else {\r
-               print "Invalid ID $ARGV[$ind]\n";\r
-               next;\r
-       }\r
-}\r
-\r
-sub processdir {\r
-       (my $dirpath) = @_;\r
-       my $dir;\r
-       my $subpath;\r
-       my $subpathfull;\r
-       my @stat;\r
-\r
-       unless ( opendir ($dir, $dirpath)) {\r
-               return;\r
-       }\r
-       while (defined($subpath = readdir $dir)) {\r
-               $subpathfull=$dirpath.$subpath;\r
-               if ($subpath =~ /^\.\.?$/) {\r
-                       next;\r
-               }\r
-               if (-f $subpathfull) {\r
-                       processfile($subpathfull);\r
-               }\r
-               elsif (-d $subpathfull) {\r
-                       processdir($subpathfull.'/');\r
-               }\r
-       }\r
-       closedir ($dir);        \r
-}\r
-\r
-sub processfile {\r
-       (my $headerpath) = @_;\r
-       my $basepath;\r
-       my $contentpath;\r
-       \r
-       my %header;\r
-       \r
-       my $prot;\r
-       my $host;\r
-       my $port;\r
-       my $path;\r
-       my $query;\r
-       \r
-       my %cgi;\r
-       \r
-       my $id;\r
-       \r
-       if ($headerpath =~ /^((.+)\@h)$/) {\r
-               $headerpath = $1;\r
-               $basepath = $2;\r
-               $contentpath = $basepath.'@v';\r
-       }\r
-       else {\r
-               return;\r
-       }\r
-       \r
-       ($prot, $host, $port, $path, $query) = path2urldiv($basepath);\r
-       \r
-       if($query ne '') {\r
-               %cgi=getcgi($query);\r
-               $id = $cgi{'id'};\r
-       }\r
-       else {\r
-               $id='';\r
-       }\r
-       \r
-       while () {\r
-               %header = readheaderfile($headerpath); \r
-               if ($header{'status'} =~ /^200 /) {\r
-                       last;\r
-               }\r
-               elsif ($header{'status'} =~ /^30[1237] /) {\r
-                       my $location;\r
-                       unless (defined($location = $header{'location'})) {\r
-                               return;\r
-                       }\r
-                       if ($location !~ /^[a-z]+:\/\//) {\r
-                               $location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location;\r
-                       }\r
-                       ($prot, $host, $port, $path, $query) = divideurl($location);\r
-                       $basepath = urldiv2path($prot, $host, $port, $path, $query);\r
-                       $headerpath = $basepath.'@h';\r
-                       $contentpath = $basepath.'@v';\r
-               }\r
-               else {\r
-                       return;\r
-               }\r
-       }\r
-       \r
-       if ($id =~ /^[0-9]+$/) {\r
-               \r
-               \r
-               print "e\n";\r
-               \r
-               \r
-               \r
-               \r
-       }\r
-}\r
-\r
-sub readheaderfile {\r
-       (my $headerpath) = @_;\r
-       my $headerfile;\r
-       my %header;\r
-       \r
-       my $lastname='';\r
-       \r
-       if (open ($headerfile, "<", $headerpath)) {\r
-               while (defined(my $line = <$headerfile>)) {\r
-                       $line =~ s/[\r\n]$//g;\r
-                       my $headname='';\r
-                       my $headval='';\r
-                       \r
-                       if($line =~ /^[ \t]+([^ \t].*)$/){\r
-                               if($lastname ne '') {\r
-                                       $header{$lastname}.=$1;\r
-                               }\r
-                       }\r
-                       elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {\r
-                               $headname = lc($1);\r
-                               $headval = $2;\r
-                               \r
-                               if ($header{$headname} ne '') {\r
-                                       $header{$headname}.=', '.$headval;\r
-                               }\r
-                               else {\r
-                                       $header{$headname}=$headval;\r
-                               }\r
-                               $lastname = $headname;\r
-                       }\r
-               }\r
-               close ($headerfile);\r
-       }\r
-       return %header;\r
+###PERL;
+
+use strict;
+
+###PROXY_LIB;
+use proxy_lib qw(url2path path2urldiv getcgi divideurl readconfigfile entitydecode);
+use POSIX qw(strftime);
+
+###ARCH_PATH;
+###GROUPSETTINGS_PATH;
+
+my $time = time();
+
+print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n";
+if (scalar @ARGV) {
+       for (my $ind=0; $ind < scalar @ARGV; ++$ind) {
+               processgroup($ARGV[$ind]);
+       }
+}
+else {
+       my $dir;
+       my $subpath;
+       if (opendir ($dir, GROUPSETTINGS_PATH)) {
+               while (defined ($subpath = readdir $dir)) {
+                       processgroup($subpath);
+               }
+               closedir ($dir);
+       }
+}
+
+sub processgroup {
+       (my $filenumber) = @_;
+       my $settingspath;
+       my %settings;
+       my $groupid;
+       my $namespath;
+       my %names;
+       my $archpath;
+       
+       if ($filenumber =~ /^([0-9]+)$/) {
+               $settingspath = GROUPSETTINGS_PATH.$1;
+       }
+       else {
+               return;
+       }
+       
+       %settings = readconfigfile($settingspath);
+       
+       #The group id SHOULD be the filename. But what if it isn't?
+       if ($settings{'id'} =~ /^([0-9]+)$/) {
+               $groupid=$1;
+       }
+       else {
+               return;
+       }
+       
+       $namespath = GROUPSETTINGS_PATH.$groupid.'-names';
+       %names = readconfigfile($namespath);
+       if ($names{'default'} eq '') {
+               return;
+       }
+       
+       $archpath = url2path('https://m.facebook.com/groups/'.$groupid).'@q/';
+       
+       print "Group $groupid\n";
+       
+       processdir($archpath,$groupid,\%settings,\%names);
+}
+
+sub processdir {
+       (my $dirpath, my $groupid, my $settings, my $names) = @_;
+       my $dir;
+       my $subpath;
+       my $subpathfull;
+       
+       unless ( opendir ($dir, $dirpath)) {
+               return;
+       }
+       while (defined($subpath = readdir $dir)) {
+               $subpathfull=$dirpath.$subpath;
+               if ($subpath =~ /^\.\.?$/) {
+                       next;
+               }
+               if (-f $subpathfull) {
+                       processfile($subpathfull, $groupid, $settings, $names);
+               }
+               elsif (-d $subpathfull) {
+                       processdir($subpathfull.'/', $groupid, $settings, $names);
+               }
+       }
+       closedir ($dir);        
+}
+
+sub processfile {
+       (my $headerpath, my $groupid, my $settings, my $names) = @_;
+       my $basepath;
+       my $contentpath;
+       my $contentfile;
+       
+       my %header;
+       
+       my $prot;
+       my $host;
+       my $port;
+       my $path;
+       my $query;
+       
+       my %cgi;
+       
+       my $id;
+       my $timenumber;
+       
+       my %thread;
+       
+       if ($headerpath =~ /^((.+)\@h)$/) {
+               $headerpath = $1;
+               $basepath = $2;
+               $contentpath = $basepath.'@v';
+       }
+       else {
+               return;
+       }
+       
+       ($prot, $host, $port, $path, $query) = path2urldiv($basepath);
+       
+       if($query ne '') {
+               %cgi=getcgi($query);
+               $id = $cgi{'id'};
+       }
+       else {
+               $id='';
+       }
+       
+       while () {
+               %header = readheaderfile($headerpath); 
+               if ($header{'status'} =~ /^200 /) {
+                       last;
+               }
+               elsif ($header{'status'} =~ /^30[1237] /) {
+                       my $location;
+                       unless (defined($location = $header{'location'})) {
+                               return;
+                       }
+                       if ($location !~ /^[a-z]+:\/\//) {
+                               $location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location;
+                       }
+                       ($prot, $host, $port, $path, $query) = divideurl($location);
+                       $basepath = urldiv2path($prot, $host, $port, $path, $query);
+                       $headerpath = $basepath.'@h';
+                       $contentpath = $basepath.'@v';
+               }
+               else {
+                       return;
+               }
+       }
+       
+       unless ($timenumber = gettimenumber ($header{'date'})) {
+               my @stat;
+               unless (@stat = stat $contentpath) {
+                       $timenumber='00000000000000';
+               }
+               else {
+                       $timenumber = strftime('%Y%m%d%h%M%S',gmtime($stat[9]));
+               }
+       }
+       
+       if ($id =~ /^[0-9]+$/) {
+               print "Thread $id\n";
+               
+               $thread{'id'}=$id;
+               $thread{'groupid'}=$groupid;
+               $thread{'timenumber'}=$timenumber;
+               
+               my $line;
+               
+               my %postdata;
+               
+               
+               unless (open ($contentfile, "<",$contentpath)) {
+                       print "Can't open file";
+                       return;
+               }
+               
+               my $text;
+               my %tag;
+               my $mode = 'thread';
+               my $level = 0;
+               my $closetag=0;
+               
+               local $/ = '<';
+               unless (defined ($text = <$contentfile>)) {
+                       close($contentfile);
+                       return;
+               }
+               while () {
+                       if ($closetag){
+                               $tag{'<'} = '/'.$tag{'<'};
+                               $tag{'/'}='/';
+                               $tag{"\\"}=undef;
+                               $closetag=0;
+                       }
+                       else {
+                               local $/ = '>';
+                               unless (defined ($text = <$contentfile>)) {
+                                       close($contentfile);
+                                       return;
+                               }
+                               $text =~ s/>$//;
+                               # print "tag: $text\n";
+                               %tag = taginfo($text);
+                       }
+                       
+                       if ($mode eq 'thread'){
+                               if ($tag{'<'} eq 'h3') {
+                                       $mode = 'thread-author';
+                               }
+                               elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bj|bk)$/)) {
+                                       $mode='thread-content';
+                                       $level=0;
+                               }
+                               
+                               
+                               elsif (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/)){
+                                       print "$tag{'id'} - $id\n";
+                                       $mode='posts';
+                                       
+                                       #!!! {
+                                       
+                                       foreach my $ind (keys %thread) {
+                                               print "$ind: $thread{$ind}\n";
+                                       }
+                                       print "\n";
+                                       
+                                       #!!! }
+                               }
+                       }
+                       elsif ($mode eq 'thread-author') {
+                               if ($tag{'<'} eq 'a') {
+                                       if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
+                                               my $author = $1;
+                                               if ($thread{'author'} eq '') {
+                                                       $thread{'author'} = $author;
+                                                       $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
+                                               }
+                                       }
+                               }
+                               elsif ($tag{'<'} eq '/h3') {
+                                       $mode='thread';
+                               }
+                       }
+                       elsif ($mode eq 'thread-content') {
+                               if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
+                                       ++$level;
+                                       $thread{'postcontent'}.="<br>\n<br>\n";
+                               }
+                               elsif ($tag{'<'} eq '/div') {
+                                       if($level){
+                                               --$level;
+                                               $thread{'postcontent'}.="<br>\n<br>\n";
+                                       }
+                                       else {
+                                               $mode = 'thread';
+                                       }
+                               }
+                               elsif ($tag{'<'} eq '/p') {
+                                       $thread{'postcontent'}.="<br>\n";
+                               }
+                               # else {
+                                       # $thread{'postcontent'}.='<'.$tag{'<'}.'>';
+                               # }
+                       }
+                       
+                       if ($tag{"\\"} ne '') {
+                               $closetag = 1;
+                               next;
+                       }
+                       
+                       local $/ = '<';
+                       unless (defined ($text = <$contentfile>)) {
+                               close($contentfile);
+                               return;
+                       }
+                       $text =~ s/<$//;
+                       
+                       if($mode eq 'thread-content') {
+                               $thread{'postcontent'}.=$text;
+                       }
+                       # print "text: $text\n";
+               }
+               
+               
+               
+               # while (defined ($line = <$contentfile>)) {
+                       # if ($firstline) {
+                               # $firstline = 0;
+                               # next;
+                       # }
+                       # # print "$line\n";
+               # }
+               close ($contentfile);
+               
+       }
+}
+
+sub readheaderfile {
+       (my $headerpath) = @_;
+       my $headerfile;
+       my %header;
+       
+       my $lastname='';
+       
+       if (open ($headerfile, "<", $headerpath)) {
+               while (defined(my $line = <$headerfile>)) {
+                       $line =~ s/[\r\n]$//g;
+                       my $headname='';
+                       my $headval='';
+                       
+                       if($line =~ /^[ \t]+([^ \t].*)$/){
+                               if($lastname ne '') {
+                                       $header{$lastname}.=$1;
+                               }
+                       }
+                       elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {
+                               $headname = lc($1);
+                               $headval = $2;
+                               
+                               if ($header{$headname} ne '') {
+                                       $header{$headname}.=', '.$headval;
+                               }
+                               else {
+                                       $header{$headname}=$headval;
+                               }
+                               $lastname = $headname;
+                       }
+               }
+               close ($headerfile);
+       }
+       return %header;
+}
+
+sub gettimenumber {
+       (my $date) = @_;
+       my $year;
+       my $month;
+       my $day;
+       my $hour;
+       my $minute;
+       my $second;
+       
+       # see https://tools.ietf.org/html/rfc2616#section-3.3.1
+       if ($date =~ /^[A-Za-z]{3}, ([0-9]{2}) ([A-Za-z]{3}) ([0-9]{4}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/){
+               $day=$1;
+               $month=lc($2);
+               $year=$3;
+               $hour=$4;
+               $minute=$5;
+               $second=$6;
+       }
+       elsif ($date =~ /^[A-Za-z]{3,}, ([0-9]{2})-([A-Za-z]{3})-([0-9]{2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/) {
+               $day=$1;
+               $month=lc($2);
+               $year='20'.$3; # Assuming 21st century!
+               $hour=$4;
+               $minute=$5;
+               $second=$6;
+       }
+       elsif ($date =~ /^[A-Za-z]{3} ([A-Za-z]{3}) ([ 0-9][0-9]) ([0-9]{2}):([0-9]{2}):([0-9]{2}) ([0-9]{4})/) {
+               $month=lc($1);
+               $day=$2;
+               $hour=$3;
+               $minute=$4;
+               $second=$5;
+               $year=$6;
+               $day =~ s/ /0/;
+       }
+       else {
+               return undef;
+       }
+       
+       if ($month =~ /^jan/) {
+               $month = '01';
+       }
+       elsif ($month =~ /^feb/) {
+               $month = '02';
+       }
+       elsif ($month =~ /^mar/) {
+               $month = '03';
+       }
+       elsif ($month =~ /^apr/) {
+               $month = '04';
+       }
+       elsif ($month =~ /^may/) {
+               $month = '05';
+       }
+       elsif ($month =~ /^jun/) {
+               $month = '06';
+       }
+       elsif ($month =~ /^jul/) {
+               $month = '07';
+       }
+       elsif ($month =~ /^aug/) {
+               $month = '08';
+       }
+       elsif ($month =~ /^sep/) {
+               $month = '09';
+       }
+       elsif ($month =~ /^oct/) {
+               $month = '10';
+       }
+       elsif ($month =~ /^nov/) {
+               $month = '11';
+       }
+       elsif ($month =~ /^dec/) {
+               $month = '12';
+       }
+       else {
+               return undef;
+       }
+       
+       return $year.$month.$day.$hour.$minute.$second;
+}
+
+sub taginfo {
+       (my $tagtext) = @_;
+       my %tag;
+       
+       # if ($tagtext =~ /^div/) {
+               # print "$tagtext\n";
+       # }
+       
+       if ($tagtext =~ /^((\/?)[^ \t\r\n<>\/"=]+)([ \t\r\n].*)?$/) {
+               $tag{'<'}=$1;
+               if ($2 ne '') {
+                       $tag{'/'} = '/';
+               }
+               $tagtext = $3;
+       }
+       else {
+               return %tag;
+       }
+       
+       while ($tagtext =~ /^[ \t\r\n]*([^ \t\r\n<>\/"=]+)[ \t\r\n]*=[ \t\r\n]*"([^"]+)"([ \t\r\n].*)?$/) {
+               $tag{$1} = entitydecode($2);
+               $tagtext = $3;
+       }
+       
+       if ($tagtext =~ /\/[ \t\r\n]*$/) {
+               $tag{"\\"}="\\";
+       }
+       # if ($tag{'<'} eq 'div') { 
+       # foreach my $ind (keys %tag) {
+                                               # print "$ind: $tag{$ind}\n";
+                                       # }
+                                       # print "\n";
+       # }
+       return %tag; 
 }
\ No newline at end of file
index 148fef28b69fee8eb1d709d70d2c2da77904100b..909b26299e99fd08cd66fe54ad0faa147d4826a0 100644 (file)
@@ -34,6 +34,8 @@ close ($configfile);
 # Now generate things to be inserted.
 
 $def{'PROXY_ARCH_PATH'}   = "use constant PROXY_ARCH_PATH    => '".$set{'proxy_data_path'}."archive/';";
+$def{'ARCH_PATH'}         = "use constant ARCH_PATH          => '".$set{'data_path'}."group/';";
+$def{'GROUPSETTINGS_PATH'}= "use constant GROUPSETTINGS_PATH => '".$set{'data_path'}."groupsettings/';";
 
 $def{'PROXY_LIB'} = "use lib '".$set{'proxy_lib_path'}."';";