From 5ae4b73b31cb5c7fdb83d116b30a7d4cb1c3491d Mon Sep 17 00:00:00 2001 From: b Date: Tue, 1 Dec 2015 23:36:54 +0000 Subject: [PATCH] File processing started. git-svn-id: svn://botcastle1b/yplom/facebug1@2 7dec801f-c475-4e67-ba99-809552d69c55 --- bot.1.pl | 627 +++++++++++++++++++++++++++++++++++++-------------- configure.pl | 2 + 2 files changed, 458 insertions(+), 171 deletions(-) diff --git a/bot.1.pl b/bot.1.pl index cc473a5..10ec131 100644 --- a/bot.1.pl +++ b/bot.1.pl @@ -1,172 +1,457 @@ -###PERL; - -use strict; - -###PROXY_LIB; -use proxy_lib qw(url2path path2urldiv getcgi divideurl); -use POSIX qw(strftime); - -my $time = time(); -print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n"; -for (my $ind=0; $ind < scalar @ARGV; ++$ind) { - my %set = {}; - my $basepath=''; - my $configfile; - - unless (open ($configfile, "<", $ARGV[$ind])) { - print "Cannot open $ARGV[$ind]\n"; - next; - } - - while (defined(my $line = <$configfile>)) { - $line =~ s/[\r\n]//g; - $line =~ s/#.*$//; - if ($line =~ /^ *([a-zA-Z0-9_]+) *= *(.*)$/){ - my $name=$1; - my $value=$2; - $value =~ s/ *$//; - $set{$name}=$value; - } - } - close ($configfile); - - if ($set{'id'} =~ /^([0-9]+)$/) { - my $basepath = url2path('https://m.facebook.com/groups/'.$1).'@q/'; - processdir($basepath); - } - else { - print "Invalid ID $ARGV[$ind]\n"; - next; - } -} - -sub processdir { - (my $dirpath) = @_; - my $dir; - my $subpath; - my $subpathfull; - my @stat; - - unless ( opendir ($dir, $dirpath)) { - return; - } - while (defined($subpath = readdir $dir)) { - $subpathfull=$dirpath.$subpath; - if ($subpath =~ /^\.\.?$/) { - next; - } - if (-f $subpathfull) { - processfile($subpathfull); - } - elsif (-d $subpathfull) { - processdir($subpathfull.'/'); - } - } - closedir ($dir); -} - -sub processfile { - (my $headerpath) = @_; - my $basepath; - my $contentpath; - - my %header; - - my $prot; - my $host; - my $port; - my $path; - my $query; - - my %cgi; - - my $id; - - if ($headerpath =~ /^((.+)\@h)$/) { - $headerpath = $1; - $basepath = $2; - $contentpath = $basepath.'@v'; - } - else { - return; - } - - ($prot, $host, $port, $path, $query) = path2urldiv($basepath); - - if($query ne '') { - %cgi=getcgi($query); - $id = $cgi{'id'}; - } - else { - $id=''; - } - - while () { - %header = readheaderfile($headerpath); - if ($header{'status'} =~ /^200 /) { - last; - } - elsif ($header{'status'} =~ /^30[1237] /) { - my $location; - unless (defined($location = $header{'location'})) { - return; - } - if ($location !~ /^[a-z]+:\/\//) { - $location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location; - } - ($prot, $host, $port, $path, $query) = divideurl($location); - $basepath = urldiv2path($prot, $host, $port, $path, $query); - $headerpath = $basepath.'@h'; - $contentpath = $basepath.'@v'; - } - else { - return; - } - } - - if ($id =~ /^[0-9]+$/) { - - - print "e\n"; - - - - - } -} - -sub readheaderfile { - (my $headerpath) = @_; - my $headerfile; - my %header; - - my $lastname=''; - - if (open ($headerfile, "<", $headerpath)) { - while (defined(my $line = <$headerfile>)) { - $line =~ s/[\r\n]$//g; - my $headname=''; - my $headval=''; - - if($line =~ /^[ \t]+([^ \t].*)$/){ - if($lastname ne '') { - $header{$lastname}.=$1; - } - } - elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) { - $headname = lc($1); - $headval = $2; - - if ($header{$headname} ne '') { - $header{$headname}.=', '.$headval; - } - else { - $header{$headname}=$headval; - } - $lastname = $headname; - } - } - close ($headerfile); - } - return %header; +###PERL; + +use strict; + +###PROXY_LIB; +use proxy_lib qw(url2path path2urldiv getcgi divideurl readconfigfile entitydecode); +use POSIX qw(strftime); + +###ARCH_PATH; +###GROUPSETTINGS_PATH; + +my $time = time(); + +print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n"; +if (scalar @ARGV) { + for (my $ind=0; $ind < scalar @ARGV; ++$ind) { + processgroup($ARGV[$ind]); + } +} +else { + my $dir; + my $subpath; + if (opendir ($dir, GROUPSETTINGS_PATH)) { + while (defined ($subpath = readdir $dir)) { + processgroup($subpath); + } + closedir ($dir); + } +} + +sub processgroup { + (my $filenumber) = @_; + my $settingspath; + my %settings; + my $groupid; + my $namespath; + my %names; + my $archpath; + + if ($filenumber =~ /^([0-9]+)$/) { + $settingspath = GROUPSETTINGS_PATH.$1; + } + else { + return; + } + + %settings = readconfigfile($settingspath); + + #The group id SHOULD be the filename. But what if it isn't? + if ($settings{'id'} =~ /^([0-9]+)$/) { + $groupid=$1; + } + else { + return; + } + + $namespath = GROUPSETTINGS_PATH.$groupid.'-names'; + %names = readconfigfile($namespath); + if ($names{'default'} eq '') { + return; + } + + $archpath = url2path('https://m.facebook.com/groups/'.$groupid).'@q/'; + + print "Group $groupid\n"; + + processdir($archpath,$groupid,\%settings,\%names); +} + +sub processdir { + (my $dirpath, my $groupid, my $settings, my $names) = @_; + my $dir; + my $subpath; + my $subpathfull; + + unless ( opendir ($dir, $dirpath)) { + return; + } + while (defined($subpath = readdir $dir)) { + $subpathfull=$dirpath.$subpath; + if ($subpath =~ /^\.\.?$/) { + next; + } + if (-f $subpathfull) { + processfile($subpathfull, $groupid, $settings, $names); + } + elsif (-d $subpathfull) { + processdir($subpathfull.'/', $groupid, $settings, $names); + } + } + closedir ($dir); +} + +sub processfile { + (my $headerpath, my $groupid, my $settings, my $names) = @_; + my $basepath; + my $contentpath; + my $contentfile; + + my %header; + + my $prot; + my $host; + my $port; + my $path; + my $query; + + my %cgi; + + my $id; + my $timenumber; + + my %thread; + + if ($headerpath =~ /^((.+)\@h)$/) { + $headerpath = $1; + $basepath = $2; + $contentpath = $basepath.'@v'; + } + else { + return; + } + + ($prot, $host, $port, $path, $query) = path2urldiv($basepath); + + if($query ne '') { + %cgi=getcgi($query); + $id = $cgi{'id'}; + } + else { + $id=''; + } + + while () { + %header = readheaderfile($headerpath); + if ($header{'status'} =~ /^200 /) { + last; + } + elsif ($header{'status'} =~ /^30[1237] /) { + my $location; + unless (defined($location = $header{'location'})) { + return; + } + if ($location !~ /^[a-z]+:\/\//) { + $location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location; + } + ($prot, $host, $port, $path, $query) = divideurl($location); + $basepath = urldiv2path($prot, $host, $port, $path, $query); + $headerpath = $basepath.'@h'; + $contentpath = $basepath.'@v'; + } + else { + return; + } + } + + unless ($timenumber = gettimenumber ($header{'date'})) { + my @stat; + unless (@stat = stat $contentpath) { + $timenumber='00000000000000'; + } + else { + $timenumber = strftime('%Y%m%d%h%M%S',gmtime($stat[9])); + } + } + + if ($id =~ /^[0-9]+$/) { + print "Thread $id\n"; + + $thread{'id'}=$id; + $thread{'groupid'}=$groupid; + $thread{'timenumber'}=$timenumber; + + my $line; + + my %postdata; + + + unless (open ($contentfile, "<",$contentpath)) { + print "Can't open file"; + return; + } + + my $text; + my %tag; + my $mode = 'thread'; + my $level = 0; + my $closetag=0; + + local $/ = '<'; + unless (defined ($text = <$contentfile>)) { + close($contentfile); + return; + } + while () { + if ($closetag){ + $tag{'<'} = '/'.$tag{'<'}; + $tag{'/'}='/'; + $tag{"\\"}=undef; + $closetag=0; + } + else { + local $/ = '>'; + unless (defined ($text = <$contentfile>)) { + close($contentfile); + return; + } + $text =~ s/>$//; + # print "tag: $text\n"; + %tag = taginfo($text); + } + + if ($mode eq 'thread'){ + if ($tag{'<'} eq 'h3') { + $mode = 'thread-author'; + } + elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bj|bk)$/)) { + $mode='thread-content'; + $level=0; + } + + + elsif (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/)){ + print "$tag{'id'} - $id\n"; + $mode='posts'; + + #!!! { + + foreach my $ind (keys %thread) { + print "$ind: $thread{$ind}\n"; + } + print "\n"; + + #!!! } + } + } + elsif ($mode eq 'thread-author') { + if ($tag{'<'} eq 'a') { + if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { + my $author = $1; + if ($thread{'author'} eq '') { + $thread{'author'} = $author; + $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; + } + } + } + elsif ($tag{'<'} eq '/h3') { + $mode='thread'; + } + } + elsif ($mode eq 'thread-content') { + if ($tag{'<'} eq 'div') { # There should not be any sub
s! + ++$level; + $thread{'postcontent'}.="
\n
\n"; + } + elsif ($tag{'<'} eq '/div') { + if($level){ + --$level; + $thread{'postcontent'}.="
\n
\n"; + } + else { + $mode = 'thread'; + } + } + elsif ($tag{'<'} eq '/p') { + $thread{'postcontent'}.="
\n"; + } + # else { + # $thread{'postcontent'}.='<'.$tag{'<'}.'>'; + # } + } + + if ($tag{"\\"} ne '') { + $closetag = 1; + next; + } + + local $/ = '<'; + unless (defined ($text = <$contentfile>)) { + close($contentfile); + return; + } + $text =~ s/<$//; + + if($mode eq 'thread-content') { + $thread{'postcontent'}.=$text; + } + # print "text: $text\n"; + } + + + + # while (defined ($line = <$contentfile>)) { + # if ($firstline) { + # $firstline = 0; + # next; + # } + # # print "$line\n"; + # } + close ($contentfile); + + } +} + +sub readheaderfile { + (my $headerpath) = @_; + my $headerfile; + my %header; + + my $lastname=''; + + if (open ($headerfile, "<", $headerpath)) { + while (defined(my $line = <$headerfile>)) { + $line =~ s/[\r\n]$//g; + my $headname=''; + my $headval=''; + + if($line =~ /^[ \t]+([^ \t].*)$/){ + if($lastname ne '') { + $header{$lastname}.=$1; + } + } + elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) { + $headname = lc($1); + $headval = $2; + + if ($header{$headname} ne '') { + $header{$headname}.=', '.$headval; + } + else { + $header{$headname}=$headval; + } + $lastname = $headname; + } + } + close ($headerfile); + } + return %header; +} + +sub gettimenumber { + (my $date) = @_; + my $year; + my $month; + my $day; + my $hour; + my $minute; + my $second; + + # see https://tools.ietf.org/html/rfc2616#section-3.3.1 + if ($date =~ /^[A-Za-z]{3}, ([0-9]{2}) ([A-Za-z]{3}) ([0-9]{4}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/){ + $day=$1; + $month=lc($2); + $year=$3; + $hour=$4; + $minute=$5; + $second=$6; + } + elsif ($date =~ /^[A-Za-z]{3,}, ([0-9]{2})-([A-Za-z]{3})-([0-9]{2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/) { + $day=$1; + $month=lc($2); + $year='20'.$3; # Assuming 21st century! + $hour=$4; + $minute=$5; + $second=$6; + } + elsif ($date =~ /^[A-Za-z]{3} ([A-Za-z]{3}) ([ 0-9][0-9]) ([0-9]{2}):([0-9]{2}):([0-9]{2}) ([0-9]{4})/) { + $month=lc($1); + $day=$2; + $hour=$3; + $minute=$4; + $second=$5; + $year=$6; + $day =~ s/ /0/; + } + else { + return undef; + } + + if ($month =~ /^jan/) { + $month = '01'; + } + elsif ($month =~ /^feb/) { + $month = '02'; + } + elsif ($month =~ /^mar/) { + $month = '03'; + } + elsif ($month =~ /^apr/) { + $month = '04'; + } + elsif ($month =~ /^may/) { + $month = '05'; + } + elsif ($month =~ /^jun/) { + $month = '06'; + } + elsif ($month =~ /^jul/) { + $month = '07'; + } + elsif ($month =~ /^aug/) { + $month = '08'; + } + elsif ($month =~ /^sep/) { + $month = '09'; + } + elsif ($month =~ /^oct/) { + $month = '10'; + } + elsif ($month =~ /^nov/) { + $month = '11'; + } + elsif ($month =~ /^dec/) { + $month = '12'; + } + else { + return undef; + } + + return $year.$month.$day.$hour.$minute.$second; +} + +sub taginfo { + (my $tagtext) = @_; + my %tag; + + # if ($tagtext =~ /^div/) { + # print "$tagtext\n"; + # } + + if ($tagtext =~ /^((\/?)[^ \t\r\n<>\/"=]+)([ \t\r\n].*)?$/) { + $tag{'<'}=$1; + if ($2 ne '') { + $tag{'/'} = '/'; + } + $tagtext = $3; + } + else { + return %tag; + } + + while ($tagtext =~ /^[ \t\r\n]*([^ \t\r\n<>\/"=]+)[ \t\r\n]*=[ \t\r\n]*"([^"]+)"([ \t\r\n].*)?$/) { + $tag{$1} = entitydecode($2); + $tagtext = $3; + } + + if ($tagtext =~ /\/[ \t\r\n]*$/) { + $tag{"\\"}="\\"; + } + # if ($tag{'<'} eq 'div') { + # foreach my $ind (keys %tag) { + # print "$ind: $tag{$ind}\n"; + # } + # print "\n"; + # } + return %tag; } \ No newline at end of file diff --git a/configure.pl b/configure.pl index 148fef2..909b262 100644 --- a/configure.pl +++ b/configure.pl @@ -34,6 +34,8 @@ close ($configfile); # Now generate things to be inserted. $def{'PROXY_ARCH_PATH'} = "use constant PROXY_ARCH_PATH => '".$set{'proxy_data_path'}."archive/';"; +$def{'ARCH_PATH'} = "use constant ARCH_PATH => '".$set{'data_path'}."group/';"; +$def{'GROUPSETTINGS_PATH'}= "use constant GROUPSETTINGS_PATH => '".$set{'data_path'}."groupsettings/';"; $def{'PROXY_LIB'} = "use lib '".$set{'proxy_lib_path'}."';"; -- 2.30.2