-###PERL;\r
-\r
-use strict;\r
-\r
-###PROXY_LIB;\r
-use proxy_lib qw(url2path path2urldiv getcgi divideurl);\r
-use POSIX qw(strftime);\r
-\r
-my $time = time();\r
-print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n";\r
-for (my $ind=0; $ind < scalar @ARGV; ++$ind) {\r
- my %set = {};\r
- my $basepath='';\r
- my $configfile;\r
- \r
- unless (open ($configfile, "<", $ARGV[$ind])) {\r
- print "Cannot open $ARGV[$ind]\n";\r
- next;\r
- }\r
- \r
- while (defined(my $line = <$configfile>)) {\r
- $line =~ s/[\r\n]//g;\r
- $line =~ s/#.*$//;\r
- if ($line =~ /^ *([a-zA-Z0-9_]+) *= *(.*)$/){\r
- my $name=$1;\r
- my $value=$2;\r
- $value =~ s/ *$//; \r
- $set{$name}=$value;\r
- }\r
- }\r
- close ($configfile);\r
- \r
- if ($set{'id'} =~ /^([0-9]+)$/) {\r
- my $basepath = url2path('https://m.facebook.com/groups/'.$1).'@q/';\r
- processdir($basepath);\r
- }\r
- else {\r
- print "Invalid ID $ARGV[$ind]\n";\r
- next;\r
- }\r
-}\r
-\r
-sub processdir {\r
- (my $dirpath) = @_;\r
- my $dir;\r
- my $subpath;\r
- my $subpathfull;\r
- my @stat;\r
-\r
- unless ( opendir ($dir, $dirpath)) {\r
- return;\r
- }\r
- while (defined($subpath = readdir $dir)) {\r
- $subpathfull=$dirpath.$subpath;\r
- if ($subpath =~ /^\.\.?$/) {\r
- next;\r
- }\r
- if (-f $subpathfull) {\r
- processfile($subpathfull);\r
- }\r
- elsif (-d $subpathfull) {\r
- processdir($subpathfull.'/');\r
- }\r
- }\r
- closedir ($dir); \r
-}\r
-\r
-sub processfile {\r
- (my $headerpath) = @_;\r
- my $basepath;\r
- my $contentpath;\r
- \r
- my %header;\r
- \r
- my $prot;\r
- my $host;\r
- my $port;\r
- my $path;\r
- my $query;\r
- \r
- my %cgi;\r
- \r
- my $id;\r
- \r
- if ($headerpath =~ /^((.+)\@h)$/) {\r
- $headerpath = $1;\r
- $basepath = $2;\r
- $contentpath = $basepath.'@v';\r
- }\r
- else {\r
- return;\r
- }\r
- \r
- ($prot, $host, $port, $path, $query) = path2urldiv($basepath);\r
- \r
- if($query ne '') {\r
- %cgi=getcgi($query);\r
- $id = $cgi{'id'};\r
- }\r
- else {\r
- $id='';\r
- }\r
- \r
- while () {\r
- %header = readheaderfile($headerpath); \r
- if ($header{'status'} =~ /^200 /) {\r
- last;\r
- }\r
- elsif ($header{'status'} =~ /^30[1237] /) {\r
- my $location;\r
- unless (defined($location = $header{'location'})) {\r
- return;\r
- }\r
- if ($location !~ /^[a-z]+:\/\//) {\r
- $location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location;\r
- }\r
- ($prot, $host, $port, $path, $query) = divideurl($location);\r
- $basepath = urldiv2path($prot, $host, $port, $path, $query);\r
- $headerpath = $basepath.'@h';\r
- $contentpath = $basepath.'@v';\r
- }\r
- else {\r
- return;\r
- }\r
- }\r
- \r
- if ($id =~ /^[0-9]+$/) {\r
- \r
- \r
- print "e\n";\r
- \r
- \r
- \r
- \r
- }\r
-}\r
-\r
-sub readheaderfile {\r
- (my $headerpath) = @_;\r
- my $headerfile;\r
- my %header;\r
- \r
- my $lastname='';\r
- \r
- if (open ($headerfile, "<", $headerpath)) {\r
- while (defined(my $line = <$headerfile>)) {\r
- $line =~ s/[\r\n]$//g;\r
- my $headname='';\r
- my $headval='';\r
- \r
- if($line =~ /^[ \t]+([^ \t].*)$/){\r
- if($lastname ne '') {\r
- $header{$lastname}.=$1;\r
- }\r
- }\r
- elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {\r
- $headname = lc($1);\r
- $headval = $2;\r
- \r
- if ($header{$headname} ne '') {\r
- $header{$headname}.=', '.$headval;\r
- }\r
- else {\r
- $header{$headname}=$headval;\r
- }\r
- $lastname = $headname;\r
- }\r
- }\r
- close ($headerfile);\r
- }\r
- return %header;\r
+###PERL;
+
+use strict;
+
+###PROXY_LIB;
+use proxy_lib qw(url2path path2urldiv getcgi divideurl readconfigfile entitydecode);
+use POSIX qw(strftime);
+
+###ARCH_PATH;
+###GROUPSETTINGS_PATH;
+
+my $time = time();
+
+print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n";
+if (scalar @ARGV) {
+ for (my $ind=0; $ind < scalar @ARGV; ++$ind) {
+ processgroup($ARGV[$ind]);
+ }
+}
+else {
+ my $dir;
+ my $subpath;
+ if (opendir ($dir, GROUPSETTINGS_PATH)) {
+ while (defined ($subpath = readdir $dir)) {
+ processgroup($subpath);
+ }
+ closedir ($dir);
+ }
+}
+
+sub processgroup {
+ (my $filenumber) = @_;
+ my $settingspath;
+ my %settings;
+ my $groupid;
+ my $namespath;
+ my %names;
+ my $archpath;
+
+ if ($filenumber =~ /^([0-9]+)$/) {
+ $settingspath = GROUPSETTINGS_PATH.$1;
+ }
+ else {
+ return;
+ }
+
+ %settings = readconfigfile($settingspath);
+
+ #The group id SHOULD be the filename. But what if it isn't?
+ if ($settings{'id'} =~ /^([0-9]+)$/) {
+ $groupid=$1;
+ }
+ else {
+ return;
+ }
+
+ $namespath = GROUPSETTINGS_PATH.$groupid.'-names';
+ %names = readconfigfile($namespath);
+ if ($names{'default'} eq '') {
+ return;
+ }
+
+ $archpath = url2path('https://m.facebook.com/groups/'.$groupid).'@q/';
+
+ print "Group $groupid\n";
+
+ processdir($archpath,$groupid,\%settings,\%names);
+}
+
+sub processdir {
+ (my $dirpath, my $groupid, my $settings, my $names) = @_;
+ my $dir;
+ my $subpath;
+ my $subpathfull;
+
+ unless ( opendir ($dir, $dirpath)) {
+ return;
+ }
+ while (defined($subpath = readdir $dir)) {
+ $subpathfull=$dirpath.$subpath;
+ if ($subpath =~ /^\.\.?$/) {
+ next;
+ }
+ if (-f $subpathfull) {
+ processfile($subpathfull, $groupid, $settings, $names);
+ }
+ elsif (-d $subpathfull) {
+ processdir($subpathfull.'/', $groupid, $settings, $names);
+ }
+ }
+ closedir ($dir);
+}
+
+sub processfile {
+ (my $headerpath, my $groupid, my $settings, my $names) = @_;
+ my $basepath;
+ my $contentpath;
+ my $contentfile;
+
+ my %header;
+
+ my $prot;
+ my $host;
+ my $port;
+ my $path;
+ my $query;
+
+ my %cgi;
+
+ my $id;
+ my $timenumber;
+
+ my %thread;
+
+ if ($headerpath =~ /^((.+)\@h)$/) {
+ $headerpath = $1;
+ $basepath = $2;
+ $contentpath = $basepath.'@v';
+ }
+ else {
+ return;
+ }
+
+ ($prot, $host, $port, $path, $query) = path2urldiv($basepath);
+
+ if($query ne '') {
+ %cgi=getcgi($query);
+ $id = $cgi{'id'};
+ }
+ else {
+ $id='';
+ }
+
+ while () {
+ %header = readheaderfile($headerpath);
+ if ($header{'status'} =~ /^200 /) {
+ last;
+ }
+ elsif ($header{'status'} =~ /^30[1237] /) {
+ my $location;
+ unless (defined($location = $header{'location'})) {
+ return;
+ }
+ if ($location !~ /^[a-z]+:\/\//) {
+ $location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location;
+ }
+ ($prot, $host, $port, $path, $query) = divideurl($location);
+ $basepath = urldiv2path($prot, $host, $port, $path, $query);
+ $headerpath = $basepath.'@h';
+ $contentpath = $basepath.'@v';
+ }
+ else {
+ return;
+ }
+ }
+
+ unless ($timenumber = gettimenumber ($header{'date'})) {
+ my @stat;
+ unless (@stat = stat $contentpath) {
+ $timenumber='00000000000000';
+ }
+ else {
+ $timenumber = strftime('%Y%m%d%h%M%S',gmtime($stat[9]));
+ }
+ }
+
+ if ($id =~ /^[0-9]+$/) {
+ print "Thread $id\n";
+
+ $thread{'id'}=$id;
+ $thread{'groupid'}=$groupid;
+ $thread{'timenumber'}=$timenumber;
+
+ my $line;
+
+ my %postdata;
+
+
+ unless (open ($contentfile, "<",$contentpath)) {
+ print "Can't open file";
+ return;
+ }
+
+ my $text;
+ my %tag;
+ my $mode = 'thread';
+ my $level = 0;
+ my $closetag=0;
+
+ local $/ = '<';
+ unless (defined ($text = <$contentfile>)) {
+ close($contentfile);
+ return;
+ }
+ while () {
+ if ($closetag){
+ $tag{'<'} = '/'.$tag{'<'};
+ $tag{'/'}='/';
+ $tag{"\\"}=undef;
+ $closetag=0;
+ }
+ else {
+ local $/ = '>';
+ unless (defined ($text = <$contentfile>)) {
+ close($contentfile);
+ return;
+ }
+ $text =~ s/>$//;
+ # print "tag: $text\n";
+ %tag = taginfo($text);
+ }
+
+ if ($mode eq 'thread'){
+ if ($tag{'<'} eq 'h3') {
+ $mode = 'thread-author';
+ }
+ elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bj|bk)$/)) {
+ $mode='thread-content';
+ $level=0;
+ }
+
+
+ elsif (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/)){
+ print "$tag{'id'} - $id\n";
+ $mode='posts';
+
+ #!!! {
+
+ foreach my $ind (keys %thread) {
+ print "$ind: $thread{$ind}\n";
+ }
+ print "\n";
+
+ #!!! }
+ }
+ }
+ elsif ($mode eq 'thread-author') {
+ if ($tag{'<'} eq 'a') {
+ if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
+ my $author = $1;
+ if ($thread{'author'} eq '') {
+ $thread{'author'} = $author;
+ $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
+ }
+ }
+ }
+ elsif ($tag{'<'} eq '/h3') {
+ $mode='thread';
+ }
+ }
+ elsif ($mode eq 'thread-content') {
+ if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
+ ++$level;
+ $thread{'postcontent'}.="<br>\n<br>\n";
+ }
+ elsif ($tag{'<'} eq '/div') {
+ if($level){
+ --$level;
+ $thread{'postcontent'}.="<br>\n<br>\n";
+ }
+ else {
+ $mode = 'thread';
+ }
+ }
+ elsif ($tag{'<'} eq '/p') {
+ $thread{'postcontent'}.="<br>\n";
+ }
+ # else {
+ # $thread{'postcontent'}.='<'.$tag{'<'}.'>';
+ # }
+ }
+
+ if ($tag{"\\"} ne '') {
+ $closetag = 1;
+ next;
+ }
+
+ local $/ = '<';
+ unless (defined ($text = <$contentfile>)) {
+ close($contentfile);
+ return;
+ }
+ $text =~ s/<$//;
+
+ if($mode eq 'thread-content') {
+ $thread{'postcontent'}.=$text;
+ }
+ # print "text: $text\n";
+ }
+
+
+
+ # while (defined ($line = <$contentfile>)) {
+ # if ($firstline) {
+ # $firstline = 0;
+ # next;
+ # }
+ # # print "$line\n";
+ # }
+ close ($contentfile);
+
+ }
+}
+
+sub readheaderfile {
+ (my $headerpath) = @_;
+ my $headerfile;
+ my %header;
+
+ my $lastname='';
+
+ if (open ($headerfile, "<", $headerpath)) {
+ while (defined(my $line = <$headerfile>)) {
+ $line =~ s/[\r\n]$//g;
+ my $headname='';
+ my $headval='';
+
+ if($line =~ /^[ \t]+([^ \t].*)$/){
+ if($lastname ne '') {
+ $header{$lastname}.=$1;
+ }
+ }
+ elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {
+ $headname = lc($1);
+ $headval = $2;
+
+ if ($header{$headname} ne '') {
+ $header{$headname}.=', '.$headval;
+ }
+ else {
+ $header{$headname}=$headval;
+ }
+ $lastname = $headname;
+ }
+ }
+ close ($headerfile);
+ }
+ return %header;
+}
+
+sub gettimenumber {
+ (my $date) = @_;
+ my $year;
+ my $month;
+ my $day;
+ my $hour;
+ my $minute;
+ my $second;
+
+ # see https://tools.ietf.org/html/rfc2616#section-3.3.1
+ if ($date =~ /^[A-Za-z]{3}, ([0-9]{2}) ([A-Za-z]{3}) ([0-9]{4}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/){
+ $day=$1;
+ $month=lc($2);
+ $year=$3;
+ $hour=$4;
+ $minute=$5;
+ $second=$6;
+ }
+ elsif ($date =~ /^[A-Za-z]{3,}, ([0-9]{2})-([A-Za-z]{3})-([0-9]{2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/) {
+ $day=$1;
+ $month=lc($2);
+ $year='20'.$3; # Assuming 21st century!
+ $hour=$4;
+ $minute=$5;
+ $second=$6;
+ }
+ elsif ($date =~ /^[A-Za-z]{3} ([A-Za-z]{3}) ([ 0-9][0-9]) ([0-9]{2}):([0-9]{2}):([0-9]{2}) ([0-9]{4})/) {
+ $month=lc($1);
+ $day=$2;
+ $hour=$3;
+ $minute=$4;
+ $second=$5;
+ $year=$6;
+ $day =~ s/ /0/;
+ }
+ else {
+ return undef;
+ }
+
+ if ($month =~ /^jan/) {
+ $month = '01';
+ }
+ elsif ($month =~ /^feb/) {
+ $month = '02';
+ }
+ elsif ($month =~ /^mar/) {
+ $month = '03';
+ }
+ elsif ($month =~ /^apr/) {
+ $month = '04';
+ }
+ elsif ($month =~ /^may/) {
+ $month = '05';
+ }
+ elsif ($month =~ /^jun/) {
+ $month = '06';
+ }
+ elsif ($month =~ /^jul/) {
+ $month = '07';
+ }
+ elsif ($month =~ /^aug/) {
+ $month = '08';
+ }
+ elsif ($month =~ /^sep/) {
+ $month = '09';
+ }
+ elsif ($month =~ /^oct/) {
+ $month = '10';
+ }
+ elsif ($month =~ /^nov/) {
+ $month = '11';
+ }
+ elsif ($month =~ /^dec/) {
+ $month = '12';
+ }
+ else {
+ return undef;
+ }
+
+ return $year.$month.$day.$hour.$minute.$second;
+}
+
+sub taginfo {
+ (my $tagtext) = @_;
+ my %tag;
+
+ # if ($tagtext =~ /^div/) {
+ # print "$tagtext\n";
+ # }
+
+ if ($tagtext =~ /^((\/?)[^ \t\r\n<>\/"=]+)([ \t\r\n].*)?$/) {
+ $tag{'<'}=$1;
+ if ($2 ne '') {
+ $tag{'/'} = '/';
+ }
+ $tagtext = $3;
+ }
+ else {
+ return %tag;
+ }
+
+ while ($tagtext =~ /^[ \t\r\n]*([^ \t\r\n<>\/"=]+)[ \t\r\n]*=[ \t\r\n]*"([^"]+)"([ \t\r\n].*)?$/) {
+ $tag{$1} = entitydecode($2);
+ $tagtext = $3;
+ }
+
+ if ($tagtext =~ /\/[ \t\r\n]*$/) {
+ $tag{"\\"}="\\";
+ }
+ # if ($tag{'<'} eq 'div') {
+ # foreach my $ind (keys %tag) {
+ # print "$ind: $tag{$ind}\n";
+ # }
+ # print "\n";
+ # }
+ return %tag;
}
\ No newline at end of file