From: b
Date: Wed, 13 Jan 2016 20:55:31 +0000 (+0000)
Subject: Changed suggeasted by others; license info
X-Git-Url: http://bicyclesonthemoon.info/git-projects/?a=commitdiff_plain;h=4b2afb47ffdb02146c707afa5266ffc179e24fcb;p=yplom%2Ffacebug1
Changed suggeasted by others; license info
git-svn-id: svn://botcastle1b/yplom/facebug1@16 7dec801f-c475-4e67-ba99-809552d69c55
---
diff --git a/bot.1.pl b/bot.1.pl
index 08eed9c..0223127 100644
--- a/bot.1.pl
+++ b/bot.1.pl
@@ -1,7 +1,7 @@
###PERL;
# bot is generated from bot.1.pl
-# 02.01.2016
+# 13.01.2016
#
# This is the facebook bot. It depends on the proxy.
# It reads pages from m.facebook.com archived on the proxy, extracts threads,
@@ -27,7 +27,7 @@ use Fcntl;
use File::Copy;
###PROXY_LIB;
###FACEBUG_LIB;
-use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode entityencode urldecode readheaderfile);
+use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode entityencode urldecode urlencode readheaderfile);
use facebug_lib qw(key readdatafile writedatafile gettimenumber);
use POSIX qw(strftime locale_h);
@@ -35,6 +35,7 @@ use POSIX qw(strftime locale_h);
###GROUPSETTINGS_PATH;
###KEY_BITS;
###MAX_REDIRECTIONS;
+###DETAILED_LOGS;
my $time = time();
srand ($time-$$);
@@ -299,10 +300,12 @@ sub processfile {
my $closetag=0;# if there is a tag to close
my $ignoretext;# if text should be ignored and not added to post/thread content
my $link; # if bot is inside a link
+ my $namelink;
my $hidename; # if bot is inside a part which contains a name to be hidden
my $attnumber; # number of current attachment
my $incomplete;# if thread firstpost's content is incomplete
my $firstpost; # if the bot is in the firstpost (important if pagetype='post')
+ my $description;
my $content;
@@ -411,6 +414,12 @@ sub processfile {
# thread author is in first
if (($tag{'<'} eq 'h3') and ($thread{($shared?'shared-':'').'author'} eq '')) {
$mode = 'thread-author';
+ $link=0;
+ $namelink=0;
+ unless ($shared) {
+ $description=0;
+ }
+ $hidename;
}
# elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) {
elsif ($tag{'<'} eq 'div') {
@@ -422,6 +431,7 @@ sub processfile {
$ignoretext=1; # text in firstposts only inside
$hidename=0; $link=0; + $namelink=0; } # The thread contains "shared" content elsif (($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/) and !$shared and ($thread{'shared'} eq '')) { @@ -450,7 +460,7 @@ sub processfile { if (lc($tag{'aria-label'})eq 'likes') { $mode='thread-likes'; } - ###DUPLICADED! + ### this code had to be duplicated: # there is an image attached elsif ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { my $imgnum=$2; @@ -499,7 +509,7 @@ sub processfile { $mode = 'thread-attachment-img'; } } - ###END OF DUPLICATED + ###END OF DUPLICATED CODE # there is a link attached elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { ++$attnumber; @@ -609,6 +619,10 @@ sub processfile { } } + if ($description <= 0) { + delete $thread{'description'}; + } + # overwrite previous information with new one foreach my $ind (keys %thread) { $thread2{$ind}=$thread{$ind}; @@ -622,8 +636,10 @@ sub processfile { writedatafile($threadfile,%thread2); truncate ($threadfile , tell($threadfile)); - foreach my $ind (keys %thread2) { - print "$ind: $thread2{$ind}\n"; #### + if (DETAILED_LOGS) { + foreach my $ind (keys %thread2) { + print "$ind: $thread2{$ind}\n"; #### + } } print "saved.\n\n"; } @@ -648,19 +664,61 @@ sub processfile { # author name elsif ($mode eq 'thread-author') { # name can be found in hyperlinks - if ($tag{'<'} eq 'a') { + if ($tag{'<'} eq 'span') { + $description=-1; + } + elsif ($tag{'<'} eq 'a') { + if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { + unless ($shared) { + $thread{'description'}.=''; + $link=1; + } + } # there are two types of facebook user IDs - if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)\/?(\?.*)?$/) { - my $author = $1; - if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { - $author = urldecode($2); + elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)\/?(\?.*)?$/) { + my $person = $1; + unless ($person =~ /^(photo|post).php$/) { + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $person = urldecode($2); + } + unless ($shared) { + $thread{'description'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); + $link=1; + $namelink=1; + $hidename=1; + } + + if ($thread{($shared?'shared-':'').'author'} eq '') { + $thread{($shared?'shared-':'').'author'} = $person; + $thread{($shared?'shared-':'').'name'} = ($$names{$person} ne '')?$$names{$person}:$$names{'default'}; + } + } + } + elsif ($tag{'href'} eq '#') { + unless ($shared) { + $thread{'description'}.=''.$$names{'default'}; + $link=1; + $namelink=1; + $hidename=1; } if ($thread{($shared?'shared-':'').'author'} eq '') { - $thread{($shared?'shared-':'').'author'} = $author; - $thread{($shared?'shared-':'').'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; + $thread{($shared?'shared-':'').'name'} = $$names{'default'}; } } } + elsif ($tag{'<'} eq '/a') { + if($link) { + if($namelink) { + $thread{'description'}.=''; + $namelink=0; + } + else { + $thread{'description'}.=''; + } + $link=0; + $hidename=0; + } + } # go out of
Post not completely archived.
'; + $thread{($shared?'shared-':'').'postcontent'}.='Post not completely archived.
'; } $incomplete=1; } - ### HAD TO BE DUPLICATED :( + ### this code had to be duplicated: # there is an image attached @@ -969,6 +1040,7 @@ sub processfile { $ignoretext=0; $hidename=0; $link=0; + $namelink=0; } else { ++$level; @@ -1046,8 +1118,10 @@ sub processfile { writedatafile($postfile,%post2); truncate ($postfile , tell($postfile)); - foreach my $ind (keys %post2) { - print "$ind: $post2{$ind}\n"; + if(DETAILED_LOGS) { + foreach my $ind (keys %post2) { + print "$ind: $post2{$ind}\n"; + } } print "saved.\n\n"; } @@ -1230,14 +1304,27 @@ sub processfile { if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { $person = urldecode($2); } - $post{'content'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); + $post{'content'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); + $link=1; + $namelink=1; + $hidename=1; + } + elsif ($tag{'href'} eq '#') { + $post{'content'}.=''.$$names{'default'}; $link=1; + $namelink=1; $hidename=1; } } elsif ($tag{'<'} eq '/a') { if($link) { - $post{'content'}.=''; + if ($namelink) { + $post{'content'}.=''; + $namelink=0; + } + else { + $post{'content'}.=''; + } $link=0; $hidename=0; } @@ -1298,12 +1385,20 @@ sub processfile { $thread{($shared?'shared-':'').'postcontent'}.=$text; } } + if($mode eq 'thread-author') { + unless ($hidename or $shared){ + $thread{'description'}.=$text; + if (($text =~ /[a-z]/) and (!$link) and ($description == 0)) { + $description = 1; + } + } + } # the format facebook uses for showing time is not always helpful (for # example: "2 mins ago") There is no setting to change it in facebook. # at least in the m.facebook.com. The bot corrently DOES NOT interpret the # text. elsif ($mode eq 'thread-time') { - $thread{($shared?'shared-':'').'timetext'}.=$text; + $thread{($shared?'shared-':'').'timetext'}.=facebooktime($timenumber,$text); } elsif ($mode eq 'thread-attachment-link-title') { $thread{($shared?'shared-':'').'linktitle-'.$attnumber}.=$text; @@ -1328,7 +1423,7 @@ sub processfile { } } elsif ($mode eq 'post-time') { - $post{'timetext'}.=$text; + $post{'timetext'}.=facebooktime($timenumber,$text); } elsif ($mode eq 'post-link-title') { $post{'linktitle-'.$attnumber}.=$text; @@ -1472,7 +1567,9 @@ sub saveimg { $archbasepath = urldiv2path($prot, $host, $port, $path, $query); $archheadpath = $archbasepath.'@h'; $archimgpath = $archbasepath.'@v'; - print "url: $url\n"; + if (DETAILED_LOGS) { + print "url: $url\n"; + } # Read the http header. Only interested in the status. If a redirection then # follow it and read again. If 200 it's ok to continue processing. Otherwise # return. @@ -1487,7 +1584,9 @@ sub saveimg { if ($location !~ /^[a-z]+:\/\//) { $location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location; } - print "Redirect: $location\n"; + if (DETAILED_LOGS) { + print "Redirect: $location\n"; + } ($prot, $host, $port, $path, $query) = divideurl($location); $archbasepath = urldiv2path($prot, $host, $port, $path, $query); $archheadpath = $archbasepath.'@h'; @@ -1558,3 +1657,220 @@ sub saveimg { return $header{'key'}; } + +# function to get time from facebook time text +# arguments: 1 - time number (when page was downloaded), 2 - facebook time text +# returns a time number on success and the unchanged time text on failure. +sub facebooktime { + (my $timenumber, my $timetext) = @_; + $timetext = lc ($timetext); + + my $year; + my $month; + my $day; + my $hour; + my $minute; + my $ampm; + + my $subtype; + my $subvalue; + my $returnvalue; + + # There are multible possible format of facebook time text format + + # Jan 2 + # January 2 + if ($timetext =~ /^ *([a-z]{3,}) +([0-9]+) *$/) { + $month = $1; + $day = int($2); + $year = int(substr($timenumber,0,4)); + $hour=''; + $minute=''; + } + # Jan 2, 2015 + # January 2, 2015 + elsif ($timetext =~ /^ *([a-z]{3,}) +([0-9]+), +([0-9]{4}) *$/) { + $month = $1; + $day = int($2); + $year = int($3); + $hour=''; + $minute=''; + } + # Jan 2 at 10:02am + # January 2 at 10:02am + elsif ($timetext =~ /^ *([a-z]{3,}) +([0-9]+) +at +([0-9]{1,2}):([0-9]{2})([ap]m) *$/) { + $month = $1; + $day = int($2); + $year = int(substr($timenumber,0,4)); + $hour=int($3); + $minute=int($4); + $ampm=$5; + if($ampm eq 'pm') { + $hour += 12; + } + } + # Jan 2, 2015 at 10:02am + # January 2, 2015 at 10:02am + elsif ($timetext =~ /^ *([a-z]{3,}) +([0-9]+), +([0-9]{4}) +at +([0-9]{1,2}):([0-9]{2})([ap]m) *$/) { + $month = $1; + $day = int($2); + $year = int($3); + $hour=int($4); + $minute=int($5); + $ampm=$6; + if($ampm eq 'pm') { + $hour += 12; + } + } + # 1 hr + # 2 hours ago + # etc... + elsif ($timetext =~ /^ *([0-9]+) +h(ou)?rs?( +ago)? *$/) { + $subtype='h'; + $subvalue=int($1); + $hour = int(substr($timenumber,8,2)); + } + # 4 mins + # 2 minutes ago + # etc... + elsif ($timetext =~ /^ *([0-9]+) +min(ute)?s?( +ago)? *$/) { + $subtype='m'; + $subvalue=int($1); + $minute = int(substr($timenumber,10,2)); + $hour = int(substr($timenumber,8,2)); + } + # 2 days ago + # etc... + elsif ($timetext =~ /^ *([0-9]+) +days?( +ago)? *$/) { + $subtype='d'; + $subvalue=int($1); + } + elsif ($timetext =~ /^ *yesterday +at +([0-9]{1,2}):([0-9]{2})([ap]m) */) { + $subtype='d'; + $subvalue=1; + $hour=int($1); + $minute=int($2); + $ampm=$3; + if($ampm eq 'pm') { + $hour += 12; + } + } + else { + # print "FAIL 1 -".urlencode($timetext)."-\n"; + return $timetext; + } + + if ($year !~ /^[0-9]+$/) { + $year = int(substr($timenumber,0,4)); + } + if ($month !~ /^[0-9]+$/) { + if ($month eq '') { + $month = int(substr($timenumber,4,2)); + } + elsif ($month =~ /^jan/) { + $month=1; + } + elsif ($month =~ /^feb/) { + $month=2; + } + elsif ($month =~ /^mar/) { + $month=3; + } + elsif ($month =~ /^apr/) { + $month=4; + } + elsif ($month =~ /^may/) { + $month=5; + } + elsif ($month =~ /^jun/) { + $month=6; + } + elsif ($month =~ /^jul/) { + $month=7; + } + elsif ($month =~ /^aug/) { + $month=8; + } + elsif ($month =~ /^sep/) { + $month=9; + } + elsif ($month =~ /^oct/) { + $month=10; + } + elsif ($month =~ /^nov/) { + $month=11; + } + elsif ($month =~ /^dec/) { + $month=12; + } + else { + # print "FAIL 2\n"; + return $timetext; + } + + } + if ($day !~ /^[0-9]+$/) { + $day = substr($timenumber,6,2); + } + + if ($subtype eq 'm') { + $minute -= $subvalue; + $subvalue = 0; + while($minute <0) { + $minute += 60; + ++$subvalue; + $subtype = 'h'; + } + } + if ($subtype eq 'h') { + $hour -= $subvalue; + $subvalue = 0; + while($hour <0) { + $hour += 24; + ++$subvalue; + $subtype = 'd'; + } + } + if ($subtype eq 'd') { + $day -= $subvalue; + if ($day <1) { + --$month; + if($month<1) { + --$year; + $month=12; + $day = 31; + } + else { + if ($month == 2) { + if($year%4) { + $day = 28; + } + elsif ($year%100) { + $day = 29; + } + elsif ($year%400) { + $day = 28; + } + else { + $day = 29; + } + } + elsif ((($month%2) and ($month < 8)) or ((!($month%2)) and ($month >= 8))) { + $day = 31; + } + else { + $day = 30; + } + } + } + } + + $returnvalue= sprintf('%04u%02u%02u',$year,$month,$day); + if($hour ne '') { + $returnvalue.=sprintf('%02u',$hour); + if($minute ne '') { + $returnvalue.=sprintf('%02u',$minute); + } + } + return $returnvalue; +} \ No newline at end of file diff --git a/config.1.txt b/config.1.txt index f0ba966..ed26be0 100644 --- a/config.1.txt +++ b/config.1.txt @@ -1,4 +1,24 @@ # config.txt is generated from config.1.txt +# 13.01.2016 +# +# The file with the autogenerated configurations for Apache2 and crontab +# +# Copyright (C) 2015-2016 Balthasar SzczepaÅski +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see'.$$post{'description'}.'
'; + } + print $$postcontent.'