#!/usr/bin/perl

# se.pl by Davide +mala Eynard, 2010-2011
# run me as "perl se.pl <testsearch.json" - if you have downloaded me from
# the right place you also know what that json file is ;-) , otherwise check
# the related post on http://davide.eynard.it

use WWW::Mechanize;
use URI::Escape;
use JSON;
use Data::Dumper;
use utf8;

$| = 1;
my $SLEEP 	= 2;
my $AUTOCHECK 	= 0;
my $USE_PROXY   = 0; # if yahoo bans you, set this to 1 ;-)
my $PROXY       = "http://localhost:8118"; # I use tor to anonymize my connections...
my $DEBUG 	= 1;
my $PAGENUM 	= 3;

# ----------------------------------------------------------------------------
#
my %domainlist; # list of gathered domain names
my %urllist; 	# list of gathered URL names (weighted according to their occurrences)
my %engines;	# list of engine profiles
my $results_total	= 0;
my $results_unique	= 0;
my $results_domain	= 0;

my %results_hash;

# ----------------------------------------------------------------------------

my $json = '';
# read json from standard input (should we read it from a file?)
while(<>){
	$json .= $_;
}

# read the json configuration (passed from the cmdline)
if (!defined($json) or $json eq ""){
	dieWithErr("Where is the json?");
}

my $jsonobj = from_json($json);

my $mech = new WWW::Mechanize();
$mech->{autocheck} = $AUTOCHECK;

if ($USE_PROXY){
        $mech->proxy(['http', 'ftp'], $PROXY);
}

# build the engines hash
$engines = ${from_json($json)}{'engines'};

foreach $searchkey (keys %{$jsonobj->{'search'}}){
	my $search = ${$jsonobj->{'search'}}{$searchkey};
	# get results from browser
	foreach $engine (keys %$engines){
		my $engineId = $$engines{$engine}{'id'};
		print "[i] Searching $search on $engine... " if $DEBUG;
	
		$mech->get($$engines{$engine}{'url'});
		my %fields = %{$$engines{$engine}{'fields'}};
		foreach $fk(keys %fields){
			$fields{$fk} =~ s/\$search/$search/g;
		}
		print Dumper (\%fields)."\n" if $DEBUG;
		$res = $mech->submit_form(fields => \%fields);

	        if (!$res->is_success()){
        	        dieWithErr ("Probs contacting $engine\n");
        	}	
		print "done.\n" if $DEBUG;
	
		my $i = 0;

		# $results_partial contains the number of results for each engine
		my $results_partial = 0;
	
		while ($res && $res->is_success()){
	                print "I have downloaded ". $res->base() ."\n" if $DEBUG;
			# get sleep time from config, if it's not present take the default
			my $sleeptime = ($$engines{$engine}{'sleep'} ? $$engines{$engine}{'sleep'} : $SLEEP);
			sleep $sleeptime;
		        
			# parse results from engine
		        $i++;
		        print "  [i] Parsing page $i...\n" if $DEBUG;
		        my $content = $mech->content();
			my $j = -1;
			while ($content =~ /$$engines{$engine}{'regexp'}/gsi){
				$j++;
				$results_partial++;
				my $referer = $1;
				$urllist{$referer}++;
			
				if ($referer =~ /(http:\/\/[^\/]+)\//si){
					$mydomain = $1;
				}
				my $rank = ($i-1)*10+$j;
				print "$engineId : $search : $mydomain : $referer\n" if $DEBUG;

				# domain has to be the key, so:
				# 1) Increment domain->results
				$results_hash{$mydomain}{'results'}++;
				# 2) Add the engine to the domain engines list
				$results_hash{$mydomain}{'engines'}{$engineId}++;
				# 3) Add URL->{engine, rank}
				my %small_hash;
				$small_hash{"engineId"} = $engineId;
				$small_hash{"rank"} = $rank;
				$small_hash{"termId"} = $searchkey;
				push @{$results_hash{$mydomain}{'urls'}{$referer}}, \%small_hash;
			}
			last if ($i >= $PAGENUM);	
			
			# get next page from engine
			if (defined ($$engines{$engine}{'nextURL'})){
                        	print "Using filter on URL too: ". $$engines{$engine}{'nextURL'}."\n" if $DEBUG;
                        	$res = $mech->follow_link(
                                	text_regex => qr/$$engines{$engine}{'next'}/,
                                	url_abs_regex => qr/$$engines{$engine}{'nextURL'}/);
                	}else{
                        	$res = $mech->follow_link(
                                text_regex => qr/$$engines{$engine}{'next'}/);
                	}
		}
	
		$engines{$$engine}{'results'} = $results_partial;
	}
}

@rankedURLs = reverse sort { $urllist{$a} <=> $urllist{$b}} keys %urllist;

$output{'status'} = "OK";
$output{'contents'}{'terms'} = $jsonobj->{'search'};
$output{'contents'}{'domains'} = \%results_hash;
$output{'contents'}{'urls'} = \@rankedURLs;

my $jsonOutput = new JSON;
my $jsonResult = $jsonOutput->pretty->encode(\%output);

# comment this if you don't want to enable utf8 encoding
utf8::encode($jsonResult);

print $jsonResult;

exit;

#----------------------------------------------------------------------------

sub dieWithErr{
        my $error = shift;
        my %results;
        my $jsonObj = new JSON;

        $results{status} = "ERROR";
        $results{contents} = $error;

        print $jsonObj->pretty->encode(\%results);
        exit;
}