#!/usr/bin/perl # se.pl by Davide +mala Eynard, 2010-2011 # run me as "perl se.pl <testsearch.json" - if you have downloaded me from # the right place you also know what that json file is ;-) , otherwise check # the related post on http://davide.eynard.it use WWW::Mechanize; use URI::Escape; use JSON; use Data::Dumper; use utf8; $| = 1; my $SLEEP = 2; my $AUTOCHECK = 0; my $USE_PROXY = 0; # if yahoo bans you, set this to 1 ;-) my $PROXY = "http://localhost:8118"; # I use tor to anonymize my connections... my $DEBUG = 1; my $PAGENUM = 3; # ---------------------------------------------------------------------------- # my %domainlist; # list of gathered domain names my %urllist; # list of gathered URL names (weighted according to their occurrences) my %engines; # list of engine profiles my $results_total = 0; my $results_unique = 0; my $results_domain = 0; my %results_hash; # ---------------------------------------------------------------------------- my $json = ''; # read json from standard input (should we read it from a file?) while(<>){ $json .= $_; } # read the json configuration (passed from the cmdline) if (!defined($json) or $json eq ""){ dieWithErr("Where is the json?"); } my $jsonobj = from_json($json); my $mech = new WWW::Mechanize(); $mech->{autocheck} = $AUTOCHECK; if ($USE_PROXY){ $mech->proxy(['http', 'ftp'], $PROXY); } # build the engines hash $engines = ${from_json($json)}{'engines'}; foreach $searchkey (keys %{$jsonobj->{'search'}}){ my $search = ${$jsonobj->{'search'}}{$searchkey}; # get results from browser foreach $engine (keys %$engines){ my $engineId = $$engines{$engine}{'id'}; print "[i] Searching $search on $engine... " if $DEBUG; $mech->get($$engines{$engine}{'url'}); my %fields = %{$$engines{$engine}{'fields'}}; foreach $fk(keys %fields){ $fields{$fk} =~ s/\$search/$search/g; } print Dumper (\%fields)."\n" if $DEBUG; $res = $mech->submit_form(fields => \%fields); if (!$res->is_success()){ dieWithErr ("Probs contacting $engine\n"); } print "done.\n" if $DEBUG; my $i = 0; # $results_partial contains the number of results for each engine my $results_partial = 0; while ($res && $res->is_success()){ print "I have downloaded ". $res->base() ."\n" if $DEBUG; # get sleep time from config, if it's not present take the default my $sleeptime = ($$engines{$engine}{'sleep'} ? $$engines{$engine}{'sleep'} : $SLEEP); sleep $sleeptime; # parse results from engine $i++; print " [i] Parsing page $i...\n" if $DEBUG; my $content = $mech->content(); my $j = -1; while ($content =~ /$$engines{$engine}{'regexp'}/gsi){ $j++; $results_partial++; my $referer = $1; $urllist{$referer}++; if ($referer =~ /(http:\/\/[^\/]+)\//si){ $mydomain = $1; } my $rank = ($i-1)*10+$j; print "$engineId : $search : $mydomain : $referer\n" if $DEBUG; # domain has to be the key, so: # 1) Increment domain->results $results_hash{$mydomain}{'results'}++; # 2) Add the engine to the domain engines list $results_hash{$mydomain}{'engines'}{$engineId}++; # 3) Add URL->{engine, rank} my %small_hash; $small_hash{"engineId"} = $engineId; $small_hash{"rank"} = $rank; $small_hash{"termId"} = $searchkey; push @{$results_hash{$mydomain}{'urls'}{$referer}}, \%small_hash; } last if ($i >= $PAGENUM); # get next page from engine if (defined ($$engines{$engine}{'nextURL'})){ print "Using filter on URL too: ". $$engines{$engine}{'nextURL'}."\n" if $DEBUG; $res = $mech->follow_link( text_regex => qr/$$engines{$engine}{'next'}/, url_abs_regex => qr/$$engines{$engine}{'nextURL'}/); }else{ $res = $mech->follow_link( text_regex => qr/$$engines{$engine}{'next'}/); } } $engines{$$engine}{'results'} = $results_partial; } } @rankedURLs = reverse sort { $urllist{$a} <=> $urllist{$b}} keys %urllist; $output{'status'} = "OK"; $output{'contents'}{'terms'} = $jsonobj->{'search'}; $output{'contents'}{'domains'} = \%results_hash; $output{'contents'}{'urls'} = \@rankedURLs; my $jsonOutput = new JSON; my $jsonResult = $jsonOutput->pretty->encode(\%output); # comment this if you don't want to enable utf8 encoding utf8::encode($jsonResult); print $jsonResult; exit; #---------------------------------------------------------------------------- sub dieWithErr{ my $error = shift; my %results; my $jsonObj = new JSON; $results{status} = "ERROR"; $results{contents} = $error; print $jsonObj->pretty->encode(\%results); exit; }