#!/usr/bin/perl

package common;

use 5.005; # this is needed to use the map qr// trick 
use Exporter;
use LWP::UserAgent;
use Data::Dumper;
use HTTP::Cookies;
use URI;

@ISA 	= qw 	(Exporter);
@EXPORT = qw 	(
		exturl 
		getpage 
		walkpages
		walkpages_loop
		);

#----------------------------------------------------------------------------
# CONSTANTS

my $MAX_RETRIES = 5;

#----------------------------------------------------------------------------
# VARIABLES

my $ua;	# global user agent (for efficiency)
my $kj; # global cookie jar
my %h_vurl;

#----------------------------------------------------------------------------
#

BEGIN{

  @MyAgent::ISA = qw(LWP::UserAgent); # set inheritance

  $ua = MyAgent->new;
  $ua->agent("Two/0.01");
  $ua->env_proxy; # load proxy from environment variables *_proxy
  
  $kj = new HTTP::Cookies;
  $kj->load("cookies.txt");
  # activate cookie jar for the useragent - every page will support cookies now!
  $ua->cookie_jar($kj);
  push @{ $ua->requests_redirectable }, 'POST';
  $h_vurl{_}=0; 
}

#----------------------------------------------------------------------------
#

sub getpage{
	my $url = shift;
	
	my $retries = $MAX_RETRIES;
	# connect to the main url
	my $req = new HTTP::Request GET=> $url;
	my $res = $ua->request($req);

	while ((!$res->is_success)&&($retries)){
		my $retried = $MAX_RETRIES-$retries+1;
		warn "Error connecting to $url:\n";#.Dumper($res->headers);
		warn "Retrying ($retried)...\n";
		$res = $ua->request($req);
		$retries--;
	}

	unless ($res->is_success){
		warn "REALLY could not connect to $url (ignoring)\n";
		return;
	}

	# parse main page content for conferences
	$content = $res->content;
	return $content;
}

#----------------------------------------------------------------------------
#

sub exturl{
	my ($content,$regexp1,$regexp2,$pageurl) = @_;

	my (@links, @links2); 	# these are used to store links
	my %hash;		# this is used to store links occurrencies
	
	# if $regexp1 or $regexp2 are simple strings (scalars) and not array
	# of strings, convert them in an array of one element.
	my @a_regexp1 = (ref($regexp1) ? @$regexp1 : $regexp1);
	my @a_regexp2 = (ref($regexp2) ? @$regexp2 : $regexp2);

	# powerful regexp! :)
	while ($content =~ /<a.*?href\s*=[\s\"]*([^\s\">]+).*?>(.*?)(<\/a>|<\/td>)/gsi){
		my $url = $1;
		my $str = $2;
		
		# create an absolute version of the url, if it's relative
		my $uri1 = URI->new($pageurl);
		my $uri2 = URI->new($url);
		$url = $uri2->abs($uri1);
		
		my @comp_re1 = map qr/$_/i, @a_regexp1;
		my @comp_re2 = map qr/$_/i, @a_regexp2;
		for (@comp_re1) {
			push (@links,$url) if $url =~ /$_/;
		}
		for (@comp_re2) {
			push (@links,$url) if $str =~ /$_/;
		}
	}

	undef @ret_links;
	for (@links){
	      $hash{$_}++ || push @links2,$_;
        }

	return @links2;
}

#----------------------------------------------------------------------------
#

sub walkpages{
	my ($url,$regexp1,$regexp2,$regexp3,$regexp4,$eachpage) = @_;

	my %hash;
	my (@in_links, @out_links, @ret_links);
	
	my @a_url    = (ref($url)     ? @$url     : $url    );
	my @a_refol1 = (ref($regexp1) ? @$regexp1 : $regexp1);
	my @a_refol2 = (ref($regexp2) ? @$regexp2 : $regexp2);
	my @a_recol1 = (ref($regexp3) ? @$regexp3 : $regexp3);
	my @a_recol2 = (ref($regexp4) ? @$regexp4 : $regexp4);
	
	# extract links which match top level regexps
	# if LOOP then push on top the array I've just shifted
	my $a_re1 = shift (@a_refol1);
	my $a_re2 = shift (@a_refol2);
	my $empty = !($a_re1 || $a_re2); # are both the arrays empty?

	foreach $s_url (@a_url){
		$h_vurl{$s_url}++;
		print "Working on url $s_url:\n";
		# download url
		$content = getpage ($s_url);
		# NOTE: the values of @a_re1 and @a_re2 are array REFS!
		# first I check for the links to FOLLOW
		my @links = exturl ($content,$a_re1,$a_re2,$s_url);
		
		# then I should check for the links to COLLECT
		push @out_links, exturl ($content,\@a_recol1,\@a_recol2,$s_url) if ($eachpage||$empty);
		
		# if any link is contained in the parent array, delete it
		undef @in_links;
		for (@links){
		      $h_vurl{$_}++ || push @in_links,$_;
        	}

		# foreach link walkpage (link) with shifted regexp array
		if (!$empty){
			push @out_links, walkpages(\@in_links,\@a_refol1,\@a_refol2,\@a_recol1,\@a_recol2,$eachpage);
		}
	}
	for (@out_links){
		$hash{$_}++ || push @ret_links,$_;
	}
	return @ret_links;
}

#----------------------------------------------------------------------------
#

sub walkpages_loop{
	my ($url,$regexp1,$regexp2,$regexp3,$regexp4,$depth) = @_;
	
	my %hash;
	my (@in_links, @out_links, @ret_links);
	
	my @a_url    = (ref($url)     ? @$url     : $url    );
	my @a_refol1 = (ref($regexp1) ? @$regexp1 : $regexp1);
	my @a_refol2 = (ref($regexp2) ? @$regexp2 : $regexp2);
	my @a_recol1 = (ref($regexp3) ? @$regexp3 : $regexp3);
	my @a_recol2 = (ref($regexp4) ? @$regexp4 : $regexp4);
	
	# extract links which match top level regexps
	# if LOOP then push on top the array I've just shifted
	my $a_re1 = shift (@a_refol1);
	my $a_re2 = shift (@a_refol2);

	# push the values at the end of the arrays
	push @a_refol1 , $a_re1; 
	push @a_refol2 , $a_re2;
	
	foreach $s_url (@a_url){
		$h_vurl{$s_url}++;
		print "$depth: Working on url $s_url:\n";
		# download url
		$content = getpage ($s_url);
		# NOTE: the values of @a_re1 and @a_re2 are array REFS!
		# first I check for the links to FOLLOW
		my @links = exturl ($content,$a_re1,$a_re2,$s_url);
		
		# then I should check for the links to COLLECT
		push @out_links, exturl ($content,\@a_recol1,\@a_recol2,$s_url);
		
		# if any link is contained in the parent array, delete it
		undef @in_links;
		for (@links){
		      $h_vurl{$_}++ || push @in_links,$_;
        	}

		# foreach link walkpage (link) with shifted regexp array
		# if depth is !=1, of course
		# and passing (depth-1) or 0 if depth==0
		if ($depth!=1){
			$depth-- if $depth;
			push @out_links, walkpages_loop(\@in_links,\@a_refol1,\@a_refol2,\@a_recol1,\@a_recol2,$depth);
		}
	}
	undef @ret_links;
	for (@out_links){
		$hash{$_}++ || push @ret_links,$_;
	}
	return @ret_links;
}

1;