#!/usr/bin/perl -w
use strict;
use HTML::LinkExtor;
use URI::URL;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtor;
use LWP:

arallel::UserAgent;
my $VERSION = "Monrobot/1.01 (+http://shunix.com)";
$| = 1;
my @startlinks = (
"http://www.wanadoo.fr",
"http://danzcontrib2.free.fr"
);
my $ua = LWP:

arallel::UserAgent->new();
$ua->agent('Mozilla/5.0 (compatible; Monrobot/1.1)');
$ua->wait(0);
$ua->protocols_allowed( ['http'] );
$ua->timeout(4);
$ua->env_proxy;
$ua->requests_redirectable( ['HEAD'] );
$ua->max_redirect(15000000000);
$ua->protocols_forbidden( [ 'mailto', 'https', 'gopher', 'ftp', 'socks', 'file' ] );
$ua->redirect (1); # prevents automatic following of redirects
$ua->max_hosts(200); # sets maximum number of locations accessed in parallel
$ua->max_req (200); # sets maximum number of parallel requests per host
sub spider (%);
spider URL => '$url';
sub spider (%) {
my %args = @_;
push(@startlinks, $args{URL});
WORKLOOP: while (my $link = shift @startlinks) {
for (my $i = 0; $i< $#startlinks; $i++) {
next WORKLOOP if $link eq $startlinks[$i];
}
print ">>>>> working on $link\n";
HTML::LinkExtor->new(
sub {
my ($t, %a) = @_;
my @links = map { url($_, $link)->abs() }
grep { defined } @a{qw/href img/};
foreach my $start_link (@startlinks) {
my $i = 0;
for (0 .. $#links) {
if ($links[$i++] eq $start_link) {
$links[$i -1] = "'REMOVE'";
}
}
}
@links = sort @links;
for (my $i = 0; $i< $#links; $i++) {
$links[$i] = "'REMOVE'" if $links[$i] eq $links[$i +1];
}
@links = grep { $_ ne "'REMOVE'" } @links;
print "+ $_\n" foreach @links;
push @startlinks, @links if @links;
} ) -> parse(
do {
my $r = $ua->simple_request(HTTP::Request->new("GET", $link));
$r->content_type eq "text/html" ? $r->content : "";
}
)
}
}
Partager