| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 
 | #!C:/PROGRA~1/Perl/bin/perl.exe -w
use cgi::carp;
use strict;
use LWP::UserAgent;
use HTTP::Response;
use HTTP::Request;
use LWP::RobotUA;
use URI;
 
#---------------------------------------
# Creation de l'agent
# Create the users agent
#---------------------------------------
my $VERSION = "Bot/1.1 (+http://shunix.com)";
my $ua = LWP::UserAgent->new;
   $ua->agent('Mozilla/5.0 (compatible; Bot/1.1)');
   $ua = new LWP::RobotUA('Mozilla/5.0 (compatible; Bot/1.1)', '!!!!!@free.fr');
#---------------------------------------
# Configuration du robot
# Configure the robot
#---------------------------------------
 
  $ua->protocols_allowed( ['http'] );
  $ua->timeout(4);
  $ua->requests_redirectable( ['HEAD'] );
  $ua->max_redirect(10);
  $ua->protocols_forbidden( ['mailto','https','gopher','ftp','socks','file'] );
 
#----------------------------------------
# Chargement des la liste des site
# Loading site list
#----------------------------------------
my @url = ("http://free.fr");
     @url = allow(@url);
 
#----------------------------------------
# Configuration des variales
# Configuring the variable
#----------------------------------------
$| = 1;
my %urls;
my $url = join("", @url);
 
#----------------------------------------
# Robot code
# Code du robot
#----------------------------------------
  sub explore {
  foreach $url (keys %urls)
  {
  my $base_uri = URI->new( $url );
  my $request = new HTTP::Request('GET', $url);
  my $response = $ua->request($request);
  if ($response->is_success) {
     # print $response->content; # Hear we take the good page
 
     my $link = $response->content;
     print "$link\n";
     my @links;  
        while (my $content =~ m/(?:<a [^>]+ href=|<frame [^>]+ src=) 
          (?: "((?:[^"\\]|\\")+)" ) 
             [^>]*> 
            /xig){ 
            push @links, $1; 
         }
          @links=(@links, $link);
 
       foreach my $lien ( @links) {
       $lien = URI->new_abs( $lien, $base_uri )->as_string();
        $urls{$lien}++; 
         if( $urls{$lien} == 1 ) {
           $url = "";
           $url = $lien;
           &explore($url); # Faire du récusif a l'arrache
          print "Nouveau lien : $lien\n"; 
        } 
      }    
    } 
  }
 }
 
 
#----------------------------------------
# Supprésion des extentions interdites
#----------------------------------------
 
use Regexp::Assemble;
{ # scope pour rendre $re 'statique'
  my $re;
  sub allow
   {
    if( not $re ) {
        my $r = Regexp::Assemble->new;
        $r->add( "\\.$_\$" ) for
            qw/ htm asp aspx nux mspx cfm html xhtml jhtml php php3
                php4 shtml jsp php4 php5 jpg jpeg png gif doc rtf pdf
                xls pm shtm
            /;
        $re = $r->re;
    }
 
    return grep { /$re/ } @_;
  }
} | 
Partager