1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
|
#!/usr/bin/env perl
use strict;
use warnings;
sub cutter { ... }
sub cutteregexp { ... }
my $path_file = "search_google.html";
open (TXT_FILE, "<$path_file");
my $result;
while (<TXT_FILE>) {
$result .= $_;
}
close (TXT_FILE);
my ($beg, $pos, $end) = 0;
while (1) {
my ($snippet, $url, $summary, $heading) = "";
my $tok1 = "<a href=\"";
my $tok2 = "\"";
my $tok3 = ">";
my $tok4 = "</a>";
my $tok5 = "<font size=-1>";
my $tok6 = "<br>";
my $google_tok = "<li class=g>";
# on releve l'intervalle du prochain snippet de code
$beg = index($result, $google_tok, $beg);
$end = index($result, $google_tok, $beg + 1);
# test si une occurence du mot clef a bien été trouvé, sinon sort
if ($beg == -1 || $end == -1 || $beg == $end) {
last;
}
# where $result est par exemple le html d'une recherche google !!
# déclaration : bout de code, sa position, et les champs à extraire
$snippet = substr($result, $beg, $end - $beg);
# appelle de la fonction qui traite les chaine simple
($pos, $url) = cutter($tok1,$tok2, 0, $snippet);
($pos, $heading) = cutter($tok3, $tok4, $pos, $snippet);
($pos, $summary) = cutter($tok5, $tok6, $pos, $snippet);
# appelle de la fonction qui traite les chaines contenant des expressions régulieres
# ($pos, $url) = cutteregexp($tok1,$tok2, 0, $snippet);
# ($pos, $heading) = cutteregexp($tok3, $tok4, $pos, $snippet);
# ($pos, $summary) = cutteregexp($tok5, $tok6, $pos, $snippet);
print "Title : ".$tok2."<br>";
print "content : ".$tok3."<br>";
print "URL : ".$tok1."<br>";
$beg = $end;
} |
Partager