1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
|
#!/usr/bin/perl
use warnings;
use strict;
#lemmes anglais
my $fichier_corpus_en = 'title-abs-en-A61K-medline-tagger.txt';
my $fichier_en = 'matrice.txt';
open my $fh_corpus_en, "<", $fichier_corpus_en or die "Impossible de lire $fichier_corpus_en\n";
open my $fh_en , ">", $fichier_en or die "Impossible de lire $fichier_en\n";
my $id = 0;
while (defined(my $ligne = <$fh_corpus_en>)) {
chomp ($ligne);
my ($token, $cat, $lemme) = $ligne =~ /^([^\t]+)\t+([^\t]+)\t+([^\t]+)$/;
if ($cat =~ /(NN)|(VB)|(JJ)/ and not $lemme =~ /<unknown>|be|can|use|do|have|make|show|mg|kg|ml/) {
$token =~ s/mg.?//g;
$token =~ s/kg.?//g;
$token =~ s/ml.?//g;
$token =~ s/\W/XXX/g;
print {$fh_en} $lemme.", ";
}
elsif ($cat =~ /NN/ and $lemme =~ /<unknown>/ and not $token =~ /A61K\d+/) {
$token =~ s/mg.?//g;
$token =~ s/kg.?//g;
$token =~ s/ml.?//g;
$token =~ s/\W/XXX/g;
print {$fh_en} $token.", ";
}
elsif ($cat =~ /NP/ and $lemme =~ /<unknown>/ and not $token =~ /A61K\d+/) {
$token =~ s/mg.?//g;
$token =~ s/kg.?//g;
$token =~ s/ml.?//g;
$token =~ s/\W/XXX/g;
print {$fh_en} $token.", ";
}
elsif ($ligne =~ /^A61K\d+/){
print {$fh_en} "'$token'\n";
$id++;
}
# print {$fh_en} "ID".$id." ";
}
#print {$fh_en} "ID".$id; |
Partager