1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
| #!/usr/local/bin/perl
use strict;
use warnings;
#------------------ sub_ClustalW.pl
use Bio::Perl;
use Bio::Tools::Run::Alignment::Clustalw;
use Bio::AlignIO::msf;
use Date::Calc qw(:all);
use FileHandle;
my ($start_year, $start_month, $start_day) = Today([+1]);
my ($start_hour, $start_min, $start_sec) = Now([+1]);
=h
use FileHandle;
my $way = 'P:/Perl/scripts2/ClustalW';
# écriture des calculs de ClustalW dans un fichier de sortie
my $fh = FileHandle->new($way.'/calc.txt');
close $fh;
close STDOUT;
open STDOUT, '>', $way.'P:/Perl/scripts2/ClustalW/calc.txt'
or die "...";
=cut
my $rep = 'P:/Theorie/Jean_Francois/ClustalW/';
my @fichiers = glob ($rep."ITS2_SEQ-1.txt");
@fichiers = grep {$_ !~ /^\.\.?$/} @fichiers;
my @times_array;
foreach my $fich (@fichiers){
# tester le nombre de séquences
my $nbr_seq = 0;
my $in = Bio::SeqIO->new(-file => $fich, '-format' => 'Fasta');
while ( my $seq = $in->next_seq()){
$nbr_seq++;
}
if($nbr_seq > 1){
&ClustalW2 ($fich, $nbr_seq);
}
else{
print "moins de 2 séquences pour $fich\n";
}
}
# my @fichiers_msf = glob ($rep."*.msf");
# unlink (@fichiers_msf);
# calcul du temps d'exécution (prend 25 secondes)
my ($end_year,$end_month,$end_day) = Today([+1]);
my ($end_hour,$end_min,$end_sec) = Now([+1]);
my ($D_y, $D_m, $D_d, $Dh, $Dm, $Ds) =
Delta_YMDHMS($start_year, $start_month, $start_day, $start_hour, $start_min, $start_sec,
$end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec);
map {print "$_";} @times_array;
# nb on tient compte du jour car le script lancer le soir peut s'éxécuter sur 2 jours différents
# néanmoins, on ne tient compte du jour que dans le calcul et on n'affiche que le jour car
# le script tournera moins de 24h
print "\nDate::Calc TOT : ";
print $Dh." h ".$Dm." min ".$Ds." sec\n";
sub ClustalW2 {
my $file = $_[0];
my $nbr_seq = $_[1];
if (-e $file){
my ($path) = ( $file =~ m/(.+)\.txt$/);
my ($file_name) = $file =~ m{.+/(\w+)\.txt$};
# on ne fait l'alignement que si le fichier .fsa
# n'existe pas encore
my $fich_fsa = $path.'.fsa';
if ( ! -e $fich_fsa) {
my ($start_year, $start_month, $start_day) = Today([+1]);
my ($start_hour, $start_min, $start_sec) = Now([+1]);
my $fich_msf = $path.'.msf';
# my $fich_msf_fh = FileHandle->new (">".$fich_msf);
# close $fich_msf_fh;
my @params = (
'gapopen' => 15,
'ktuple' => 4,
'type' => 'dna',
'outfile' => $fich_msf,
'format' => 'Fasta',
'outorder' => 'aligned',
);
# Bio::Perl : reads an array of sequences
my @seq_array = read_all_sequences($file,'fasta');
# and pass the factory a reference to that array
my $seq_array_ref = \@seq_array;
my $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
$factory->executable("C:/ClustalW2/clustalw2.exe");
# $factory->executable("C:/ClustalW/clustalw.exe");
my $aln = $factory->align($seq_array_ref);
# création du fichier fasta
my $in_msf = Bio::AlignIO->new(-file => $fich_msf , -format => 'msf');
my $out_fsa = Bio::AlignIO->new(-file => ">".$fich_fsa , -format => 'fasta');
while ( my $aln = $in_msf->next_aln() ) {
$out_fsa->write_aln($aln);
}
# calcul du temps d'exécution (prend 25 secondes)
my ($end_year,$end_month,$end_day) = Today([+1]);
my ($end_hour,$end_min,$end_sec) = Now([+1]);
my ($D_y, $D_m, $D_d, $Dh, $Dm, $Ds) =
Delta_YMDHMS($start_year, $start_month, $start_day, $start_hour, $start_min, $start_sec,
$end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec);
# nb on tient compte du jour car le script lancer le soir peut s'éxécuter sur 2 jours différents
# néanmoins, on ne tient compte du jour que dans le calcul et on n'affiche que le jour car
# le script tournera moins de 24h
push @times_array, "Date::Calc :\t$file_name\t".$Dh." h ".$Dm." min ".$Ds." sec\t$nbr_seq seq\n";
}
else {
print "\n ===> $fich_fsa existe déjà\n\n";
}
}
else{
print "\n ===> ERREUR AVEC LE FICHIER $file\n\n";
}
}
=h
GAPOPEN Default value DNA: 15.0
valeurs par défaut cf http://www.ebi.ac.uk/Tools/clustalw2/pairgap_frame.html
PARAMETER FOR ALIGNMENT COMPUTATION
KTUPLE
Title : KTUPLE
Description : (optional) set the word size to be used in the alignment
This is the size of exactly matching fragment that is used.
INCREASE for speed (max= 2 for proteins; 4 for DNA),
DECREASE for sensitivity.
For longer sequences (e.g. >1000 residues) you may
need to increase the default
TOPDIAGS
Title : TOPDIAGS
Description : (optional) number of best diagonals to use
The number of k-tuple matches on each diagonal
(in an imaginary dot-matrix plot) is calculated.
Only the best ones (with most matches) are used in
the alignment. This parameter specifies how many.
Decrease for speed; increase for sensitivity.
WINDOW
Title : WINDOW
Description : (optional) window size
This is the number of diagonals around each of the 'best'
diagonals that will be used. Decrease for speed;
increase for sensitivity.
PAIRGAP
Title : PAIRGAP
Description : (optional) gap penalty for pairwise alignments
This is a penalty for each gap in the fast alignments.
It has little affect on the speed or sensitivity except
for extreme values.
FIXEDGAP
Title : FIXEDGAP
Description : (optional) fixed length gap penalty
FLOATGAP
Title : FLOATGAP
Description : (optional) variable length gap penalty
MATRIX
Title : MATRIX
Default : PAM100 for DNA - PAM250 for protein alignment
Description : (optional) substitution matrix used in the multiple
alignments. Depends on the version of clustalw as to
what default matrix will be used
PROTEIN WEIGHT MATRIX leads to a new menu where you are
offered a choice of weight matrices. The default for
proteins in version 1.8 is the PAM series derived by
Gonnet and colleagues. Note, a series is used! The
actual matrix that is used depends on how similar the
sequences to be aligned at this alignment step
are. Different matrices work differently at each
evolutionary distance.
DNA WEIGHT MATRIX leads to a new menu where a single
matrix (not a series) can be selected. The default is
the matrix used by BESTFIT for comparison of nucleic
acid sequences.
TYPE
Title : TYPE
Description : (optional) sequence type: protein or DNA. This allows
you to explicitly overide the programs attempt at
guessing the type of the sequence. It is only useful
if you are using sequences with a VERY strange
composition.
OUTPUT
Title : OUTPUT
Description : (optional) clustalw supports GCG or PHYLIP or PIR or
Clustal format. See the Bio::AlignIO modules for
which formats are supported by bioperl.
OUTFILE
Title : OUTFILE
Description : (optional) Name of clustalw output file. If not set
module will erase output file. In any case alignment will
be returned in the form of SimpleAlign objects
TRANSMIT
Title : TRANSMIT
Description : (optional) transitions not weighted. The default is to
weight transitions as more favourable than other
mismatches in DNA alignments. This switch makes all
nucleotide mismatches equally weighted. |