1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
|
#!/usr/bin/perl -w
####################################
#
# getfastanamesandlength.pl -f fasta-file -o output_file
#
# reads concatanated fasta files, writes name, length, CpGsand GpCs into TAB file #for each fasta sequence
#####################################
use diagnostics;
use strict;
use Carp;
use FileHandle;
use File::Path;
use File::Basename;
use Getopt::Std;
my @methylnam = ();
my @methylseq = ();
my @methylnam1 = ();
my @methylseq1 = ();
my @methylnam1_sorted = ();
my @temp = ();
my $minimum_length = 0; #minimal acceptable sequence length
my $acceptable_proc_n = 100; #minimal acceptable percentage of non ATGC
my $seq_length = 0;
my $proc_n = 0;
my $FILE = FileHandle->new();
my $seq = "";
my $num = 0;
my $i;
my $duplicates_nr = 0;
my $outfile = "";
my $j;
my $seqnumber = 0;
my $seqname = "";
my $CG_number = 0;
my $C_number = 0;
my $G_number = 0;
my $GC_number = 0;
use vars qw( $opt_f $opt_o);
# Command line parsing
#
getopts('f:o:');
#
# Read input file and split into fasta sequences on the fly
#
#replace Mac and PC CRLF by UNIX LF
system ("perl -pi -e 's/\r/\n/g' $opt_f");
open($FILE,$opt_f) || croak(sprintf("Cannot open file \"%s\"",$opt_f));
while ( <$FILE> ) {
chomp;
s/\ /_/g;
if ( /^>(\S+)/)
{
push(@methylnam,$1);
push(@methylseq,"");
}
else
{
$methylseq[-1] .= $_;
}
}
close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$opt_f));
$outfile = $opt_o;
#print header
open($FILE,">".$outfile) || croak(sprintf("Cannot open file \"%s\"",$outfile));
print $FILE "#name\tlength\tCpGs\tGpCs\tCs\tGs\tCpG o\/e\n";
close ($FILE);
open($FILE,">>".$outfile) || croak(sprintf("Cannot open file \"%s\"",$outfile));
for $i (0 .. $#methylnam)
{
@temp = ();
@temp=split('\|',$methylnam[$i]);
$seqname=$temp[1];
print $FILE (($methylnam[$i])."\t".(length($methylseq[$i]))."\t".($methylseq[$i]=~ s/CG/CG/gi)."\t".($methylseq[$i]=~ s/GC/GC/gi)."\t".($methylseq[$i]=~ s/C/C/gi)."\t".($methylseq[$i]=~ s/G/G/gi)."\t".((($methylseq[$i]=~ s/CG/CG/gi)/(($methylseq[$i]=~ s/C/C/gi)*($methylseq[$i]=~ s/G/G/gi)))*(((length($methylseq[$i]))^2)/(length($methylseq[$i]))-1))."\n");
}
close ($FILE); |
Partager