sahrendt0 · November 28, 2012 01:38
diff --git a/hw8.pl b/hw8.pl
 #!/usr/bin/perl
 # Script: hw8.pl
 # Description: Trim adapter, clean reads
 # Author: Steven Ahrendt
 # email: sahre001@ucr.edu
 ##################################
 # Adapter: CTGTAGGCACCATCAAT
 #  trim if it contains a perfect match to first 6 bases
 ##################################

 use warnings;
 use strict;
 use Bio::Seq;
 use Bio::SeqIO;

 my $infile = shift;
 my ($seq_obj,$seq_str,%seqs);
 my $adapter = "CTGTAGGCACCATCAAT";
 my $a_m = 6; #Seq must match first 6 bases of adapter for trimming
 my $match = substr($adapter,0,$a_m);
 my $c = 0;
 my $total = 0;
 my $clean = 0;
 open(my $fh => "zcat $infile |") || die $!;
 while(<$fh>)
 {
  chomp $_;
  if($_ =~ m/^>/)
  {
    if($c != 0)
    {
      $seq_obj->seq($seq_str);
      $total++;
      if ((my $a_pos = adapterFind($seq_obj->seq)) > -1)
      {
        $seq_obj->seq(substr($seq_str,0,$a_pos));
        if ($seq_obj->seq !~ m/N+/g)
        {
          if($seq_obj->length >= 18)
          {
            $seqs{$seq_obj->display_id} = $seq_obj;
            $clean++;
            #push(@seqs,$seq_obj);
          }
        }
      }
    }
    my $header = substr($_,1);
    $seq_obj = Bio::Seq->new(-display_id => $header);
    $seq_str = "";
  }
  else
  {
    $seq_str = $_;
  }
  $c++;
 }
 $seq_obj->seq($seq_str);
 $total++;
 if(($seq_obj->length >= 18) && ($seq_obj->seq !~ m/N+/g) && ($seq_obj->seq =~ m/^$match/))
 {
  $seqs{$seq_obj->display_id} = $seq_obj;
  $clean++;
 }
 #push(@seqs,$seq_obj);
 close($fh);

 open(RES,">results");
 print RES "$total read(s)\n";
 print RES "$clean clean read(s)\n";

 #print $seqs[0]->seq,"\n";
 open(SIZES,">sizes");
 my $clean_reads = Bio::SeqIO->new(-file => ">clean_reads.fa",
                                  -format => "fasta");
 foreach my $key (keys %seqs)
 {
  print SIZES $seqs{$key}->length,"\n";
  $clean_reads->write_seq($seqs{$key});
 }
 close(SIZES);

 ## Step 2: Cluster
 my %cluster;
 my $unique_reads = Bio::SeqIO->new(-file => ">unique_reads.fa",
                                   -format => "fasta");
 my $unique = 0;
 foreach my $key (keys %seqs)
 {
  $cluster{ $seqs{$key}->seq} = $seqs{$key};
  $unique_reads->write_seq($cluster{$seqs{$key}->seq});
  $unique++;
 }

 print RES "$unique unique read(s)\n";
 close(RES);

 sub adapterFind {
  my $seq = shift;
  my $pos = -1;
  my $len = -1;
  while(($pos = index($seq,$match,$pos)) > -1)
  {
    $len = $pos;
    $pos++;
  }
  return $len;
 }
diff --git a/mirbase_comp.pl b/mirbase_comp.pl
 #!/usr/bin/perl
 # Script: mirbase_comp.pl
 # Description: GEN220 HW8, compare unique reads to mirbase 
 # Author: Steven Ahrendt
 # email: sahre001@ucr.edu
 #####################################
 use strict;
 use warnings;
 use Bio::Seq;
 use Bio::SeqIO;
 use List::MoreUtils 'true';
 #my $count = true { /pattern/ } @list_of_strings;

 my $mirfile = $ARGV[0];
 my $reads_file = $ARGV[1];
 my (%mirnas,%clean_reads);
 my (@m_seqs, @c_seqs);
 %mirnas = readFile($mirfile);
 %clean_reads = readFile($reads_file);
 foreach my $c_key (keys %clean_reads)
 {
  push(@c_seqs, $clean_reads{$c_key}->seq);
 }

 open(OUT,">outfile");
 foreach my $keym (keys %mirnas)
 {
  my $mseq = $mirnas{$keym}->seq;
  $mseq =~ s/U/T/g;
  next if (!(my $match = true {/$mseq/} @c_seqs));
  my $norm = ($match*10000000)/(scalar @c_seqs); # RPTM
  print "$keym\t$norm\n";
  print OUT "$keym\t$norm\n"; 
 }
 close(OUT);

 ## Subroutine: readFile
 #   Input: Name of a fasta file (**can be gzipped**)
 #   Returns: Hash of Bio::Seq objects with display_id as hash key
 sub readFile {
  my (%seq_hash,$seq_str,$seq_obj,$header);
  my $infile = shift;
 #print $infile,"\n";
  if((split(/\./,$infile))[-1] eq "gz")
  {
    open(IN => "zcat $infile |") || die $!;
    my $c=0;
    foreach my $line (<IN>)
    {
      chomp $line;
      #print $line,"\n";
      if($line =~ m/^>/)
      {
        if($c != 0)
        {
          $seq_obj->seq($seq_str);
          $seq_hash{$seq_obj->display_id} = $seq_obj;
        }
        $header = substr($line,1);
        $seq_obj = Bio::Seq->new(-display_id => $header);
        $seq_str = "";
      }
      else
      {  
        $seq_str = $line; #join("",$seq_str,$_);
      }
      $c++;
    }
    $seq_obj->seq($seq_str);
    $seq_hash{$seq_obj->display_id} = $seq_obj;
    close(IN);
  }
  else
  {
    my $seqIO_obj = Bio::SeqIO->new(-file => "<$infile",
                                    -format => "fasta");
    while($seq_obj = $seqIO_obj->next_seq)
    {
      $seq_hash{$seq_obj->display_id} = $seq_obj;
    }
  }
  return %seq_hash;
 }
diff --git a/seqlen.pl b/seqlen.pl
 #!/usr/bin/perl -w
 # Script: seqlen.pl
 # Description: Extracts sequence lengths from a fasta file
 # Author: Steven Ahrendt
 # email: sahrendt0@gmail.com
 # Date: 6.17.11
 #####################################################
 # Usage: seqlen.pl [-v] fastafile
 #####################################################

 use strict;
 use warnings;
 use Bio::SeqIO;
 use List::Util qw[min max];

 my $screen_out = 0;
 # Handle command-line options
 use Getopt::Std;
 my %opts;
 getopts('v', \%opts);
 if (exists $opts{'v'}) 
 {
  $screen_out = 1;
 }


 my (@tmp,@len);
 my @lengths;
 my $infile = $ARGV[0];
 print `perl -pi -e 's/\r/\n/g' $infile`;

 my $numseqs = 0;

 my $fastafile = Bio::SeqIO->new(-file=>"<$infile", -format=>"fasta");
 my $csvfile = join(".",infilename($infile),"csv");

 open(OUT,">$csvfile");
 while(my $seq = $fastafile->next_seq)
 {
 	push(@lengths,$seq->length);
 	print OUT $seq->display_id,",",$seq->length,"\n";
 	if($screen_out)
  {
    print $seq->display_id,",",$seq->length,"\n";
  }
 }
 close(OUT);

 $numseqs = @lengths;
 print "# Seqs:\t$numseqs\n";
 print "Min:\t",min(@lengths),"\n";
 print "Max:\t",max(@lengths),"\n";
 printf("Mean:\t%.2f\n",mean(@lengths));
 if($numseqs == 1)
 {
 	printf("Stdv:\tNA\n");
 }
 else
 {
 	printf("Stdv:\t%.2f\n",stddev(@lengths));
 }
 print "-----\n";
 print "Lengths are in \"$csvfile\"\n";
 print "-----\n";

 sub stddev {
 	my @a = @_;
 	my $n = @a;
 	my $c = (sum(@a) ** 2)/$n;
 	my $num = sumsqr(@a) - $c;

 	#print "\n";
 	#print sumsqr(@a);
 	#print " - ";
 	#print "(",sum(@a),"^2)/($n)\n";
 	#print "-----------------------\n";
 	#print $n,"-1\n";

 	return ($num/($n-1)) ** 0.5;
 }

 sub mean {
 	my @a = @_;
 	my $n = @a;
 	return (sum(@a)/$n);
 }


 sub sumsqr {
 	my @a = @_;
 	my $ret = 0;
 	foreach my $i (@a)
 	{
 		$ret += ($i**2);
 	}
 	return $ret;
 }

 sub sum {
 	my @a = @_;
 	my $ret = 0;
 	foreach my $i (@a)
 	{
 		$ret += $i;
 	}
 	return $ret;
 }

 sub round {
 	my($number) = shift;
 	return int($number + .5);
 }

 sub infilename {
 	my $file = shift;
 	return substr($file,0,index($file,"."));
 }
	#!/usr/bin/perl
	# Script: hw8.pl
	# Description: Trim adapter, clean reads
	# Author: Steven Ahrendt
	# email: sahre001@ucr.edu
	##################################
	# Adapter: CTGTAGGCACCATCAAT
	# trim if it contains a perfect match to first 6 bases
	##################################

	use warnings;
	use strict;
	use Bio::Seq;
	use Bio::SeqIO;

	my $infile = shift;
	my ($seq_obj,$seq_str,%seqs);
	my $adapter = "CTGTAGGCACCATCAAT";
	my $a_m = 6; #Seq must match first 6 bases of adapter for trimming
	my $match = substr($adapter,0,$a_m);
	my $c = 0;
	my $total = 0;
	my $clean = 0;
	open(my $fh => "zcat $infile \|") \|\| die $!;
	while(<$fh>)
	{
	chomp $_;
	if($_ =~ m/^>/)
	{
	if($c != 0)
	{
	$seq_obj->seq($seq_str);
	$total++;
	if ((my $a_pos = adapterFind($seq_obj->seq)) > -1)
	{
	$seq_obj->seq(substr($seq_str,0,$a_pos));
	if ($seq_obj->seq !~ m/N+/g)
	{
	if($seq_obj->length >= 18)
	{
	$seqs{$seq_obj->display_id} = $seq_obj;
	$clean++;
	#push(@seqs,$seq_obj);
	}
	}
	}
	}
	my $header = substr($_,1);
	$seq_obj = Bio::Seq->new(-display_id => $header);
	$seq_str = "";
	}
	else
	{
	$seq_str = $_;
	}
	$c++;
	}
	$seq_obj->seq($seq_str);
	$total++;
	if(($seq_obj->length >= 18) && ($seq_obj->seq !~ m/N+/g) && ($seq_obj->seq =~ m/^$match/))
	{
	$seqs{$seq_obj->display_id} = $seq_obj;
	$clean++;
	}
	#push(@seqs,$seq_obj);
	close($fh);

	open(RES,">results");
	print RES "$total read(s)\n";
	print RES "$clean clean read(s)\n";

	#print $seqs[0]->seq,"\n";
	open(SIZES,">sizes");
	my $clean_reads = Bio::SeqIO->new(-file => ">clean_reads.fa",
	-format => "fasta");
	foreach my $key (keys %seqs)
	{
	print SIZES $seqs{$key}->length,"\n";
	$clean_reads->write_seq($seqs{$key});
	}
	close(SIZES);

	## Step 2: Cluster
	my %cluster;
	my $unique_reads = Bio::SeqIO->new(-file => ">unique_reads.fa",
	-format => "fasta");
	my $unique = 0;
	foreach my $key (keys %seqs)
	{
	$cluster{ $seqs{$key}->seq} = $seqs{$key};
	$unique_reads->write_seq($cluster{$seqs{$key}->seq});
	$unique++;
	}

	print RES "$unique unique read(s)\n";
	close(RES);

	sub adapterFind {
	my $seq = shift;
	my $pos = -1;
	my $len = -1;
	while(($pos = index($seq,$match,$pos)) > -1)
	{
	$len = $pos;
	$pos++;
	}
	return $len;
	}
	#!/usr/bin/perl
	# Script: mirbase_comp.pl
	# Description: GEN220 HW8, compare unique reads to mirbase
	# Author: Steven Ahrendt
	# email: sahre001@ucr.edu
	#####################################
	use strict;
	use warnings;
	use Bio::Seq;
	use Bio::SeqIO;
	use List::MoreUtils 'true';
	#my $count = true { /pattern/ } @list_of_strings;

	my $mirfile = $ARGV[0];
	my $reads_file = $ARGV[1];
	my (%mirnas,%clean_reads);
	my (@m_seqs, @c_seqs);
	%mirnas = readFile($mirfile);
	%clean_reads = readFile($reads_file);
	foreach my $c_key (keys %clean_reads)
	{
	push(@c_seqs, $clean_reads{$c_key}->seq);
	}

	open(OUT,">outfile");
	foreach my $keym (keys %mirnas)
	{
	my $mseq = $mirnas{$keym}->seq;
	$mseq =~ s/U/T/g;
	next if (!(my $match = true {/$mseq/} @c_seqs));
	my $norm = ($match*10000000)/(scalar @c_seqs); # RPTM
	print "$keym\t$norm\n";
	print OUT "$keym\t$norm\n";
	}
	close(OUT);

	## Subroutine: readFile
	# Input: Name of a fasta file (can be gzipped)
	# Returns: Hash of Bio::Seq objects with display_id as hash key
	sub readFile {
	my (%seq_hash,$seq_str,$seq_obj,$header);
	my $infile = shift;
	#print $infile,"\n";
	if((split(/\./,$infile))[-1] eq "gz")
	{
	open(IN => "zcat $infile \|") \|\| die $!;
	my $c=0;
	foreach my $line (<IN>)
	{
	chomp $line;
	#print $line,"\n";
	if($line =~ m/^>/)
	{
	if($c != 0)
	{
	$seq_obj->seq($seq_str);
	$seq_hash{$seq_obj->display_id} = $seq_obj;
	}
	$header = substr($line,1);
	$seq_obj = Bio::Seq->new(-display_id => $header);
	$seq_str = "";
	}
	else
	{
	$seq_str = $line; #join("",$seq_str,$_);
	}
	$c++;
	}
	$seq_obj->seq($seq_str);
	$seq_hash{$seq_obj->display_id} = $seq_obj;
	close(IN);
	}
	else
	{
	my $seqIO_obj = Bio::SeqIO->new(-file => "<$infile",
	-format => "fasta");
	while($seq_obj = $seqIO_obj->next_seq)
	{
	$seq_hash{$seq_obj->display_id} = $seq_obj;
	}
	}
	return %seq_hash;
	}
	#!/usr/bin/perl -w
	# Script: seqlen.pl
	# Description: Extracts sequence lengths from a fasta file
	# Author: Steven Ahrendt
	# email: sahrendt0@gmail.com
	# Date: 6.17.11
	#####################################################
	# Usage: seqlen.pl [-v] fastafile
	#####################################################

	use strict;
	use warnings;
	use Bio::SeqIO;
	use List::Util qw[min max];

	my $screen_out = 0;
	# Handle command-line options
	use Getopt::Std;
	my %opts;
	getopts('v', \%opts);
	if (exists $opts{'v'})
	{
	$screen_out = 1;
	}


	my (@tmp,@len);
	my @lengths;
	my $infile = $ARGV[0];
	print `perl -pi -e 's/\r/\n/g' $infile`;

	my $numseqs = 0;

	my $fastafile = Bio::SeqIO->new(-file=>"<$infile", -format=>"fasta");
	my $csvfile = join(".",infilename($infile),"csv");

	open(OUT,">$csvfile");
	while(my $seq = $fastafile->next_seq)
	{
	push(@lengths,$seq->length);
	print OUT $seq->display_id,",",$seq->length,"\n";
	if($screen_out)
	{
	print $seq->display_id,",",$seq->length,"\n";
	}
	}
	close(OUT);

	$numseqs = @lengths;
	print "# Seqs:\t$numseqs\n";
	print "Min:\t",min(@lengths),"\n";
	print "Max:\t",max(@lengths),"\n";
	printf("Mean:\t%.2f\n",mean(@lengths));
	if($numseqs == 1)
	{
	printf("Stdv:\tNA\n");
	}
	else
	{
	printf("Stdv:\t%.2f\n",stddev(@lengths));
	}
	print "-----\n";
	print "Lengths are in \"$csvfile\"\n";
	print "-----\n";

	sub stddev {
	my @a = @_;
	my $n = @a;
	my $c = (sum(@a) ** 2)/$n;
	my $num = sumsqr(@a) - $c;

	#print "\n";
	#print sumsqr(@a);
	#print " - ";
	#print "(",sum(@a),"^2)/($n)\n";
	#print "-----------------------\n";
	#print $n,"-1\n";

	return ($num/($n-1)) ** 0.5;
	}

	sub mean {
	my @a = @_;
	my $n = @a;
	return (sum(@a)/$n);
	}


	sub sumsqr {
	my @a = @_;
	my $ret = 0;
	foreach my $i (@a)
	{
	$ret += ($i**2);
	}
	return $ret;
	}

	sub sum {
	my @a = @_;
	my $ret = 0;
	foreach my $i (@a)
	{
	$ret += $i;
	}
	return $ret;
	}

	sub round {
	my($number) = shift;
	return int($number + .5);
	}

	sub infilename {
	my $file = shift;
	return substr($file,0,index($file,"."));
	}