Skip to content

Instantly share code, notes, and snippets.

@sahrendt0
Created November 28, 2012 01:38
Show Gist options
  • Save sahrendt0/4158493 to your computer and use it in GitHub Desktop.
Save sahrendt0/4158493 to your computer and use it in GitHub Desktop.
GEN 220 Week 8
#!/usr/bin/perl
# Script: hw8.pl
# Description: Trim adapter, clean reads
# Author: Steven Ahrendt
# email: sahre001@ucr.edu
##################################
# Adapter: CTGTAGGCACCATCAAT
# trim if it contains a perfect match to first 6 bases
##################################
use warnings;
use strict;
use Bio::Seq;
use Bio::SeqIO;
my $infile = shift;
my ($seq_obj,$seq_str,%seqs);
my $adapter = "CTGTAGGCACCATCAAT";
my $a_m = 6; #Seq must match first 6 bases of adapter for trimming
my $match = substr($adapter,0,$a_m);
my $c = 0;
my $total = 0;
my $clean = 0;
open(my $fh => "zcat $infile |") || die $!;
while(<$fh>)
{
chomp $_;
if($_ =~ m/^>/)
{
if($c != 0)
{
$seq_obj->seq($seq_str);
$total++;
if ((my $a_pos = adapterFind($seq_obj->seq)) > -1)
{
$seq_obj->seq(substr($seq_str,0,$a_pos));
if ($seq_obj->seq !~ m/N+/g)
{
if($seq_obj->length >= 18)
{
$seqs{$seq_obj->display_id} = $seq_obj;
$clean++;
#push(@seqs,$seq_obj);
}
}
}
}
my $header = substr($_,1);
$seq_obj = Bio::Seq->new(-display_id => $header);
$seq_str = "";
}
else
{
$seq_str = $_;
}
$c++;
}
$seq_obj->seq($seq_str);
$total++;
if(($seq_obj->length >= 18) && ($seq_obj->seq !~ m/N+/g) && ($seq_obj->seq =~ m/^$match/))
{
$seqs{$seq_obj->display_id} = $seq_obj;
$clean++;
}
#push(@seqs,$seq_obj);
close($fh);
open(RES,">results");
print RES "$total read(s)\n";
print RES "$clean clean read(s)\n";
#print $seqs[0]->seq,"\n";
open(SIZES,">sizes");
my $clean_reads = Bio::SeqIO->new(-file => ">clean_reads.fa",
-format => "fasta");
foreach my $key (keys %seqs)
{
print SIZES $seqs{$key}->length,"\n";
$clean_reads->write_seq($seqs{$key});
}
close(SIZES);
## Step 2: Cluster
my %cluster;
my $unique_reads = Bio::SeqIO->new(-file => ">unique_reads.fa",
-format => "fasta");
my $unique = 0;
foreach my $key (keys %seqs)
{
$cluster{ $seqs{$key}->seq} = $seqs{$key};
$unique_reads->write_seq($cluster{$seqs{$key}->seq});
$unique++;
}
print RES "$unique unique read(s)\n";
close(RES);
sub adapterFind {
my $seq = shift;
my $pos = -1;
my $len = -1;
while(($pos = index($seq,$match,$pos)) > -1)
{
$len = $pos;
$pos++;
}
return $len;
}
#!/usr/bin/perl
# Script: mirbase_comp.pl
# Description: GEN220 HW8, compare unique reads to mirbase
# Author: Steven Ahrendt
# email: sahre001@ucr.edu
#####################################
use strict;
use warnings;
use Bio::Seq;
use Bio::SeqIO;
use List::MoreUtils 'true';
#my $count = true { /pattern/ } @list_of_strings;
my $mirfile = $ARGV[0];
my $reads_file = $ARGV[1];
my (%mirnas,%clean_reads);
my (@m_seqs, @c_seqs);
%mirnas = readFile($mirfile);
%clean_reads = readFile($reads_file);
foreach my $c_key (keys %clean_reads)
{
push(@c_seqs, $clean_reads{$c_key}->seq);
}
open(OUT,">outfile");
foreach my $keym (keys %mirnas)
{
my $mseq = $mirnas{$keym}->seq;
$mseq =~ s/U/T/g;
next if (!(my $match = true {/$mseq/} @c_seqs));
my $norm = ($match*10000000)/(scalar @c_seqs); # RPTM
print "$keym\t$norm\n";
print OUT "$keym\t$norm\n";
}
close(OUT);
## Subroutine: readFile
# Input: Name of a fasta file (**can be gzipped**)
# Returns: Hash of Bio::Seq objects with display_id as hash key
sub readFile {
my (%seq_hash,$seq_str,$seq_obj,$header);
my $infile = shift;
#print $infile,"\n";
if((split(/\./,$infile))[-1] eq "gz")
{
open(IN => "zcat $infile |") || die $!;
my $c=0;
foreach my $line (<IN>)
{
chomp $line;
#print $line,"\n";
if($line =~ m/^>/)
{
if($c != 0)
{
$seq_obj->seq($seq_str);
$seq_hash{$seq_obj->display_id} = $seq_obj;
}
$header = substr($line,1);
$seq_obj = Bio::Seq->new(-display_id => $header);
$seq_str = "";
}
else
{
$seq_str = $line; #join("",$seq_str,$_);
}
$c++;
}
$seq_obj->seq($seq_str);
$seq_hash{$seq_obj->display_id} = $seq_obj;
close(IN);
}
else
{
my $seqIO_obj = Bio::SeqIO->new(-file => "<$infile",
-format => "fasta");
while($seq_obj = $seqIO_obj->next_seq)
{
$seq_hash{$seq_obj->display_id} = $seq_obj;
}
}
return %seq_hash;
}
#!/usr/bin/perl -w
# Script: seqlen.pl
# Description: Extracts sequence lengths from a fasta file
# Author: Steven Ahrendt
# email: sahrendt0@gmail.com
# Date: 6.17.11
#####################################################
# Usage: seqlen.pl [-v] fastafile
#####################################################
use strict;
use warnings;
use Bio::SeqIO;
use List::Util qw[min max];
my $screen_out = 0;
# Handle command-line options
use Getopt::Std;
my %opts;
getopts('v', \%opts);
if (exists $opts{'v'})
{
$screen_out = 1;
}
my (@tmp,@len);
my @lengths;
my $infile = $ARGV[0];
print `perl -pi -e 's/\r/\n/g' $infile`;
my $numseqs = 0;
my $fastafile = Bio::SeqIO->new(-file=>"<$infile", -format=>"fasta");
my $csvfile = join(".",infilename($infile),"csv");
open(OUT,">$csvfile");
while(my $seq = $fastafile->next_seq)
{
push(@lengths,$seq->length);
print OUT $seq->display_id,",",$seq->length,"\n";
if($screen_out)
{
print $seq->display_id,",",$seq->length,"\n";
}
}
close(OUT);
$numseqs = @lengths;
print "# Seqs:\t$numseqs\n";
print "Min:\t",min(@lengths),"\n";
print "Max:\t",max(@lengths),"\n";
printf("Mean:\t%.2f\n",mean(@lengths));
if($numseqs == 1)
{
printf("Stdv:\tNA\n");
}
else
{
printf("Stdv:\t%.2f\n",stddev(@lengths));
}
print "-----\n";
print "Lengths are in \"$csvfile\"\n";
print "-----\n";
sub stddev {
my @a = @_;
my $n = @a;
my $c = (sum(@a) ** 2)/$n;
my $num = sumsqr(@a) - $c;
#print "\n";
#print sumsqr(@a);
#print " - ";
#print "(",sum(@a),"^2)/($n)\n";
#print "-----------------------\n";
#print $n,"-1\n";
return ($num/($n-1)) ** 0.5;
}
sub mean {
my @a = @_;
my $n = @a;
return (sum(@a)/$n);
}
sub sumsqr {
my @a = @_;
my $ret = 0;
foreach my $i (@a)
{
$ret += ($i**2);
}
return $ret;
}
sub sum {
my @a = @_;
my $ret = 0;
foreach my $i (@a)
{
$ret += $i;
}
return $ret;
}
sub round {
my($number) = shift;
return int($number + .5);
}
sub infilename {
my $file = shift;
return substr($file,0,index($file,"."));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment