Created
May 5, 2014 13:17
-
-
Save ianyfchang/dc759a538df9dfa04873 to your computer and use it in GitHub Desktop.
Extracting TSS, Exons, Introns, 5'UTR and 3' UTR from UCSC knownGene.txt and kgXref.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.util.*; | |
public class ExtractIntronExonUTRFromKnownGeneAndKgXref { | |
public static void main(String[] args) throws Exception { | |
BufferedReader br = new BufferedReader(new FileReader(args[1])); | |
HashMap<String,String> xref=new HashMap<String,String>(); | |
String tmp = ""; | |
while ((tmp = br.readLine()) != null) { | |
String[] x = tmp.split("\t"); | |
xref.put(x[0],x[4]); | |
} | |
br.close(); | |
int tss=Integer.parseInt(args[2]); | |
BufferedWriter bwTSS = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"tss"+tss+".bed")); | |
BufferedWriter bwI = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"introns.bed")); | |
BufferedWriter bwE = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"exons.bed")); | |
BufferedWriter bw3U = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"3UTR.bed")); | |
BufferedWriter bw5U = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"5UTR.bed")); | |
br = new BufferedReader(new FileReader(args[0])); | |
tmp = ""; | |
while ((tmp = br.readLine()) != null) { | |
// chr1 1115 4121 + 3 1115,2475,3083, 2090,2584,4121, uc001aaa.2 | |
// BC032353 | |
String[] x = tmp.split("\t"); | |
String geneinfo = x[0]+":"+xref.get(x[0])+":"; // put strand and kgid to mimic kg.cg format | |
int exonsize = Integer.parseInt(x[7]); | |
String[] exonstart = x[8].split(","); | |
String[] exonend = x[9].split(","); | |
for (int i = 0 ; i < exonsize ; i++){ | |
if(x[2].equals("+")){//forward strand | |
bwE.write(x[1]+"\t"+exonstart[i]+"\t"+exonend[i]+"\t"+geneinfo+"Exon" + (i + 1) +"\n"); | |
if(exonsize>0 && i<exonsize-1){//having intron | |
bwI.write(x[1]+"\t"+exonend[i]+"\t"+exonstart[i+1]+"\t"+geneinfo+"Intron" + (i + 1) +"\n"); | |
} | |
}else{ | |
bwE.write(x[1]+"\t"+exonstart[i]+"\t"+exonend[i]+"\t"+geneinfo+"Exon" + (exonsize-i) +"\n"); | |
if(exonsize>0 && i<exonsize-1){//having intron | |
bwI.write(x[1]+"\t"+exonend[i]+"\t"+exonstart[i+1]+"\t"+geneinfo+"Intron" + (exonsize-i-1) +"\n"); | |
} | |
} | |
} | |
int txstart=Integer.parseInt(x[3]); | |
int txend=Integer.parseInt(x[4]); | |
int cdsstart=Integer.parseInt(x[5]); | |
int cdsend =Integer.parseInt(x[6]); | |
// write TSS | |
if(x[2].equals("+")){ | |
int tssstart=(txstart-tss)<0?0:(txstart-tss); | |
int tssend=txstart+tss; | |
bwTSS.write(x[1]+"\t"+tssstart+"\t"+tssend+"\t"+geneinfo+x[2]+"\n"); | |
} | |
else{ | |
int tssstart=(txend-tss)<0?0:(txend-tss); | |
int tssend=txend+tss; | |
bwTSS.write(x[1]+"\t"+tssstart+"\t"+tssend+"\t"+geneinfo+x[2]+"\n"); | |
} | |
if(cdsstart!=cdsend){ //having 3'UTR or 5'UTR | |
if(x[2].equals("+")){ | |
if(txstart!=cdsstart){ | |
bw5U.write(x[1]+"\t"+x[3]+"\t"+(cdsstart-1)+"\t"+geneinfo+"5'UTR"+"\n"); | |
} | |
if(txend!=cdsend){ | |
bw3U.write(x[1]+"\t"+(cdsend+1)+"\t"+x[4]+"\t"+geneinfo+"3'UTR"+"\n"); | |
} | |
}else{ | |
if(txstart!=cdsstart){ | |
bw3U.write(x[1]+"\t"+x[3]+"\t"+(cdsstart-1)+"\t"+geneinfo+"3'UTR"+"\n"); | |
} | |
if(txend!=cdsend){ | |
bw5U.write(x[1]+"\t"+(cdsend+1)+"\t"+x[4]+"\t"+geneinfo+"5'UTR"+"\n"); | |
} | |
} | |
} | |
} | |
bwTSS.close(); | |
bwI.close(); | |
bwE.close(); | |
bw3U.close(); | |
bw5U.close(); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment