Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ianyfchang/dc759a538df9dfa04873 to your computer and use it in GitHub Desktop.
Save ianyfchang/dc759a538df9dfa04873 to your computer and use it in GitHub Desktop.
Extracting TSS, Exons, Introns, 5'UTR and 3' UTR from UCSC knownGene.txt and kgXref.txt
import java.io.*;
import java.util.*;
public class ExtractIntronExonUTRFromKnownGeneAndKgXref {
public static void main(String[] args) throws Exception {
BufferedReader br = new BufferedReader(new FileReader(args[1]));
HashMap<String,String> xref=new HashMap<String,String>();
String tmp = "";
while ((tmp = br.readLine()) != null) {
String[] x = tmp.split("\t");
xref.put(x[0],x[4]);
}
br.close();
int tss=Integer.parseInt(args[2]);
BufferedWriter bwTSS = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"tss"+tss+".bed"));
BufferedWriter bwI = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"introns.bed"));
BufferedWriter bwE = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"exons.bed"));
BufferedWriter bw3U = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"3UTR.bed"));
BufferedWriter bw5U = new BufferedWriter(new FileWriter(new File(args[0]).getName().replaceAll("txt","")+"5UTR.bed"));
br = new BufferedReader(new FileReader(args[0]));
tmp = "";
while ((tmp = br.readLine()) != null) {
// chr1 1115 4121 + 3 1115,2475,3083, 2090,2584,4121, uc001aaa.2
// BC032353
String[] x = tmp.split("\t");
String geneinfo = x[0]+":"+xref.get(x[0])+":"; // put strand and kgid to mimic kg.cg format
int exonsize = Integer.parseInt(x[7]);
String[] exonstart = x[8].split(",");
String[] exonend = x[9].split(",");
for (int i = 0 ; i < exonsize ; i++){
if(x[2].equals("+")){//forward strand
bwE.write(x[1]+"\t"+exonstart[i]+"\t"+exonend[i]+"\t"+geneinfo+"Exon" + (i + 1) +"\n");
if(exonsize>0 && i<exonsize-1){//having intron
bwI.write(x[1]+"\t"+exonend[i]+"\t"+exonstart[i+1]+"\t"+geneinfo+"Intron" + (i + 1) +"\n");
}
}else{
bwE.write(x[1]+"\t"+exonstart[i]+"\t"+exonend[i]+"\t"+geneinfo+"Exon" + (exonsize-i) +"\n");
if(exonsize>0 && i<exonsize-1){//having intron
bwI.write(x[1]+"\t"+exonend[i]+"\t"+exonstart[i+1]+"\t"+geneinfo+"Intron" + (exonsize-i-1) +"\n");
}
}
}
int txstart=Integer.parseInt(x[3]);
int txend=Integer.parseInt(x[4]);
int cdsstart=Integer.parseInt(x[5]);
int cdsend =Integer.parseInt(x[6]);
// write TSS
if(x[2].equals("+")){
int tssstart=(txstart-tss)<0?0:(txstart-tss);
int tssend=txstart+tss;
bwTSS.write(x[1]+"\t"+tssstart+"\t"+tssend+"\t"+geneinfo+x[2]+"\n");
}
else{
int tssstart=(txend-tss)<0?0:(txend-tss);
int tssend=txend+tss;
bwTSS.write(x[1]+"\t"+tssstart+"\t"+tssend+"\t"+geneinfo+x[2]+"\n");
}
if(cdsstart!=cdsend){ //having 3'UTR or 5'UTR
if(x[2].equals("+")){
if(txstart!=cdsstart){
bw5U.write(x[1]+"\t"+x[3]+"\t"+(cdsstart-1)+"\t"+geneinfo+"5'UTR"+"\n");
}
if(txend!=cdsend){
bw3U.write(x[1]+"\t"+(cdsend+1)+"\t"+x[4]+"\t"+geneinfo+"3'UTR"+"\n");
}
}else{
if(txstart!=cdsstart){
bw3U.write(x[1]+"\t"+x[3]+"\t"+(cdsstart-1)+"\t"+geneinfo+"3'UTR"+"\n");
}
if(txend!=cdsend){
bw5U.write(x[1]+"\t"+(cdsend+1)+"\t"+x[4]+"\t"+geneinfo+"5'UTR"+"\n");
}
}
}
}
bwTSS.close();
bwI.close();
bwE.close();
bw3U.close();
bw5U.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment