Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ianyfchang/254b893eb71529651f1d to your computer and use it in GitHub Desktop.
Save ianyfchang/254b893eb71529651f1d to your computer and use it in GitHub Desktop.
Splitting each chromosome position in BED-like format into bins
import java.util.zip.*;
import java.io.*;
import java.util.*;
public class GenerateBinsForRegionsAndReverseByStrand{
static LinkedHashMap<String, String> data = new LinkedHashMap<String, String>();
public static void main(String[] args) throws Exception {
if(args.length!=2){
System.err.println("java -Xmx14g GenerateBinsForRegionsAndReverseByStrand bed-like binsize");
System.exit(1);
}
String block= new File(args[0]).getName().indexOf(".gz")>-1?new File(args[0]).getName().replaceAll(".gz",""):new File(args[0]).getName();
int bin = Integer.parseInt(args[1]);
BufferedReader br = new BufferedReader(new FileReader(args[0]));
/*
chr1 703405 705146 - bt126 uc001abo.1 - 0 uc001abo.1 - -276 0
chr1 751975 753075 + bt126 uc009vjn.1 + 0 uc009vjn.1 + 0 0
chr1 794294 795479 + bt126 uc001abt.2 - 6851 uc001abt.2 - 5667 0
chr1 829557 831008 + bt126 uc001abu.1 + -6119 uc001abu.1 + -4669 0
*/
String tmp = "";
int idx=0;
while ((tmp = br.readLine()) != null) {
String tmpstr0=tmp.replaceAll("\t",",");
String[] tmpstr = tmp.split("\t");
int tmpS = Integer.parseInt(tmpstr[1])+1; // start MUST add 1 to fix UCSC zero-based start, after divided into bins, substract 1 to restore zero-base for bigwig query
int tmpE = Integer.parseInt(tmpstr[2]);// end
//divide into bins
int size = (tmpE-tmpS+1)/bin;
ArrayList<int[]> tmpdata = new ArrayList<int[]>();
for (int i = 0; i < bin ; i++){
int last = 0;
if (i==bin-1){ // last bin
last=tmpE;
}else{
last=tmpS+((i+1)*size);
}
int[] tmpPos = {0,tmpS+(i*size),last};
tmpdata.add(tmpPos);
}
if(tmpstr[3].equals("-")){
Collections.reverse(tmpdata);
}
for (int j = 0 ; j < tmpdata.size(); j++){
int[] tmppos = tmpdata.get(j);
int chr = tmppos[0]; // chromosome
int start = tmppos[1]-1;
int end = tmppos[2];// end
int realstart=start>=0?start:0;
int realend=end>=0?end:0;
//bed format: chr, start, end, name, score, strand
System.out.println(tmpstr[0]+"\t"+realstart+"\t"+realend+"\t"+tmpstr[0]+","+start+","+end+","+(j+1)+","+tmpstr0);
}
System.err.print(++idx+"\r");
}
br.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment