diff --git a/.gitignore b/.gitignore index a1c2a23..bd8eb75 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,7 @@ *.tar.gz *.rar +.idea* + # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* diff --git a/msp.iml b/msp.iml new file mode 100644 index 0000000..b1c5c22 --- /dev/null +++ b/msp.iml @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/src/META-INF/MANIFEST.MF b/src/META-INF/MANIFEST.MF index 85846ea..3eb3083 100644 --- a/src/META-INF/MANIFEST.MF +++ b/src/META-INF/MANIFEST.MF @@ -1,3 +1,3 @@ Manifest-Version: 1.0 -Main-Class: buildgraph.BuildDeBruijnGraph +Main-Class: dumbo.OrderingOptimizer diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java deleted file mode 100644 index 00ea1eb..0000000 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ /dev/null @@ -1,190 +0,0 @@ -package buildgraph; - -import buildgraph.Ordering.*; -import buildgraph.Ordering.UHS.UHSFrequencySignatureOrdering; -import buildgraph.Ordering.UHS.UHSSignatureOrdering; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.AbstractMap; -import java.util.HashMap; - -public class BuildDeBruijnGraph { - - public static void main(String[] args) throws IOException { - -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/chr14.fastq"; -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/smalldata.fastq"; - String infile = "/home/gaga/data-scratch/yaelbenari/datas/breastCancer.fastq"; -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/beeData.fastq"; -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/workspace/72.fastq"; - - int k = 60, pivot_len = 7, bufferSize = 8192, numThreads = 1, hsmapCapacity = 10000000; -// int readLen = 124; -// int readLen = 101; - int readLen = 100; - int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// - boolean readable = false; - String orderingName = "uhs_freq_sig"; - int xor = 0; //11101101; - - if (args.length > 0 && args[0].equals("-help")) { - System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" + - "Options Available: \n" + - "[-NB numOfBlocks] : (Integer) Number Of Kmer Blocks. Default: 256" + "\n" + - "[-p pivotLength] : (Integer) Pivot Length. Default: 12" + "\n" + - "[-t numOfThreads] : (Integer) Number Of Threads. Default: 1" + "\n" + - "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + - "[-o order] : lexico or sig or uhs or uhs_sig" + "\n" + - "[-r readable] : (Boolean) Output Format: true means readable text, false means binary. Default: false" + "\n"); - return; - } - - for (int i = 0; i < args.length; i += 2) { - if(args[i].equals("-in")) - infile = args[i+1]; - else if(args[i].equals("-k")) - k = new Integer(args[i+1]); - else if(args[i].equals("-NB")) - numBlocks = new Integer(args[i+1]); - else - if(args[i].equals("-o")) - orderingName = args[i+1]; - else if(args[i].equals("-p")) - pivot_len = new Integer(args[i+1]); - else if(args[i].equals("-b")) - bufferSize = new Integer(args[i+1]); - else if(args[i].equals("-L")) - readLen = new Integer(args[i+1]); - else if(args[i].equals("-t")) - numThreads = new Integer(args[i+1]); - else if(args[i].equals("-r")) - readable = new Boolean(args[i+1]); - else{ - System.out.println("Wrong with arguments. Abort!"); - return; - } - } - - - IOrdering ordering; - switch (orderingName) - { - case "lexico": - ordering = new LexicographicOrdering(pivot_len); - break;; - case "uhs": - ordering = new UHSSignatureOrdering(xor, pivot_len, false, true); - case "random": - //ordering = - break; - } - -// UHSFrequencySignatureOrdering uhs_freq_sig = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, true); -// uhs_freq_sig.initRank(); -// HashMap orderingNames = new HashMap() {{ -// put("lexico", new LexicographicOrdering(pivot_len)); -// put("sig", new LexicographicSignatureOrdering(pivot_len)); -// put("uhs_sig", new UHSSignatureOrdering(xor, pivot_len, false, true)); -// put("uhs_freq", new UniversalFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, false)); -// put("uhs_freq_sig", uhs_freq_sig); -// }}; - - - IOrdering ordering = orderingNames.get(orderingName); -// IOrdering ordering = new LexicographicSignatureOrdering(pivot_len); - Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); - Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); - - try { - - System.out.println("Program Configuration:"); - System.out.print("Input File: " + infile + "\n" + - "Kmer Length: " + k + "\n" + - "Read Length: " + readLen + "\n" + - "# Of Blocks: " + numBlocks + "\n" + - "Pivot Length: " + pivot_len + "\n" + - "# Of Threads: " + numThreads + "\n" + - "R/W Buffer Size: " + bufferSize + "\n" + - "Ordering: " + orderingName + "\n" + - "x xor: " + xor + "\n" + - "Output Format: " + (readable == true ? "Text" : "Binary") + "\n"); - - long maxID = partition.Run(); - - AbstractMap distinctKmersPerPartition = map.Run(numThreads); - BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); - - HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); - BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); -// -// -// long time1 = 0; -// long t1 = System.currentTimeMillis(); -// System.out.println("Merge IDReplaceTables Begin!"); -// String sortcmd = "sort -t $\'\t\' -o IDReplaceTable +0 -1 -n -m Maps/maps*"; -// Runtime.getRuntime().exec(new String[]{"/bin/sh", "-c", sortcmd}, null, null).waitFor(); -// long t2 = System.currentTimeMillis(); -// time1 = (t2 - t1) / 1000; -// System.out.println("Time used for merging: " + time1 + " seconds!"); -// -// Replace replace = new Replace("IDReplaceTable", "OutGraph", k, bufferSize, readLen, maxID); -// replace.Run(readable); - - - } catch (Exception E) { - System.out.println("Exception caught!"); - E.printStackTrace(); - } - - } - - public static HashMap getBytesPerFile() { - File folder = new File("./Nodes"); - File[] listOfFiles = folder.listFiles(); - - HashMap bytesPerFile = new HashMap<>(); - - for (int i = 0; i < listOfFiles.length; i++) { - if (listOfFiles[i].isFile()) - bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length()); - } - return bytesPerFile; - } - - public static void writeToFile(AbstractMap data, String fileName) { - File file = new File(fileName); - - BufferedWriter bf = null; - ; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - bf.write("x = {"); - bf.newLine(); - - //iterate map entries - for (java.util.Map.Entry entry : data.entrySet()) { - bf.write(entry.getKey() + ":" + entry.getValue() + ","); - bf.newLine(); - } - bf.write("}"); - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - - } - -} diff --git a/src/buildgraph/Kmer64.java b/src/buildgraph/Kmer64.java deleted file mode 100644 index 9e6ac27..0000000 --- a/src/buildgraph/Kmer64.java +++ /dev/null @@ -1,78 +0,0 @@ -package buildgraph; - -public class Kmer64 extends Object { - - public long high; - public long low; - - private final static char[] baseDic = {'A', 'C', 'G', 'T'}; - private final static int[] intDic = {0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3}; - - private final int base2int(char base) { - return intDic[base - 'A']; - } - - - public Kmer64(char[] str, int start, int end, boolean rev) { - - this.high = this.low = 0; - - int len = end - start; - - if (!rev) { - if (len <= 32) { - for (int i = start; i <= end - 1; i++) { - this.low = (this.low << 2) + base2int(str[i]); - } - } else { - for (int i = end - 32; i <= end - 1; i++) { - this.low = (this.low << 2) + base2int(str[i]); - } - - for (int i = start; i <= end - 33; i++) { - this.high = (this.high << 2) + base2int(str[i]); - } - } - } else { - if (len <= 32) { - for (int i = end - 1; i >= start; i--) { - this.low = (this.low << 2) + 3 ^ base2int(str[i]); - } - } else { - for (int i = start + 31; i >= start; i--) { - this.low = (this.low << 2) + 3 ^ base2int(str[i]); - } - - for (int i = end - 1; i >= start + 32; i--) { - this.high = (this.high << 2) + 3 ^ base2int(str[i]); - } - } - } - - } - - public Kmer64(long low, long high) { - this.low = low; - this.high = high; - } - - @Override - public boolean equals(Object another) { - Kmer64 k = (Kmer64) another; - if (this.high == k.high && this.low == k.low) - return true; - else - return false; - } - - @Override - public int hashCode() { - return (int) ((low ^ (low >>> 32)) ^ (high ^ (high >>> 32))); - } - - - public String toString() { - return this.high + "," + this.low; - } -} - diff --git a/src/buildgraph/Map.java b/src/buildgraph/Map.java deleted file mode 100644 index a98a9d0..0000000 --- a/src/buildgraph/Map.java +++ /dev/null @@ -1,316 +0,0 @@ -package buildgraph; - -import java.io.*; -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.CountDownLatch; - - -public class Map{ - - private int k; - private int numOfBlocks; - private int bufSize; - - private Object lock_blocks = new Object(); - - private int capacity; - - private int blockID; - - private long forAndVal; - private long forAndVal32; - - private static int[] valTable = StringUtils.valTable; - - public Map(int kk, int numberOfBlocks, int bufferSize, int HScapacity){ - this.k = kk; - this.numOfBlocks = numberOfBlocks; - this.bufSize = bufferSize; - this.capacity = HScapacity; - this.blockID = 0; - this.forAndVal = (long)Math.pow(2, 2*(k-32)) - 1; - this.forAndVal32 = (long)Math.pow(2, 2*k) - 1; - } - - public class MyThread extends Thread{ - private CountDownLatch threadsSignal; - private HashSet fileNames; - private ConcurrentHashMap distinctKmersPerPartition; - - public MyThread(CountDownLatch threadsSignal, HashSet fileNames, ConcurrentHashMap distinctKmersPerPartition){ - super(); - this.threadsSignal = threadsSignal; - this.fileNames = fileNames; - this.distinctKmersPerPartition = distinctKmersPerPartition; - } - - @Override - public void run(){ - System.out.println(Thread.currentThread().getName() + "Start..."); - - FileReader fr; - BufferedReader bfr; - FileWriter fw; - BufferedWriter bfw; - - - String line; - - int p,j; - long cnt; - Kmer64 k1, k1_rev; - - - try{ - File dir = new File("Maps"); - if(!dir.exists()) - dir.mkdir(); - - while(blockID nodes = new HashMap(capacity); - - while((line = bfr.readLine()) != null){ - - String[] strs = line.split("\t"); - cnt = Long.parseLong(strs[1]); - - long preOriginal = -1, preReplace = -1, Original = -1, Replace = -1; - long diff = -1; - boolean newOut = true, next = false; - - Long ReplaceObj, Replace_revObj; - - char[] lineCharArray = strs[0].toCharArray(); - k1 = new Kmer64(lineCharArray,0,k,false); - k1_rev = new Kmer64(lineCharArray,0,k,true); - - int bound = strs[0].length() - k + 1; - - for(j = 0; j < bound; j++){ - - if(j != 0){ - if(k > 32){ - k1 = new Kmer64((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A'], ((k1.high<<2) + valTable[lineCharArray[k+j-33]-'A']) & forAndVal); - k1_rev = new Kmer64((k1_rev.low>>>2) + ((k1_rev.high&3)<<62), (k1_rev.high>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-33)<<1))); - } - else{ - k1 = new Kmer64(((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A']) & forAndVal32, 0); - k1_rev = new Kmer64((k1_rev.low>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-1)<<1)), 0); - } - } - - ReplaceObj = nodes.get(k1); - Replace_revObj = nodes.get(k1_rev); - - if(ReplaceObj == null && Replace_revObj == null){ - nodes.put(k1, cnt+j*2); - - if(!newOut && !next){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - - newOut = true; - } - - } - else{ - if(ReplaceObj!=null){ - Original = cnt+j*2; - Replace = ReplaceObj; - } - else{ - Original = cnt+j*2; - Replace = Replace_revObj+1; - } - - if(newOut){ - bfw.write(Original+"\t"+Replace+"\t"); - newOut = false; - next = true; - } - - else if(Original-preOriginal==2){ - if(next){ - diff = Replace - preReplace; - if(diff==2){ - bfw.write("+\t"); - next = false; - } - else if(diff==-2){ - bfw.write("-\t"); - next = false; - } - else{ - bfw.write("\n"+Original+"\t"+Replace+"\t"); - } - } - else{ - if(Replace - preReplace != diff){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - - bfw.write(Original+"\t"+Replace+"\t"); - next = true; - } - } - } - - else if(next==true){ - - bfw.write("\n"+Original+"\t"+Replace+"\t"); - } - - preOriginal = Original; - preReplace = Replace; - } - - } - - if(!newOut && !next){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - } - else if(next){ - bfw.newLine(); - } - } - - if(p%100 == 0) System.out.println(p); - distinctKmersPerPartition.put((long)p, (long)nodes.size()); - - nodes.clear(); - nodes = null; - - bfw.close(); - fw.close(); - bfr.close(); - fr.close(); - bfw = null; - fw = null; - bfr = null; - fr = null; - } - - }catch(Exception E){ - System.out.println("Exception caught!"); - E.printStackTrace(); - } - - threadsSignal.countDown(); - System.out.println(Thread.currentThread().getName() + "End. Remaining" + threadsSignal.getCount() + " threads"); - - } - } - - - private AbstractMap BuildMap(int threadNum, HashSet fileNames) throws Exception{ - CountDownLatch threadSignal = new CountDownLatch(threadNum); - - ConcurrentHashMap distinctKmersPerPartition = new ConcurrentHashMap<>(); - - for(int i=0;i Run(int numThreads) throws Exception{ - long time1=0; - - HashSet fileNames = getNodesFileNames(); - - long t1 = System.currentTimeMillis(); - System.out.println("Build Maps Begin!"); - AbstractMap distinctKmersPerPartition= BuildMap(numThreads, fileNames); - long t2 = System.currentTimeMillis(); - time1 = (t2-t1)/1000; - System.out.println("Time used for building maps: " + time1 + " seconds!"); - - return distinctKmersPerPartition; - - } - - private HashSet getNodesFileNames(){ - File[] files = (new File("./Nodes")).listFiles(); - List fileNames = new LinkedList<>(); - for(File file : files){ - if(file.isFile()){ - fileNames.add(file.getName()); - } - } - return new HashSet<>(fileNames); - } - - public static void main(String[] args){ - - int k = 15, numBlocks = 256, numThreads = 1, bufferSize = 8192, hsmapCapacity = 1000000; - - if(args[0].equals("-help")){ - System.out.print("Usage: java -jar Map.jar -k k -NB numOfBlocks [options]\n" + - "Options Available: \n" + - "[-t numOfThreads] : (Integer) Number Of Threads. Default: 1" + "\n" + - "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + - "[-c capacity] : (Integer) Hashmap Capacity. Default: 1000000" + "\n"); - return; - } - - for(int i=0; i 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - for (int i = 0; i < len; i++) { - if (a[froma + i] < b[fromb + i]) - return -1; - else if (a[froma + i] > b[fromb + i]) - return 1; - } - return 0; - } -} diff --git a/src/buildgraph/Ordering/LexicographicSignatureOrdering.java b/src/buildgraph/Ordering/LexicographicSignatureOrdering.java deleted file mode 100644 index 7b816c4..0000000 --- a/src/buildgraph/Ordering/LexicographicSignatureOrdering.java +++ /dev/null @@ -1,39 +0,0 @@ -package buildgraph.Ordering; - -import buildgraph.StringUtils; - -import java.io.IOException; - -public class LexicographicSignatureOrdering extends LexicographicOrdering { - private SignatureUtils signatureUtils; - private StringUtils stringUtils; - public LexicographicSignatureOrdering(int pivotLen) throws IOException { - super(pivotLen); - signatureUtils = new SignatureUtils(pivotLen); - stringUtils = new StringUtils(); - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { -// boolean aAllowed = signatureUtils.isAllowed(a, froma, froma + len); -// boolean bAllowed = signatureUtils.isAllowed(b, fromb, fromb + len); - int x = stringUtils.getDecimal(a, froma, froma + pivotLen); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen); - boolean aAllowed = signatureUtils.isAllowed(a, froma, x); - boolean bAllowed = signatureUtils.isAllowed(b, fromb, y); - - if (!aAllowed && bAllowed) { - return 1; - } else if (!bAllowed && aAllowed) { - return -1; - } - - for (int i = 0; i < len; i++) { - if (a[froma + i] < b[fromb + i]) - return -1; - else if (a[froma + i] > b[fromb + i]) - return 1; - } - return 0; - } -} diff --git a/src/buildgraph/Ordering/SignatureUtils.java b/src/buildgraph/Ordering/SignatureUtils.java deleted file mode 100644 index 8212663..0000000 --- a/src/buildgraph/Ordering/SignatureUtils.java +++ /dev/null @@ -1,44 +0,0 @@ -package buildgraph.Ordering; - -import java.util.HashMap; - -public class SignatureUtils { - - private int len; - protected byte[] isPmerAllowed; - - public SignatureUtils(int len){ - this.len = len; - isPmerAllowed = new byte[(int)Math.pow(4, len)]; - } - - public boolean isAllowed(char[] a, int from, int aDecimal) { - int isAllowed = isPmerAllowed[aDecimal]; - if(isAllowed != 0){ - return isAllowed == 1; - } - - int lastIndex = from + len - 1; - if (a[from] == 'A' && a[from + 2] == 'A') { - if (a[from + 1] <= 'C') { // C or A - isPmerAllowed[aDecimal] = -1; - return false; - } - } else if (a[lastIndex] == 'T' && a[lastIndex - 2] == 'T') { - if (a[lastIndex - 1] >='G') { // G or T - isPmerAllowed[aDecimal] = -1; - return false; - } - } - - for (int i = from + 2; i < lastIndex; i++) { - if (a[i] == 'A' && a[i + 1] == 'A') { - isPmerAllowed[aDecimal] = -1; - return false; - } - } - isPmerAllowed[aDecimal] = 1; - return true; - } - -} diff --git a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java deleted file mode 100644 index a0a0231..0000000 --- a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ /dev/null @@ -1,98 +0,0 @@ -package buildgraph.Ordering.UHS; - -import java.io.*; - -public class UHSFrequencySignatureOrdering extends UHSSignatureOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private long[] pmerFrequency; - private boolean isInit; - - public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature, boolean useCache) throws IOException { - super(0, pivotLen, useSignature, useCache); - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - pmerFrequency = new long[(int)Math.pow(4, pivotLen)]; - isInit = false; - } - - @Override - public void initRank() throws IOException { - initFrequency(); - super.initRank(); - isRankInit = true; - } - - protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) throws IOException { - int baseCompareValue = strcmpBase(x, y); - if (baseCompareValue != BOTH_IN_UHS) { - return baseCompareValue; - } - - // from down here - both in UHS - - if(useSignature){ - if (!xAllowed && yAllowed) { - return 1; - } else if (!yAllowed && xAllowed) { - return -1; - } - } - - // both allowed or both not allowed - if(pmerFrequency[x] == pmerFrequency[y]){ - if(x 1000000){ - break; - } - } - } - bfrG.close(); - frG.close(); - } - - - - -} diff --git a/src/buildgraph/Ordering/UHS/UHSOrderingBase.java b/src/buildgraph/Ordering/UHS/UHSOrderingBase.java deleted file mode 100644 index 3820b30..0000000 --- a/src/buildgraph/Ordering/UHS/UHSOrderingBase.java +++ /dev/null @@ -1,147 +0,0 @@ -package buildgraph.Ordering.UHS; - -import buildgraph.Ordering.IOrdering; -import buildgraph.StringUtils; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; - -public abstract class UHSOrderingBase implements IOrdering { - - protected byte[] uhsBits; - protected StringUtils stringUtils; - - protected static final int BOTH_IN_UHS = 824; - protected int pivotLen; - - protected int[] rankOfPmer; - protected boolean isRankInit; - - - public UHSOrderingBase(int pivotLen) throws IOException { - this.pivotLen = pivotLen; - stringUtils = new StringUtils(); - uhsBits = uhsBitSet(pivotLen); - rankOfPmer = new int[(int) Math.pow(4, pivotLen)]; - Arrays.fill(rankOfPmer, Integer.MAX_VALUE); - isRankInit = false; - } - - protected abstract int calculateStrcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException; - - - public boolean isInUHS(int pmerDecimal) { - int pmerDecimalDiv8 = pmerDecimal >> 3; - int pmerDecimalMod8 = pmerDecimal & 0b111; - if (((this.uhsBits[pmerDecimalDiv8] >> (pmerDecimalMod8)) & 1) == 1) { - return true; - } - return false; - } - - public boolean isInUHS(char[] a, int from, int to) { - return isInUHS(stringUtils.getDecimal(a, from, to)); - } - - protected int strcmpBase(int x, int y) { - if (x == y) - return 0; - - boolean xInUHS = isInUHS(x); - boolean yInUHS = isInUHS(y); - if (xInUHS && !yInUHS) { - return -1; - } else if (!xInUHS && yInUHS) { - return 1; - } - return BOTH_IN_UHS; - } - - private byte[] uhsBitSet(int pivotLen) throws IOException { - int n = (int) Math.pow(4, pivotLen) / 8; - int i = 0; - byte[] bits = new byte[n]; - - String DocksFile = "res_" + pivotLen + ".txt"; - FileReader frG = new FileReader(DocksFile); - int count = 0; - - BufferedReader reader; - try { - reader = new BufferedReader(frG); - String line; - while ((line = reader.readLine()) != null) { - i = stringUtils.getDecimal(line.toCharArray(), 0, pivotLen); - bits[i / 8] |= 1 << (i % 8); - count++; - } - reader.close(); - } catch (IOException e) { - e.printStackTrace(); - } - System.out.println(count); - frG.close(); - - return bits; - } - - public void initRank() throws IOException { - System.out.println("start init rank"); - HashSet pmers = getPmersInUHS(); - char[][] pmersArr = new char[pmers.size()][pivotLen]; - pmers.toArray(pmersArr); - Arrays.sort(pmersArr, (o1, o2) -> { - try { - return calculateStrcmp(o1, o2, 0, 0, pivotLen); - } catch (IOException e) { - e.printStackTrace(); - } - return 0; - }); - for (int i = 0; i < pmersArr.length; i++) { - rankOfPmer[stringUtils.getDecimal(pmersArr[i], 0, pivotLen)] = i; - } - System.out.println("finish init rank"); - } - - private HashSet getPmersInUHS() { - HashSet pmers = new HashSet<>(); - StringBuilder sb = new StringBuilder(pivotLen); - for (int i = 0; i < pivotLen; i++) sb.append('A'); - generate(pmers, sb, 0); - return pmers; - - } - - private void generate(HashSet pmers, StringBuilder sb, int n) { - char[] alphabet = {'A', 'C', 'G', 'T'}; - if (n == sb.capacity()) { - char[] pmer = sb.toString().toCharArray(); - if (isInUHS(pmer, 0, pivotLen)) { - pmers.add(pmer); - } - return; - } - for (char letter : alphabet) { - sb.setCharAt(n, letter); - generate(pmers, sb, n + 1); - } - } - - protected static HashMap pivotLengthToHexRepresentation = new HashMap() { - { - put(5, 0x3ff); - put(6, 0xfff); - put(7, 0x3fff); - put(8, 0xffff); - put(10, 0xfffff); - put(12, 0xffffff); - } - - }; - -} diff --git a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java deleted file mode 100644 index 136aa5b..0000000 --- a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java +++ /dev/null @@ -1,105 +0,0 @@ -package buildgraph.Ordering.UHS; - -import buildgraph.Ordering.SignatureUtils; -import buildgraph.StringUtils; - -import java.io.IOException; - -public class UHSSignatureOrdering extends UHSOrderingBase { - private SignatureUtils signatureUtils; - protected boolean useSignature; - private boolean useCache; - protected int xor; - - - public UHSSignatureOrdering(int xor, int pivotLen, boolean useSignature, boolean useCache) throws IOException { - super(pivotLen); - this.xor = xor; - this.useSignature = useSignature; - this.useCache = useCache; - signatureUtils = new SignatureUtils(pivotLen); - } - - public UHSSignatureOrdering(int pivotLen, boolean useSignature, boolean useCache) throws IOException { - this(0, pivotLen, useSignature, useCache); - } - - - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException { - if(!isRankInit) throw new IOException("rank not initialized yet"); - - int x = stringUtils.getDecimal(a, froma, froma + pivotLen); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen); - - if (x == y) return 0; - - // isRankInit = true here - if (rankOfPmer[x] < rankOfPmer[y]) { - return -1; - } - return 1; - } - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - int j = stringUtils.getDecimal(a, min_pos, min_pos + pivotLen); - int prev = j; - boolean prevAllowed = signatureUtils.isAllowed(a, min_pos, prev), jAllowed = true; - int hexRepresentation = pivotLengthToHexRepresentation.get(pivotLen); - for (int i = from + 1; i <= to - pivotLen; i++) { - j = ((j * 4) ^ (StringUtils.valTable[a[i + pivotLen - 1] - 'A'])) & hexRepresentation; - - if (useSignature) - jAllowed = signatureUtils.isAllowed(a, i, j); - - if (isInUHS(j)) { - if (strcmpSignature(prev, j, prevAllowed, jAllowed) > 0) { - min_pos = i; - prev = j; - } - - } - prevAllowed = jAllowed; - } - return min_pos; - } - - protected int calculateStrcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException { - int x = stringUtils.getDecimal(a, froma, froma + pivotLen); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen); - - if (x == y) return 0; - - boolean aAllowed = true, bAllowed = true; - if (useSignature) { - aAllowed = signatureUtils.isAllowed(a, froma, x); - bAllowed = signatureUtils.isAllowed(b, fromb, y); - } - - return strcmpSignature(x, y, aAllowed, bAllowed); - } - - protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) throws IOException { - int baseCompareValue = strcmpBase(x, y); - if (baseCompareValue != BOTH_IN_UHS) { - return baseCompareValue; - } - // from down here - both in UHS - if (useSignature) { - if (!xAllowed && yAllowed) { - return 1; - } else if (!yAllowed && xAllowed) { - return -1; - } - } - // both allowed or both not allowed - if ((x ^ xor) < (y ^ xor)) - return -1; - else - return 1; - - } -} diff --git a/src/buildgraph/Partition.java b/src/buildgraph/Partition.java deleted file mode 100644 index 5a43374..0000000 --- a/src/buildgraph/Partition.java +++ /dev/null @@ -1,245 +0,0 @@ -package buildgraph; - -import buildgraph.Ordering.IOrdering; -import buildgraph.Ordering.UHS.UHSOrderingBase; - -import java.io.*; - -public class Partition { - - private int k; - private String inputfile; - private int numOfBlocks; - private int pivotLen; - private int bufSize; - - private FileReader frG; - private BufferedReader bfrG; - private FileWriter[] fwG; - private BufferedWriter[] bfwG; - - private int readLen; - private IOrdering ordering; - - private StringUtils stringUtils; - - private int numOpenFiles; - - - public Partition(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrdering ordering) { - this.k = kk; - this.inputfile = infile; - this.numOfBlocks = numberOfBlocks; - this.pivotLen = pivotLength; - this.bufSize = bufferSize; - this.readLen = readLen; - this.ordering = ordering; - this.stringUtils = new StringUtils(); - this.numOpenFiles = 0; - } - - - private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException { - - int len = a.length; - int pos1 = ordering.findSmallest(a, from, to); - int pos2 = ordering.findSmallest(b, len - to, len - from); - - if (ordering.strcmp(a, b, pos1, pos2, pivotLen) < 0) { - flag[0] = 0; - return pos1; - } else { - flag[0] = 1; - return pos2; - } - } - - private int calPosNew(char[] a, int from, int to) { - return stringUtils.getDecimal(a, from, to) % numOfBlocks; - } - - private long DistributeNodes() throws IOException { - frG = new FileReader(inputfile); - bfrG = new BufferedReader(frG, bufSize); - fwG = new FileWriter[numOfBlocks]; - bfwG = new BufferedWriter[numOfBlocks]; - - String describeline; - - int prepos, substart = 0, subend, min_pos = -1; - - char[] lineCharArray = new char[readLen]; - - int[] flag = new int[1]; - - long cnt = 0, outcnt = 0; - - File dir = new File("Nodes"); - if (!dir.exists()) - dir.mkdir(); - - - while ((describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - prepos = -1; - if (stringUtils.isReadLegal(lineCharArray)) { - - substart = 0; - - outcnt = cnt; - - int len = readLen; - - char[] revCharArray = stringUtils.getReversedRead(lineCharArray); - - min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag); - - cnt += 2; - - int bound = len - k + 1; - - for (int i = 1; i < bound; i++) { - - if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLen)) { - - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag); - - if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen))) { - prepos = temp; - subend = i - 1 + k; - - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - - substart = i; - outcnt = cnt; - } - - } else { - - if (ordering.strcmp(lineCharArray, revCharArray, k + i - pivotLen, len - i - k, pivotLen) < 0) { - if (ordering.strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLen, min_pos, pivotLen) < 0) { - boolean enter = true; - if (ordering instanceof UHSOrderingBase) { - if (!((UHSOrderingBase) ordering).isInUHS(lineCharArray, k + i - pivotLen, k + i)) { - enter = false; - } - } - if (enter) { - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = k + i - pivotLen; - - if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLen)) { - prepos = temp; - subend = i - 1 + k; - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - - substart = i; - outcnt = cnt; - } - - flag[0] = 0; - } - } - } else { - if (ordering.strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLen) < 0) { - boolean enter = true; - if (ordering instanceof UHSOrderingBase) { - if (!((UHSOrderingBase) ordering).isInUHS(revCharArray, len - i - k, len - i - k + pivotLen)) { - enter = false; - } - } - if (enter) { - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = -k - i + len; - - if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLen)) { - prepos = temp; - subend = i - 1 + k; - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - - substart = i; - outcnt = cnt; - } - flag[0] = 1; - } - } - } - } - - cnt += 2; - } - subend = len; - prepos = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - } - } - - System.out.println("Largest ID is " + cnt); - - for (int i = 0; i < bfwG.length; i++) { - if (bfwG[i] != null) { - bfwG[i].close(); - fwG[i].close(); - } - } - - bfrG.close(); - frG.close(); - - return cnt; - } - - private void tryCreateWriterForPmer(int prepos) throws IOException { - if (numOpenFiles == 16000) { - for (int i = 0; i < bfwG.length; i++) { - if (bfwG[i] != null) { - bfwG[i].close(); - fwG[i].close(); - bfwG[i] = null; - fwG[i] = null; - } - } - Runtime.getRuntime().gc(); - numOpenFiles = 0; - } - - if (bfwG[prepos] == null) { - fwG[prepos] = new FileWriter("Nodes/nodes" + prepos, true); - bfwG[prepos] = new BufferedWriter(fwG[prepos], bufSize); - numOpenFiles += 1; - } - } - - private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray, long outcnt) throws IOException { - tryCreateWriterForPmer(prepos); - - BufferedWriter writer = bfwG[prepos]; - - writer.write(lineCharArray, substart, subend - substart); - writer.write("\t" + outcnt); - writer.newLine(); - } - - public long Run() throws Exception { - long time1 = 0; - long t1 = System.currentTimeMillis(); - System.out.println("Distribute Nodes Begin!"); - long maxID = DistributeNodes(); - long t2 = System.currentTimeMillis(); - time1 = (t2 - t1) / 1000; - System.out.println("Time used for distributing nodes: " + time1 + " seconds!"); - return maxID; - } - -} \ No newline at end of file diff --git a/src/buildgraph/Replace.java b/src/buildgraph/Replace.java deleted file mode 100644 index 0c6e6d5..0000000 --- a/src/buildgraph/Replace.java +++ /dev/null @@ -1,237 +0,0 @@ -package buildgraph; - -import java.io.*; - -public class Replace { - - private String replaceTableFile; - private String outputGraphFile; - private int k; - private int bufSize; - private long largestID; - - private FileReader fr; - private BufferedReader bfr; - private FileWriter fw; - private BufferedWriter bfw; - - private int readLen; - - public Replace(String infile, String outfile, int k, int bufferSize, int readLen, long largestID){ - this.replaceTableFile = infile; - this.outputGraphFile = outfile; - this.k = k; - this.bufSize = bufferSize; - this.readLen = readLen; - this.largestID = largestID; - } - - private void DoReplace() throws IOException{ - fr = new FileReader(replaceTableFile); - bfr = new BufferedReader(fr, bufSize); - fw = new FileWriter(outputGraphFile); - bfw = new BufferedWriter(fw, bufSize); - - long originalID, replaceID; - - String str; - String[] strs = null; - - if((str=bfr.readLine())!=null){ - strs = str.split("\t"); - originalID = new Long(strs[0]); - replaceID = new Long(strs[1]); - } - else{ - originalID = Long.MAX_VALUE; - replaceID = Long.MAX_VALUE; - } - - int modValue = ((readLen-k+1)<<1); - - for(long i=0; i 3){ - long rangeEnd = Long.parseLong(strs[4]); - if(strs[2].equals("+")){ - for(long temp=replaceID+2; temp<=rangeEnd; temp+=2){ - bfw.write(temp + " "); - } - } - else if(strs[2].equals("-")){ - for(long temp=replaceID-2; temp>=rangeEnd; temp-=2){ - bfw.write(temp + " "); - } - } - i = Long.parseLong(strs[3]); - } - - if((str=bfr.readLine())!=null){ - strs = str.split("\t"); - originalID = new Long(strs[0]); - replaceID = new Long(strs[1]); - } - else{ - originalID = Long.MAX_VALUE; - replaceID = Long.MAX_VALUE; - } - } - else{ - bfw.write(i + " "); - } - - if((i+2) % modValue == 0) - bfw.newLine(); - } - - bfw.close(); - fw.close(); - bfr.close(); - fr.close(); - } - - private void DoReplaceBin() throws IOException{ - fr = new FileReader(replaceTableFile); - bfr = new BufferedReader(fr, bufSize); - DataOutputStream out = null; - out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(outputGraphFile)), bufSize)); - - long originalID, replaceID; - - String str; - String[] strs = null; - - if((str=bfr.readLine())!=null){ - strs = str.split("\t"); - originalID = new Long(strs[0]); - replaceID = new Long(strs[1]); - } - else{ - originalID = Long.MAX_VALUE; - replaceID = Long.MAX_VALUE; - } - - - for(long i=0; i 3){ - long rangeEnd = Long.parseLong(strs[4]); - if(strs[2].equals("+")){ - for(long temp=replaceID+2; temp<=rangeEnd; temp+=2){ - out.writeLong(temp); - } - } - else if(strs[2].equals("-")){ - for(long temp=replaceID-2; temp>=rangeEnd; temp-=2){ - out.writeLong(temp); - } - } - i = Long.parseLong(strs[3]); - } - - if((str=bfr.readLine())!=null){ - strs = str.split("\t"); - originalID = new Long(strs[0]); - replaceID = new Long(strs[1]); - } - else{ - originalID = Long.MAX_VALUE; - replaceID = Long.MAX_VALUE; - } - } - else{ - out.writeLong(i); - } - - } - - out.close(); - bfr.close(); - fr.close(); - } - - public void Run(boolean readable) throws Exception{ - - long time1=0; - - long t1 = System.currentTimeMillis(); - System.out.println("Replace IDs Begin!"); - - if(readable) - DoReplace(); - else - DoReplaceBin(); - - long t2 = System.currentTimeMillis(); - time1 = (t2-t1)/1000; - System.out.println("Time used for replacing IDs: " + time1 + " seconds!"); - - } - - public static void main(String[] args){ - - String infile = "E:\\test.txt"; - String outfile = "E:\\testOut.txt"; - int k = 15, bufferSize = 8192, readLen = 101; - long largestID = 0; - boolean readable = false; - - if(args[0].equals("-help")){ - System.out.print("Usage: java -jar Replace.jar -in InputTablePath -out outGraphPath -k k -L readLength -m largestID[options]\n" + - "Options Available: \n" + - "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + - "[-r readable] : (Boolean) Output Format: true means readable text, false means binary. Default: false" + "\n"); - return; - } - - for(int i=0; i min_pos) { + min_pos = ordering.findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + frequencies[minValueNormalized] += k; + } else if (ordering.compareMmer(currentValue, minValue) < 0) { + int lastIndexInWindow = k + i - pivotLength; + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + frequencies[minValueNormalized] += k; + } + else + frequencies[minValueNormalized]++; + } + } + } + + + protected void initFrequency() throws Exception { + boolean keepSample = true; + long numSampled = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + + String describeline, line; + char[] lineCharArray; + + int readLen; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + line = bfrG.readLine(); + readLen = line.length(); + lineCharArray = line.toCharArray(); + + if(readLen < k) + continue; + + concurrentCounter(lineCharArray); + numSampled += readLen - k; + if (numSampled > statisticsSamples) + keepSample = false; + } + bfrG.close(); + frG.close(); + } + + + public long[] getStatistics() { + long[] stats = new long[numMmers]; + for (int i = 0; i < numMmers; i++) { + stats[i] = frequencies[i]; + } + return stats; + } + + +} diff --git a/src/dumbo/ExportUtils.java b/src/dumbo/ExportUtils.java new file mode 100644 index 0000000..d3aee6d --- /dev/null +++ b/src/dumbo/ExportUtils.java @@ -0,0 +1,146 @@ +package dumbo; + +import java.io.*; +import java.util.AbstractMap; +import java.util.HashMap; +import java.util.LinkedList; + +public class ExportUtils { + public void exportOrderingForCpp(long[] currentOrdering) { + File file = new File("ranks.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < currentOrdering.length; i++) { + bf.write(Long.toString(currentOrdering[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + +// public long[] importOrdering(String fileName, int pivotLength) throws Exception { +// String line; +// LinkedList ranks = new LinkedList<>(); +// +// File file = new File(fileName); +// BufferedReader bfr = null; +// +// try { +// bfr = new BufferedReader(new FileReader(file)); +// while ((line = bfr.readLine()) != null) { +// ranks.add(Long.getLong(line)); +// } +// +// } catch (IOException e) { +// e.printStackTrace(); +// } finally { +// bfr.close(); +// } +// +// if (ranks.size() != (int) Math.pow(4, pivotLength)) { +// throw new Exception("rank file of wrong size"); +// } +// int i = 0; +// long[] ordering = new long[(int) Math.pow(4, pivotLength)]; +// while (ranks.size() > 0) { +// ordering[i] = ranks.pop(); +// i++; +// } +// return ordering; +// +// } + + public void exportBinningForCpp(long[] statFrequency) { + File file = new File("freq.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < statFrequency.length; i++) { + bf.write(Long.toString(statFrequency[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + + public HashMap getBytesPerFile() { + File folder = new File("./Nodes"); + File[] listOfFiles = folder.listFiles(); + + HashMap bytesPerFile = new HashMap<>(); + + for (int i = 0; i < listOfFiles.length; i++) { + if (listOfFiles[i].isFile()) + bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length()); + } + return bytesPerFile; + } + + public void writeToFile(long[] arr, String fileName) { + HashMap map = new HashMap<>(); + for (long i = 0; i < arr.length; i++) { + map.put(i, arr[(int)i]); + } + writeToFile(map, fileName); + } + + public void writeToFile(AbstractMap data, String fileName) { + File file = new File(fileName); + + BufferedWriter bf = null; + + + try { + bf = new BufferedWriter(new FileWriter(file)); + + bf.write("x = {"); + bf.newLine(); + + //iterate map entries + for (java.util.Map.Entry entry : data.entrySet()) { + bf.write(entry.getKey() + ":" + entry.getValue() + ","); + bf.newLine(); + } + bf.write("}"); + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + + } +} diff --git a/src/dumbo/LoadCounter.java b/src/dumbo/LoadCounter.java new file mode 100644 index 0000000..a1334da --- /dev/null +++ b/src/dumbo/LoadCounter.java @@ -0,0 +1,210 @@ +package dumbo; + +import dumbo.Ordering.OrderingBase; +import dumbo.Ordering.Standard.SignatureUtils; +import dumbo.StringUtils; +import net.agkn.hll.HLL; +import net.agkn.hll.HLLType; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; + +import java.security.MessageDigest; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; + +public class LoadCounter { + private String inputFile; + private int readLen; + private int bufSize; + private int k; + private HashMap frequency; + private Object[] frequencyLocks; + private int[] frequencies; + private OrderingBase ordering; + + private StringUtils stringUtils; + + private int pivotLength; + private long statisticsSamples; + + private int mask; + private int numMmers; + + + + + public LoadCounter( + int pivotLength, String infile, int readLen, int bufSize, int k, long statisticsSamples, OrderingBase ordering) { + this.pivotLength = pivotLength; + this.statisticsSamples = statisticsSamples; + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.k = k; + numMmers = (int) Math.pow(4, pivotLength); + frequency = new HashMap<>(numMmers); + frequencies = new int[numMmers]; + this.ordering = ordering; + stringUtils = new StringUtils(); + mask = numMmers - 1; + frequencyLocks = new Object[numMmers + 1]; + for (int i = 0; i < frequencyLocks.length - 1; i++) { + frequencyLocks[i] = new Object(); + } + } + + + private void concurrentCounter(char[] lineCharArray) throws Exception { + int min_pos, minValue, minValueNormalized, currentValue, numSampled = 0; + + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = ordering.findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + + updateStatistics(minValueNormalized, line, 0); + + int bound = readLen - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; + + if (i > min_pos) { + min_pos = ordering.findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + } else { + int lastIndexInWindow = k + i - pivotLength; + if (ordering.compareMmer(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + } + } + updateStatistics(minValueNormalized, line, i); + } + } + } + + + protected void initFrequency() throws Exception { + + + boolean keepSample = true; + long numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + + String describeline; + char[] lineCharArray = new char[readLen]; + + ThreadPoolExecutor executor = + (ThreadPoolExecutor) Executors.newFixedThreadPool(1); + + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + +// char[] localLineCharArray = lineCharArray.clone(); +// executor.submit(() -> { +// concurrentCounter(localLineCharArray); +// return null; +// }); + + concurrentCounter(lineCharArray); + numSampled += readLen - k; + if (numSampled > statisticsSamples) + keepSample = false; + + } + + executor.shutdown(); + bfrG.close(); + frG.close(); + } + + private void updateStatistics(int minValueNormalized, String line, int startPosition) { +// synchronized (frequencyLocks[minValueNormalized]) +// { + if (!frequency.containsKey(minValueNormalized)) +// frequency.put(minValueNormalized, new HLL(11, 5)); /// about 3gb of ram before going to sparse + frequency.put(minValueNormalized, new HLL(11, 5, 0, true, HLLType.FULL)); + frequency.get(minValueNormalized).addRaw(hashString(stringUtils.getCanonical(line.substring(startPosition, k + startPosition)))); +// } + //synchronized (frequencyLocks[numMmers]) + frequencies[minValueNormalized]++; + + + } + + private long hashString(String s) { + long h = 1125899906842597L; // prime + int len = s.length(); + for (int i = 0; i < len; i++) { + h = 31 * h + s.charAt(i); + } + return h; + } + + + public long[] getStatistics() { + long[] stats = new long[numMmers]; + for (int i = 0; i < numMmers; i++) { + if (frequency.containsKey(i)) { + stats[i] = frequency.get(i).cardinality(); + } +// if (i < stringUtils.getReversedMmer(i, pivotLength)) { +// stats[i] += 1000; +// } + } + + + + // pure counters +// System.out.println("x = ["); +// for (int i = 0; i < stats.length; i++) { +// System.out.print(stats[i]+ ", "); +// } +// System.out.println("]"); + + + // all ratios +// System.out.println("x = ["); +// for (int j = 0; j < stats.length; j++) { +// if(frequencies[j] != 0) +// System.out.print((float) stats[j] / frequencies[j] + ", "); +// else +// System.out.print("0, "); +// } +// System.out.println("]"); +// ConcurrentLinkedQueuex = new ConcurrentLinkedQueue<>(); +// x.remove() + +// long max = Arrays.stream(stats).max().getAsLong(); +// for (int i = 0; i < numMmers; i++) { +// if (stats[i] > 0 && stats[i] * 1.1 > max) { +// stats[i] *= 1.1; +// } +// } + return stats; + } + + +} diff --git a/src/dumbo/MinimizerCounter.java b/src/dumbo/MinimizerCounter.java new file mode 100644 index 0000000..fbfb63f --- /dev/null +++ b/src/dumbo/MinimizerCounter.java @@ -0,0 +1,84 @@ +package dumbo; + +import dumbo.Ordering.OrderingBase; + +import java.io.*; +import java.util.*; + +public class MinimizerCounter { + + private int k; + private String kmerSetFile; + private int pivotLen; + private int bufSize; + + private FileReader frG; + private BufferedReader bfrG; + + private OrderingBase ordering; + + private StringUtils stringUtils; + + private long[] minimizerCounters; + + + public MinimizerCounter(int kk, String kmerSetFile, int pivotLength, int bufferSize, OrderingBase ordering) { + this.k = kk; + this.kmerSetFile = kmerSetFile; + this.pivotLen = pivotLength; + this.bufSize = bufferSize; + this.ordering = ordering; + this.stringUtils = new StringUtils(); + minimizerCounters = new long[(int) Math.pow(4, pivotLength)]; + } + + + private long[] getMinimizersCounters() throws Exception { + frG = new FileReader(kmerSetFile); + bfrG = new BufferedReader(frG, bufSize); + + + String describeline, line; + + int minPos; + char[] lineCharArray; + + + int minValue, minValueNormalized, currentValue, start; + while ((describeline = bfrG.readLine()) != null) { + +// bfrG.read(lineCharArray, 0, k); +// bfrG.read(); + + line = bfrG.readLine(); + int readLen = line.length(); + if(readLen != k) + throw new Exception("Input row is not of length k"); + lineCharArray = line.toCharArray(); + + if (stringUtils.isReadLegal(lineCharArray)) { + minPos = ordering.findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, minPos, minPos + pivotLen); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLen); + minimizerCounters[minValueNormalized]++; + } + } + + bfrG.close(); + frG.close(); + return minimizerCounters.clone(); + } + + public long[] Run() throws Exception { + long time1 = 0; + long t1 = System.currentTimeMillis(); + System.out.println("Minimizers counting Begin!"); + long[] counters = getMinimizersCounters(); + + long t2 = System.currentTimeMillis(); + time1 = (t2 - t1) / 1000; + System.out.println("Time used for counting minimizers appearances: " + time1 + " seconds!"); + return counters; + } + +} \ No newline at end of file diff --git a/src/dumbo/Ordering/IterativeOrdering.java b/src/dumbo/Ordering/IterativeOrdering.java new file mode 100644 index 0000000..7725809 --- /dev/null +++ b/src/dumbo/Ordering/IterativeOrdering.java @@ -0,0 +1,225 @@ +package dumbo.Ordering; + +import dumbo.Ordering.Standard.SignatureUtils; +import dumbo.StringUtils; + +import java.io.*; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; + +public class IterativeOrdering extends OrderingBase { + private String inputFile; + + private int bufSize; + private int k; + private SignatureUtils signatureUtils; + private HashMap> frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double percentagePunishment; + + private long[] statFrequency; + + private boolean useSignature; + + private boolean initialized; + + + public IterativeOrdering( + int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature) { + super(pivotLength); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.percentagePunishment = percentagePunishment; + this.useSignature = useSignature; + this.inputFile = infile; + + this.bufSize = bufSize; + this.k = k; + signatureUtils = new SignatureUtils(pivotLength); + initialized = false; + } + + public IterativeOrdering( + int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, int[] initialOrdering) { + this(pivotLength, infile, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); + mmerRanks = initialOrdering.clone(); + initialized = true; + badArgumentsThrow(); + } + + public IterativeOrdering( + int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, OrderingBase initialOrdering) throws IOException { + this(pivotLength, infile, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); + mmerRanks = initialOrdering.getRanks().clone(); + initialized = true; + badArgumentsThrow(); + } + + private void badArgumentsThrow() { + if (mmerRanks.length != numMmers) + throw new IllegalArgumentException("initialOrdering is not of correct size"); + if (useSignature) + throw new IllegalArgumentException("Can't initialize ordering from outside with useSignature as true"); + } + + + protected void initFrequency() throws Exception { + + if (!initialized) { + for (int i = 0; i < numMmers; i++) { + int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength)); + mmerRanks[i] = canonical; + mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] = canonical; + } + if (useSignature) { + for (int i = 0; i < numMmers; i++) { + if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { + mmerRanks[i] += numMmers; + mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; + } + } + } + } + + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + statFrequency = new long[numMmers]; + HashMap> pmerFrequency = new HashMap<>(roundSamples * 2); + + String skippedDescribeLine, line; + char[] lineCharArray;// = new char[readLen]; + int readLen; + + + int min_pos = -1; + int minValue, currentValue, minValueNormalized; + + while (keepSample && (skippedDescribeLine = bfrG.readLine()) != null) { + + line = bfrG.readLine(); + readLen = line.length(); + lineCharArray = line.toCharArray(); + + if(readLen < k) + continue; + +// bfrG.read(lineCharArray, 0, readLen); +// bfrG.read(); +// String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + + updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, 0); + + int bound = readLen - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + + updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); + } else { + int lastIndexInWindow = k + i - pivotLength; + if (compareMmer(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + + updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); + } + } + updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { // TODO: SHOULD THIS BE < and not <= + numSampled = 0; + adaptOrdering(pmerFrequency); + pmerFrequency.clear(); + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + normalize(); + bfrG.close(); + frG.close(); + } + + private void updateStatistics(int roundNumber, HashMap> pmerFrequency, int minValueNormalized, String line, int startPosition) { + if (roundNumber == rounds) + statFrequency[minValueNormalized]++; + else { + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(stringUtils.getCanonical(line.substring(startPosition, k + startPosition))); + } + } + + + private void adaptOrdering(HashMap> pmerFrequency) { + int[] frequencies = new int[numMmers]; + for (Integer i : pmerFrequency.keySet()) { + frequencies[i] = pmerFrequency.get(i).size(); + } + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < frequencies.length; k++) { + if (frequencies[k] > biggest) { + biggest = frequencies[k]; + biggestIndex = k; + } + } +// TODO: might not be necessary to change both. + int newRank = mmerRanks[biggestIndex] + (int) (numMmers * percentagePunishment); + mmerRanks[biggestIndex] = newRank; + mmerRanks[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank; + frequencies[biggestIndex] = 0; + frequencies[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = 0; + } + } + + + @Override + public void initializeRanks() throws Exception { + isRankInitialized = true; + initFrequency(); + } + +} diff --git a/src/dumbo/Ordering/OrderingBase.java b/src/dumbo/Ordering/OrderingBase.java new file mode 100644 index 0000000..ecf76db --- /dev/null +++ b/src/dumbo/Ordering/OrderingBase.java @@ -0,0 +1,73 @@ +package dumbo.Ordering; + +import dumbo.StringUtils; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; + +public abstract class OrderingBase { + + protected int pivotLength; + protected int numMmers; + protected int mask; + + protected StringUtils stringUtils; + + protected int[] mmerRanks; + protected boolean isRankInitialized; + + public OrderingBase(int pivotLength) { + this.pivotLength = pivotLength; + this.numMmers = (int) Math.pow(4, pivotLength); + this.mask = numMmers - 1; + this.stringUtils = new StringUtils(); + this.mmerRanks = new int[numMmers]; + this.isRankInitialized = false; + } + + + public abstract void initializeRanks() throws Exception; + + public int compareMmer(int x, int y) throws Exception { + if (!isRankInitialized) + throw new Exception("problema - rank not initialized"); + + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + if (a == b) return 0; + if (mmerRanks[a] < mmerRanks[b]) return -1; + return 1; + } + + public int[] getRanks() { + return mmerRanks.clone(); + } + + public int findSmallest(char[] a, int from, int to) throws Exception { + int min_pos = from; + int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength); + int currentValue = minValue; + for (int i = from + 1; i <= to - pivotLength; i++) { + currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask; + if (compareMmer(minValue, currentValue) > 0) { + min_pos = i; + minValue = currentValue; + } + } + + return min_pos; + } + + protected void normalize() { + Integer[] temp = new Integer[mmerRanks.length]; + for (int i = 0; i < temp.length; i++) + temp[i] = i; + + Arrays.sort(temp, Comparator.comparingLong(a -> mmerRanks[a])); + for (int i = 0; i < temp.length; i++) { + mmerRanks[temp[i]] = i; + } + } +} diff --git a/src/dumbo/Ordering/Standard/LexicographicOrdering.java b/src/dumbo/Ordering/Standard/LexicographicOrdering.java new file mode 100644 index 0000000..4a318e0 --- /dev/null +++ b/src/dumbo/Ordering/Standard/LexicographicOrdering.java @@ -0,0 +1,36 @@ +package dumbo.Ordering.Standard; + + +import dumbo.Ordering.OrderingBase; + +import java.io.IOException; +import java.util.Arrays; + +public class LexicographicOrdering extends OrderingBase { + + public LexicographicOrdering(int pivotLength) { + super(pivotLength); + } + + @Override + public void initializeRanks() throws IOException { + Integer[] mmers = new Integer[numMmers]; + for (int i = 0; i < mmers.length; i++) { + mmers[i] = i; + } + + Arrays.sort(mmers, this::rawCompareMmer); + for (int i = 0; i < mmers.length; i++) { + mmerRanks[mmers[i]] = i; + } + System.out.println("finish init rank"); + isRankInitialized = true; + } + + + protected int rawCompareMmer(int x, int y) { + return Integer.compare(stringUtils.getNormalizedValue(x, pivotLength), stringUtils.getNormalizedValue(y, pivotLength)); + } + + +} diff --git a/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java b/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java new file mode 100644 index 0000000..f5e93aa --- /dev/null +++ b/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java @@ -0,0 +1,54 @@ +package dumbo.Ordering.Standard; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; + +public class LexicographicSignatureOrdering extends LexicographicOrdering { + protected SignatureUtils signatureUtils; + + public LexicographicSignatureOrdering(int pivotLen) throws IOException { + super(pivotLen); + signatureUtils = new SignatureUtils(pivotLen); + } + + @Override + public void initializeRanks() throws IOException { + Arrays.fill(mmerRanks, Integer.MAX_VALUE); + + HashSet normalizedAllowedMmers = new HashSet<>(); + for (int i = 0; i < numMmers; i++) { + if (signatureUtils.isAllowed(stringUtils.getNormalizedValue(i, pivotLength))) + normalizedAllowedMmers.add(stringUtils.getNormalizedValue(i, pivotLength)); + } + + Integer[] mmers = new Integer[normalizedAllowedMmers.size()]; + normalizedAllowedMmers.toArray(mmers); + + Arrays.sort(mmers); + + for (int i = 0; i < mmers.length; i++) { + mmerRanks[mmers[i]] = i; + } + normalize(); + System.out.println("finish init rank"); + isRankInitialized = true; + } + + @Override + protected int rawCompareMmer(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + boolean aAllowed = signatureUtils.isAllowed(a); + boolean bAllowed = signatureUtils.isAllowed(b); + + if (!aAllowed && bAllowed) { + return 1; + } else if (!bAllowed && aAllowed) { + return -1; + } + + return Integer.compare(a, b); + } +} diff --git a/src/dumbo/Ordering/Standard/RandomOrdering.java b/src/dumbo/Ordering/Standard/RandomOrdering.java new file mode 100644 index 0000000..bb0a0c5 --- /dev/null +++ b/src/dumbo/Ordering/Standard/RandomOrdering.java @@ -0,0 +1,41 @@ +package dumbo.Ordering.Standard; + +import dumbo.Ordering.OrderingBase; + +import java.io.IOException; +import java.util.Arrays; + +public class RandomOrdering extends OrderingBase { + private int xor; + + public RandomOrdering(int pivotLen, int xor) { + super(pivotLen); + this.xor = xor; + } + + @Override + public void initializeRanks() throws IOException { + Integer[] mmers = new Integer[numMmers]; + for (int i = 0; i < mmers.length; i++) { + mmers[i] = i; + } + + Arrays.sort(mmers, this::rawCompareMmer); + for (int i = 0; i < mmers.length; i++) { + mmerRanks[mmers[i]] = i; + } + System.out.println("finish init rank"); + isRankInitialized = true; + } + + protected int rawCompareMmer(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + if ((a ^ xor) < (b ^ xor)) + return -1; + else if ((a ^ xor) > (b ^ xor)) + return 1; + return 0; + } +} \ No newline at end of file diff --git a/src/dumbo/Ordering/Standard/SignatureUtils.java b/src/dumbo/Ordering/Standard/SignatureUtils.java new file mode 100644 index 0000000..86898ed --- /dev/null +++ b/src/dumbo/Ordering/Standard/SignatureUtils.java @@ -0,0 +1,95 @@ +package dumbo.Ordering.Standard; + +public class SignatureUtils { + + private int len; + protected byte[] isPmerAllowed; + + public SignatureUtils(int len){ + this.len = len; + isPmerAllowed = new byte[(int)Math.pow(4, len)]; + } + + public boolean isAllowed(int mmer) + { + int isAllowed = isPmerAllowed[mmer]; + if(isAllowed != 0){ + return isAllowed == 1; + } + + if ((mmer & 0x3f) == 0x3f) // TTT suffix + { + isPmerAllowed[mmer] = -1; + return false; + } + + if ((mmer & 0x3f) == 0x3b) // TGT suffix + { + isPmerAllowed[mmer] = -1; + return false; + } + if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152 + { + isPmerAllowed[mmer] = -1; + return false; + } + + for (int j = 0; j < len - 3; ++j) + if ((mmer & 0xf) == 0) // AA inside + { + isPmerAllowed[mmer] = -1; + return false; + } + else + mmer >>= 2; + + if (mmer == 0) // AAA prefix + { + isPmerAllowed[mmer] = -1; + return false; + } + if (mmer == 0x04) // ACA prefix + { + isPmerAllowed[mmer] = -1; + return false; + } + if ((mmer & 0xf) == 0) // *AA prefix + { + isPmerAllowed[mmer] = -1; + return false; + } + + isPmerAllowed[mmer] = 1; + return true; + } + + public boolean isAllowed(char[] a, int from, int aDecimal) { + int isAllowed = isPmerAllowed[aDecimal]; + if(isAllowed != 0){ + return isAllowed == 1; + } + + int lastIndex = from + len - 1; + if (a[from] == 'A' && a[from + 2] == 'A') { + if (a[from + 1] <= 'C') { // C or A + isPmerAllowed[aDecimal] = -1; + return false; + } + } else if (a[lastIndex] == 'T' && a[lastIndex - 2] == 'T') { + if (a[lastIndex - 1] >='G') { // G or T + isPmerAllowed[aDecimal] = -1; + return false; + } + } + + for (int i = from + 2; i < lastIndex; i++) { + if (a[i] == 'A' && a[i + 1] == 'A') { + isPmerAllowed[aDecimal] = -1; + return false; + } + } + isPmerAllowed[aDecimal] = 1; + return true; + } + +} diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java new file mode 100644 index 0000000..2adbb9e --- /dev/null +++ b/src/dumbo/OrderingOptimizer.java @@ -0,0 +1,157 @@ +package dumbo; + +import dumbo.Ordering.*; +import dumbo.Ordering.Standard.FrequencyOrdering; +import dumbo.Ordering.Standard.LexicographicOrdering; +import dumbo.Ordering.Standard.RandomOrdering; +import dumbo.Ordering.Standard.LexicographicSignatureOrdering; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; + +public class OrderingOptimizer { + + public static void main(String[] args) throws Exception { + + String infile = null; + + int k = 60, pivot_len = 8, bufferSize = 81920; + int readLen = 124; + String orderingName = "iterativeOrdering"; + int numRounds = 0, elementsToPush = 0, samplesPerRound = 0; + long statSamples = 0; + double punishPercentage = 1; + String version = "10"; + String kmerSetFile = null; + + if (args.length > 0 && args[0].equals("-help")) { + System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" + + "Options Available: \n" + + "[-p pivotLength] : (Integer) Pivot Length. Default: 12" + "\n" + + "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + + "[-o order] : lexico or sig or uhs or uhs_sig" + "\n"); + return; + } + + for (int i = 0; i < args.length; i += 2) { + if (args[i].equals("-in")) + infile = args[i + 1]; + else if (args[i].equals("-v")) + version = args[i + 1]; + else if (args[i].equals("-k")) + k = new Integer(args[i + 1]); + else if (args[i].equals("-kmers-file")) + kmerSetFile = args[i + 1]; + +// else +// if(args[i].equals("-o")) +// orderingName = args[i+1]; + else if (args[i].equals("-p")) + pivot_len = new Integer(args[i + 1]); + else if (args[i].equals("-b")) + bufferSize = new Integer(args[i + 1]); + else if (args[i].equals("-L")) + readLen = new Integer(args[i + 1]); + else if (args[i].equals("-rounds")) + numRounds = new Integer(args[i + 1]); + else if (args[i].equals("-samples")) + samplesPerRound = new Integer(args[i + 1]); + else if (args[i].equals("-elementsToPush")) + elementsToPush = new Integer(args[i + 1]); + else if (args[i].equals("-statSamples")) + statSamples = new Long(args[i + 1]); + else if (args[i].equals("-punishPercentage")) + punishPercentage = new Double(args[i + 1]); + else { + System.out.println("Wrong with arguments. Abort!"); + System.out.println(args[i]); + return; + } + } + + System.out.println("Optimizing an ordering:"); + System.out.print("Input File: " + infile + "\n" + + "Kmer Length: " + k + "\n" + + "Pivot Length: " + pivot_len + "\n" + + "R/W Buffer Size: " + bufferSize + "\n" + + "Read length" + readLen + "\n" + + "Ordering: " + orderingName + "\n"); + + + + OrderingBase ordering = null; + System.out.println(version); + switch (version) { + + case "9-normalized": + IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, bufferSize, k, + samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false); + iterative.initializeRanks(); + ordering = iterative; + break; + case "9-normalized-signature": + IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, bufferSize, k, + samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, true); + iterativeSignature.initializeRanks(); + ordering = iterativeSignature; + break; + case "signature": + ordering = new LexicographicSignatureOrdering(pivot_len); + ordering.initializeRanks(); + break; + case "lexicographic": + ordering = new LexicographicOrdering(pivot_len); + ordering.initializeRanks(); + break; + case "random": + Random r = new Random(); + ordering = new RandomOrdering(pivot_len, r.nextInt()); + ordering.initializeRanks(); + break; + } + + ExportUtils exportUtils = new ExportUtils(); + + int[] ranks = ordering.getRanks(); + long[] longRanks = new long[ranks.length]; + for (int i = 0; i < longRanks.length; longRanks[i]=ranks[i], i++) ; + + exportUtils.exportOrderingForCpp(longRanks); + + + long[] counters; + if (kmerSetFile != null) { + try { + + System.out.println("Counting minimizer appearances:"); + System.out.print("Input File: " + kmerSetFile + "\n" + + "Kmer Length: " + k + "\n" + + "Pivot Length: " + pivot_len + "\n" + + "Ordering: " + orderingName + "\n"); + + MinimizerCounter minimizerCounter = new MinimizerCounter(k, kmerSetFile, pivot_len, bufferSize, ordering); + counters = minimizerCounter.Run(); + + exportUtils.writeToFile(counters, orderingName + pivot_len + "_" + "kmers"); + } catch (Exception E) { + System.out.println("Exception caught!"); + E.printStackTrace(); + } + } + if (statSamples > 0) { + System.out.println("Collecting stats for binning"); +// LoadCounter counter = new LoadCounter(pivot_len, infile, readLen, bufferSize, k, statSamples, ordering); +// counter.initFrequency(); + + BinSizeCounter counter = new BinSizeCounter(pivot_len, infile, bufferSize, k, statSamples, ordering); + counter.initFrequency(); + + counters = counter.getStatistics(); + exportUtils.exportBinningForCpp(counters); + + } + } + + +} diff --git a/src/buildgraph/StringUtils.java b/src/dumbo/StringUtils.java similarity index 51% rename from src/buildgraph/StringUtils.java rename to src/dumbo/StringUtils.java index 450dd19..98f57b4 100644 --- a/src/buildgraph/StringUtils.java +++ b/src/dumbo/StringUtils.java @@ -1,4 +1,4 @@ -package buildgraph; +package dumbo; public class StringUtils { @@ -17,6 +17,18 @@ public int getDecimal(char[] a, int from, int to){ return val; } + public long getLDecimal(char[] a, int from, int to){ + + long val=0; + + for(int i=from; i>= 2; + } + return rev; + } + + public String getCanonical(String line) { + String x = new String(getReversedRead(line.toCharArray())); + for (int i = 0; i < line.length(); i++) { + if (line.charAt(i) < x.charAt(i)) + return line; + else if (line.charAt(i) > x.charAt(i)) + return x; + } + return x; + } + + public int getNormalizedValue(int minValue, int pivotLength) { + return Math.min(minValue, getReversedMmer(minValue, pivotLength)); + } }