diff --git a/.gitignore b/.gitignore
index a1c2a23..bd8eb75 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,5 +19,7 @@
*.tar.gz
*.rar
+.idea*
+
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
diff --git a/msp.iml b/msp.iml
new file mode 100644
index 0000000..b1c5c22
--- /dev/null
+++ b/msp.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/META-INF/MANIFEST.MF b/src/META-INF/MANIFEST.MF
index 85846ea..3eb3083 100644
--- a/src/META-INF/MANIFEST.MF
+++ b/src/META-INF/MANIFEST.MF
@@ -1,3 +1,3 @@
Manifest-Version: 1.0
-Main-Class: buildgraph.BuildDeBruijnGraph
+Main-Class: dumbo.OrderingOptimizer
diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java
deleted file mode 100644
index 00ea1eb..0000000
--- a/src/buildgraph/BuildDeBruijnGraph.java
+++ /dev/null
@@ -1,190 +0,0 @@
-package buildgraph;
-
-import buildgraph.Ordering.*;
-import buildgraph.Ordering.UHS.UHSFrequencySignatureOrdering;
-import buildgraph.Ordering.UHS.UHSSignatureOrdering;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.AbstractMap;
-import java.util.HashMap;
-
-public class BuildDeBruijnGraph {
-
- public static void main(String[] args) throws IOException {
-
-// String infile = "/home/gaga/data-scratch/yaelbenari/datas/chr14.fastq";
-// String infile = "/home/gaga/data-scratch/yaelbenari/datas/smalldata.fastq";
- String infile = "/home/gaga/data-scratch/yaelbenari/datas/breastCancer.fastq";
-// String infile = "/home/gaga/data-scratch/yaelbenari/datas/beeData.fastq";
-// String infile = "/home/gaga/data-scratch/yaelbenari/datas/workspace/72.fastq";
-
- int k = 60, pivot_len = 7, bufferSize = 8192, numThreads = 1, hsmapCapacity = 10000000;
-// int readLen = 124;
-// int readLen = 101;
- int readLen = 100;
- int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;//
- boolean readable = false;
- String orderingName = "uhs_freq_sig";
- int xor = 0; //11101101;
-
- if (args.length > 0 && args[0].equals("-help")) {
- System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" +
- "Options Available: \n" +
- "[-NB numOfBlocks] : (Integer) Number Of Kmer Blocks. Default: 256" + "\n" +
- "[-p pivotLength] : (Integer) Pivot Length. Default: 12" + "\n" +
- "[-t numOfThreads] : (Integer) Number Of Threads. Default: 1" + "\n" +
- "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" +
- "[-o order] : lexico or sig or uhs or uhs_sig" + "\n" +
- "[-r readable] : (Boolean) Output Format: true means readable text, false means binary. Default: false" + "\n");
- return;
- }
-
- for (int i = 0; i < args.length; i += 2) {
- if(args[i].equals("-in"))
- infile = args[i+1];
- else if(args[i].equals("-k"))
- k = new Integer(args[i+1]);
- else if(args[i].equals("-NB"))
- numBlocks = new Integer(args[i+1]);
- else
- if(args[i].equals("-o"))
- orderingName = args[i+1];
- else if(args[i].equals("-p"))
- pivot_len = new Integer(args[i+1]);
- else if(args[i].equals("-b"))
- bufferSize = new Integer(args[i+1]);
- else if(args[i].equals("-L"))
- readLen = new Integer(args[i+1]);
- else if(args[i].equals("-t"))
- numThreads = new Integer(args[i+1]);
- else if(args[i].equals("-r"))
- readable = new Boolean(args[i+1]);
- else{
- System.out.println("Wrong with arguments. Abort!");
- return;
- }
- }
-
-
- IOrdering ordering;
- switch (orderingName)
- {
- case "lexico":
- ordering = new LexicographicOrdering(pivot_len);
- break;;
- case "uhs":
- ordering = new UHSSignatureOrdering(xor, pivot_len, false, true);
- case "random":
- //ordering =
- break;
- }
-
-// UHSFrequencySignatureOrdering uhs_freq_sig = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, true);
-// uhs_freq_sig.initRank();
-// HashMap orderingNames = new HashMap() {{
-// put("lexico", new LexicographicOrdering(pivot_len));
-// put("sig", new LexicographicSignatureOrdering(pivot_len));
-// put("uhs_sig", new UHSSignatureOrdering(xor, pivot_len, false, true));
-// put("uhs_freq", new UniversalFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, false));
-// put("uhs_freq_sig", uhs_freq_sig);
-// }};
-
-
- IOrdering ordering = orderingNames.get(orderingName);
-// IOrdering ordering = new LexicographicSignatureOrdering(pivot_len);
- Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering);
- Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity);
-
- try {
-
- System.out.println("Program Configuration:");
- System.out.print("Input File: " + infile + "\n" +
- "Kmer Length: " + k + "\n" +
- "Read Length: " + readLen + "\n" +
- "# Of Blocks: " + numBlocks + "\n" +
- "Pivot Length: " + pivot_len + "\n" +
- "# Of Threads: " + numThreads + "\n" +
- "R/W Buffer Size: " + bufferSize + "\n" +
- "Ordering: " + orderingName + "\n" +
- "x xor: " + xor + "\n" +
- "Output Format: " + (readable == true ? "Text" : "Binary") + "\n");
-
- long maxID = partition.Run();
-
- AbstractMap distinctKmersPerPartition = map.Run(numThreads);
- BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers");
-
- HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile();
- BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes");
-//
-//
-// long time1 = 0;
-// long t1 = System.currentTimeMillis();
-// System.out.println("Merge IDReplaceTables Begin!");
-// String sortcmd = "sort -t $\'\t\' -o IDReplaceTable +0 -1 -n -m Maps/maps*";
-// Runtime.getRuntime().exec(new String[]{"/bin/sh", "-c", sortcmd}, null, null).waitFor();
-// long t2 = System.currentTimeMillis();
-// time1 = (t2 - t1) / 1000;
-// System.out.println("Time used for merging: " + time1 + " seconds!");
-//
-// Replace replace = new Replace("IDReplaceTable", "OutGraph", k, bufferSize, readLen, maxID);
-// replace.Run(readable);
-
-
- } catch (Exception E) {
- System.out.println("Exception caught!");
- E.printStackTrace();
- }
-
- }
-
- public static HashMap getBytesPerFile() {
- File folder = new File("./Nodes");
- File[] listOfFiles = folder.listFiles();
-
- HashMap bytesPerFile = new HashMap<>();
-
- for (int i = 0; i < listOfFiles.length; i++) {
- if (listOfFiles[i].isFile())
- bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length());
- }
- return bytesPerFile;
- }
-
- public static void writeToFile(AbstractMap data, String fileName) {
- File file = new File(fileName);
-
- BufferedWriter bf = null;
- ;
-
- try {
- bf = new BufferedWriter(new FileWriter(file));
-
- bf.write("x = {");
- bf.newLine();
-
- //iterate map entries
- for (java.util.Map.Entry entry : data.entrySet()) {
- bf.write(entry.getKey() + ":" + entry.getValue() + ",");
- bf.newLine();
- }
- bf.write("}");
- bf.flush();
-
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
-
- try {
- //always close the writer
- bf.close();
- } catch (Exception e) {
- }
- }
-
- }
-
-}
diff --git a/src/buildgraph/Kmer64.java b/src/buildgraph/Kmer64.java
deleted file mode 100644
index 9e6ac27..0000000
--- a/src/buildgraph/Kmer64.java
+++ /dev/null
@@ -1,78 +0,0 @@
-package buildgraph;
-
-public class Kmer64 extends Object {
-
- public long high;
- public long low;
-
- private final static char[] baseDic = {'A', 'C', 'G', 'T'};
- private final static int[] intDic = {0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3};
-
- private final int base2int(char base) {
- return intDic[base - 'A'];
- }
-
-
- public Kmer64(char[] str, int start, int end, boolean rev) {
-
- this.high = this.low = 0;
-
- int len = end - start;
-
- if (!rev) {
- if (len <= 32) {
- for (int i = start; i <= end - 1; i++) {
- this.low = (this.low << 2) + base2int(str[i]);
- }
- } else {
- for (int i = end - 32; i <= end - 1; i++) {
- this.low = (this.low << 2) + base2int(str[i]);
- }
-
- for (int i = start; i <= end - 33; i++) {
- this.high = (this.high << 2) + base2int(str[i]);
- }
- }
- } else {
- if (len <= 32) {
- for (int i = end - 1; i >= start; i--) {
- this.low = (this.low << 2) + 3 ^ base2int(str[i]);
- }
- } else {
- for (int i = start + 31; i >= start; i--) {
- this.low = (this.low << 2) + 3 ^ base2int(str[i]);
- }
-
- for (int i = end - 1; i >= start + 32; i--) {
- this.high = (this.high << 2) + 3 ^ base2int(str[i]);
- }
- }
- }
-
- }
-
- public Kmer64(long low, long high) {
- this.low = low;
- this.high = high;
- }
-
- @Override
- public boolean equals(Object another) {
- Kmer64 k = (Kmer64) another;
- if (this.high == k.high && this.low == k.low)
- return true;
- else
- return false;
- }
-
- @Override
- public int hashCode() {
- return (int) ((low ^ (low >>> 32)) ^ (high ^ (high >>> 32)));
- }
-
-
- public String toString() {
- return this.high + "," + this.low;
- }
-}
-
diff --git a/src/buildgraph/Map.java b/src/buildgraph/Map.java
deleted file mode 100644
index a98a9d0..0000000
--- a/src/buildgraph/Map.java
+++ /dev/null
@@ -1,316 +0,0 @@
-package buildgraph;
-
-import java.io.*;
-import java.util.*;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.CountDownLatch;
-
-
-public class Map{
-
- private int k;
- private int numOfBlocks;
- private int bufSize;
-
- private Object lock_blocks = new Object();
-
- private int capacity;
-
- private int blockID;
-
- private long forAndVal;
- private long forAndVal32;
-
- private static int[] valTable = StringUtils.valTable;
-
- public Map(int kk, int numberOfBlocks, int bufferSize, int HScapacity){
- this.k = kk;
- this.numOfBlocks = numberOfBlocks;
- this.bufSize = bufferSize;
- this.capacity = HScapacity;
- this.blockID = 0;
- this.forAndVal = (long)Math.pow(2, 2*(k-32)) - 1;
- this.forAndVal32 = (long)Math.pow(2, 2*k) - 1;
- }
-
- public class MyThread extends Thread{
- private CountDownLatch threadsSignal;
- private HashSet fileNames;
- private ConcurrentHashMap distinctKmersPerPartition;
-
- public MyThread(CountDownLatch threadsSignal, HashSet fileNames, ConcurrentHashMap distinctKmersPerPartition){
- super();
- this.threadsSignal = threadsSignal;
- this.fileNames = fileNames;
- this.distinctKmersPerPartition = distinctKmersPerPartition;
- }
-
- @Override
- public void run(){
- System.out.println(Thread.currentThread().getName() + "Start...");
-
- FileReader fr;
- BufferedReader bfr;
- FileWriter fw;
- BufferedWriter bfw;
-
-
- String line;
-
- int p,j;
- long cnt;
- Kmer64 k1, k1_rev;
-
-
- try{
- File dir = new File("Maps");
- if(!dir.exists())
- dir.mkdir();
-
- while(blockID nodes = new HashMap(capacity);
-
- while((line = bfr.readLine()) != null){
-
- String[] strs = line.split("\t");
- cnt = Long.parseLong(strs[1]);
-
- long preOriginal = -1, preReplace = -1, Original = -1, Replace = -1;
- long diff = -1;
- boolean newOut = true, next = false;
-
- Long ReplaceObj, Replace_revObj;
-
- char[] lineCharArray = strs[0].toCharArray();
- k1 = new Kmer64(lineCharArray,0,k,false);
- k1_rev = new Kmer64(lineCharArray,0,k,true);
-
- int bound = strs[0].length() - k + 1;
-
- for(j = 0; j < bound; j++){
-
- if(j != 0){
- if(k > 32){
- k1 = new Kmer64((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A'], ((k1.high<<2) + valTable[lineCharArray[k+j-33]-'A']) & forAndVal);
- k1_rev = new Kmer64((k1_rev.low>>>2) + ((k1_rev.high&3)<<62), (k1_rev.high>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-33)<<1)));
- }
- else{
- k1 = new Kmer64(((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A']) & forAndVal32, 0);
- k1_rev = new Kmer64((k1_rev.low>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-1)<<1)), 0);
- }
- }
-
- ReplaceObj = nodes.get(k1);
- Replace_revObj = nodes.get(k1_rev);
-
- if(ReplaceObj == null && Replace_revObj == null){
- nodes.put(k1, cnt+j*2);
-
- if(!newOut && !next){
- bfw.write(preOriginal+"\t"+preReplace);
- bfw.newLine();
-
- newOut = true;
- }
-
- }
- else{
- if(ReplaceObj!=null){
- Original = cnt+j*2;
- Replace = ReplaceObj;
- }
- else{
- Original = cnt+j*2;
- Replace = Replace_revObj+1;
- }
-
- if(newOut){
- bfw.write(Original+"\t"+Replace+"\t");
- newOut = false;
- next = true;
- }
-
- else if(Original-preOriginal==2){
- if(next){
- diff = Replace - preReplace;
- if(diff==2){
- bfw.write("+\t");
- next = false;
- }
- else if(diff==-2){
- bfw.write("-\t");
- next = false;
- }
- else{
- bfw.write("\n"+Original+"\t"+Replace+"\t");
- }
- }
- else{
- if(Replace - preReplace != diff){
- bfw.write(preOriginal+"\t"+preReplace);
- bfw.newLine();
-
- bfw.write(Original+"\t"+Replace+"\t");
- next = true;
- }
- }
- }
-
- else if(next==true){
-
- bfw.write("\n"+Original+"\t"+Replace+"\t");
- }
-
- preOriginal = Original;
- preReplace = Replace;
- }
-
- }
-
- if(!newOut && !next){
- bfw.write(preOriginal+"\t"+preReplace);
- bfw.newLine();
- }
- else if(next){
- bfw.newLine();
- }
- }
-
- if(p%100 == 0) System.out.println(p);
- distinctKmersPerPartition.put((long)p, (long)nodes.size());
-
- nodes.clear();
- nodes = null;
-
- bfw.close();
- fw.close();
- bfr.close();
- fr.close();
- bfw = null;
- fw = null;
- bfr = null;
- fr = null;
- }
-
- }catch(Exception E){
- System.out.println("Exception caught!");
- E.printStackTrace();
- }
-
- threadsSignal.countDown();
- System.out.println(Thread.currentThread().getName() + "End. Remaining" + threadsSignal.getCount() + " threads");
-
- }
- }
-
-
- private AbstractMap BuildMap(int threadNum, HashSet fileNames) throws Exception{
- CountDownLatch threadSignal = new CountDownLatch(threadNum);
-
- ConcurrentHashMap distinctKmersPerPartition = new ConcurrentHashMap<>();
-
- for(int i=0;i Run(int numThreads) throws Exception{
- long time1=0;
-
- HashSet fileNames = getNodesFileNames();
-
- long t1 = System.currentTimeMillis();
- System.out.println("Build Maps Begin!");
- AbstractMap distinctKmersPerPartition= BuildMap(numThreads, fileNames);
- long t2 = System.currentTimeMillis();
- time1 = (t2-t1)/1000;
- System.out.println("Time used for building maps: " + time1 + " seconds!");
-
- return distinctKmersPerPartition;
-
- }
-
- private HashSet getNodesFileNames(){
- File[] files = (new File("./Nodes")).listFiles();
- List fileNames = new LinkedList<>();
- for(File file : files){
- if(file.isFile()){
- fileNames.add(file.getName());
- }
- }
- return new HashSet<>(fileNames);
- }
-
- public static void main(String[] args){
-
- int k = 15, numBlocks = 256, numThreads = 1, bufferSize = 8192, hsmapCapacity = 1000000;
-
- if(args[0].equals("-help")){
- System.out.print("Usage: java -jar Map.jar -k k -NB numOfBlocks [options]\n" +
- "Options Available: \n" +
- "[-t numOfThreads] : (Integer) Number Of Threads. Default: 1" + "\n" +
- "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" +
- "[-c capacity] : (Integer) Hashmap Capacity. Default: 1000000" + "\n");
- return;
- }
-
- for(int i=0; i 0)
- min_pos = i;
- }
-
- return min_pos;
- }
-
- @Override
- public int strcmp(char[] a, char[] b, int froma, int fromb, int len) {
- for (int i = 0; i < len; i++) {
- if (a[froma + i] < b[fromb + i])
- return -1;
- else if (a[froma + i] > b[fromb + i])
- return 1;
- }
- return 0;
- }
-}
diff --git a/src/buildgraph/Ordering/LexicographicSignatureOrdering.java b/src/buildgraph/Ordering/LexicographicSignatureOrdering.java
deleted file mode 100644
index 7b816c4..0000000
--- a/src/buildgraph/Ordering/LexicographicSignatureOrdering.java
+++ /dev/null
@@ -1,39 +0,0 @@
-package buildgraph.Ordering;
-
-import buildgraph.StringUtils;
-
-import java.io.IOException;
-
-public class LexicographicSignatureOrdering extends LexicographicOrdering {
- private SignatureUtils signatureUtils;
- private StringUtils stringUtils;
- public LexicographicSignatureOrdering(int pivotLen) throws IOException {
- super(pivotLen);
- signatureUtils = new SignatureUtils(pivotLen);
- stringUtils = new StringUtils();
- }
-
- @Override
- public int strcmp(char[] a, char[] b, int froma, int fromb, int len) {
-// boolean aAllowed = signatureUtils.isAllowed(a, froma, froma + len);
-// boolean bAllowed = signatureUtils.isAllowed(b, fromb, fromb + len);
- int x = stringUtils.getDecimal(a, froma, froma + pivotLen);
- int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen);
- boolean aAllowed = signatureUtils.isAllowed(a, froma, x);
- boolean bAllowed = signatureUtils.isAllowed(b, fromb, y);
-
- if (!aAllowed && bAllowed) {
- return 1;
- } else if (!bAllowed && aAllowed) {
- return -1;
- }
-
- for (int i = 0; i < len; i++) {
- if (a[froma + i] < b[fromb + i])
- return -1;
- else if (a[froma + i] > b[fromb + i])
- return 1;
- }
- return 0;
- }
-}
diff --git a/src/buildgraph/Ordering/SignatureUtils.java b/src/buildgraph/Ordering/SignatureUtils.java
deleted file mode 100644
index 8212663..0000000
--- a/src/buildgraph/Ordering/SignatureUtils.java
+++ /dev/null
@@ -1,44 +0,0 @@
-package buildgraph.Ordering;
-
-import java.util.HashMap;
-
-public class SignatureUtils {
-
- private int len;
- protected byte[] isPmerAllowed;
-
- public SignatureUtils(int len){
- this.len = len;
- isPmerAllowed = new byte[(int)Math.pow(4, len)];
- }
-
- public boolean isAllowed(char[] a, int from, int aDecimal) {
- int isAllowed = isPmerAllowed[aDecimal];
- if(isAllowed != 0){
- return isAllowed == 1;
- }
-
- int lastIndex = from + len - 1;
- if (a[from] == 'A' && a[from + 2] == 'A') {
- if (a[from + 1] <= 'C') { // C or A
- isPmerAllowed[aDecimal] = -1;
- return false;
- }
- } else if (a[lastIndex] == 'T' && a[lastIndex - 2] == 'T') {
- if (a[lastIndex - 1] >='G') { // G or T
- isPmerAllowed[aDecimal] = -1;
- return false;
- }
- }
-
- for (int i = from + 2; i < lastIndex; i++) {
- if (a[i] == 'A' && a[i + 1] == 'A') {
- isPmerAllowed[aDecimal] = -1;
- return false;
- }
- }
- isPmerAllowed[aDecimal] = 1;
- return true;
- }
-
-}
diff --git a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java
deleted file mode 100644
index a0a0231..0000000
--- a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java
+++ /dev/null
@@ -1,98 +0,0 @@
-package buildgraph.Ordering.UHS;
-
-import java.io.*;
-
-public class UHSFrequencySignatureOrdering extends UHSSignatureOrdering {
- private String inputFile;
- private int readLen;
- private int bufSize;
- private long[] pmerFrequency;
- private boolean isInit;
-
- public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature, boolean useCache) throws IOException {
- super(0, pivotLen, useSignature, useCache);
- this.inputFile = infile;
- this.readLen = readLen;
- this.bufSize = bufSize;
- pmerFrequency = new long[(int)Math.pow(4, pivotLen)];
- isInit = false;
- }
-
- @Override
- public void initRank() throws IOException {
- initFrequency();
- super.initRank();
- isRankInit = true;
- }
-
- protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) throws IOException {
- int baseCompareValue = strcmpBase(x, y);
- if (baseCompareValue != BOTH_IN_UHS) {
- return baseCompareValue;
- }
-
- // from down here - both in UHS
-
- if(useSignature){
- if (!xAllowed && yAllowed) {
- return 1;
- } else if (!yAllowed && xAllowed) {
- return -1;
- }
- }
-
- // both allowed or both not allowed
- if(pmerFrequency[x] == pmerFrequency[y]){
- if(x 1000000){
- break;
- }
- }
- }
- bfrG.close();
- frG.close();
- }
-
-
-
-
-}
diff --git a/src/buildgraph/Ordering/UHS/UHSOrderingBase.java b/src/buildgraph/Ordering/UHS/UHSOrderingBase.java
deleted file mode 100644
index 3820b30..0000000
--- a/src/buildgraph/Ordering/UHS/UHSOrderingBase.java
+++ /dev/null
@@ -1,147 +0,0 @@
-package buildgraph.Ordering.UHS;
-
-import buildgraph.Ordering.IOrdering;
-import buildgraph.StringUtils;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-
-public abstract class UHSOrderingBase implements IOrdering {
-
- protected byte[] uhsBits;
- protected StringUtils stringUtils;
-
- protected static final int BOTH_IN_UHS = 824;
- protected int pivotLen;
-
- protected int[] rankOfPmer;
- protected boolean isRankInit;
-
-
- public UHSOrderingBase(int pivotLen) throws IOException {
- this.pivotLen = pivotLen;
- stringUtils = new StringUtils();
- uhsBits = uhsBitSet(pivotLen);
- rankOfPmer = new int[(int) Math.pow(4, pivotLen)];
- Arrays.fill(rankOfPmer, Integer.MAX_VALUE);
- isRankInit = false;
- }
-
- protected abstract int calculateStrcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException;
-
-
- public boolean isInUHS(int pmerDecimal) {
- int pmerDecimalDiv8 = pmerDecimal >> 3;
- int pmerDecimalMod8 = pmerDecimal & 0b111;
- if (((this.uhsBits[pmerDecimalDiv8] >> (pmerDecimalMod8)) & 1) == 1) {
- return true;
- }
- return false;
- }
-
- public boolean isInUHS(char[] a, int from, int to) {
- return isInUHS(stringUtils.getDecimal(a, from, to));
- }
-
- protected int strcmpBase(int x, int y) {
- if (x == y)
- return 0;
-
- boolean xInUHS = isInUHS(x);
- boolean yInUHS = isInUHS(y);
- if (xInUHS && !yInUHS) {
- return -1;
- } else if (!xInUHS && yInUHS) {
- return 1;
- }
- return BOTH_IN_UHS;
- }
-
- private byte[] uhsBitSet(int pivotLen) throws IOException {
- int n = (int) Math.pow(4, pivotLen) / 8;
- int i = 0;
- byte[] bits = new byte[n];
-
- String DocksFile = "res_" + pivotLen + ".txt";
- FileReader frG = new FileReader(DocksFile);
- int count = 0;
-
- BufferedReader reader;
- try {
- reader = new BufferedReader(frG);
- String line;
- while ((line = reader.readLine()) != null) {
- i = stringUtils.getDecimal(line.toCharArray(), 0, pivotLen);
- bits[i / 8] |= 1 << (i % 8);
- count++;
- }
- reader.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println(count);
- frG.close();
-
- return bits;
- }
-
- public void initRank() throws IOException {
- System.out.println("start init rank");
- HashSet pmers = getPmersInUHS();
- char[][] pmersArr = new char[pmers.size()][pivotLen];
- pmers.toArray(pmersArr);
- Arrays.sort(pmersArr, (o1, o2) -> {
- try {
- return calculateStrcmp(o1, o2, 0, 0, pivotLen);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return 0;
- });
- for (int i = 0; i < pmersArr.length; i++) {
- rankOfPmer[stringUtils.getDecimal(pmersArr[i], 0, pivotLen)] = i;
- }
- System.out.println("finish init rank");
- }
-
- private HashSet getPmersInUHS() {
- HashSet pmers = new HashSet<>();
- StringBuilder sb = new StringBuilder(pivotLen);
- for (int i = 0; i < pivotLen; i++) sb.append('A');
- generate(pmers, sb, 0);
- return pmers;
-
- }
-
- private void generate(HashSet pmers, StringBuilder sb, int n) {
- char[] alphabet = {'A', 'C', 'G', 'T'};
- if (n == sb.capacity()) {
- char[] pmer = sb.toString().toCharArray();
- if (isInUHS(pmer, 0, pivotLen)) {
- pmers.add(pmer);
- }
- return;
- }
- for (char letter : alphabet) {
- sb.setCharAt(n, letter);
- generate(pmers, sb, n + 1);
- }
- }
-
- protected static HashMap pivotLengthToHexRepresentation = new HashMap() {
- {
- put(5, 0x3ff);
- put(6, 0xfff);
- put(7, 0x3fff);
- put(8, 0xffff);
- put(10, 0xfffff);
- put(12, 0xffffff);
- }
-
- };
-
-}
diff --git a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java
deleted file mode 100644
index 136aa5b..0000000
--- a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java
+++ /dev/null
@@ -1,105 +0,0 @@
-package buildgraph.Ordering.UHS;
-
-import buildgraph.Ordering.SignatureUtils;
-import buildgraph.StringUtils;
-
-import java.io.IOException;
-
-public class UHSSignatureOrdering extends UHSOrderingBase {
- private SignatureUtils signatureUtils;
- protected boolean useSignature;
- private boolean useCache;
- protected int xor;
-
-
- public UHSSignatureOrdering(int xor, int pivotLen, boolean useSignature, boolean useCache) throws IOException {
- super(pivotLen);
- this.xor = xor;
- this.useSignature = useSignature;
- this.useCache = useCache;
- signatureUtils = new SignatureUtils(pivotLen);
- }
-
- public UHSSignatureOrdering(int pivotLen, boolean useSignature, boolean useCache) throws IOException {
- this(0, pivotLen, useSignature, useCache);
- }
-
-
-
- @Override
- public int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException {
- if(!isRankInit) throw new IOException("rank not initialized yet");
-
- int x = stringUtils.getDecimal(a, froma, froma + pivotLen);
- int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen);
-
- if (x == y) return 0;
-
- // isRankInit = true here
- if (rankOfPmer[x] < rankOfPmer[y]) {
- return -1;
- }
- return 1;
- }
-
- @Override
- public int findSmallest(char[] a, int from, int to) throws IOException {
- int min_pos = from;
- int j = stringUtils.getDecimal(a, min_pos, min_pos + pivotLen);
- int prev = j;
- boolean prevAllowed = signatureUtils.isAllowed(a, min_pos, prev), jAllowed = true;
- int hexRepresentation = pivotLengthToHexRepresentation.get(pivotLen);
- for (int i = from + 1; i <= to - pivotLen; i++) {
- j = ((j * 4) ^ (StringUtils.valTable[a[i + pivotLen - 1] - 'A'])) & hexRepresentation;
-
- if (useSignature)
- jAllowed = signatureUtils.isAllowed(a, i, j);
-
- if (isInUHS(j)) {
- if (strcmpSignature(prev, j, prevAllowed, jAllowed) > 0) {
- min_pos = i;
- prev = j;
- }
-
- }
- prevAllowed = jAllowed;
- }
- return min_pos;
- }
-
- protected int calculateStrcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException {
- int x = stringUtils.getDecimal(a, froma, froma + pivotLen);
- int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen);
-
- if (x == y) return 0;
-
- boolean aAllowed = true, bAllowed = true;
- if (useSignature) {
- aAllowed = signatureUtils.isAllowed(a, froma, x);
- bAllowed = signatureUtils.isAllowed(b, fromb, y);
- }
-
- return strcmpSignature(x, y, aAllowed, bAllowed);
- }
-
- protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) throws IOException {
- int baseCompareValue = strcmpBase(x, y);
- if (baseCompareValue != BOTH_IN_UHS) {
- return baseCompareValue;
- }
- // from down here - both in UHS
- if (useSignature) {
- if (!xAllowed && yAllowed) {
- return 1;
- } else if (!yAllowed && xAllowed) {
- return -1;
- }
- }
- // both allowed or both not allowed
- if ((x ^ xor) < (y ^ xor))
- return -1;
- else
- return 1;
-
- }
-}
diff --git a/src/buildgraph/Partition.java b/src/buildgraph/Partition.java
deleted file mode 100644
index 5a43374..0000000
--- a/src/buildgraph/Partition.java
+++ /dev/null
@@ -1,245 +0,0 @@
-package buildgraph;
-
-import buildgraph.Ordering.IOrdering;
-import buildgraph.Ordering.UHS.UHSOrderingBase;
-
-import java.io.*;
-
-public class Partition {
-
- private int k;
- private String inputfile;
- private int numOfBlocks;
- private int pivotLen;
- private int bufSize;
-
- private FileReader frG;
- private BufferedReader bfrG;
- private FileWriter[] fwG;
- private BufferedWriter[] bfwG;
-
- private int readLen;
- private IOrdering ordering;
-
- private StringUtils stringUtils;
-
- private int numOpenFiles;
-
-
- public Partition(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrdering ordering) {
- this.k = kk;
- this.inputfile = infile;
- this.numOfBlocks = numberOfBlocks;
- this.pivotLen = pivotLength;
- this.bufSize = bufferSize;
- this.readLen = readLen;
- this.ordering = ordering;
- this.stringUtils = new StringUtils();
- this.numOpenFiles = 0;
- }
-
-
- private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException {
-
- int len = a.length;
- int pos1 = ordering.findSmallest(a, from, to);
- int pos2 = ordering.findSmallest(b, len - to, len - from);
-
- if (ordering.strcmp(a, b, pos1, pos2, pivotLen) < 0) {
- flag[0] = 0;
- return pos1;
- } else {
- flag[0] = 1;
- return pos2;
- }
- }
-
- private int calPosNew(char[] a, int from, int to) {
- return stringUtils.getDecimal(a, from, to) % numOfBlocks;
- }
-
- private long DistributeNodes() throws IOException {
- frG = new FileReader(inputfile);
- bfrG = new BufferedReader(frG, bufSize);
- fwG = new FileWriter[numOfBlocks];
- bfwG = new BufferedWriter[numOfBlocks];
-
- String describeline;
-
- int prepos, substart = 0, subend, min_pos = -1;
-
- char[] lineCharArray = new char[readLen];
-
- int[] flag = new int[1];
-
- long cnt = 0, outcnt = 0;
-
- File dir = new File("Nodes");
- if (!dir.exists())
- dir.mkdir();
-
-
- while ((describeline = bfrG.readLine()) != null) {
-
- bfrG.read(lineCharArray, 0, readLen);
- bfrG.read();
-
- prepos = -1;
- if (stringUtils.isReadLegal(lineCharArray)) {
-
- substart = 0;
-
- outcnt = cnt;
-
- int len = readLen;
-
- char[] revCharArray = stringUtils.getReversedRead(lineCharArray);
-
- min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag);
-
- cnt += 2;
-
- int bound = len - k + 1;
-
- for (int i = 1; i < bound; i++) {
-
- if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLen)) {
-
- int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen));
-
- min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag);
-
- if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen))) {
- prepos = temp;
- subend = i - 1 + k;
-
-
- writeToFile(prepos, substart, subend, lineCharArray, outcnt);
-
- substart = i;
- outcnt = cnt;
- }
-
- } else {
-
- if (ordering.strcmp(lineCharArray, revCharArray, k + i - pivotLen, len - i - k, pivotLen) < 0) {
- if (ordering.strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLen, min_pos, pivotLen) < 0) {
- boolean enter = true;
- if (ordering instanceof UHSOrderingBase) {
- if (!((UHSOrderingBase) ordering).isInUHS(lineCharArray, k + i - pivotLen, k + i)) {
- enter = false;
- }
- }
- if (enter) {
- int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen));
-
- min_pos = k + i - pivotLen;
-
- if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLen)) {
- prepos = temp;
- subend = i - 1 + k;
-
- writeToFile(prepos, substart, subend, lineCharArray, outcnt);
-
- substart = i;
- outcnt = cnt;
- }
-
- flag[0] = 0;
- }
- }
- } else {
- if (ordering.strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLen) < 0) {
- boolean enter = true;
- if (ordering instanceof UHSOrderingBase) {
- if (!((UHSOrderingBase) ordering).isInUHS(revCharArray, len - i - k, len - i - k + pivotLen)) {
- enter = false;
- }
- }
- if (enter) {
- int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen));
-
- min_pos = -k - i + len;
-
- if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLen)) {
- prepos = temp;
- subend = i - 1 + k;
-
- writeToFile(prepos, substart, subend, lineCharArray, outcnt);
-
- substart = i;
- outcnt = cnt;
- }
- flag[0] = 1;
- }
- }
- }
- }
-
- cnt += 2;
- }
- subend = len;
- prepos = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen));
-
- writeToFile(prepos, substart, subend, lineCharArray, outcnt);
- }
- }
-
- System.out.println("Largest ID is " + cnt);
-
- for (int i = 0; i < bfwG.length; i++) {
- if (bfwG[i] != null) {
- bfwG[i].close();
- fwG[i].close();
- }
- }
-
- bfrG.close();
- frG.close();
-
- return cnt;
- }
-
- private void tryCreateWriterForPmer(int prepos) throws IOException {
- if (numOpenFiles == 16000) {
- for (int i = 0; i < bfwG.length; i++) {
- if (bfwG[i] != null) {
- bfwG[i].close();
- fwG[i].close();
- bfwG[i] = null;
- fwG[i] = null;
- }
- }
- Runtime.getRuntime().gc();
- numOpenFiles = 0;
- }
-
- if (bfwG[prepos] == null) {
- fwG[prepos] = new FileWriter("Nodes/nodes" + prepos, true);
- bfwG[prepos] = new BufferedWriter(fwG[prepos], bufSize);
- numOpenFiles += 1;
- }
- }
-
- private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray, long outcnt) throws IOException {
- tryCreateWriterForPmer(prepos);
-
- BufferedWriter writer = bfwG[prepos];
-
- writer.write(lineCharArray, substart, subend - substart);
- writer.write("\t" + outcnt);
- writer.newLine();
- }
-
- public long Run() throws Exception {
- long time1 = 0;
- long t1 = System.currentTimeMillis();
- System.out.println("Distribute Nodes Begin!");
- long maxID = DistributeNodes();
- long t2 = System.currentTimeMillis();
- time1 = (t2 - t1) / 1000;
- System.out.println("Time used for distributing nodes: " + time1 + " seconds!");
- return maxID;
- }
-
-}
\ No newline at end of file
diff --git a/src/buildgraph/Replace.java b/src/buildgraph/Replace.java
deleted file mode 100644
index 0c6e6d5..0000000
--- a/src/buildgraph/Replace.java
+++ /dev/null
@@ -1,237 +0,0 @@
-package buildgraph;
-
-import java.io.*;
-
-public class Replace {
-
- private String replaceTableFile;
- private String outputGraphFile;
- private int k;
- private int bufSize;
- private long largestID;
-
- private FileReader fr;
- private BufferedReader bfr;
- private FileWriter fw;
- private BufferedWriter bfw;
-
- private int readLen;
-
- public Replace(String infile, String outfile, int k, int bufferSize, int readLen, long largestID){
- this.replaceTableFile = infile;
- this.outputGraphFile = outfile;
- this.k = k;
- this.bufSize = bufferSize;
- this.readLen = readLen;
- this.largestID = largestID;
- }
-
- private void DoReplace() throws IOException{
- fr = new FileReader(replaceTableFile);
- bfr = new BufferedReader(fr, bufSize);
- fw = new FileWriter(outputGraphFile);
- bfw = new BufferedWriter(fw, bufSize);
-
- long originalID, replaceID;
-
- String str;
- String[] strs = null;
-
- if((str=bfr.readLine())!=null){
- strs = str.split("\t");
- originalID = new Long(strs[0]);
- replaceID = new Long(strs[1]);
- }
- else{
- originalID = Long.MAX_VALUE;
- replaceID = Long.MAX_VALUE;
- }
-
- int modValue = ((readLen-k+1)<<1);
-
- for(long i=0; i 3){
- long rangeEnd = Long.parseLong(strs[4]);
- if(strs[2].equals("+")){
- for(long temp=replaceID+2; temp<=rangeEnd; temp+=2){
- bfw.write(temp + " ");
- }
- }
- else if(strs[2].equals("-")){
- for(long temp=replaceID-2; temp>=rangeEnd; temp-=2){
- bfw.write(temp + " ");
- }
- }
- i = Long.parseLong(strs[3]);
- }
-
- if((str=bfr.readLine())!=null){
- strs = str.split("\t");
- originalID = new Long(strs[0]);
- replaceID = new Long(strs[1]);
- }
- else{
- originalID = Long.MAX_VALUE;
- replaceID = Long.MAX_VALUE;
- }
- }
- else{
- bfw.write(i + " ");
- }
-
- if((i+2) % modValue == 0)
- bfw.newLine();
- }
-
- bfw.close();
- fw.close();
- bfr.close();
- fr.close();
- }
-
- private void DoReplaceBin() throws IOException{
- fr = new FileReader(replaceTableFile);
- bfr = new BufferedReader(fr, bufSize);
- DataOutputStream out = null;
- out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(outputGraphFile)), bufSize));
-
- long originalID, replaceID;
-
- String str;
- String[] strs = null;
-
- if((str=bfr.readLine())!=null){
- strs = str.split("\t");
- originalID = new Long(strs[0]);
- replaceID = new Long(strs[1]);
- }
- else{
- originalID = Long.MAX_VALUE;
- replaceID = Long.MAX_VALUE;
- }
-
-
- for(long i=0; i 3){
- long rangeEnd = Long.parseLong(strs[4]);
- if(strs[2].equals("+")){
- for(long temp=replaceID+2; temp<=rangeEnd; temp+=2){
- out.writeLong(temp);
- }
- }
- else if(strs[2].equals("-")){
- for(long temp=replaceID-2; temp>=rangeEnd; temp-=2){
- out.writeLong(temp);
- }
- }
- i = Long.parseLong(strs[3]);
- }
-
- if((str=bfr.readLine())!=null){
- strs = str.split("\t");
- originalID = new Long(strs[0]);
- replaceID = new Long(strs[1]);
- }
- else{
- originalID = Long.MAX_VALUE;
- replaceID = Long.MAX_VALUE;
- }
- }
- else{
- out.writeLong(i);
- }
-
- }
-
- out.close();
- bfr.close();
- fr.close();
- }
-
- public void Run(boolean readable) throws Exception{
-
- long time1=0;
-
- long t1 = System.currentTimeMillis();
- System.out.println("Replace IDs Begin!");
-
- if(readable)
- DoReplace();
- else
- DoReplaceBin();
-
- long t2 = System.currentTimeMillis();
- time1 = (t2-t1)/1000;
- System.out.println("Time used for replacing IDs: " + time1 + " seconds!");
-
- }
-
- public static void main(String[] args){
-
- String infile = "E:\\test.txt";
- String outfile = "E:\\testOut.txt";
- int k = 15, bufferSize = 8192, readLen = 101;
- long largestID = 0;
- boolean readable = false;
-
- if(args[0].equals("-help")){
- System.out.print("Usage: java -jar Replace.jar -in InputTablePath -out outGraphPath -k k -L readLength -m largestID[options]\n" +
- "Options Available: \n" +
- "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" +
- "[-r readable] : (Boolean) Output Format: true means readable text, false means binary. Default: false" + "\n");
- return;
- }
-
- for(int i=0; i min_pos) {
+ min_pos = ordering.findSmallest(lineCharArray, i, i + k);
+ minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength);
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength);
+ frequencies[minValueNormalized] += k;
+ } else if (ordering.compareMmer(currentValue, minValue) < 0) {
+ int lastIndexInWindow = k + i - pivotLength;
+ min_pos = lastIndexInWindow;
+ minValue = currentValue;
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength);
+ frequencies[minValueNormalized] += k;
+ }
+ else
+ frequencies[minValueNormalized]++;
+ }
+ }
+ }
+
+
+ protected void initFrequency() throws Exception {
+ boolean keepSample = true;
+ long numSampled = 0;
+
+ FileReader frG = new FileReader(inputFile);
+ BufferedReader bfrG = new BufferedReader(frG, bufSize);
+
+
+ String describeline, line;
+ char[] lineCharArray;
+
+ int readLen;
+
+ while (keepSample && (describeline = bfrG.readLine()) != null) {
+
+ line = bfrG.readLine();
+ readLen = line.length();
+ lineCharArray = line.toCharArray();
+
+ if(readLen < k)
+ continue;
+
+ concurrentCounter(lineCharArray);
+ numSampled += readLen - k;
+ if (numSampled > statisticsSamples)
+ keepSample = false;
+ }
+ bfrG.close();
+ frG.close();
+ }
+
+
+ public long[] getStatistics() {
+ long[] stats = new long[numMmers];
+ for (int i = 0; i < numMmers; i++) {
+ stats[i] = frequencies[i];
+ }
+ return stats;
+ }
+
+
+}
diff --git a/src/dumbo/ExportUtils.java b/src/dumbo/ExportUtils.java
new file mode 100644
index 0000000..d3aee6d
--- /dev/null
+++ b/src/dumbo/ExportUtils.java
@@ -0,0 +1,146 @@
+package dumbo;
+
+import java.io.*;
+import java.util.AbstractMap;
+import java.util.HashMap;
+import java.util.LinkedList;
+
+public class ExportUtils {
+ public void exportOrderingForCpp(long[] currentOrdering) {
+ File file = new File("ranks.txt");
+
+ BufferedWriter bf = null;
+
+ try {
+ bf = new BufferedWriter(new FileWriter(file));
+
+ for (int i = 0; i < currentOrdering.length; i++) {
+ bf.write(Long.toString(currentOrdering[i]));
+ bf.newLine();
+ }
+ bf.flush();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+
+ try {
+ //always close the writer
+ bf.close();
+ } catch (Exception e) {
+ }
+ }
+ }
+
+// public long[] importOrdering(String fileName, int pivotLength) throws Exception {
+// String line;
+// LinkedList ranks = new LinkedList<>();
+//
+// File file = new File(fileName);
+// BufferedReader bfr = null;
+//
+// try {
+// bfr = new BufferedReader(new FileReader(file));
+// while ((line = bfr.readLine()) != null) {
+// ranks.add(Long.getLong(line));
+// }
+//
+// } catch (IOException e) {
+// e.printStackTrace();
+// } finally {
+// bfr.close();
+// }
+//
+// if (ranks.size() != (int) Math.pow(4, pivotLength)) {
+// throw new Exception("rank file of wrong size");
+// }
+// int i = 0;
+// long[] ordering = new long[(int) Math.pow(4, pivotLength)];
+// while (ranks.size() > 0) {
+// ordering[i] = ranks.pop();
+// i++;
+// }
+// return ordering;
+//
+// }
+
+ public void exportBinningForCpp(long[] statFrequency) {
+ File file = new File("freq.txt");
+
+ BufferedWriter bf = null;
+
+ try {
+ bf = new BufferedWriter(new FileWriter(file));
+
+ for (int i = 0; i < statFrequency.length; i++) {
+ bf.write(Long.toString(statFrequency[i]));
+ bf.newLine();
+ }
+ bf.flush();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+
+ try {
+ //always close the writer
+ bf.close();
+ } catch (Exception e) {
+ }
+ }
+ }
+
+ public HashMap getBytesPerFile() {
+ File folder = new File("./Nodes");
+ File[] listOfFiles = folder.listFiles();
+
+ HashMap bytesPerFile = new HashMap<>();
+
+ for (int i = 0; i < listOfFiles.length; i++) {
+ if (listOfFiles[i].isFile())
+ bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length());
+ }
+ return bytesPerFile;
+ }
+
+ public void writeToFile(long[] arr, String fileName) {
+ HashMap map = new HashMap<>();
+ for (long i = 0; i < arr.length; i++) {
+ map.put(i, arr[(int)i]);
+ }
+ writeToFile(map, fileName);
+ }
+
+ public void writeToFile(AbstractMap data, String fileName) {
+ File file = new File(fileName);
+
+ BufferedWriter bf = null;
+
+
+ try {
+ bf = new BufferedWriter(new FileWriter(file));
+
+ bf.write("x = {");
+ bf.newLine();
+
+ //iterate map entries
+ for (java.util.Map.Entry entry : data.entrySet()) {
+ bf.write(entry.getKey() + ":" + entry.getValue() + ",");
+ bf.newLine();
+ }
+ bf.write("}");
+ bf.flush();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+
+ try {
+ //always close the writer
+ bf.close();
+ } catch (Exception e) {
+ }
+ }
+
+ }
+}
diff --git a/src/dumbo/LoadCounter.java b/src/dumbo/LoadCounter.java
new file mode 100644
index 0000000..a1334da
--- /dev/null
+++ b/src/dumbo/LoadCounter.java
@@ -0,0 +1,210 @@
+package dumbo;
+
+import dumbo.Ordering.OrderingBase;
+import dumbo.Ordering.Standard.SignatureUtils;
+import dumbo.StringUtils;
+import net.agkn.hll.HLL;
+import net.agkn.hll.HLLType;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import java.security.MessageDigest;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ThreadPoolExecutor;
+
+public class LoadCounter {
+ private String inputFile;
+ private int readLen;
+ private int bufSize;
+ private int k;
+ private HashMap frequency;
+ private Object[] frequencyLocks;
+ private int[] frequencies;
+ private OrderingBase ordering;
+
+ private StringUtils stringUtils;
+
+ private int pivotLength;
+ private long statisticsSamples;
+
+ private int mask;
+ private int numMmers;
+
+
+
+
+ public LoadCounter(
+ int pivotLength, String infile, int readLen, int bufSize, int k, long statisticsSamples, OrderingBase ordering) {
+ this.pivotLength = pivotLength;
+ this.statisticsSamples = statisticsSamples;
+ this.inputFile = infile;
+ this.readLen = readLen;
+ this.bufSize = bufSize;
+ this.k = k;
+ numMmers = (int) Math.pow(4, pivotLength);
+ frequency = new HashMap<>(numMmers);
+ frequencies = new int[numMmers];
+ this.ordering = ordering;
+ stringUtils = new StringUtils();
+ mask = numMmers - 1;
+ frequencyLocks = new Object[numMmers + 1];
+ for (int i = 0; i < frequencyLocks.length - 1; i++) {
+ frequencyLocks[i] = new Object();
+ }
+ }
+
+
+ private void concurrentCounter(char[] lineCharArray) throws Exception {
+ int min_pos, minValue, minValueNormalized, currentValue, numSampled = 0;
+
+ String line = new String(lineCharArray);
+
+ if (stringUtils.isReadLegal(lineCharArray)) {
+
+ min_pos = ordering.findSmallest(lineCharArray, 0, k);
+ minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength);
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength);
+ currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k);
+
+ updateStatistics(minValueNormalized, line, 0);
+
+ int bound = readLen - k + 1;
+ for (int i = 1; i < bound; i++) {
+ numSampled++;
+ currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;
+
+ if (i > min_pos) {
+ min_pos = ordering.findSmallest(lineCharArray, i, i + k);
+ minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength);
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength);
+ } else {
+ int lastIndexInWindow = k + i - pivotLength;
+ if (ordering.compareMmer(currentValue, minValue) < 0) {
+ min_pos = lastIndexInWindow;
+ minValue = currentValue;
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength);
+ }
+ }
+ updateStatistics(minValueNormalized, line, i);
+ }
+ }
+ }
+
+
+ protected void initFrequency() throws Exception {
+
+
+ boolean keepSample = true;
+ long numSampled = 0;
+ int roundNumber = 0;
+
+ FileReader frG = new FileReader(inputFile);
+ BufferedReader bfrG = new BufferedReader(frG, bufSize);
+
+
+ String describeline;
+ char[] lineCharArray = new char[readLen];
+
+ ThreadPoolExecutor executor =
+ (ThreadPoolExecutor) Executors.newFixedThreadPool(1);
+
+
+ while (keepSample && (describeline = bfrG.readLine()) != null) {
+
+ bfrG.read(lineCharArray, 0, readLen);
+ bfrG.read();
+
+// char[] localLineCharArray = lineCharArray.clone();
+// executor.submit(() -> {
+// concurrentCounter(localLineCharArray);
+// return null;
+// });
+
+ concurrentCounter(lineCharArray);
+ numSampled += readLen - k;
+ if (numSampled > statisticsSamples)
+ keepSample = false;
+
+ }
+
+ executor.shutdown();
+ bfrG.close();
+ frG.close();
+ }
+
+ private void updateStatistics(int minValueNormalized, String line, int startPosition) {
+// synchronized (frequencyLocks[minValueNormalized])
+// {
+ if (!frequency.containsKey(minValueNormalized))
+// frequency.put(minValueNormalized, new HLL(11, 5)); /// about 3gb of ram before going to sparse
+ frequency.put(minValueNormalized, new HLL(11, 5, 0, true, HLLType.FULL));
+ frequency.get(minValueNormalized).addRaw(hashString(stringUtils.getCanonical(line.substring(startPosition, k + startPosition))));
+// }
+ //synchronized (frequencyLocks[numMmers])
+ frequencies[minValueNormalized]++;
+
+
+ }
+
+ private long hashString(String s) {
+ long h = 1125899906842597L; // prime
+ int len = s.length();
+ for (int i = 0; i < len; i++) {
+ h = 31 * h + s.charAt(i);
+ }
+ return h;
+ }
+
+
+ public long[] getStatistics() {
+ long[] stats = new long[numMmers];
+ for (int i = 0; i < numMmers; i++) {
+ if (frequency.containsKey(i)) {
+ stats[i] = frequency.get(i).cardinality();
+ }
+// if (i < stringUtils.getReversedMmer(i, pivotLength)) {
+// stats[i] += 1000;
+// }
+ }
+
+
+
+ // pure counters
+// System.out.println("x = [");
+// for (int i = 0; i < stats.length; i++) {
+// System.out.print(stats[i]+ ", ");
+// }
+// System.out.println("]");
+
+
+ // all ratios
+// System.out.println("x = [");
+// for (int j = 0; j < stats.length; j++) {
+// if(frequencies[j] != 0)
+// System.out.print((float) stats[j] / frequencies[j] + ", ");
+// else
+// System.out.print("0, ");
+// }
+// System.out.println("]");
+// ConcurrentLinkedQueuex = new ConcurrentLinkedQueue<>();
+// x.remove()
+
+// long max = Arrays.stream(stats).max().getAsLong();
+// for (int i = 0; i < numMmers; i++) {
+// if (stats[i] > 0 && stats[i] * 1.1 > max) {
+// stats[i] *= 1.1;
+// }
+// }
+ return stats;
+ }
+
+
+}
diff --git a/src/dumbo/MinimizerCounter.java b/src/dumbo/MinimizerCounter.java
new file mode 100644
index 0000000..fbfb63f
--- /dev/null
+++ b/src/dumbo/MinimizerCounter.java
@@ -0,0 +1,84 @@
+package dumbo;
+
+import dumbo.Ordering.OrderingBase;
+
+import java.io.*;
+import java.util.*;
+
+public class MinimizerCounter {
+
+ private int k;
+ private String kmerSetFile;
+ private int pivotLen;
+ private int bufSize;
+
+ private FileReader frG;
+ private BufferedReader bfrG;
+
+ private OrderingBase ordering;
+
+ private StringUtils stringUtils;
+
+ private long[] minimizerCounters;
+
+
+ public MinimizerCounter(int kk, String kmerSetFile, int pivotLength, int bufferSize, OrderingBase ordering) {
+ this.k = kk;
+ this.kmerSetFile = kmerSetFile;
+ this.pivotLen = pivotLength;
+ this.bufSize = bufferSize;
+ this.ordering = ordering;
+ this.stringUtils = new StringUtils();
+ minimizerCounters = new long[(int) Math.pow(4, pivotLength)];
+ }
+
+
+ private long[] getMinimizersCounters() throws Exception {
+ frG = new FileReader(kmerSetFile);
+ bfrG = new BufferedReader(frG, bufSize);
+
+
+ String describeline, line;
+
+ int minPos;
+ char[] lineCharArray;
+
+
+ int minValue, minValueNormalized, currentValue, start;
+ while ((describeline = bfrG.readLine()) != null) {
+
+// bfrG.read(lineCharArray, 0, k);
+// bfrG.read();
+
+ line = bfrG.readLine();
+ int readLen = line.length();
+ if(readLen != k)
+ throw new Exception("Input row is not of length k");
+ lineCharArray = line.toCharArray();
+
+ if (stringUtils.isReadLegal(lineCharArray)) {
+ minPos = ordering.findSmallest(lineCharArray, 0, k);
+ minValue = stringUtils.getDecimal(lineCharArray, minPos, minPos + pivotLen);
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLen);
+ minimizerCounters[minValueNormalized]++;
+ }
+ }
+
+ bfrG.close();
+ frG.close();
+ return minimizerCounters.clone();
+ }
+
+ public long[] Run() throws Exception {
+ long time1 = 0;
+ long t1 = System.currentTimeMillis();
+ System.out.println("Minimizers counting Begin!");
+ long[] counters = getMinimizersCounters();
+
+ long t2 = System.currentTimeMillis();
+ time1 = (t2 - t1) / 1000;
+ System.out.println("Time used for counting minimizers appearances: " + time1 + " seconds!");
+ return counters;
+ }
+
+}
\ No newline at end of file
diff --git a/src/dumbo/Ordering/IterativeOrdering.java b/src/dumbo/Ordering/IterativeOrdering.java
new file mode 100644
index 0000000..7725809
--- /dev/null
+++ b/src/dumbo/Ordering/IterativeOrdering.java
@@ -0,0 +1,225 @@
+package dumbo.Ordering;
+
+import dumbo.Ordering.Standard.SignatureUtils;
+import dumbo.StringUtils;
+
+import java.io.*;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+
+public class IterativeOrdering extends OrderingBase {
+ private String inputFile;
+
+ private int bufSize;
+ private int k;
+ private SignatureUtils signatureUtils;
+ private HashMap> frequency;
+
+ private int statisticsSamples;
+ private int roundSamples;
+ private int rounds;
+ private int elementsToPush;
+
+ private double percentagePunishment;
+
+ private long[] statFrequency;
+
+ private boolean useSignature;
+
+ private boolean initialized;
+
+
+ public IterativeOrdering(
+ int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds,
+ int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature) {
+ super(pivotLength);
+ this.roundSamples = roundSamples;
+ this.rounds = rounds;
+ this.elementsToPush = elementsToPush;
+ this.statisticsSamples = statisticsSamples;
+ this.percentagePunishment = percentagePunishment;
+ this.useSignature = useSignature;
+ this.inputFile = infile;
+
+ this.bufSize = bufSize;
+ this.k = k;
+ signatureUtils = new SignatureUtils(pivotLength);
+ initialized = false;
+ }
+
+ public IterativeOrdering(
+ int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds,
+ int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, int[] initialOrdering) {
+ this(pivotLength, infile, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature);
+ mmerRanks = initialOrdering.clone();
+ initialized = true;
+ badArgumentsThrow();
+ }
+
+ public IterativeOrdering(
+ int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds,
+ int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, OrderingBase initialOrdering) throws IOException {
+ this(pivotLength, infile, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature);
+ mmerRanks = initialOrdering.getRanks().clone();
+ initialized = true;
+ badArgumentsThrow();
+ }
+
+ private void badArgumentsThrow() {
+ if (mmerRanks.length != numMmers)
+ throw new IllegalArgumentException("initialOrdering is not of correct size");
+ if (useSignature)
+ throw new IllegalArgumentException("Can't initialize ordering from outside with useSignature as true");
+ }
+
+
+ protected void initFrequency() throws Exception {
+
+ if (!initialized) {
+ for (int i = 0; i < numMmers; i++) {
+ int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength));
+ mmerRanks[i] = canonical;
+ mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] = canonical;
+ }
+ if (useSignature) {
+ for (int i = 0; i < numMmers; i++) {
+ if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) {
+ mmerRanks[i] += numMmers;
+ mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] += numMmers;
+ }
+ }
+ }
+ }
+
+
+ boolean keepSample = true;
+ int numSampled = 0;
+ int roundNumber = 0;
+
+ FileReader frG = new FileReader(inputFile);
+ BufferedReader bfrG = new BufferedReader(frG, bufSize);
+
+ statFrequency = new long[numMmers];
+ HashMap> pmerFrequency = new HashMap<>(roundSamples * 2);
+
+ String skippedDescribeLine, line;
+ char[] lineCharArray;// = new char[readLen];
+ int readLen;
+
+
+ int min_pos = -1;
+ int minValue, currentValue, minValueNormalized;
+
+ while (keepSample && (skippedDescribeLine = bfrG.readLine()) != null) {
+
+ line = bfrG.readLine();
+ readLen = line.length();
+ lineCharArray = line.toCharArray();
+
+ if(readLen < k)
+ continue;
+
+// bfrG.read(lineCharArray, 0, readLen);
+// bfrG.read();
+// String line = new String(lineCharArray);
+
+ if (stringUtils.isReadLegal(lineCharArray)) {
+
+ min_pos = findSmallest(lineCharArray, 0, k);
+ minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength);
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength);
+ currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k);
+
+ updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, 0);
+
+ int bound = readLen - k + 1;
+ for (int i = 1; i < bound; i++) {
+ numSampled++;
+ currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;
+
+ if (i > min_pos) {
+ min_pos = findSmallest(lineCharArray, i, i + k);
+ minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength);
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength);
+
+ updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i);
+ } else {
+ int lastIndexInWindow = k + i - pivotLength;
+ if (compareMmer(currentValue, minValue) < 0) {
+ min_pos = lastIndexInWindow;
+ minValue = currentValue;
+ minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength);
+
+ updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i);
+ }
+ }
+ updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i);
+ }
+ }
+
+ if (numSampled >= roundSamples) {
+ roundNumber++;
+ if (roundNumber <= rounds) { // TODO: SHOULD THIS BE < and not <=
+ numSampled = 0;
+ adaptOrdering(pmerFrequency);
+ pmerFrequency.clear();
+ if (roundNumber == rounds) {
+ System.out.println("Sampling for binning round");
+ roundSamples = statisticsSamples;
+ }
+ } else {
+ keepSample = false;
+ }
+ }
+ frequency = pmerFrequency;
+
+ }
+ normalize();
+ bfrG.close();
+ frG.close();
+ }
+
+ private void updateStatistics(int roundNumber, HashMap> pmerFrequency, int minValueNormalized, String line, int startPosition) {
+ if (roundNumber == rounds)
+ statFrequency[minValueNormalized]++;
+ else {
+ if (!pmerFrequency.containsKey(minValueNormalized))
+ pmerFrequency.put(minValueNormalized, new HashSet<>());
+ pmerFrequency.get(minValueNormalized).add(stringUtils.getCanonical(line.substring(startPosition, k + startPosition)));
+ }
+ }
+
+
+ private void adaptOrdering(HashMap> pmerFrequency) {
+ int[] frequencies = new int[numMmers];
+ for (Integer i : pmerFrequency.keySet()) {
+ frequencies[i] = pmerFrequency.get(i).size();
+ }
+ for (int i = 0; i < elementsToPush; i++) {
+ long biggest = -1;
+ int biggestIndex = -1;
+ for (int k = 0; k < frequencies.length; k++) {
+ if (frequencies[k] > biggest) {
+ biggest = frequencies[k];
+ biggestIndex = k;
+ }
+ }
+// TODO: might not be necessary to change both.
+ int newRank = mmerRanks[biggestIndex] + (int) (numMmers * percentagePunishment);
+ mmerRanks[biggestIndex] = newRank;
+ mmerRanks[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank;
+ frequencies[biggestIndex] = 0;
+ frequencies[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = 0;
+ }
+ }
+
+
+ @Override
+ public void initializeRanks() throws Exception {
+ isRankInitialized = true;
+ initFrequency();
+ }
+
+}
diff --git a/src/dumbo/Ordering/OrderingBase.java b/src/dumbo/Ordering/OrderingBase.java
new file mode 100644
index 0000000..ecf76db
--- /dev/null
+++ b/src/dumbo/Ordering/OrderingBase.java
@@ -0,0 +1,73 @@
+package dumbo.Ordering;
+
+import dumbo.StringUtils;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+public abstract class OrderingBase {
+
+ protected int pivotLength;
+ protected int numMmers;
+ protected int mask;
+
+ protected StringUtils stringUtils;
+
+ protected int[] mmerRanks;
+ protected boolean isRankInitialized;
+
+ public OrderingBase(int pivotLength) {
+ this.pivotLength = pivotLength;
+ this.numMmers = (int) Math.pow(4, pivotLength);
+ this.mask = numMmers - 1;
+ this.stringUtils = new StringUtils();
+ this.mmerRanks = new int[numMmers];
+ this.isRankInitialized = false;
+ }
+
+
+ public abstract void initializeRanks() throws Exception;
+
+ public int compareMmer(int x, int y) throws Exception {
+ if (!isRankInitialized)
+ throw new Exception("problema - rank not initialized");
+
+ int a = stringUtils.getNormalizedValue(x, pivotLength);
+ int b = stringUtils.getNormalizedValue(y, pivotLength);
+
+ if (a == b) return 0;
+ if (mmerRanks[a] < mmerRanks[b]) return -1;
+ return 1;
+ }
+
+ public int[] getRanks() {
+ return mmerRanks.clone();
+ }
+
+ public int findSmallest(char[] a, int from, int to) throws Exception {
+ int min_pos = from;
+ int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength);
+ int currentValue = minValue;
+ for (int i = from + 1; i <= to - pivotLength; i++) {
+ currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask;
+ if (compareMmer(minValue, currentValue) > 0) {
+ min_pos = i;
+ minValue = currentValue;
+ }
+ }
+
+ return min_pos;
+ }
+
+ protected void normalize() {
+ Integer[] temp = new Integer[mmerRanks.length];
+ for (int i = 0; i < temp.length; i++)
+ temp[i] = i;
+
+ Arrays.sort(temp, Comparator.comparingLong(a -> mmerRanks[a]));
+ for (int i = 0; i < temp.length; i++) {
+ mmerRanks[temp[i]] = i;
+ }
+ }
+}
diff --git a/src/dumbo/Ordering/Standard/LexicographicOrdering.java b/src/dumbo/Ordering/Standard/LexicographicOrdering.java
new file mode 100644
index 0000000..4a318e0
--- /dev/null
+++ b/src/dumbo/Ordering/Standard/LexicographicOrdering.java
@@ -0,0 +1,36 @@
+package dumbo.Ordering.Standard;
+
+
+import dumbo.Ordering.OrderingBase;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+public class LexicographicOrdering extends OrderingBase {
+
+ public LexicographicOrdering(int pivotLength) {
+ super(pivotLength);
+ }
+
+ @Override
+ public void initializeRanks() throws IOException {
+ Integer[] mmers = new Integer[numMmers];
+ for (int i = 0; i < mmers.length; i++) {
+ mmers[i] = i;
+ }
+
+ Arrays.sort(mmers, this::rawCompareMmer);
+ for (int i = 0; i < mmers.length; i++) {
+ mmerRanks[mmers[i]] = i;
+ }
+ System.out.println("finish init rank");
+ isRankInitialized = true;
+ }
+
+
+ protected int rawCompareMmer(int x, int y) {
+ return Integer.compare(stringUtils.getNormalizedValue(x, pivotLength), stringUtils.getNormalizedValue(y, pivotLength));
+ }
+
+
+}
diff --git a/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java b/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java
new file mode 100644
index 0000000..f5e93aa
--- /dev/null
+++ b/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java
@@ -0,0 +1,54 @@
+package dumbo.Ordering.Standard;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+
+public class LexicographicSignatureOrdering extends LexicographicOrdering {
+ protected SignatureUtils signatureUtils;
+
+ public LexicographicSignatureOrdering(int pivotLen) throws IOException {
+ super(pivotLen);
+ signatureUtils = new SignatureUtils(pivotLen);
+ }
+
+ @Override
+ public void initializeRanks() throws IOException {
+ Arrays.fill(mmerRanks, Integer.MAX_VALUE);
+
+ HashSet normalizedAllowedMmers = new HashSet<>();
+ for (int i = 0; i < numMmers; i++) {
+ if (signatureUtils.isAllowed(stringUtils.getNormalizedValue(i, pivotLength)))
+ normalizedAllowedMmers.add(stringUtils.getNormalizedValue(i, pivotLength));
+ }
+
+ Integer[] mmers = new Integer[normalizedAllowedMmers.size()];
+ normalizedAllowedMmers.toArray(mmers);
+
+ Arrays.sort(mmers);
+
+ for (int i = 0; i < mmers.length; i++) {
+ mmerRanks[mmers[i]] = i;
+ }
+ normalize();
+ System.out.println("finish init rank");
+ isRankInitialized = true;
+ }
+
+ @Override
+ protected int rawCompareMmer(int x, int y) {
+ int a = stringUtils.getNormalizedValue(x, pivotLength);
+ int b = stringUtils.getNormalizedValue(y, pivotLength);
+
+ boolean aAllowed = signatureUtils.isAllowed(a);
+ boolean bAllowed = signatureUtils.isAllowed(b);
+
+ if (!aAllowed && bAllowed) {
+ return 1;
+ } else if (!bAllowed && aAllowed) {
+ return -1;
+ }
+
+ return Integer.compare(a, b);
+ }
+}
diff --git a/src/dumbo/Ordering/Standard/RandomOrdering.java b/src/dumbo/Ordering/Standard/RandomOrdering.java
new file mode 100644
index 0000000..bb0a0c5
--- /dev/null
+++ b/src/dumbo/Ordering/Standard/RandomOrdering.java
@@ -0,0 +1,41 @@
+package dumbo.Ordering.Standard;
+
+import dumbo.Ordering.OrderingBase;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+public class RandomOrdering extends OrderingBase {
+ private int xor;
+
+ public RandomOrdering(int pivotLen, int xor) {
+ super(pivotLen);
+ this.xor = xor;
+ }
+
+ @Override
+ public void initializeRanks() throws IOException {
+ Integer[] mmers = new Integer[numMmers];
+ for (int i = 0; i < mmers.length; i++) {
+ mmers[i] = i;
+ }
+
+ Arrays.sort(mmers, this::rawCompareMmer);
+ for (int i = 0; i < mmers.length; i++) {
+ mmerRanks[mmers[i]] = i;
+ }
+ System.out.println("finish init rank");
+ isRankInitialized = true;
+ }
+
+ protected int rawCompareMmer(int x, int y) {
+ int a = stringUtils.getNormalizedValue(x, pivotLength);
+ int b = stringUtils.getNormalizedValue(y, pivotLength);
+
+ if ((a ^ xor) < (b ^ xor))
+ return -1;
+ else if ((a ^ xor) > (b ^ xor))
+ return 1;
+ return 0;
+ }
+}
\ No newline at end of file
diff --git a/src/dumbo/Ordering/Standard/SignatureUtils.java b/src/dumbo/Ordering/Standard/SignatureUtils.java
new file mode 100644
index 0000000..86898ed
--- /dev/null
+++ b/src/dumbo/Ordering/Standard/SignatureUtils.java
@@ -0,0 +1,95 @@
+package dumbo.Ordering.Standard;
+
+public class SignatureUtils {
+
+ private int len;
+ protected byte[] isPmerAllowed;
+
+ public SignatureUtils(int len){
+ this.len = len;
+ isPmerAllowed = new byte[(int)Math.pow(4, len)];
+ }
+
+ public boolean isAllowed(int mmer)
+ {
+ int isAllowed = isPmerAllowed[mmer];
+ if(isAllowed != 0){
+ return isAllowed == 1;
+ }
+
+ if ((mmer & 0x3f) == 0x3f) // TTT suffix
+ {
+ isPmerAllowed[mmer] = -1;
+ return false;
+ }
+
+ if ((mmer & 0x3f) == 0x3b) // TGT suffix
+ {
+ isPmerAllowed[mmer] = -1;
+ return false;
+ }
+ if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152
+ {
+ isPmerAllowed[mmer] = -1;
+ return false;
+ }
+
+ for (int j = 0; j < len - 3; ++j)
+ if ((mmer & 0xf) == 0) // AA inside
+ {
+ isPmerAllowed[mmer] = -1;
+ return false;
+ }
+ else
+ mmer >>= 2;
+
+ if (mmer == 0) // AAA prefix
+ {
+ isPmerAllowed[mmer] = -1;
+ return false;
+ }
+ if (mmer == 0x04) // ACA prefix
+ {
+ isPmerAllowed[mmer] = -1;
+ return false;
+ }
+ if ((mmer & 0xf) == 0) // *AA prefix
+ {
+ isPmerAllowed[mmer] = -1;
+ return false;
+ }
+
+ isPmerAllowed[mmer] = 1;
+ return true;
+ }
+
+ public boolean isAllowed(char[] a, int from, int aDecimal) {
+ int isAllowed = isPmerAllowed[aDecimal];
+ if(isAllowed != 0){
+ return isAllowed == 1;
+ }
+
+ int lastIndex = from + len - 1;
+ if (a[from] == 'A' && a[from + 2] == 'A') {
+ if (a[from + 1] <= 'C') { // C or A
+ isPmerAllowed[aDecimal] = -1;
+ return false;
+ }
+ } else if (a[lastIndex] == 'T' && a[lastIndex - 2] == 'T') {
+ if (a[lastIndex - 1] >='G') { // G or T
+ isPmerAllowed[aDecimal] = -1;
+ return false;
+ }
+ }
+
+ for (int i = from + 2; i < lastIndex; i++) {
+ if (a[i] == 'A' && a[i + 1] == 'A') {
+ isPmerAllowed[aDecimal] = -1;
+ return false;
+ }
+ }
+ isPmerAllowed[aDecimal] = 1;
+ return true;
+ }
+
+}
diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java
new file mode 100644
index 0000000..2adbb9e
--- /dev/null
+++ b/src/dumbo/OrderingOptimizer.java
@@ -0,0 +1,157 @@
+package dumbo;
+
+import dumbo.Ordering.*;
+import dumbo.Ordering.Standard.FrequencyOrdering;
+import dumbo.Ordering.Standard.LexicographicOrdering;
+import dumbo.Ordering.Standard.RandomOrdering;
+import dumbo.Ordering.Standard.LexicographicSignatureOrdering;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Random;
+
+public class OrderingOptimizer {
+
+ public static void main(String[] args) throws Exception {
+
+ String infile = null;
+
+ int k = 60, pivot_len = 8, bufferSize = 81920;
+ int readLen = 124;
+ String orderingName = "iterativeOrdering";
+ int numRounds = 0, elementsToPush = 0, samplesPerRound = 0;
+ long statSamples = 0;
+ double punishPercentage = 1;
+ String version = "10";
+ String kmerSetFile = null;
+
+ if (args.length > 0 && args[0].equals("-help")) {
+ System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" +
+ "Options Available: \n" +
+ "[-p pivotLength] : (Integer) Pivot Length. Default: 12" + "\n" +
+ "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" +
+ "[-o order] : lexico or sig or uhs or uhs_sig" + "\n");
+ return;
+ }
+
+ for (int i = 0; i < args.length; i += 2) {
+ if (args[i].equals("-in"))
+ infile = args[i + 1];
+ else if (args[i].equals("-v"))
+ version = args[i + 1];
+ else if (args[i].equals("-k"))
+ k = new Integer(args[i + 1]);
+ else if (args[i].equals("-kmers-file"))
+ kmerSetFile = args[i + 1];
+
+// else
+// if(args[i].equals("-o"))
+// orderingName = args[i+1];
+ else if (args[i].equals("-p"))
+ pivot_len = new Integer(args[i + 1]);
+ else if (args[i].equals("-b"))
+ bufferSize = new Integer(args[i + 1]);
+ else if (args[i].equals("-L"))
+ readLen = new Integer(args[i + 1]);
+ else if (args[i].equals("-rounds"))
+ numRounds = new Integer(args[i + 1]);
+ else if (args[i].equals("-samples"))
+ samplesPerRound = new Integer(args[i + 1]);
+ else if (args[i].equals("-elementsToPush"))
+ elementsToPush = new Integer(args[i + 1]);
+ else if (args[i].equals("-statSamples"))
+ statSamples = new Long(args[i + 1]);
+ else if (args[i].equals("-punishPercentage"))
+ punishPercentage = new Double(args[i + 1]);
+ else {
+ System.out.println("Wrong with arguments. Abort!");
+ System.out.println(args[i]);
+ return;
+ }
+ }
+
+ System.out.println("Optimizing an ordering:");
+ System.out.print("Input File: " + infile + "\n" +
+ "Kmer Length: " + k + "\n" +
+ "Pivot Length: " + pivot_len + "\n" +
+ "R/W Buffer Size: " + bufferSize + "\n" +
+ "Read length" + readLen + "\n" +
+ "Ordering: " + orderingName + "\n");
+
+
+
+ OrderingBase ordering = null;
+ System.out.println(version);
+ switch (version) {
+
+ case "9-normalized":
+ IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, bufferSize, k,
+ samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false);
+ iterative.initializeRanks();
+ ordering = iterative;
+ break;
+ case "9-normalized-signature":
+ IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, bufferSize, k,
+ samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, true);
+ iterativeSignature.initializeRanks();
+ ordering = iterativeSignature;
+ break;
+ case "signature":
+ ordering = new LexicographicSignatureOrdering(pivot_len);
+ ordering.initializeRanks();
+ break;
+ case "lexicographic":
+ ordering = new LexicographicOrdering(pivot_len);
+ ordering.initializeRanks();
+ break;
+ case "random":
+ Random r = new Random();
+ ordering = new RandomOrdering(pivot_len, r.nextInt());
+ ordering.initializeRanks();
+ break;
+ }
+
+ ExportUtils exportUtils = new ExportUtils();
+
+ int[] ranks = ordering.getRanks();
+ long[] longRanks = new long[ranks.length];
+ for (int i = 0; i < longRanks.length; longRanks[i]=ranks[i], i++) ;
+
+ exportUtils.exportOrderingForCpp(longRanks);
+
+
+ long[] counters;
+ if (kmerSetFile != null) {
+ try {
+
+ System.out.println("Counting minimizer appearances:");
+ System.out.print("Input File: " + kmerSetFile + "\n" +
+ "Kmer Length: " + k + "\n" +
+ "Pivot Length: " + pivot_len + "\n" +
+ "Ordering: " + orderingName + "\n");
+
+ MinimizerCounter minimizerCounter = new MinimizerCounter(k, kmerSetFile, pivot_len, bufferSize, ordering);
+ counters = minimizerCounter.Run();
+
+ exportUtils.writeToFile(counters, orderingName + pivot_len + "_" + "kmers");
+ } catch (Exception E) {
+ System.out.println("Exception caught!");
+ E.printStackTrace();
+ }
+ }
+ if (statSamples > 0) {
+ System.out.println("Collecting stats for binning");
+// LoadCounter counter = new LoadCounter(pivot_len, infile, readLen, bufferSize, k, statSamples, ordering);
+// counter.initFrequency();
+
+ BinSizeCounter counter = new BinSizeCounter(pivot_len, infile, bufferSize, k, statSamples, ordering);
+ counter.initFrequency();
+
+ counters = counter.getStatistics();
+ exportUtils.exportBinningForCpp(counters);
+
+ }
+ }
+
+
+}
diff --git a/src/buildgraph/StringUtils.java b/src/dumbo/StringUtils.java
similarity index 51%
rename from src/buildgraph/StringUtils.java
rename to src/dumbo/StringUtils.java
index 450dd19..98f57b4 100644
--- a/src/buildgraph/StringUtils.java
+++ b/src/dumbo/StringUtils.java
@@ -1,4 +1,4 @@
-package buildgraph;
+package dumbo;
public class StringUtils {
@@ -17,6 +17,18 @@ public int getDecimal(char[] a, int from, int to){
return val;
}
+ public long getLDecimal(char[] a, int from, int to){
+
+ long val=0;
+
+ for(int i=from; i>= 2;
+ }
+ return rev;
+ }
+
+ public String getCanonical(String line) {
+ String x = new String(getReversedRead(line.toCharArray()));
+ for (int i = 0; i < line.length(); i++) {
+ if (line.charAt(i) < x.charAt(i))
+ return line;
+ else if (line.charAt(i) > x.charAt(i))
+ return x;
+ }
+ return x;
+ }
+
+ public int getNormalizedValue(int minValue, int pivotLength) {
+ return Math.min(minValue, getReversedMmer(minValue, pivotLength));
+ }
}