From 58d2aa462f7b7eec33dae16c8b2139fe04a9d0fc Mon Sep 17 00:00:00 2001 From: danflomin Date: Mon, 22 Feb 2021 17:33:15 +0200 Subject: [PATCH 01/44] FIXED BUGSSS IN UHS --- src/buildgraph/BuildDeBruijnGraph.java | 43 ++-- .../Ordering/IterativeOrdering.java | 228 ++++++++++++++++++ .../Ordering/IterativeOrdering2.java | 224 +++++++++++++++++ .../Ordering/IterativeOrdering3.java | 155 ++++++++++++ src/buildgraph/Ordering/SignatureUtils.java | 53 ++++ .../UHS/UHSFrequencySignatureOrdering.java | 6 +- .../Ordering/UHS/UHSOrderingBase.java | 44 +++- .../Ordering/UHS/UHSSignatureOrdering.java | 41 +++- src/buildgraph/Partition.java | 12 +- 9 files changed, 764 insertions(+), 42 deletions(-) create mode 100644 src/buildgraph/Ordering/IterativeOrdering.java create mode 100644 src/buildgraph/Ordering/IterativeOrdering2.java create mode 100644 src/buildgraph/Ordering/IterativeOrdering3.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 00ea1eb..59b954b 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -8,6 +8,7 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.lang.reflect.Array; import java.util.AbstractMap; import java.util.HashMap; @@ -21,13 +22,13 @@ public static void main(String[] args) throws IOException { // String infile = "/home/gaga/data-scratch/yaelbenari/datas/beeData.fastq"; // String infile = "/home/gaga/data-scratch/yaelbenari/datas/workspace/72.fastq"; - int k = 60, pivot_len = 7, bufferSize = 8192, numThreads = 1, hsmapCapacity = 10000000; + int k = 60, pivot_len = 8, bufferSize = 8192, numThreads = 20, hsmapCapacity = 10000000; // int readLen = 124; // int readLen = 101; int readLen = 100; int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// boolean readable = false; - String orderingName = "uhs_freq_sig"; + String orderingName = "uhs_sig_freq"; int xor = 0; //11101101; if (args.length > 0 && args[0].equals("-help")) { @@ -49,9 +50,9 @@ else if(args[i].equals("-k")) k = new Integer(args[i+1]); else if(args[i].equals("-NB")) numBlocks = new Integer(args[i+1]); - else - if(args[i].equals("-o")) - orderingName = args[i+1]; +// else +// if(args[i].equals("-o")) +// orderingName = args[i+1]; else if(args[i].equals("-p")) pivot_len = new Integer(args[i+1]); else if(args[i].equals("-b")) @@ -69,18 +70,6 @@ else if(args[i].equals("-r")) } - IOrdering ordering; - switch (orderingName) - { - case "lexico": - ordering = new LexicographicOrdering(pivot_len); - break;; - case "uhs": - ordering = new UHSSignatureOrdering(xor, pivot_len, false, true); - case "random": - //ordering = - break; - } // UHSFrequencySignatureOrdering uhs_freq_sig = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, true); // uhs_freq_sig.initRank(); @@ -93,8 +82,25 @@ else if(args[i].equals("-r")) // }}; - IOrdering ordering = orderingNames.get(orderingName); +// IOrdering ordering = orderingNames.get(orderingName); // IOrdering ordering = new LexicographicSignatureOrdering(pivot_len); + +// orderingName = "iterativeOrdering"; + //IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 +// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 30000, 1, 10); +// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, 10); +// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, (int)Math.pow(4,pivot_len)/100); + +// IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k); + +// IterativeOrdering2 ordering = new IterativeOrdering2(pivot_len, infile, readLen, bufferSize, k, 100000, 10000, 5, (int)Math.pow(4,pivot_len)/100); + +// ordering.initFrequency(); + + UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); + ordering.initRank(); + + Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); @@ -116,6 +122,7 @@ else if(args[i].equals("-r")) AbstractMap distinctKmersPerPartition = map.Run(numThreads); BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); + System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); diff --git a/src/buildgraph/Ordering/IterativeOrdering.java b/src/buildgraph/Ordering/IterativeOrdering.java new file mode 100644 index 0000000..23244e4 --- /dev/null +++ b/src/buildgraph/Ordering/IterativeOrdering.java @@ -0,0 +1,228 @@ +package buildgraph.Ordering; + +import buildgraph.Ordering.UHS.UHSOrderingBase; +import buildgraph.Ordering.UHS.UHSSignatureOrdering; +import buildgraph.StringUtils; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.lang.reflect.Array; +import java.util.Arrays; +import java.util.HashSet; + +public class IterativeOrdering implements IOrdering { + private String inputFile; + private int readLen; + private int bufSize; + private int pivotLength; + private int k; + private long[] currentOrdering; + private StringUtils stringUtils; + + private int roundSamples; + private int rounds; + private int elementsToPush; + private int pushBy; + + public IterativeOrdering(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering, int roundSamples, int rounds, int elementsToPush, int pushBy) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.pushBy = pushBy; + stringUtils = new StringUtils(); + } + + public IterativeOrdering(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)], 100000, 10000, 1, (int) Math.pow(4, pivotLength)); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + } + + public IterativeOrdering(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int pushBy) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)], roundSamples, rounds, elementsToPush, pushBy); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + } + + + public void initFrequency() throws IOException { + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; + + String describeline; + char[] lineCharArray = new char[readLen]; + char[] currentArray; + + + int prepos, min_pos = -1; + int[] flag = new int[1]; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + if (stringUtils.isReadLegal(lineCharArray)) { + int len = readLen; + char[] revCharArray = stringUtils.getReversedRead(lineCharArray); + + min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag); + //int initialMinPos = min_pos; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + + if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLength)) { + currentArray = flag[0] == 0 ? lineCharArray : revCharArray; + int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); + + min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag); + //initialMinPos = min_pos; + + if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLength) : calPosNew(revCharArray, min_pos, min_pos + pivotLength))) { + prepos = temp; + pmerFrequency[prepos]++; + } + + } else { + + if (strcmp(lineCharArray, revCharArray, k + i - pivotLength, len - i - k, pivotLength) < 0) { + if (strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLength, min_pos, pivotLength) < 0) { + + currentArray = flag[0] == 0 ? lineCharArray : revCharArray; + int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); + + min_pos = k + i - pivotLength; + if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLength)) { + prepos = temp; + pmerFrequency[prepos]++; + } + + flag[0] = 0; + + } + } else { + if (strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLength) < 0) { + + currentArray = flag[0] == 0 ? lineCharArray : revCharArray; + int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); + + min_pos = -k - i + len; + + if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLength)) { + prepos = temp; + pmerFrequency[prepos]++; + } + flag[0] = 1; + } + } + } + } + currentArray = flag[0] == 0 ? lineCharArray : revCharArray; + prepos = calPosNew(currentArray, min_pos, min_pos + pivotLength); + pmerFrequency[prepos]++; + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber == rounds) + keepSample = false; + else + numSampled = 0; + adaptOrdering(pmerFrequency); + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + } + + + } + bfrG.close(); + frG.close(); + } + + private void adaptOrdering(long[] pmerFrequency) { + for (int i = 0; i < elementsToPush; i++) { + long biggest = Arrays.stream(pmerFrequency).max().getAsLong(); + for (int j = 0; j < pmerFrequency.length; j++) { + if (pmerFrequency[j] == biggest) { + long newRank = currentOrdering[j] + pushBy; + currentOrdering[j] = newRank; + currentOrdering[getReversed(j)] = newRank; + pmerFrequency[j] = 0; + pmerFrequency[getReversed(j)] = 0; + } + } + } + } + + private int calPosNew(char[] a, int from, int to) { + return stringUtils.getDecimal(a, from, to); + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException { + int len = a.length; + int pos1 = findSmallest(a, from, to); + int pos2 = findSmallest(b, len - to, len - from); + + if (strcmp(a, b, pos1, pos2, pivotLength) < 0) { + flag[0] = 0; + return pos1; + } else { + flag[0] = 1; + return pos2; + } + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } +} diff --git a/src/buildgraph/Ordering/IterativeOrdering2.java b/src/buildgraph/Ordering/IterativeOrdering2.java new file mode 100644 index 0000000..fbf8dbd --- /dev/null +++ b/src/buildgraph/Ordering/IterativeOrdering2.java @@ -0,0 +1,224 @@ +package buildgraph.Ordering; + +import buildgraph.StringUtils; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.Arrays; + +public class IterativeOrdering2 implements IOrdering { + private String inputFile; + private int readLen; + private int bufSize; + private int pivotLength; + private int k; + private long[] currentOrdering; + private StringUtils stringUtils; + + private int roundSamples; + private int rounds; + private int elementsToPush; + private int pushBy; + + public IterativeOrdering2(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering, int roundSamples, int rounds, int elementsToPush, int pushBy) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.pushBy = pushBy; + stringUtils = new StringUtils(); + } + + public IterativeOrdering2(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)], 100000, 10000, 1, (int) Math.pow(4, pivotLength)); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + } + + public IterativeOrdering2(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int pushBy) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)], roundSamples, rounds, elementsToPush, pushBy); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + } + + + public void initFrequency() throws IOException { + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; + + String describeline; + char[] lineCharArray = new char[readLen]; + char[] currentArray; + + + int prepos, min_pos = -1; + int[] flag = new int[1]; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + if (stringUtils.isReadLegal(lineCharArray)) { + int len = readLen; + char[] revCharArray = stringUtils.getReversedRead(lineCharArray); + + min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag); + //int initialMinPos = min_pos; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + + if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLength)) { + currentArray = flag[0] == 0 ? lineCharArray : revCharArray; + int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); + + min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag); + //initialMinPos = min_pos; + + if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLength) : calPosNew(revCharArray, min_pos, min_pos + pivotLength))) { + prepos = temp; + pmerFrequency[prepos]++; + } + + } else { + + if (strcmp(lineCharArray, revCharArray, k + i - pivotLength, len - i - k, pivotLength) < 0) { + if (strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLength, min_pos, pivotLength) < 0) { + + currentArray = flag[0] == 0 ? lineCharArray : revCharArray; + int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); + + min_pos = k + i - pivotLength; + if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLength)) { + prepos = temp; + pmerFrequency[prepos]++; + } + + flag[0] = 0; + + } + } else { + if (strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLength) < 0) { + + currentArray = flag[0] == 0 ? lineCharArray : revCharArray; + int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); + + min_pos = -k - i + len; + + if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLength)) { + prepos = temp; + pmerFrequency[prepos]++; + } + flag[0] = 1; + } + } + } + } + currentArray = flag[0] == 0 ? lineCharArray : revCharArray; + prepos = calPosNew(currentArray, min_pos, min_pos + pivotLength); + pmerFrequency[prepos]++; + } + + if (Arrays.stream(pmerFrequency).max().getAsLong() > 25-Math.min(roundNumber/10, 15)) { + System.out.println("round number = "+roundNumber); + roundNumber++; + if (roundNumber == rounds) + keepSample = false; + else + numSampled = 0; + adaptOrdering(pmerFrequency); + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + } + + + } + bfrG.close(); + frG.close(); + } + + private void adaptOrdering(long[] pmerFrequency) { + for (int i = 0; i < elementsToPush; i++) { + long biggest = Arrays.stream(pmerFrequency).max().getAsLong(); + for (int j = 0; j < pmerFrequency.length; j++) { + if (pmerFrequency[j] == biggest) { + long newRank = currentOrdering[j] + pushBy; + currentOrdering[j] = newRank; + currentOrdering[getReversed(j)] = newRank; + pmerFrequency[j] = 0; + pmerFrequency[getReversed(j)] = 0; + } + } + } + } + + private int calPosNew(char[] a, int from, int to) { + return stringUtils.getDecimal(a, from, to); + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException { + int len = a.length; + int pos1 = findSmallest(a, from, to); + int pos2 = findSmallest(b, len - to, len - from); + + if (strcmp(a, b, pos1, pos2, pivotLength) < 0) { + flag[0] = 0; + return pos1; + } else { + flag[0] = 1; + return pos2; + } + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } +} diff --git a/src/buildgraph/Ordering/IterativeOrdering3.java b/src/buildgraph/Ordering/IterativeOrdering3.java new file mode 100644 index 0000000..44c2634 --- /dev/null +++ b/src/buildgraph/Ordering/IterativeOrdering3.java @@ -0,0 +1,155 @@ +package buildgraph.Ordering; + +import buildgraph.StringUtils; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.Arrays; + +public class IterativeOrdering3 implements IOrdering { + private String inputFile; + private int readLen; + private int bufSize; + private int pivotLength; + private int k; + private long[] currentOrdering; + private StringUtils stringUtils; + + private int roundSamples = 100000; + private int rounds = 10000; + private int elementsToPush = 1; + + public IterativeOrdering3(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + } + + public IterativeOrdering3(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + } + + + public void initFrequency() throws IOException { + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + + int min_pos = -1; + int minValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos+pivotLength); + pmerFrequency[minValue]++; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos+pivotLength); + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(lineCharArray, lineCharArray, lastIndexInWindow, min_pos, pivotLength) < 0) { + min_pos = lastIndexInWindow; + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos+pivotLength); + } + } + pmerFrequency[minValue]++; + } + } + pmerFrequency[min_pos]++; + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber == rounds) + keepSample = false; + else + numSampled = 0; + adaptOrdering(pmerFrequency); + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + } + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(long[] pmerFrequency) { + for (int i = 0; i < elementsToPush; i++) { + long biggest = Arrays.stream(pmerFrequency).max().getAsLong(); + for (int j = 0; j < pmerFrequency.length; j++) { + if (pmerFrequency[j] == biggest) { + long newRank = currentOrdering[j] + (int) Math.pow(4, pivotLength)/100; + currentOrdering[j] = newRank; + currentOrdering[getReversed(j)] = newRank; + pmerFrequency[j] = 0; + pmerFrequency[getReversed(j)] = 0; + } + } + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } +} diff --git a/src/buildgraph/Ordering/SignatureUtils.java b/src/buildgraph/Ordering/SignatureUtils.java index 8212663..fa11bb7 100644 --- a/src/buildgraph/Ordering/SignatureUtils.java +++ b/src/buildgraph/Ordering/SignatureUtils.java @@ -12,6 +12,59 @@ public SignatureUtils(int len){ isPmerAllowed = new byte[(int)Math.pow(4, len)]; } + public boolean isAllowed(int mmer) + { + int isAllowed = isPmerAllowed[mmer]; + if(isAllowed != 0){ + return isAllowed == 1; + } + + if ((mmer & 0x3f) == 0x3f) // TTT suffix + { + isPmerAllowed[mmer] = -1; + return false; + } + + if ((mmer & 0x3f) == 0x3b) // TGT suffix + { + isPmerAllowed[mmer] = -1; + return false; + } + if ((mmer & 0x3c) == 0x3c) // TG* suffix !!!! consider issue #152 + { + isPmerAllowed[mmer] = -1; + return false; + } + + for (int j = 0; j < len - 3; ++j) + if ((mmer & 0xf) == 0) // AA inside + { + isPmerAllowed[mmer] = -1; + return false; + } + else + mmer >>= 2; + + if (mmer == 0) // AAA prefix + { + isPmerAllowed[mmer] = -1; + return false; + } + if (mmer == 0x04) // ACA prefix + { + isPmerAllowed[mmer] = -1; + return false; + } + if ((mmer & 0xf) == 0) // *AA prefix + { + isPmerAllowed[mmer] = -1; + return false; + } + + isPmerAllowed[mmer] = 1; + return true; + } + public boolean isAllowed(char[] a, int from, int aDecimal) { int isAllowed = isPmerAllowed[aDecimal]; if(isAllowed != 0){ diff --git a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java index a0a0231..3b8b214 100644 --- a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ b/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java @@ -9,8 +9,8 @@ public class UHSFrequencySignatureOrdering extends UHSSignatureOrdering { private long[] pmerFrequency; private boolean isInit; - public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature, boolean useCache) throws IOException { - super(0, pivotLen, useSignature, useCache); + public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature) throws IOException { + super(0, pivotLen, useSignature); this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; @@ -27,7 +27,7 @@ public void initRank() throws IOException { protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) throws IOException { int baseCompareValue = strcmpBase(x, y); - if (baseCompareValue != BOTH_IN_UHS) { + if (baseCompareValue != BOTH_IN_UHS && baseCompareValue != BOTH_NOT_IN_UHS) { return baseCompareValue; } diff --git a/src/buildgraph/Ordering/UHS/UHSOrderingBase.java b/src/buildgraph/Ordering/UHS/UHSOrderingBase.java index 3820b30..3307108 100644 --- a/src/buildgraph/Ordering/UHS/UHSOrderingBase.java +++ b/src/buildgraph/Ordering/UHS/UHSOrderingBase.java @@ -16,6 +16,8 @@ public abstract class UHSOrderingBase implements IOrdering { protected StringUtils stringUtils; protected static final int BOTH_IN_UHS = 824; + protected static final int BOTH_NOT_IN_UHS = 1001; + protected int pivotLen; protected int[] rankOfPmer; @@ -32,6 +34,7 @@ public UHSOrderingBase(int pivotLen) throws IOException { } protected abstract int calculateStrcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException; + protected abstract int calculateStrcmp(int x, int y) throws IOException; public boolean isInUHS(int pmerDecimal) { @@ -53,12 +56,14 @@ protected int strcmpBase(int x, int y) { boolean xInUHS = isInUHS(x); boolean yInUHS = isInUHS(y); - if (xInUHS && !yInUHS) { - return -1; - } else if (!xInUHS && yInUHS) { - return 1; + if(xInUHS) + { + if(!yInUHS) return -1; + return BOTH_IN_UHS; } - return BOTH_IN_UHS; + if(yInUHS) + return 1; + return BOTH_NOT_IN_UHS; } private byte[] uhsBitSet(int pivotLen) throws IOException { @@ -91,23 +96,44 @@ private byte[] uhsBitSet(int pivotLen) throws IOException { public void initRank() throws IOException { System.out.println("start init rank"); - HashSet pmers = getPmersInUHS(); - char[][] pmersArr = new char[pmers.size()][pivotLen]; + HashSet pmers = new HashSet<>(); + for(int i = 0; i <(int)Math.pow(4, pivotLen); i++) {if(isInUHS(i)) pmers.add(i);}; + + Integer[] pmersArr = new Integer[pmers.size()]; pmers.toArray(pmersArr); Arrays.sort(pmersArr, (o1, o2) -> { try { - return calculateStrcmp(o1, o2, 0, 0, pivotLen); + return calculateStrcmp(o1, o2); } catch (IOException e) { e.printStackTrace(); } return 0; }); for (int i = 0; i < pmersArr.length; i++) { - rankOfPmer[stringUtils.getDecimal(pmersArr[i], 0, pivotLen)] = i; + rankOfPmer[pmersArr[i]] = i; } System.out.println("finish init rank"); } +// public void initRank() throws IOException { +// System.out.println("start init rank"); +// HashSet pmers = getPmersInUHS(); +// char[][] pmersArr = new char[pmers.size()][pivotLen]; +// pmers.toArray(pmersArr); +// Arrays.sort(pmersArr, (o1, o2) -> { +// try { +// return calculateStrcmp(o1, o2, 0, 0, pivotLen); +// } catch (IOException e) { +// e.printStackTrace(); +// } +// return 0; +// }); +// for (int i = 0; i < pmersArr.length; i++) { +// rankOfPmer[stringUtils.getDecimal(pmersArr[i], 0, pivotLen)] = i; +// } +// System.out.println("finish init rank"); +// } + private HashSet getPmersInUHS() { HashSet pmers = new HashSet<>(); StringBuilder sb = new StringBuilder(pivotLen); diff --git a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java index 136aa5b..efde074 100644 --- a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java +++ b/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java @@ -8,20 +8,24 @@ public class UHSSignatureOrdering extends UHSOrderingBase { private SignatureUtils signatureUtils; protected boolean useSignature; - private boolean useCache; protected int xor; - public UHSSignatureOrdering(int xor, int pivotLen, boolean useSignature, boolean useCache) throws IOException { + public UHSSignatureOrdering(int xor, int pivotLen, boolean useSignature) throws IOException { super(pivotLen); this.xor = xor; this.useSignature = useSignature; - this.useCache = useCache; signatureUtils = new SignatureUtils(pivotLen); } - public UHSSignatureOrdering(int pivotLen, boolean useSignature, boolean useCache) throws IOException { - this(0, pivotLen, useSignature, useCache); + public UHSSignatureOrdering(int pivotLen, boolean useSignature) throws IOException { + this(0, pivotLen, useSignature); + } + + @Override + public void initRank() throws IOException { + super.initRank(); + isRankInit = true; } @@ -47,22 +51,23 @@ public int findSmallest(char[] a, int from, int to) throws IOException { int min_pos = from; int j = stringUtils.getDecimal(a, min_pos, min_pos + pivotLen); int prev = j; - boolean prevAllowed = signatureUtils.isAllowed(a, min_pos, prev), jAllowed = true; + //boolean prevAllowed = signatureUtils.isAllowed(a, min_pos, prev), jAllowed = true; int hexRepresentation = pivotLengthToHexRepresentation.get(pivotLen); for (int i = from + 1; i <= to - pivotLen; i++) { j = ((j * 4) ^ (StringUtils.valTable[a[i + pivotLen - 1] - 'A'])) & hexRepresentation; - if (useSignature) - jAllowed = signatureUtils.isAllowed(a, i, j); +// if (useSignature) +// jAllowed = signatureUtils.isAllowed(a, i, j); if (isInUHS(j)) { - if (strcmpSignature(prev, j, prevAllowed, jAllowed) > 0) { + if(rankOfPmer[j] < rankOfPmer[prev]){//if (strcmpSignature(prev, j, prevAllowed, jAllowed) > 0) { // TODO: SHOULD USE RANKOFPMER min_pos = i; prev = j; + //prevAllowed = jAllowed; // TODO: SHOULD BE HERE? } } - prevAllowed = jAllowed; +// prevAllowed = jAllowed; // TODO: POSSIBLE BUG } return min_pos; } @@ -82,9 +87,23 @@ protected int calculateStrcmp(char[] a, char[] b, int froma, int fromb, int len) return strcmpSignature(x, y, aAllowed, bAllowed); } + protected int calculateStrcmp(int x, int y) throws IOException { + if (x == y) return 0; + + boolean aAllowed = true, bAllowed = true; + if (useSignature) { + aAllowed = signatureUtils.isAllowed(x); + bAllowed = signatureUtils.isAllowed(y); + } + + return strcmpSignature(x, y, aAllowed, bAllowed); + } + + + protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) throws IOException { int baseCompareValue = strcmpBase(x, y); - if (baseCompareValue != BOTH_IN_UHS) { + if (baseCompareValue != BOTH_IN_UHS && baseCompareValue != BOTH_NOT_IN_UHS) { return baseCompareValue; } // from down here - both in UHS diff --git a/src/buildgraph/Partition.java b/src/buildgraph/Partition.java index 5a43374..d5b1d5b 100644 --- a/src/buildgraph/Partition.java +++ b/src/buildgraph/Partition.java @@ -55,7 +55,7 @@ private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throw } private int calPosNew(char[] a, int from, int to) { - return stringUtils.getDecimal(a, from, to) % numOfBlocks; + return Integer.hashCode(stringUtils.getDecimal(a, from, to)) % numOfBlocks; } private long DistributeNodes() throws IOException { @@ -66,6 +66,8 @@ private long DistributeNodes() throws IOException { String describeline; + int numSuperKmers = 0; + int prepos, substart = 0, subend, min_pos = -1; char[] lineCharArray = new char[readLen]; @@ -115,6 +117,7 @@ private long DistributeNodes() throws IOException { writeToFile(prepos, substart, subend, lineCharArray, outcnt); + numSuperKmers++; substart = i; outcnt = cnt; @@ -140,6 +143,8 @@ private long DistributeNodes() throws IOException { subend = i - 1 + k; writeToFile(prepos, substart, subend, lineCharArray, outcnt); + numSuperKmers++; + substart = i; outcnt = cnt; @@ -166,6 +171,8 @@ private long DistributeNodes() throws IOException { subend = i - 1 + k; writeToFile(prepos, substart, subend, lineCharArray, outcnt); + numSuperKmers++; + substart = i; outcnt = cnt; @@ -182,10 +189,13 @@ private long DistributeNodes() throws IOException { prepos = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); writeToFile(prepos, substart, subend, lineCharArray, outcnt); + numSuperKmers++; + } } System.out.println("Largest ID is " + cnt); + System.out.println("Num superkmers is = " + numSuperKmers); for (int i = 0; i < bfwG.length; i++) { if (bfwG[i] != null) { From 904ea4f3502c71ad3bcc0097ee03d0e9316c634d Mon Sep 17 00:00:00 2001 From: danflomin Date: Tue, 23 Feb 2021 15:47:59 +0200 Subject: [PATCH 02/44] add random ordering --- src/buildgraph/Ordering/RandomOrdering.java | 38 +++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 src/buildgraph/Ordering/RandomOrdering.java diff --git a/src/buildgraph/Ordering/RandomOrdering.java b/src/buildgraph/Ordering/RandomOrdering.java new file mode 100644 index 0000000..33ad4cb --- /dev/null +++ b/src/buildgraph/Ordering/RandomOrdering.java @@ -0,0 +1,38 @@ +package buildgraph.Ordering; + +import buildgraph.StringUtils; + +import java.io.IOException; + +public class RandomOrdering implements IOrdering { + + protected StringUtils stringUtils; + private int pivotLen; + + public RandomOrdering(int pivotLen) { + this.pivotLen = pivotLen; + stringUtils = new StringUtils(); + } + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLen; i++) { + if (strcmp(a, a, min_pos, i, pivotLen) > 0) + min_pos = i; + } + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException { + int x = stringUtils.getDecimal(a, froma, froma + pivotLen); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen); + int t = 11101101; + if ((x ^ t) < (y ^ t)) + return -1; + else if ((x ^ t) > (y ^ t)) + return 1; + return 0; + } +} \ No newline at end of file From ca32615901cf3cd2b4835372f78450cc89cb68b1 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 24 Feb 2021 13:42:13 +0200 Subject: [PATCH 03/44] lolz --- src/buildgraph/BuildDeBruijnGraph.java | 108 +++++++-------- .../Ordering/IterativeOrdering.java | 40 +++++- .../Ordering/IterativeOrdering3.java | 19 +++ .../Ordering/UHS/UHSSignatureOrdering.java | 8 +- .../Ordering/UHS/YaelUHSOrdering.java | 128 ++++++++++++++++++ 5 files changed, 232 insertions(+), 71 deletions(-) create mode 100644 src/buildgraph/Ordering/UHS/YaelUHSOrdering.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 59b954b..c6e5612 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -71,22 +71,9 @@ else if(args[i].equals("-r")) -// UHSFrequencySignatureOrdering uhs_freq_sig = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, true); -// uhs_freq_sig.initRank(); -// HashMap orderingNames = new HashMap() {{ -// put("lexico", new LexicographicOrdering(pivot_len)); -// put("sig", new LexicographicSignatureOrdering(pivot_len)); -// put("uhs_sig", new UHSSignatureOrdering(xor, pivot_len, false, true)); -// put("uhs_freq", new UniversalFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, false)); -// put("uhs_freq_sig", uhs_freq_sig); -// }}; - - -// IOrdering ordering = orderingNames.get(orderingName); -// IOrdering ordering = new LexicographicSignatureOrdering(pivot_len); - -// orderingName = "iterativeOrdering"; - //IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 + orderingName = "iterativeOrdering"; + IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 + ordering.initFrequency(); // IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 30000, 1, 10); // IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, 10); // IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, (int)Math.pow(4,pivot_len)/100); @@ -97,54 +84,57 @@ else if(args[i].equals("-r")) // ordering.initFrequency(); - UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); - ordering.initRank(); +// UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); +// ordering.initRank(); + ordering.exportOrderingForCpp(); + ordering.exportBinningForCpp(); - Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); - Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); - - try { - - System.out.println("Program Configuration:"); - System.out.print("Input File: " + infile + "\n" + - "Kmer Length: " + k + "\n" + - "Read Length: " + readLen + "\n" + - "# Of Blocks: " + numBlocks + "\n" + - "Pivot Length: " + pivot_len + "\n" + - "# Of Threads: " + numThreads + "\n" + - "R/W Buffer Size: " + bufferSize + "\n" + - "Ordering: " + orderingName + "\n" + - "x xor: " + xor + "\n" + - "Output Format: " + (readable == true ? "Text" : "Binary") + "\n"); - - long maxID = partition.Run(); - - AbstractMap distinctKmersPerPartition = map.Run(numThreads); - BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); - System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); - - HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); - BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); +// try { // +// System.out.println("Program Configuration:"); +// System.out.print("Input File: " + infile + "\n" + +// "Kmer Length: " + k + "\n" + +// "Read Length: " + readLen + "\n" + +// "# Of Blocks: " + numBlocks + "\n" + +// "Pivot Length: " + pivot_len + "\n" + +// "# Of Threads: " + numThreads + "\n" + +// "R/W Buffer Size: " + bufferSize + "\n" + +// "Ordering: " + orderingName + "\n" + +// "x xor: " + xor + "\n" + +// "Output Format: " + (readable == true ? "Text" : "Binary") + "\n"); // -// long time1 = 0; -// long t1 = System.currentTimeMillis(); -// System.out.println("Merge IDReplaceTables Begin!"); -// String sortcmd = "sort -t $\'\t\' -o IDReplaceTable +0 -1 -n -m Maps/maps*"; -// Runtime.getRuntime().exec(new String[]{"/bin/sh", "-c", sortcmd}, null, null).waitFor(); -// long t2 = System.currentTimeMillis(); -// time1 = (t2 - t1) / 1000; -// System.out.println("Time used for merging: " + time1 + " seconds!"); +// Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); +// Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); // -// Replace replace = new Replace("IDReplaceTable", "OutGraph", k, bufferSize, readLen, maxID); -// replace.Run(readable); - - - } catch (Exception E) { - System.out.println("Exception caught!"); - E.printStackTrace(); - } +// +// long maxID = partition.Run(); +// +// AbstractMap distinctKmersPerPartition = map.Run(numThreads); +// BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); +// System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); +// +// HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); +// BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); +//// +//// +//// long time1 = 0; +//// long t1 = System.currentTimeMillis(); +//// System.out.println("Merge IDReplaceTables Begin!"); +//// String sortcmd = "sort -t $\'\t\' -o IDReplaceTable +0 -1 -n -m Maps/maps*"; +//// Runtime.getRuntime().exec(new String[]{"/bin/sh", "-c", sortcmd}, null, null).waitFor(); +//// long t2 = System.currentTimeMillis(); +//// time1 = (t2 - t1) / 1000; +//// System.out.println("Time used for merging: " + time1 + " seconds!"); +//// +//// Replace replace = new Replace("IDReplaceTable", "OutGraph", k, bufferSize, readLen, maxID); +//// replace.Run(readable); +// +// +// } catch (Exception E) { +// System.out.println("Exception caught!"); +// E.printStackTrace(); +// } } diff --git a/src/buildgraph/Ordering/IterativeOrdering.java b/src/buildgraph/Ordering/IterativeOrdering.java index 23244e4..1a1435e 100644 --- a/src/buildgraph/Ordering/IterativeOrdering.java +++ b/src/buildgraph/Ordering/IterativeOrdering.java @@ -11,6 +11,7 @@ import java.lang.reflect.Array; import java.util.Arrays; import java.util.HashSet; +import java.util.LinkedList; public class IterativeOrdering implements IOrdering { private String inputFile; @@ -20,6 +21,7 @@ public class IterativeOrdering implements IOrdering { private int k; private long[] currentOrdering; private StringUtils stringUtils; + private long[] frequency; private int roundSamples; private int rounds; @@ -146,12 +148,19 @@ public void initFrequency() throws IOException { if (numSampled >= roundSamples) { roundNumber++; - if (roundNumber == rounds) - keepSample = false; - else + if (roundNumber <= rounds) { numSampled = 0; - adaptOrdering(pmerFrequency); - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + adaptOrdering(pmerFrequency); + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) + { + System.out.println("Sampling for binning round"); + roundSamples = 100*rounds*roundSamples; + } + } else { + keepSample = false; + frequency = pmerFrequency; + } } @@ -225,4 +234,25 @@ public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { if (currentOrdering[x] < currentOrdering[y]) return -1; return 1; } + + public void exportOrderingForCpp() { + System.out.print("{"); + for (int i = 0; i < currentOrdering.length; i++) { + System.out.print(currentOrdering[i] + ","); + } + System.out.print("}"); + System.out.println(); + } + + public void exportBinningForCpp() { + System.out.print("{"); + for (int i = 0; i < frequency.length; i++) { + System.out.print(frequency[i] + ","); + } + System.out.print("}"); + System.out.println(); + } + + } + diff --git a/src/buildgraph/Ordering/IterativeOrdering3.java b/src/buildgraph/Ordering/IterativeOrdering3.java index 44c2634..e26e61b 100644 --- a/src/buildgraph/Ordering/IterativeOrdering3.java +++ b/src/buildgraph/Ordering/IterativeOrdering3.java @@ -15,6 +15,7 @@ public class IterativeOrdering3 implements IOrdering { private int k; private long[] currentOrdering; private StringUtils stringUtils; + private long[] frequency; private int roundSamples = 100000; private int rounds = 10000; @@ -152,4 +153,22 @@ public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { if (currentOrdering[x] < currentOrdering[y]) return -1; return 1; } + + public void exportOrderingForCpp() { + System.out.print("{"); + for (int i = 0; i < currentOrdering.length; i++) { + System.out.print(currentOrdering[i] + ","); + } + System.out.print("}"); + System.out.println(); + } + + public void exportBinningForCpp() { + System.out.print("{"); + for (int i = 0; i < frequency.length; i++) { + System.out.print(frequency[i] + ","); + } + System.out.print("}"); + System.out.println(); + } } diff --git a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java index efde074..c442853 100644 --- a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java +++ b/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java @@ -51,23 +51,17 @@ public int findSmallest(char[] a, int from, int to) throws IOException { int min_pos = from; int j = stringUtils.getDecimal(a, min_pos, min_pos + pivotLen); int prev = j; - //boolean prevAllowed = signatureUtils.isAllowed(a, min_pos, prev), jAllowed = true; int hexRepresentation = pivotLengthToHexRepresentation.get(pivotLen); for (int i = from + 1; i <= to - pivotLen; i++) { j = ((j * 4) ^ (StringUtils.valTable[a[i + pivotLen - 1] - 'A'])) & hexRepresentation; -// if (useSignature) -// jAllowed = signatureUtils.isAllowed(a, i, j); - if (isInUHS(j)) { - if(rankOfPmer[j] < rankOfPmer[prev]){//if (strcmpSignature(prev, j, prevAllowed, jAllowed) > 0) { // TODO: SHOULD USE RANKOFPMER + if(rankOfPmer[j] < rankOfPmer[prev]){ min_pos = i; prev = j; - //prevAllowed = jAllowed; // TODO: SHOULD BE HERE? } } -// prevAllowed = jAllowed; // TODO: POSSIBLE BUG } return min_pos; } diff --git a/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java b/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java new file mode 100644 index 0000000..be3815f --- /dev/null +++ b/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java @@ -0,0 +1,128 @@ +package buildgraph.Ordering.UHS; + +import buildgraph.Ordering.IOrdering; +import buildgraph.StringUtils; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; + +public class YaelUHSOrdering implements IOrdering { + + private final StringUtils stringUtils; + private int pivotLength; + private byte[] uhs_bits; + private int xor; + private int mask; + + public YaelUHSOrdering(int pivotLength, int xor) throws IOException { + this.xor = xor; + this.stringUtils = new StringUtils(); + this.pivotLength = pivotLength; + uhs_bits = uhsBitSet(pivotLength); + mask = pivotLengthToHexRepresentation.get(pivotLength); + } + + @Override + public int findSmallest(char[] a, int from, int to){ + int min_pos = from; + int j = stringUtils.getDecimal(a, min_pos, min_pos+pivotLength); + int prev = j; + for(int i=from+1; i<=to-pivotLength; i++){ + j = ((j * 4) ^ (StringUtils.valTable[a[i+pivotLength-1]])) & mask; + if(((this.uhs_bits[j >> 3] >> (j & 0b111)) & 1) == 1) { + if(strcmp(prev, j)>0) { + min_pos = i; + prev = j; + } + } + } + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len){ + + int x = stringUtils.getDecimal(a, froma, froma+pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb+pivotLength); + int xdiv8 = x >> 3; int xmod8 = x & 0b111; + int ydiv8 = y >> 3; int ymod8 = y & 0b111; + if ((((this.uhs_bits[xdiv8] >> (xmod8)) & 1) ^ ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) == 0) { + if((x ^ xor) < (y ^ xor)) + return -1; + else //if((x ^ 11101101) > (y ^ 11101101)) + return 1; + } + + if (((this.uhs_bits[xdiv8] >> (xmod8)) & 1) > ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) + return -1; + if (((this.uhs_bits[xdiv8] >> (xmod8)) & 1) < ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) + return 1; + + return 0; + } + + private int strcmp(int x, int y){ + int xdiv8 = x >> 3; int xmod8 = x & 0b111; + int ydiv8 = y >> 3; int ymod8 = y & 0b111; + if ((((this.uhs_bits[xdiv8] >> (xmod8)) & 1) ^ ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) == 0) { + if((x ^ xor) < (y ^ xor)) + return -1; + else if((x ^ xor) > (y ^ xor)) + return 1; + } + + if (((this.uhs_bits[xdiv8] >> (xmod8)) & 1) > ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) + return -1; + if (((this.uhs_bits[xdiv8] >> (xmod8)) & 1) < ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) + return 1; + + return 0; + } + + + private byte[] uhsBitSet(int pivotLen) throws IOException { + int n = (int) Math.pow(4, pivotLen) / 8; + int i = 0; + byte[] bits = new byte[n]; + + String DocksFile = "res_" + pivotLen + ".txt"; + FileReader frG = new FileReader(DocksFile); + int count = 0; + + BufferedReader reader; + try { + reader = new BufferedReader(frG); + String line; + while ((line = reader.readLine()) != null) { + i = stringUtils.getDecimal(line.toCharArray(), 0, pivotLen); + bits[i / 8] |= 1 << (i % 8); + count++; + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + System.out.println(count); + frG.close(); + + return bits; + } + + protected static HashMap pivotLengthToHexRepresentation = new HashMap() { + { + put(5, 0x3ff); + put(6, 0xfff); + put(7, 0x3fff); + put(8, 0xffff); + put(10, 0xfffff); + put(11, 0x3fffff); + put(12, 0xffffff); + put(13, 0x3ffffff); + put(14, 0xfffffff); + } + + }; + +} From 5634668545cd2cda6640de565a77a806e2fe57c0 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 24 Feb 2021 18:58:48 +0200 Subject: [PATCH 04/44] checkpoint for runs with yael - checking out to dev-runs3 --- src/buildgraph/BuildDeBruijnGraph.java | 155 ++++++++---------- .../Ordering/UHS/YaelUHSOrdering.java | 16 +- src/buildgraph/Partition.java | 11 +- 3 files changed, 91 insertions(+), 91 deletions(-) diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index c6e5612..c81384a 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -3,6 +3,7 @@ import buildgraph.Ordering.*; import buildgraph.Ordering.UHS.UHSFrequencySignatureOrdering; import buildgraph.Ordering.UHS.UHSSignatureOrdering; +import buildgraph.Ordering.UHS.YaelUHSOrdering; import java.io.BufferedWriter; import java.io.File; @@ -26,7 +27,7 @@ public static void main(String[] args) throws IOException { // int readLen = 124; // int readLen = 101; int readLen = 100; - int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// + int numBlocks = (int) Math.pow(4, pivot_len);//256; 1000;// boolean readable = false; String orderingName = "uhs_sig_freq"; int xor = 0; //11101101; @@ -44,97 +45,80 @@ public static void main(String[] args) throws IOException { } for (int i = 0; i < args.length; i += 2) { - if(args[i].equals("-in")) - infile = args[i+1]; - else if(args[i].equals("-k")) - k = new Integer(args[i+1]); - else if(args[i].equals("-NB")) - numBlocks = new Integer(args[i+1]); -// else -// if(args[i].equals("-o")) -// orderingName = args[i+1]; - else if(args[i].equals("-p")) - pivot_len = new Integer(args[i+1]); - else if(args[i].equals("-b")) - bufferSize = new Integer(args[i+1]); - else if(args[i].equals("-L")) - readLen = new Integer(args[i+1]); - else if(args[i].equals("-t")) - numThreads = new Integer(args[i+1]); - else if(args[i].equals("-r")) - readable = new Boolean(args[i+1]); - else{ + if (args[i].equals("-in")) + infile = args[i + 1]; + else if (args[i].equals("-k")) + k = new Integer(args[i + 1]); + else if (args[i].equals("-NB")) + numBlocks = new Integer(args[i + 1]); + else if(args[i].equals("-o")) + orderingName = args[i+1]; + else if (args[i].equals("-p")) + pivot_len = new Integer(args[i + 1]); + else if (args[i].equals("-b")) + bufferSize = new Integer(args[i + 1]); + else if (args[i].equals("-L")) + readLen = new Integer(args[i + 1]); + else if (args[i].equals("-t")) + numThreads = new Integer(args[i + 1]); + else if (args[i].equals("-r")) + readable = new Boolean(args[i + 1]); + else if (args[i].equals("-x")) + xor = new Integer(args[i + 1]); + else { System.out.println("Wrong with arguments. Abort!"); return; } } + IOrdering ordering = null; + switch (orderingName) { + case "lexico": + ordering = new LexicographicOrdering(pivot_len); + break; + case "uhs": + ordering = new YaelUHSOrdering(pivot_len, xor); + break; + case "random": + ordering = new RandomOrdering(pivot_len); + break; + default: + System.out.println("ordering name not recognized - goes with lexico"); + ordering = new LexicographicOrdering(pivot_len); + } + + + try { + + System.out.println("Program Configuration:"); + System.out.print("Input File: " + infile + "\n" + + "Kmer Length: " + k + "\n" + + "Read Length: " + readLen + "\n" + + "# Of Blocks: " + numBlocks + "\n" + + "Pivot Length: " + pivot_len + "\n" + + "# Of Threads: " + numThreads + "\n" + + "R/W Buffer Size: " + bufferSize + "\n" + + "Ordering: " + orderingName + "\n" + + "x xor: " + xor + "\n" + + "Output Format: " + (readable == true ? "Text" : "Binary") + "\n"); + + Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); + Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); - orderingName = "iterativeOrdering"; - IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 - ordering.initFrequency(); -// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 30000, 1, 10); -// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, 10); -// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, (int)Math.pow(4,pivot_len)/100); - -// IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k); - -// IterativeOrdering2 ordering = new IterativeOrdering2(pivot_len, infile, readLen, bufferSize, k, 100000, 10000, 5, (int)Math.pow(4,pivot_len)/100); - -// ordering.initFrequency(); - -// UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); -// ordering.initRank(); - - ordering.exportOrderingForCpp(); - ordering.exportBinningForCpp(); - -// try { -// -// System.out.println("Program Configuration:"); -// System.out.print("Input File: " + infile + "\n" + -// "Kmer Length: " + k + "\n" + -// "Read Length: " + readLen + "\n" + -// "# Of Blocks: " + numBlocks + "\n" + -// "Pivot Length: " + pivot_len + "\n" + -// "# Of Threads: " + numThreads + "\n" + -// "R/W Buffer Size: " + bufferSize + "\n" + -// "Ordering: " + orderingName + "\n" + -// "x xor: " + xor + "\n" + -// "Output Format: " + (readable == true ? "Text" : "Binary") + "\n"); -// -// Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); -// Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); -// -// -// long maxID = partition.Run(); -// -// AbstractMap distinctKmersPerPartition = map.Run(numThreads); -// BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); -// System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); -// -// HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); -// BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); -//// -//// -//// long time1 = 0; -//// long t1 = System.currentTimeMillis(); -//// System.out.println("Merge IDReplaceTables Begin!"); -//// String sortcmd = "sort -t $\'\t\' -o IDReplaceTable +0 -1 -n -m Maps/maps*"; -//// Runtime.getRuntime().exec(new String[]{"/bin/sh", "-c", sortcmd}, null, null).waitFor(); -//// long t2 = System.currentTimeMillis(); -//// time1 = (t2 - t1) / 1000; -//// System.out.println("Time used for merging: " + time1 + " seconds!"); -//// -//// Replace replace = new Replace("IDReplaceTable", "OutGraph", k, bufferSize, readLen, maxID); -//// replace.Run(readable); -// -// -// } catch (Exception E) { -// System.out.println("Exception caught!"); -// E.printStackTrace(); -// } + long maxID = partition.Run(); + + AbstractMap distinctKmersPerPartition = map.Run(numThreads); + BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); + System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); + + HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); + BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); + + } catch (Exception E) { + System.out.println("Exception caught!"); + E.printStackTrace(); + } } @@ -185,3 +169,4 @@ public static void writeToFile(AbstractMap data, String fileName) { } } + diff --git a/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java b/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java index be3815f..ccca731 100644 --- a/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java +++ b/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java @@ -22,6 +22,7 @@ public YaelUHSOrdering(int pivotLength, int xor) throws IOException { this.pivotLength = pivotLength; uhs_bits = uhsBitSet(pivotLength); mask = pivotLengthToHexRepresentation.get(pivotLength); + System.out.println("YAEL UHS"); } @Override @@ -30,7 +31,7 @@ public int findSmallest(char[] a, int from, int to){ int j = stringUtils.getDecimal(a, min_pos, min_pos+pivotLength); int prev = j; for(int i=from+1; i<=to-pivotLength; i++){ - j = ((j * 4) ^ (StringUtils.valTable[a[i+pivotLength-1]])) & mask; + j = ((j * 4) ^ (StringUtils.valTable[a[i+pivotLength-1] - 'A'])) & mask; if(((this.uhs_bits[j >> 3] >> (j & 0b111)) & 1) == 1) { if(strcmp(prev, j)>0) { min_pos = i; @@ -125,4 +126,17 @@ private byte[] uhsBitSet(int pivotLen) throws IOException { }; + public boolean isInUHS(int pmerDecimal) { + int pmerDecimalDiv8 = pmerDecimal >> 3; + int pmerDecimalMod8 = pmerDecimal & 0b111; + if (((this.uhs_bits[pmerDecimalDiv8] >> (pmerDecimalMod8)) & 1) == 1) { + return true; + } + return false; + } + + public boolean isInUHS(char[] a, int from, int to) { + return isInUHS(stringUtils.getDecimal(a, from, to)); + } + } diff --git a/src/buildgraph/Partition.java b/src/buildgraph/Partition.java index d5b1d5b..72e1eb4 100644 --- a/src/buildgraph/Partition.java +++ b/src/buildgraph/Partition.java @@ -2,6 +2,7 @@ import buildgraph.Ordering.IOrdering; import buildgraph.Ordering.UHS.UHSOrderingBase; +import buildgraph.Ordering.UHS.YaelUHSOrdering; import java.io.*; @@ -55,7 +56,7 @@ private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throw } private int calPosNew(char[] a, int from, int to) { - return Integer.hashCode(stringUtils.getDecimal(a, from, to)) % numOfBlocks; + return stringUtils.getDecimal(a, from, to) % numOfBlocks; } private long DistributeNodes() throws IOException { @@ -128,8 +129,8 @@ private long DistributeNodes() throws IOException { if (ordering.strcmp(lineCharArray, revCharArray, k + i - pivotLen, len - i - k, pivotLen) < 0) { if (ordering.strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLen, min_pos, pivotLen) < 0) { boolean enter = true; - if (ordering instanceof UHSOrderingBase) { - if (!((UHSOrderingBase) ordering).isInUHS(lineCharArray, k + i - pivotLen, k + i)) { + if (ordering instanceof YaelUHSOrdering) { + if (!((YaelUHSOrdering) ordering).isInUHS(lineCharArray, k + i - pivotLen, k + i)) { enter = false; } } @@ -156,8 +157,8 @@ private long DistributeNodes() throws IOException { } else { if (ordering.strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLen) < 0) { boolean enter = true; - if (ordering instanceof UHSOrderingBase) { - if (!((UHSOrderingBase) ordering).isInUHS(revCharArray, len - i - k, len - i - k + pivotLen)) { + if (ordering instanceof YaelUHSOrdering) { + if (!((YaelUHSOrdering) ordering).isInUHS(revCharArray, len - i - k, len - i - k + pivotLen)) { enter = false; } } From 8f1f935c720c15e80b2635a2e69d9b317096d4aa Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 24 Feb 2021 19:01:08 +0200 Subject: [PATCH 05/44] add print for orderings --- src/buildgraph/BuildDeBruijnGraph.java | 155 ++++++++++++++----------- 1 file changed, 85 insertions(+), 70 deletions(-) diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index c81384a..c6e5612 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -3,7 +3,6 @@ import buildgraph.Ordering.*; import buildgraph.Ordering.UHS.UHSFrequencySignatureOrdering; import buildgraph.Ordering.UHS.UHSSignatureOrdering; -import buildgraph.Ordering.UHS.YaelUHSOrdering; import java.io.BufferedWriter; import java.io.File; @@ -27,7 +26,7 @@ public static void main(String[] args) throws IOException { // int readLen = 124; // int readLen = 101; int readLen = 100; - int numBlocks = (int) Math.pow(4, pivot_len);//256; 1000;// + int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// boolean readable = false; String orderingName = "uhs_sig_freq"; int xor = 0; //11101101; @@ -45,80 +44,97 @@ public static void main(String[] args) throws IOException { } for (int i = 0; i < args.length; i += 2) { - if (args[i].equals("-in")) - infile = args[i + 1]; - else if (args[i].equals("-k")) - k = new Integer(args[i + 1]); - else if (args[i].equals("-NB")) - numBlocks = new Integer(args[i + 1]); - else if(args[i].equals("-o")) - orderingName = args[i+1]; - else if (args[i].equals("-p")) - pivot_len = new Integer(args[i + 1]); - else if (args[i].equals("-b")) - bufferSize = new Integer(args[i + 1]); - else if (args[i].equals("-L")) - readLen = new Integer(args[i + 1]); - else if (args[i].equals("-t")) - numThreads = new Integer(args[i + 1]); - else if (args[i].equals("-r")) - readable = new Boolean(args[i + 1]); - else if (args[i].equals("-x")) - xor = new Integer(args[i + 1]); - else { + if(args[i].equals("-in")) + infile = args[i+1]; + else if(args[i].equals("-k")) + k = new Integer(args[i+1]); + else if(args[i].equals("-NB")) + numBlocks = new Integer(args[i+1]); +// else +// if(args[i].equals("-o")) +// orderingName = args[i+1]; + else if(args[i].equals("-p")) + pivot_len = new Integer(args[i+1]); + else if(args[i].equals("-b")) + bufferSize = new Integer(args[i+1]); + else if(args[i].equals("-L")) + readLen = new Integer(args[i+1]); + else if(args[i].equals("-t")) + numThreads = new Integer(args[i+1]); + else if(args[i].equals("-r")) + readable = new Boolean(args[i+1]); + else{ System.out.println("Wrong with arguments. Abort!"); return; } } - IOrdering ordering = null; - switch (orderingName) { - case "lexico": - ordering = new LexicographicOrdering(pivot_len); - break; - case "uhs": - ordering = new YaelUHSOrdering(pivot_len, xor); - break; - case "random": - ordering = new RandomOrdering(pivot_len); - break; - default: - System.out.println("ordering name not recognized - goes with lexico"); - ordering = new LexicographicOrdering(pivot_len); - } - - - try { - - System.out.println("Program Configuration:"); - System.out.print("Input File: " + infile + "\n" + - "Kmer Length: " + k + "\n" + - "Read Length: " + readLen + "\n" + - "# Of Blocks: " + numBlocks + "\n" + - "Pivot Length: " + pivot_len + "\n" + - "# Of Threads: " + numThreads + "\n" + - "R/W Buffer Size: " + bufferSize + "\n" + - "Ordering: " + orderingName + "\n" + - "x xor: " + xor + "\n" + - "Output Format: " + (readable == true ? "Text" : "Binary") + "\n"); - - Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); - Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); - long maxID = partition.Run(); - - AbstractMap distinctKmersPerPartition = map.Run(numThreads); - BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); - System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); - - HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); - BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); - - } catch (Exception E) { - System.out.println("Exception caught!"); - E.printStackTrace(); - } + orderingName = "iterativeOrdering"; + IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 + ordering.initFrequency(); +// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 30000, 1, 10); +// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, 10); +// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, (int)Math.pow(4,pivot_len)/100); + +// IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k); + +// IterativeOrdering2 ordering = new IterativeOrdering2(pivot_len, infile, readLen, bufferSize, k, 100000, 10000, 5, (int)Math.pow(4,pivot_len)/100); + +// ordering.initFrequency(); + +// UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); +// ordering.initRank(); + + ordering.exportOrderingForCpp(); + ordering.exportBinningForCpp(); + +// try { +// +// System.out.println("Program Configuration:"); +// System.out.print("Input File: " + infile + "\n" + +// "Kmer Length: " + k + "\n" + +// "Read Length: " + readLen + "\n" + +// "# Of Blocks: " + numBlocks + "\n" + +// "Pivot Length: " + pivot_len + "\n" + +// "# Of Threads: " + numThreads + "\n" + +// "R/W Buffer Size: " + bufferSize + "\n" + +// "Ordering: " + orderingName + "\n" + +// "x xor: " + xor + "\n" + +// "Output Format: " + (readable == true ? "Text" : "Binary") + "\n"); +// +// Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); +// Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); +// +// +// long maxID = partition.Run(); +// +// AbstractMap distinctKmersPerPartition = map.Run(numThreads); +// BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); +// System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); +// +// HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); +// BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); +//// +//// +//// long time1 = 0; +//// long t1 = System.currentTimeMillis(); +//// System.out.println("Merge IDReplaceTables Begin!"); +//// String sortcmd = "sort -t $\'\t\' -o IDReplaceTable +0 -1 -n -m Maps/maps*"; +//// Runtime.getRuntime().exec(new String[]{"/bin/sh", "-c", sortcmd}, null, null).waitFor(); +//// long t2 = System.currentTimeMillis(); +//// time1 = (t2 - t1) / 1000; +//// System.out.println("Time used for merging: " + time1 + " seconds!"); +//// +//// Replace replace = new Replace("IDReplaceTable", "OutGraph", k, bufferSize, readLen, maxID); +//// replace.Run(readable); +// +// +// } catch (Exception E) { +// System.out.println("Exception caught!"); +// E.printStackTrace(); +// } } @@ -169,4 +185,3 @@ public static void writeToFile(AbstractMap data, String fileName) { } } - From d473d6020603527cd4d6f37111b328ae83fe82f7 Mon Sep 17 00:00:00 2001 From: danflomin Date: Thu, 25 Feb 2021 10:28:10 +0200 Subject: [PATCH 06/44] checkpoint --- src/buildgraph/BuildDeBruijnGraph.java | 7 +- .../Ordering/IterativeOrdering.java | 4 +- .../Ordering/IterativeOrdering3.java | 103 +++++++++++++----- 3 files changed, 83 insertions(+), 31 deletions(-) diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index c6e5612..8cf0436 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -72,8 +72,8 @@ else if(args[i].equals("-r")) orderingName = "iterativeOrdering"; - IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 - ordering.initFrequency(); +// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 +// ordering.initFrequency(); // IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 30000, 1, 10); // IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, 10); // IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, (int)Math.pow(4,pivot_len)/100); @@ -87,6 +87,9 @@ else if(args[i].equals("-r")) // UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); // ordering.initRank(); + IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k); + + ordering.initFrequency(); ordering.exportOrderingForCpp(); ordering.exportBinningForCpp(); diff --git a/src/buildgraph/Ordering/IterativeOrdering.java b/src/buildgraph/Ordering/IterativeOrdering.java index 1a1435e..ff8640b 100644 --- a/src/buildgraph/Ordering/IterativeOrdering.java +++ b/src/buildgraph/Ordering/IterativeOrdering.java @@ -155,13 +155,13 @@ public void initFrequency() throws IOException { if (roundNumber == rounds) { System.out.println("Sampling for binning round"); - roundSamples = 100*rounds*roundSamples; + roundSamples = Integer.MAX_VALUE;//100*rounds*roundSamples; } } else { keepSample = false; - frequency = pmerFrequency; } } + frequency = pmerFrequency; } diff --git a/src/buildgraph/Ordering/IterativeOrdering3.java b/src/buildgraph/Ordering/IterativeOrdering3.java index e26e61b..41907dd 100644 --- a/src/buildgraph/Ordering/IterativeOrdering3.java +++ b/src/buildgraph/Ordering/IterativeOrdering3.java @@ -2,9 +2,7 @@ import buildgraph.StringUtils; -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; +import java.io.*; import java.util.Arrays; public class IterativeOrdering3 implements IOrdering { @@ -57,9 +55,8 @@ public void initFrequency() throws IOException { int len = readLen; - int min_pos = -1; - int minValue; + int minValue, currentValue; while (keepSample && (describeline = bfrG.readLine()) != null) { @@ -69,37 +66,49 @@ public void initFrequency() throws IOException { if (stringUtils.isReadLegal(lineCharArray)) { min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos+pivotLength); - pmerFrequency[minValue]++; + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + pmerFrequency[minValue] += k; int bound = len - k + 1; for (int i = 1; i < bound; i++) { numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & 0xffff; if (i > min_pos) { min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos+pivotLength); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + pmerFrequency[minValue] += k; } else { int lastIndexInWindow = k + i - pivotLength; - if (strcmp(lineCharArray, lineCharArray, lastIndexInWindow, min_pos, pivotLength) < 0) { + if (strcmp(currentValue, minValue) < 0) { min_pos = lastIndexInWindow; - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos+pivotLength); + minValue = currentValue; + pmerFrequency[minValue] += k; } } + pmerFrequency[minValue]++; } } - pmerFrequency[min_pos]++; if (numSampled >= roundSamples) { roundNumber++; - if (roundNumber == rounds) - keepSample = false; - else + if (roundNumber <= rounds) { numSampled = 0; - adaptOrdering(pmerFrequency); - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + adaptOrdering(pmerFrequency); + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = 100 * rounds * roundSamples; + } + } else { + keepSample = false; + } } + frequency = pmerFrequency; + } bfrG.close(); frG.close(); @@ -111,7 +120,7 @@ private void adaptOrdering(long[] pmerFrequency) { long biggest = Arrays.stream(pmerFrequency).max().getAsLong(); for (int j = 0; j < pmerFrequency.length; j++) { if (pmerFrequency[j] == biggest) { - long newRank = currentOrdering[j] + (int) Math.pow(4, pivotLength)/100; + long newRank = currentOrdering[j] + (int) Math.pow(4, pivotLength) / 100; currentOrdering[j] = newRank; currentOrdering[getReversed(j)] = newRank; pmerFrequency[j] = 0; @@ -154,21 +163,61 @@ public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { return 1; } + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + public void exportOrderingForCpp() { - System.out.print("{"); - for (int i = 0; i < currentOrdering.length; i++) { - System.out.print(currentOrdering[i] + ","); + File file = new File("rank.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < currentOrdering.length; i++) { + bf.write(Long.toString(currentOrdering[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } } - System.out.print("}"); - System.out.println(); } public void exportBinningForCpp() { - System.out.print("{"); - for (int i = 0; i < frequency.length; i++) { - System.out.print(frequency[i] + ","); + File file = new File("freq.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < frequency.length; i++) { + bf.write(Long.toString(frequency[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } } - System.out.print("}"); - System.out.println(); } } From c5059a2add2520bce835d7927e0a36234fbe1b8b Mon Sep 17 00:00:00 2001 From: danflomin Date: Mon, 1 Mar 2021 10:08:23 +0200 Subject: [PATCH 07/44] nothing --- src/buildgraph/BuildDeBruijnGraph.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 8cf0436..f81384a 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -18,14 +18,14 @@ public static void main(String[] args) throws IOException { // String infile = "/home/gaga/data-scratch/yaelbenari/datas/chr14.fastq"; // String infile = "/home/gaga/data-scratch/yaelbenari/datas/smalldata.fastq"; - String infile = "/home/gaga/data-scratch/yaelbenari/datas/breastCancer.fastq"; -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/beeData.fastq"; +// String infile = "/home/gaga/data-scratch/yaelbenari/datas/breastCancer.fastq"; + String infile = "/home/gaga/data-scratch/yaelbenari/datas/beeData.fastq"; // String infile = "/home/gaga/data-scratch/yaelbenari/datas/workspace/72.fastq"; int k = 60, pivot_len = 8, bufferSize = 8192, numThreads = 20, hsmapCapacity = 10000000; -// int readLen = 124; + int readLen = 124; // int readLen = 101; - int readLen = 100; +// int readLen = 100; int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// boolean readable = false; String orderingName = "uhs_sig_freq"; From 35d6ac155aa556666738f2decc14e077de3913cc Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 3 Mar 2021 11:10:41 +0200 Subject: [PATCH 08/44] nothing --- src/buildgraph/BuildDeBruijnGraph.java | 147 +++++++----------- .../Ordering/IterativeOrdering3.java | 18 ++- 2 files changed, 70 insertions(+), 95 deletions(-) diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index f81384a..475266e 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -26,10 +26,11 @@ public static void main(String[] args) throws IOException { int readLen = 124; // int readLen = 101; // int readLen = 100; - int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// - boolean readable = false; +// int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// +// boolean readable = false; String orderingName = "uhs_sig_freq"; int xor = 0; //11101101; + int numRounds=0, elementsToPush=0, samplesPerRound=0, statSamples=0; if (args.length > 0 && args[0].equals("-help")) { System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" + @@ -48,8 +49,8 @@ public static void main(String[] args) throws IOException { infile = args[i+1]; else if(args[i].equals("-k")) k = new Integer(args[i+1]); - else if(args[i].equals("-NB")) - numBlocks = new Integer(args[i+1]); +// else if(args[i].equals("-NB")) +// numBlocks = new Integer(args[i+1]); // else // if(args[i].equals("-o")) // orderingName = args[i+1]; @@ -61,9 +62,17 @@ else if(args[i].equals("-L")) readLen = new Integer(args[i+1]); else if(args[i].equals("-t")) numThreads = new Integer(args[i+1]); - else if(args[i].equals("-r")) - readable = new Boolean(args[i+1]); - else{ +// else if(args[i].equals("-r")) +// readable = new Boolean(args[i+1]); + else if(args[i].equals("-rounds")) + numRounds = new Integer(args[i+1]); + else if(args[i].equals("-samples")) + samplesPerRound = new Integer(args[i+1]); + else if(args[i].equals("-elementsToPush")) + elementsToPush = new Integer(args[i+1]); + else if(args[i].equals("-statSamples")) + statSamples = new Integer(args[i+1]); + else{ System.out.println("Wrong with arguments. Abort!"); return; } @@ -87,104 +96,58 @@ else if(args[i].equals("-r")) // UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); // ordering.initRank(); - IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k); + IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples); ordering.initFrequency(); ordering.exportOrderingForCpp(); ordering.exportBinningForCpp(); -// try { + } + +// public static HashMap getBytesPerFile() { +// File folder = new File("./Nodes"); +// File[] listOfFiles = folder.listFiles(); // -// System.out.println("Program Configuration:"); -// System.out.print("Input File: " + infile + "\n" + -// "Kmer Length: " + k + "\n" + -// "Read Length: " + readLen + "\n" + -// "# Of Blocks: " + numBlocks + "\n" + -// "Pivot Length: " + pivot_len + "\n" + -// "# Of Threads: " + numThreads + "\n" + -// "R/W Buffer Size: " + bufferSize + "\n" + -// "Ordering: " + orderingName + "\n" + -// "x xor: " + xor + "\n" + -// "Output Format: " + (readable == true ? "Text" : "Binary") + "\n"); +// HashMap bytesPerFile = new HashMap<>(); // -// Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); -// Map map = new Map(k, numBlocks, bufferSize, hsmapCapacity); +// for (int i = 0; i < listOfFiles.length; i++) { +// if (listOfFiles[i].isFile()) +// bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length()); +// } +// return bytesPerFile; +// } // +// public static void writeToFile(AbstractMap data, String fileName) { +// File file = new File(fileName); // -// long maxID = partition.Run(); +// BufferedWriter bf = null; +// ; // -// AbstractMap distinctKmersPerPartition = map.Run(numThreads); -// BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); -// System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); +// try { +// bf = new BufferedWriter(new FileWriter(file)); +// +// bf.write("x = {"); +// bf.newLine(); // -// HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); -// BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); -//// -//// -//// long time1 = 0; -//// long t1 = System.currentTimeMillis(); -//// System.out.println("Merge IDReplaceTables Begin!"); -//// String sortcmd = "sort -t $\'\t\' -o IDReplaceTable +0 -1 -n -m Maps/maps*"; -//// Runtime.getRuntime().exec(new String[]{"/bin/sh", "-c", sortcmd}, null, null).waitFor(); -//// long t2 = System.currentTimeMillis(); -//// time1 = (t2 - t1) / 1000; -//// System.out.println("Time used for merging: " + time1 + " seconds!"); -//// -//// Replace replace = new Replace("IDReplaceTable", "OutGraph", k, bufferSize, readLen, maxID); -//// replace.Run(readable); +// //iterate map entries +// for (java.util.Map.Entry entry : data.entrySet()) { +// bf.write(entry.getKey() + ":" + entry.getValue() + ","); +// bf.newLine(); +// } +// bf.write("}"); +// bf.flush(); // +// } catch (IOException e) { +// e.printStackTrace(); +// } finally { // -// } catch (Exception E) { -// System.out.println("Exception caught!"); -// E.printStackTrace(); +// try { +// //always close the writer +// bf.close(); +// } catch (Exception e) { +// } // } - - } - - public static HashMap getBytesPerFile() { - File folder = new File("./Nodes"); - File[] listOfFiles = folder.listFiles(); - - HashMap bytesPerFile = new HashMap<>(); - - for (int i = 0; i < listOfFiles.length; i++) { - if (listOfFiles[i].isFile()) - bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length()); - } - return bytesPerFile; - } - - public static void writeToFile(AbstractMap data, String fileName) { - File file = new File(fileName); - - BufferedWriter bf = null; - ; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - bf.write("x = {"); - bf.newLine(); - - //iterate map entries - for (java.util.Map.Entry entry : data.entrySet()) { - bf.write(entry.getKey() + ":" + entry.getValue() + ","); - bf.newLine(); - } - bf.write("}"); - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - - } +// +// } } diff --git a/src/buildgraph/Ordering/IterativeOrdering3.java b/src/buildgraph/Ordering/IterativeOrdering3.java index 41907dd..eb1299d 100644 --- a/src/buildgraph/Ordering/IterativeOrdering3.java +++ b/src/buildgraph/Ordering/IterativeOrdering3.java @@ -15,9 +15,10 @@ public class IterativeOrdering3 implements IOrdering { private StringUtils stringUtils; private long[] frequency; - private int roundSamples = 100000; - private int rounds = 10000; - private int elementsToPush = 1; + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; public IterativeOrdering3(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { this.inputFile = infile; @@ -36,6 +37,17 @@ public IterativeOrdering3(int pivotLength, String infile, int readLen, int bufSi currentOrdering[i] = canonical; currentOrdering[getReversed(i)] = canonical; } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeOrdering3(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; } From 948103f7fb408664ee23ca945105f283c18d1d37 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 3 Mar 2021 23:26:46 +0200 Subject: [PATCH 09/44] add pruning --- src/buildgraph/Ordering/IterativeOrdering3.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/buildgraph/Ordering/IterativeOrdering3.java b/src/buildgraph/Ordering/IterativeOrdering3.java index eb1299d..a25c916 100644 --- a/src/buildgraph/Ordering/IterativeOrdering3.java +++ b/src/buildgraph/Ordering/IterativeOrdering3.java @@ -137,6 +137,7 @@ private void adaptOrdering(long[] pmerFrequency) { currentOrdering[getReversed(j)] = newRank; pmerFrequency[j] = 0; pmerFrequency[getReversed(j)] = 0; + break; } } } From f93531cbaa005608bece35768aa74def38edeaf9 Mon Sep 17 00:00:00 2001 From: danflomin Date: Mon, 8 Mar 2021 14:02:40 +0200 Subject: [PATCH 10/44] checkpoint --- src/buildgraph/BuildDeBruijnGraph.java | 66 ++-- .../Ordering/IterativeOrdering3.java | 2 +- .../Ordering/IterativeOrdering4.java | 250 ++++++++++++++ .../Ordering/IterativeOrdering5.java | 264 +++++++++++++++ .../Ordering/IterativeOrdering6.java | 268 +++++++++++++++ .../Ordering/IterativeOrdering8.java | 266 +++++++++++++++ .../Ordering/IterativeOrdering9.java | 285 ++++++++++++++++ .../Ordering/IterativeUHSOrdering8.java | 313 ++++++++++++++++++ .../Ordering/IterativeUHSOrdering9.java | 309 +++++++++++++++++ 9 files changed, 1989 insertions(+), 34 deletions(-) create mode 100644 src/buildgraph/Ordering/IterativeOrdering4.java create mode 100644 src/buildgraph/Ordering/IterativeOrdering5.java create mode 100644 src/buildgraph/Ordering/IterativeOrdering6.java create mode 100644 src/buildgraph/Ordering/IterativeOrdering8.java create mode 100644 src/buildgraph/Ordering/IterativeOrdering9.java create mode 100644 src/buildgraph/Ordering/IterativeUHSOrdering8.java create mode 100644 src/buildgraph/Ordering/IterativeUHSOrdering9.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 475266e..6c6893b 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -16,21 +16,16 @@ public class BuildDeBruijnGraph { public static void main(String[] args) throws IOException { -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/chr14.fastq"; -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/smalldata.fastq"; -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/breastCancer.fastq"; - String infile = "/home/gaga/data-scratch/yaelbenari/datas/beeData.fastq"; -// String infile = "/home/gaga/data-scratch/yaelbenari/datas/workspace/72.fastq"; + String infile = null; int k = 60, pivot_len = 8, bufferSize = 8192, numThreads = 20, hsmapCapacity = 10000000; - int readLen = 124; -// int readLen = 101; -// int readLen = 100; + int readLen = 124; // int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// // boolean readable = false; String orderingName = "uhs_sig_freq"; int xor = 0; //11101101; - int numRounds=0, elementsToPush=0, samplesPerRound=0, statSamples=0; + int numRounds = 0, elementsToPush = 0, samplesPerRound = 0, statSamples = 0; + double punishPercentage = 1; if (args.length > 0 && args[0].equals("-help")) { System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" + @@ -45,41 +40,42 @@ public static void main(String[] args) throws IOException { } for (int i = 0; i < args.length; i += 2) { - if(args[i].equals("-in")) - infile = args[i+1]; - else if(args[i].equals("-k")) - k = new Integer(args[i+1]); + if (args[i].equals("-in")) + infile = args[i + 1]; + else if (args[i].equals("-k")) + k = new Integer(args[i + 1]); // else if(args[i].equals("-NB")) // numBlocks = new Integer(args[i+1]); // else // if(args[i].equals("-o")) // orderingName = args[i+1]; - else if(args[i].equals("-p")) - pivot_len = new Integer(args[i+1]); - else if(args[i].equals("-b")) - bufferSize = new Integer(args[i+1]); - else if(args[i].equals("-L")) - readLen = new Integer(args[i+1]); - else if(args[i].equals("-t")) - numThreads = new Integer(args[i+1]); + else if (args[i].equals("-p")) + pivot_len = new Integer(args[i + 1]); + else if (args[i].equals("-b")) + bufferSize = new Integer(args[i + 1]); + else if (args[i].equals("-L")) + readLen = new Integer(args[i + 1]); + else if (args[i].equals("-t")) + numThreads = new Integer(args[i + 1]); // else if(args[i].equals("-r")) // readable = new Boolean(args[i+1]); - else if(args[i].equals("-rounds")) - numRounds = new Integer(args[i+1]); - else if(args[i].equals("-samples")) - samplesPerRound = new Integer(args[i+1]); - else if(args[i].equals("-elementsToPush")) - elementsToPush = new Integer(args[i+1]); - else if(args[i].equals("-statSamples")) - statSamples = new Integer(args[i+1]); - else{ + else if (args[i].equals("-rounds")) + numRounds = new Integer(args[i + 1]); + else if (args[i].equals("-samples")) + samplesPerRound = new Integer(args[i + 1]); + else if (args[i].equals("-elementsToPush")) + elementsToPush = new Integer(args[i + 1]); + else if (args[i].equals("-statSamples")) + statSamples = new Integer(args[i + 1]); + else if (args[i].equals("-punishPercentage")) + punishPercentage = new Double(args[i + 1]); + else { System.out.println("Wrong with arguments. Abort!"); return; } } - orderingName = "iterativeOrdering"; // IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 // ordering.initFrequency(); @@ -96,8 +92,12 @@ else if(args[i].equals("-statSamples")) // UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); // ordering.initRank(); - IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples); - +// IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples); +// IterativeOrdering4 ordering = new IterativeOrdering4(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples, maskRatio, punishPercentage); +// IterativeOrdering6 ordering = new IterativeOrdering6(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples, maskRatio, punishPercentage); +// IterativeUHSOrdering8 ordering = new IterativeUHSOrdering8(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples, maskRatio, punishPercentage); +// IterativeOrdering8 ordering = new IterativeOrdering8(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); + IterativeOrdering9 ordering = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); ordering.initFrequency(); ordering.exportOrderingForCpp(); ordering.exportBinningForCpp(); diff --git a/src/buildgraph/Ordering/IterativeOrdering3.java b/src/buildgraph/Ordering/IterativeOrdering3.java index a25c916..4361184 100644 --- a/src/buildgraph/Ordering/IterativeOrdering3.java +++ b/src/buildgraph/Ordering/IterativeOrdering3.java @@ -113,7 +113,7 @@ public void initFrequency() throws IOException { pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements if (roundNumber == rounds) { System.out.println("Sampling for binning round"); - roundSamples = 100 * rounds * roundSamples; + roundSamples = statisticsSamples; } } else { keepSample = false; diff --git a/src/buildgraph/Ordering/IterativeOrdering4.java b/src/buildgraph/Ordering/IterativeOrdering4.java new file mode 100644 index 0000000..a4366bc --- /dev/null +++ b/src/buildgraph/Ordering/IterativeOrdering4.java @@ -0,0 +1,250 @@ +package buildgraph.Ordering; + +import buildgraph.StringUtils; + +import java.io.*; +import java.util.Arrays; + +public class IterativeOrdering4 implements IOrdering { + private String inputFile; + private int readLen; + private int bufSize; + private int pivotLength; + private int k; + private long[] currentOrdering; + private StringUtils stringUtils; + private long[] frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double maskRatio; + private double percentagePunishment; + + public IterativeOrdering4(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + } + + public IterativeOrdering4(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeOrdering4(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double maskRatio, double percentagePunishment) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.maskRatio = maskRatio; + this.percentagePunishment = percentagePunishment; + } + + + public void initFrequency() throws IOException { + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + pmerFrequency[minValue] += k; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & 0x3fff;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + pmerFrequency[minValue] += k; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + pmerFrequency[minValue] += k; + } + } + + pmerFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(long[] pmerFrequency) { + boolean[] mask = new boolean[pmerFrequency.length]; + for(int i = 0 ; i biggest) { + biggest = pmerFrequency[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int)((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + pmerFrequency[biggestIndex] = 0; + pmerFrequency[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public void exportOrderingForCpp() { + File file = new File("ranks.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < currentOrdering.length; i++) { + bf.write(Long.toString(currentOrdering[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + + public void exportBinningForCpp() { + File file = new File("freq.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < frequency.length; i++) { + bf.write(Long.toString(frequency[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } +} diff --git a/src/buildgraph/Ordering/IterativeOrdering5.java b/src/buildgraph/Ordering/IterativeOrdering5.java new file mode 100644 index 0000000..fac839d --- /dev/null +++ b/src/buildgraph/Ordering/IterativeOrdering5.java @@ -0,0 +1,264 @@ +package buildgraph.Ordering; + +import buildgraph.StringUtils; + +import java.io.*; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; + +public class IterativeOrdering5 implements IOrdering { + private String inputFile; + private int readLen; + private int bufSize; + private int pivotLength; + private int k; + private long[] currentOrdering; + private StringUtils stringUtils; + private long[] frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double maskRatio; + private double percentagePunishment; + + public IterativeOrdering5(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + } + + public IterativeOrdering5(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeOrdering5(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double maskRatio, double percentagePunishment) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.maskRatio = maskRatio; + this.percentagePunishment = percentagePunishment; + } + + + public void initFrequency() throws IOException { + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + pmerFrequency[minValue] += 1; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & 0x3fff;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + pmerFrequency[minValue] += 1; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + pmerFrequency[minValue] += 1; + } + } + + pmerFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(long[] pmerFrequency) { + boolean[] mask = new boolean[pmerFrequency.length]; + for (int i = 0; i < mask.length; i++) { + if (Math.random() < 1 - maskRatio) + mask[i] = true; + } +// TODO : if biggest is smaller than (samples / 4^(m-1))/5 + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < pmerFrequency.length; k++) { + if (mask[k] && pmerFrequency[k] > biggest) { + biggest = pmerFrequency[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + pmerFrequency[biggestIndex] = 0; + pmerFrequency[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + Integer[] temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + pmerFrequency[minValue] += 1; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + pmerFrequency[minValue] += 1; + } + } + + pmerFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(long[] pmerFrequency) { + boolean[] mask = new boolean[pmerFrequency.length]; + for (int i = 0; i < mask.length; i++) { + if (Math.random() < 1 - maskRatio) + mask[i] = true; + } +// TODO : if biggest is smaller than (samples / 4^(m-1))/5 + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < pmerFrequency.length; k++) { + if (mask[k] && pmerFrequency[k] > biggest) { + biggest = pmerFrequency[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + pmerFrequency[biggestIndex] = 0; + pmerFrequency[getReversed(biggestIndex)] = 0; + } + normalize(); + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if(temp == null) + { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + pmerFrequency[minValue] += 1; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + pmerFrequency[minValue] += 1; + } + } + + pmerFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); + if(roundNumber % 100 == 0) { + percentagePunishment *= 0.996; + normalize(); + } + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(long[] pmerFrequency) { +// TODO : if biggest is smaller than (samples / 4^(m-1))/5 + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < pmerFrequency.length; k++) { + if (pmerFrequency[k] > biggest) { + biggest = pmerFrequency[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + pmerFrequency[biggestIndex] = 0; + pmerFrequency[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if(temp == null) + { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i> frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double percentagePunishment; + + private Integer[] temp = null; + private int mask; + private long[] statFrequency; + + public IterativeOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + } + + public IterativeOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.percentagePunishment = percentagePunishment; + this.mask = (int)Math.pow(4, pivotLength) - 1; + } + + + public void initFrequency() throws IOException { + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + statFrequency = new long[(int) Math.pow(4, pivotLength)]; +// HashSet[] pmerFrequency; +// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; + HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + if(!pmerFrequency.containsKey( minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(0, k)); // += 1; + if(roundNumber == rounds) statFrequency[minValue]++; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + + if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) statFrequency[minValue]++; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + + if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) statFrequency[minValue]++; + } + } + + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) statFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); +// if(roundNumber % 100 == 0) { +// percentagePunishment *= 0.996; +// normalize(); +// } + pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(HashMap> pmerFrequency) { + int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; + for(Integer i : pmerFrequency.keySet()){ + frequencies[i] = pmerFrequency.get(i).size(); + } + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < frequencies.length; k++) { + if (frequencies[k] > biggest) { + biggest = frequencies[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + frequencies[biggestIndex] = 0; + frequencies[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if(temp == null) + { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + pmerFrequency[minValue] += 1; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + pmerFrequency[minValue] += 1; + } + } + + pmerFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); + if(roundNumber % 100 == 0) + percentagePunishment *= 0.996; + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + for(int i = 0 ; i biggest) { + biggest = pmerFrequency[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) (sizeOfUHS * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + pmerFrequency[biggestIndex] = 0; + pmerFrequency[getReversed(biggestIndex)] = 0; + } + + //normalize(); + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if(temp == null) + { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + pmerFrequency[minValue] += 1; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + pmerFrequency[minValue] += 1; + } + } + + pmerFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); + if(roundNumber % 100 == 0) + percentagePunishment *= 0.996; + pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + for(int i = 0 ; i biggest) { + biggest = pmerFrequency[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) (sizeOfUHS * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + pmerFrequency[biggestIndex] = 0; + pmerFrequency[getReversed(biggestIndex)] = 0; + } + + //normalize(); + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if(temp == null) + { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i Date: Sun, 14 Mar 2021 17:57:37 +0200 Subject: [PATCH 11/44] remove ordering10 --- src/buildgraph/BuildDeBruijnGraph.java | 17 +- .../Ordering/FrequencyOrdering.java | 256 +++++++++++++++ .../Ordering/IterativeOrdering10.java | 296 ++++++++++++++++++ 3 files changed, 566 insertions(+), 3 deletions(-) create mode 100644 src/buildgraph/Ordering/FrequencyOrdering.java create mode 100644 src/buildgraph/Ordering/IterativeOrdering10.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 6c6893b..d0834fb 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -97,11 +97,22 @@ else if (args[i].equals("-punishPercentage")) // IterativeOrdering6 ordering = new IterativeOrdering6(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples, maskRatio, punishPercentage); // IterativeUHSOrdering8 ordering = new IterativeUHSOrdering8(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples, maskRatio, punishPercentage); // IterativeOrdering8 ordering = new IterativeOrdering8(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - IterativeOrdering9 ordering = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - ordering.initFrequency(); + +// GOOD +// IterativeOrdering9 ordering = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); +// ordering.initFrequency(); +// ordering.exportOrderingForCpp(); +// ordering.exportBinningForCpp(); +// END GOOD + +// FREQUENCY SUCKS +// FrequencyOrdering ordering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds*samplesPerRound, statSamples, k); +// ordering.initFrequency(); +// END FREQUENCY + + IterativeOrdering10 ordering = new IterativeOrdering10(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); ordering.exportOrderingForCpp(); ordering.exportBinningForCpp(); - } // public static HashMap getBytesPerFile() { diff --git a/src/buildgraph/Ordering/FrequencyOrdering.java b/src/buildgraph/Ordering/FrequencyOrdering.java new file mode 100644 index 0000000..6496bca --- /dev/null +++ b/src/buildgraph/Ordering/FrequencyOrdering.java @@ -0,0 +1,256 @@ +package buildgraph.Ordering; + +import buildgraph.Ordering.UHS.UHSSignatureOrdering; +import buildgraph.StringUtils; + +import java.io.*; +import java.net.Inet4Address; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; + +public class FrequencyOrdering implements IOrdering { + private int pivotLength; + private String inputFile; + private int readLen; + private int bufSize; + private long[] pmerFrequency; + private long[] statsFrequency; + private int numSamples; + private int numStats; + private int k; + private StringUtils stringUtils; + private int mask; + + public FrequencyOrdering(int pivotLen, String infile, int readLen, int bufSize, int numSamples, int numStats, int k) { + pivotLength = pivotLen; + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + pmerFrequency = new long[(int)Math.pow(4, pivotLen)]; + this.numSamples = numSamples; + this.numStats = numStats; + this.k = k; + stringUtils = new StringUtils(); + mask = (int)Math.pow(4, pivotLen) - 1; + } + + public void initFrequency() throws IOException { + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + int counter = 0; + + String describeline; + + char[] lineCharArray = new char[readLen]; + + + while ((describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + if (stringUtils.isReadLegal(lineCharArray)) { + char[] revCharArray = stringUtils.getReversedRead(lineCharArray); + for (int i = 0; i < lineCharArray.length-pivotLength; i++) { + + int lineValue = stringUtils.getDecimal(lineCharArray, i, i+pivotLength); + pmerFrequency[lineValue] += 1; + + int revValue = stringUtils.getDecimal(revCharArray, i, i+pivotLength); + pmerFrequency[revValue] += 1; + + counter++; + } + if(counter > numSamples){ + break; + } + } + } + + normalize(); + initStats(bfrG); + + bfrG.close(); + frG.close(); + } + + private void initStats(BufferedReader bfrG) throws IOException { + + int numSampled = 0; + boolean keepSample = true; + + statsFrequency = new long[(int) Math.pow(4, pivotLength)]; + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + + statsFrequency[minValue]++; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + + + statsFrequency[minValue]++; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + + statsFrequency[minValue]++; + } + } + statsFrequency[minValue]++; + } + if(numSampled > numStats) keepSample = false; + } + } + } + + public long[] getRawOrdering() + { + return pmerFrequency.clone(); + } + + + private void normalize() { + Integer[] temp = new Integer[pmerFrequency.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + + Arrays.sort(temp, this::strcmp); + for(int i = 0 ; i>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + return strcmp(x,y); + } + + public int strcmp(int x, int y) + { + if (x == y) return 0; + if (pmerFrequency[x] == pmerFrequency[y]) { + if(x> frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double percentagePunishment; + + private Integer[] temp = null; + private int mask; + private long[] statFrequency; + + public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + } + + public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.percentagePunishment = percentagePunishment; + this.mask = (int)Math.pow(4, pivotLength) - 1; + } + + + + public void initFrequency() throws IOException { + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + statFrequency = new long[(int) Math.pow(4, pivotLength)]; + byte[] currentHits = new byte[(int) Math.pow(4, pivotLength)]; +// HashSet[] pmerFrequency; +// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; + HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + addMyCount(currentHits, pmerFrequency, minValue, line, 0); + if(roundNumber == rounds) statFrequency[minValue]++; + + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + + addMyCount(currentHits, pmerFrequency, minValue, line, i); + if(roundNumber == rounds) statFrequency[minValue]++; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + + addMyCount(currentHits, pmerFrequency, minValue, line, i); + if(roundNumber == rounds) statFrequency[minValue]++; + } + } + + addMyCount(currentHits, pmerFrequency, minValue, line, i); + if(roundNumber == rounds) statFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); +// if(roundNumber % 100 == 0) { +// percentagePunishment *= 0.996; +// normalize(); +// } + pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements + for(int lol=0; lol> pmerFrequency, int minValue, String line, int i2) { + currentHits[minValue]++; + if (currentHits[minValue] > 1) { + if (!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(i2, i2 + k)); // += 1; + } + } + + + private void adaptOrdering(HashMap> pmerFrequency) { + int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; + for(Integer i : pmerFrequency.keySet()){ + frequencies[i] = pmerFrequency.get(i).size(); + } + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < frequencies.length; k++) { + if (frequencies[k] > biggest) { + biggest = frequencies[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + frequencies[biggestIndex] = 0; + frequencies[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if(temp == null) + { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i Date: Wed, 17 Mar 2021 11:46:39 +0200 Subject: [PATCH 12/44] add multiple pass over data where each pass opens only 10K files --- src/buildgraph/BuildDeBruijnGraph.java | 130 +++++--- .../Ordering/IterativeOrdering10.java | 296 ------------------ src/buildgraph/Partition.java | 24 +- 3 files changed, 94 insertions(+), 356 deletions(-) delete mode 100644 src/buildgraph/Ordering/IterativeOrdering10.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index d0834fb..535a101 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -99,66 +99,92 @@ else if (args[i].equals("-punishPercentage")) // IterativeOrdering8 ordering = new IterativeOrdering8(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); // GOOD -// IterativeOrdering9 ordering = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); -// ordering.initFrequency(); -// ordering.exportOrderingForCpp(); -// ordering.exportBinningForCpp(); + IterativeOrdering9 ordering = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); + ordering.initFrequency(); + ordering.exportOrderingForCpp(); + ordering.exportBinningForCpp(); // END GOOD // FREQUENCY SUCKS // FrequencyOrdering ordering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds*samplesPerRound, statSamples, k); // ordering.initFrequency(); // END FREQUENCY + try { - IterativeOrdering10 ordering = new IterativeOrdering10(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - ordering.exportOrderingForCpp(); - ordering.exportBinningForCpp(); + System.out.println("Program Configuration:"); + System.out.print("Input File: " + infile + "\n" + + "Kmer Length: " + k + "\n" + + "Read Length: " + readLen + "\n" + + "Pivot Length: " + pivot_len + "\n" + + "# Of Threads: " + numThreads + "\n" + + "R/W Buffer Size: " + bufferSize + "\n" + + "Ordering: " + orderingName + "\n"); + + Partition partition = new Partition(k, infile, (int)Math.pow(4, pivot_len), pivot_len, bufferSize, readLen, ordering); + Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); + + + partition.Run(); + + AbstractMap distinctKmersPerPartition = map.Run(numThreads); + BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); + System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); + + HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); + BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); + + } catch (Exception E) { + System.out.println("Exception caught!"); + E.printStackTrace(); + } + + + } + + public static HashMap getBytesPerFile() { + File folder = new File("./Nodes"); + File[] listOfFiles = folder.listFiles(); + + HashMap bytesPerFile = new HashMap<>(); + + for (int i = 0; i < listOfFiles.length; i++) { + if (listOfFiles[i].isFile()) + bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length()); + } + return bytesPerFile; } -// public static HashMap getBytesPerFile() { -// File folder = new File("./Nodes"); -// File[] listOfFiles = folder.listFiles(); -// -// HashMap bytesPerFile = new HashMap<>(); -// -// for (int i = 0; i < listOfFiles.length; i++) { -// if (listOfFiles[i].isFile()) -// bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length()); -// } -// return bytesPerFile; -// } -// -// public static void writeToFile(AbstractMap data, String fileName) { -// File file = new File(fileName); -// -// BufferedWriter bf = null; -// ; -// -// try { -// bf = new BufferedWriter(new FileWriter(file)); -// -// bf.write("x = {"); -// bf.newLine(); -// -// //iterate map entries -// for (java.util.Map.Entry entry : data.entrySet()) { -// bf.write(entry.getKey() + ":" + entry.getValue() + ","); -// bf.newLine(); -// } -// bf.write("}"); -// bf.flush(); -// -// } catch (IOException e) { -// e.printStackTrace(); -// } finally { -// -// try { -// //always close the writer -// bf.close(); -// } catch (Exception e) { -// } -// } -// -// } + public static void writeToFile(AbstractMap data, String fileName) { + File file = new File(fileName); + + BufferedWriter bf = null; + ; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + bf.write("x = {"); + bf.newLine(); + + //iterate map entries + for (java.util.Map.Entry entry : data.entrySet()) { + bf.write(entry.getKey() + ":" + entry.getValue() + ","); + bf.newLine(); + } + bf.write("}"); + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + + } } diff --git a/src/buildgraph/Ordering/IterativeOrdering10.java b/src/buildgraph/Ordering/IterativeOrdering10.java deleted file mode 100644 index 59c5f90..0000000 --- a/src/buildgraph/Ordering/IterativeOrdering10.java +++ /dev/null @@ -1,296 +0,0 @@ -package buildgraph.Ordering; - -import buildgraph.StringUtils; - -import java.io.*; -import java.lang.reflect.Array; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; - -// HERE we store a cache for hits and count distinct elements of minimizers with high occurences -public class IterativeOrdering10 implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private HashMap> frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double percentagePunishment; - - private Integer[] temp = null; - private int mask; - private long[] statFrequency; - - public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.percentagePunishment = percentagePunishment; - this.mask = (int)Math.pow(4, pivotLength) - 1; - } - - - - public void initFrequency() throws IOException { - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - statFrequency = new long[(int) Math.pow(4, pivotLength)]; - byte[] currentHits = new byte[(int) Math.pow(4, pivotLength)]; -// HashSet[] pmerFrequency; -// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; - HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - addMyCount(currentHits, pmerFrequency, minValue, line, 0); - if(roundNumber == rounds) statFrequency[minValue]++; - - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - - addMyCount(currentHits, pmerFrequency, minValue, line, i); - if(roundNumber == rounds) statFrequency[minValue]++; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - - addMyCount(currentHits, pmerFrequency, minValue, line, i); - if(roundNumber == rounds) statFrequency[minValue]++; - } - } - - addMyCount(currentHits, pmerFrequency, minValue, line, i); - if(roundNumber == rounds) statFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); -// if(roundNumber % 100 == 0) { -// percentagePunishment *= 0.996; -// normalize(); -// } - pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements - for(int lol=0; lol> pmerFrequency, int minValue, String line, int i2) { - currentHits[minValue]++; - if (currentHits[minValue] > 1) { - if (!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(i2, i2 + k)); // += 1; - } - } - - - private void adaptOrdering(HashMap> pmerFrequency) { - int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; - for(Integer i : pmerFrequency.keySet()){ - frequencies[i] = pmerFrequency.get(i).size(); - } - for (int i = 0; i < elementsToPush; i++) { - long biggest = -1; - int biggestIndex = -1; - for (int k = 0; k < frequencies.length; k++) { - if (frequencies[k] > biggest) { - biggest = frequencies[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - frequencies[biggestIndex] = 0; - frequencies[getReversed(biggestIndex)] = 0; - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if(temp == null) - { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i Date: Mon, 22 Mar 2021 16:58:25 +0200 Subject: [PATCH 13/44] add stats and normalization to universal-frequency --- .../UHS/UHSFrequencySignatureOrdering.java | 130 +++++++++++++++++- .../Ordering/UHS/UHSSignatureOrdering.java | 12 ++ src/buildgraph/Partition.java | 1 - 3 files changed, 141 insertions(+), 2 deletions(-) diff --git a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java index 3b8b214..4838a41 100644 --- a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ b/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java @@ -1,21 +1,33 @@ package buildgraph.Ordering.UHS; +import buildgraph.StringUtils; + import java.io.*; +import java.util.Arrays; +import java.util.Comparator; public class UHSFrequencySignatureOrdering extends UHSSignatureOrdering { private String inputFile; private int readLen; private int bufSize; private long[] pmerFrequency; + private int k; + private int numStats; private boolean isInit; - public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature) throws IOException { + private long[] statsFrequency; + private int mask; + + public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature, int k, int numStats) throws IOException { super(0, pivotLen, useSignature); this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; pmerFrequency = new long[(int)Math.pow(4, pivotLen)]; + this.k = k; + this.numStats = numStats; isInit = false; + mask = (int)Math.pow(4, pivotLen) - 1; } @Override @@ -88,10 +100,126 @@ private void initFrequency() throws IOException { } } } + initStats(bfrG); bfrG.close(); frG.close(); } + private void initStats(BufferedReader bfrG) throws IOException { + + int numSampled = 0; + boolean keepSample = true; + + statsFrequency = new long[(int) Math.pow(4, pivotLen)]; + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLen); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLen, k); + + statsFrequency[minValue]++; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLen); + + + statsFrequency[minValue]++; + } else { + int lastIndexInWindow = k + i - pivotLen; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + + statsFrequency[minValue]++; + } + } + statsFrequency[minValue]++; + } + if(numSampled > numStats) keepSample = false; + } + } + } + +// public int[] getNormalizedForm() +// { +// int[] ret = rankOfPmer.clone(); +// return ret; +// } + + public void exportOrderingForCpp() { + File file = new File("ranks.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < rankOfPmer.length; i++) { + bf.write(Long.toString(rankOfPmer[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + + public void exportBinningForCpp() { + File file = new File("freq.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < statsFrequency.length; i++) { + bf.write(Long.toString(statsFrequency[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + diff --git a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java b/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java index c442853..6e78e38 100644 --- a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java +++ b/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java @@ -46,6 +46,18 @@ public int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOEx return 1; } + public int strcmp(int x, int y) throws IOException { + if(!isRankInit) throw new IOException("rank not initialized yet"); + + if (x == y) return 0; + + // isRankInit = true here + if (rankOfPmer[x] < rankOfPmer[y]) { + return -1; + } + return 1; + } + @Override public int findSmallest(char[] a, int from, int to) throws IOException { int min_pos = from; diff --git a/src/buildgraph/Partition.java b/src/buildgraph/Partition.java index 5fe3a71..3c45452 100644 --- a/src/buildgraph/Partition.java +++ b/src/buildgraph/Partition.java @@ -223,7 +223,6 @@ private void tryCreateWriterForPmer(int prepos) throws IOException { fwG[i] = null; } } - Runtime.getRuntime().gc(); numOpenFiles = 0; } From a7e8afdcab1a7fd6fd5a52f8e5022639758476c4 Mon Sep 17 00:00:00 2001 From: danflomin Date: Mon, 22 Mar 2021 20:33:28 +0200 Subject: [PATCH 14/44] add hyperloglog to ordering9 as ordering10. included in stats sampling phase --- src/buildgraph/BuildDeBruijnGraph.java | 87 +++-- .../Ordering/IterativeOrdering10.java | 333 ++++++++++++++++++ .../Ordering/IterativeSignatureOrdering9.java | 296 ++++++++++++++++ src/buildgraph/StringUtils.java | 12 + 4 files changed, 698 insertions(+), 30 deletions(-) create mode 100644 src/buildgraph/Ordering/IterativeOrdering10.java create mode 100644 src/buildgraph/Ordering/IterativeSignatureOrdering9.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 535a101..6ef82f3 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -99,44 +99,71 @@ else if (args[i].equals("-punishPercentage")) // IterativeOrdering8 ordering = new IterativeOrdering8(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); // GOOD - IterativeOrdering9 ordering = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - ordering.initFrequency(); - ordering.exportOrderingForCpp(); - ordering.exportBinningForCpp(); + + String version = ""; + switch(version) + { + case "9": // good version + IterativeOrdering9 ordering = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); + ordering.initFrequency(); + ordering.exportOrderingForCpp(); + ordering.exportBinningForCpp(); + break; + case "10": + IterativeOrdering10 ordering10 = new IterativeOrdering10(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); + ordering10.initFrequency(); + ordering10.exportOrderingForCpp(); + ordering10.exportBinningForCpp(); + break; + case "universal-frequency-signature": + UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile,readLen, bufferSize, true, k, statSamples);; + universalFrequencySignature.initRank(); + universalFrequencySignature.exportOrderingForCpp(); + universalFrequencySignature.exportBinningForCpp(); + break; + case "universal-frequency": + UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile,readLen, bufferSize, false, k, statSamples);; + universalFrequency.initRank(); + universalFrequency.exportOrderingForCpp(); + universalFrequency.exportBinningForCpp(); + break; + } // END GOOD // FREQUENCY SUCKS // FrequencyOrdering ordering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds*samplesPerRound, statSamples, k); // ordering.initFrequency(); // END FREQUENCY - try { - - System.out.println("Program Configuration:"); - System.out.print("Input File: " + infile + "\n" + - "Kmer Length: " + k + "\n" + - "Read Length: " + readLen + "\n" + - "Pivot Length: " + pivot_len + "\n" + - "# Of Threads: " + numThreads + "\n" + - "R/W Buffer Size: " + bufferSize + "\n" + - "Ordering: " + orderingName + "\n"); - - Partition partition = new Partition(k, infile, (int)Math.pow(4, pivot_len), pivot_len, bufferSize, readLen, ordering); - Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); - partition.Run(); - - AbstractMap distinctKmersPerPartition = map.Run(numThreads); - BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); - System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); - - HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); - BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); - - } catch (Exception E) { - System.out.println("Exception caught!"); - E.printStackTrace(); - } +// try { +// +// System.out.println("Program Configuration:"); +// System.out.print("Input File: " + infile + "\n" + +// "Kmer Length: " + k + "\n" + +// "Read Length: " + readLen + "\n" + +// "Pivot Length: " + pivot_len + "\n" + +// "# Of Threads: " + numThreads + "\n" + +// "R/W Buffer Size: " + bufferSize + "\n" + +// "Ordering: " + orderingName + "\n"); +// +// Partition partition = new Partition(k, infile, (int)Math.pow(4, pivot_len), pivot_len, bufferSize, readLen, ordering); +// Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); +// +// +// partition.Run(); +// +// AbstractMap distinctKmersPerPartition = map.Run(numThreads); +// BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); +// System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); +// +// HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); +// BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); +// +// } catch (Exception E) { +// System.out.println("Exception caught!"); +// E.printStackTrace(); +// } } diff --git a/src/buildgraph/Ordering/IterativeOrdering10.java b/src/buildgraph/Ordering/IterativeOrdering10.java new file mode 100644 index 0000000..3904114 --- /dev/null +++ b/src/buildgraph/Ordering/IterativeOrdering10.java @@ -0,0 +1,333 @@ +package buildgraph.Ordering; + +import buildgraph.StringUtils; + +import java.io.*; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import net.agkn.hll.*; + +public class IterativeOrdering10 implements IOrdering { + private String inputFile; + private int readLen; + private int bufSize; + private int pivotLength; + private int k; + private long[] currentOrdering; + private StringUtils stringUtils; + private HashMap> frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double percentagePunishment; + + private Integer[] temp = null; + private int mask; + private HashMap statFrequency; + +// private class Counter{ +// private HashSet set; +// private HLL hll; +// private boolean isHll; +// +// public Counter () +// { +// hll = null; +// set = new HashSet<>(); +// isHll = false; +// } +// +// public void add(char[] a, int start, int end, String line) +// { +// if(!isHll) +// { +// set.add(line.substring(start, end)); +// } +// } +// +// } + + public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + } + + public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.percentagePunishment = percentagePunishment; + this.mask = (int)Math.pow(4, pivotLength) - 1; + } + + + public void initFrequency() throws IOException { + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + statFrequency = new HashMap<>(); +// HashSet[] pmerFrequency; +// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; + HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + if(!pmerFrequency.containsKey( minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(0, k)); // += 1; + if(roundNumber == rounds) + { + if(!statFrequency.containsKey(minValue)) + statFrequency.put(minValue, new HLL(11, 5)); + statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0,k)); + } + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + + if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) + { + if(!statFrequency.containsKey(minValue)) + statFrequency.put(minValue, new HLL(11, 5)); + statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0,k)); + } + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + + if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) + { + if(!statFrequency.containsKey(minValue)) + statFrequency.put(minValue, new HLL(11, 5)); + statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0,k)); + } + } + } + + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) + { + if(!statFrequency.containsKey(minValue)) + statFrequency.put(minValue, new HLL(11, 5)); + statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0,k)); + } + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); +// if(roundNumber % 100 == 0) { +// percentagePunishment *= 0.996; +// normalize(); +// } + pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(HashMap> pmerFrequency) { + int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; + for(Integer i : pmerFrequency.keySet()){ + frequencies[i] = pmerFrequency.get(i).size(); + } + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < frequencies.length; k++) { + if (frequencies[k] > biggest) { + biggest = frequencies[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + frequencies[biggestIndex] = 0; + frequencies[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if(temp == null) + { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i> frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double percentagePunishment; + + private Integer[] temp = null; + private int mask; + private long[] statFrequency; + + private SignatureUtils signatureUtils; + + public IterativeSignatureOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + } + + public IterativeSignatureOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeSignatureOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.percentagePunishment = percentagePunishment; + this.mask = (int)Math.pow(4, pivotLength) - 1; + signatureUtils = new SignatureUtils(pivotLength); + } + + + public void initFrequency() throws IOException { + + for(int i = 0; i<=mask; i++) /// init as signature + { + if(!signatureUtils.isAllowed(i)) + { + currentOrdering[i] += mask + 1; + } + } + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + statFrequency = new long[(int) Math.pow(4, pivotLength)]; +// HashSet[] pmerFrequency; +// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; + HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + if(!pmerFrequency.containsKey( minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(0, k)); // += 1; + if(roundNumber == rounds) statFrequency[minValue]++; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + + if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) statFrequency[minValue]++; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + + if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) statFrequency[minValue]++; + } + } + + pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(roundNumber == rounds) statFrequency[minValue]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); +// if(roundNumber % 100 == 0) { +// percentagePunishment *= 0.996; +// normalize(); +// } + pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(HashMap> pmerFrequency) { + int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; + for(Integer i : pmerFrequency.keySet()){ + frequencies[i] = pmerFrequency.get(i).size(); + } + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < frequencies.length; k++) { + if (frequencies[k] > biggest) { + biggest = frequencies[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + frequencies[biggestIndex] = 0; + frequencies[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if(temp == null) + { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for(int i = 0 ; i Date: Mon, 29 Mar 2021 13:23:15 +0300 Subject: [PATCH 15/44] checkpoint before refactor ordering10 which is not normalized wrt reverse complement --- src/META-INF/MANIFEST.MF | 1 + src/buildgraph/BuildDeBruijnGraph.java | 94 +++--- .../Ordering/IterativeOrdering10.java | 46 +-- ...rativeOrdering9_WithCounterNormalized.java | 302 ++++++++++++++++++ src/buildgraph/StringUtils.java | 5 + 5 files changed, 387 insertions(+), 61 deletions(-) create mode 100644 src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized.java diff --git a/src/META-INF/MANIFEST.MF b/src/META-INF/MANIFEST.MF index 85846ea..0fb9411 100644 --- a/src/META-INF/MANIFEST.MF +++ b/src/META-INF/MANIFEST.MF @@ -1,3 +1,4 @@ Manifest-Version: 1.0 Main-Class: buildgraph.BuildDeBruijnGraph + diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 6ef82f3..3a9a81b 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -26,6 +26,7 @@ public static void main(String[] args) throws IOException { int xor = 0; //11101101; int numRounds = 0, elementsToPush = 0, samplesPerRound = 0, statSamples = 0; double punishPercentage = 1; + String version = "10"; if (args.length > 0 && args[0].equals("-help")) { System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" + @@ -42,6 +43,8 @@ public static void main(String[] args) throws IOException { for (int i = 0; i < args.length; i += 2) { if (args[i].equals("-in")) infile = args[i + 1]; + else if (args[i].equals("-v")) + version = args[i + 1]; else if (args[i].equals("-k")) k = new Integer(args[i + 1]); // else if(args[i].equals("-NB")) @@ -71,6 +74,7 @@ else if (args[i].equals("-punishPercentage")) punishPercentage = new Double(args[i + 1]); else { System.out.println("Wrong with arguments. Abort!"); + System.out.println(args[i]); return; } } @@ -100,70 +104,80 @@ else if (args[i].equals("-punishPercentage")) // GOOD - String version = ""; + IOrdering ordering = null; + System.out.println(version); switch(version) { case "9": // good version - IterativeOrdering9 ordering = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - ordering.initFrequency(); - ordering.exportOrderingForCpp(); - ordering.exportBinningForCpp(); + IterativeOrdering9 ordering9 = new IterativeOrdering9(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); + ordering9.initFrequency(); + ordering9.exportOrderingForCpp(); + ordering9.exportBinningForCpp(); + ordering = ordering9; + break; + case "9-normalized": // good version + IterativeOrdering9_WithCounterNormalized ordering9_withCounterNormalized = new IterativeOrdering9_WithCounterNormalized(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); + ordering9_withCounterNormalized.initFrequency(); + ordering9_withCounterNormalized.exportOrderingForCpp(); + ordering9_withCounterNormalized.exportBinningForCpp(); + ordering = ordering9_withCounterNormalized; break; case "10": IterativeOrdering10 ordering10 = new IterativeOrdering10(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); ordering10.initFrequency(); ordering10.exportOrderingForCpp(); ordering10.exportBinningForCpp(); + ordering = ordering10; break; case "universal-frequency-signature": UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile,readLen, bufferSize, true, k, statSamples);; universalFrequencySignature.initRank(); universalFrequencySignature.exportOrderingForCpp(); universalFrequencySignature.exportBinningForCpp(); + ordering = universalFrequencySignature; break; case "universal-frequency": UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile,readLen, bufferSize, false, k, statSamples);; universalFrequency.initRank(); universalFrequency.exportOrderingForCpp(); universalFrequency.exportBinningForCpp(); + ordering = universalFrequency; + break; + case "frequency": // FREQUENCY SUCKS + FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds*samplesPerRound, statSamples, k); + frequencyOrdering.initFrequency(); + ordering = frequencyOrdering; break; } -// END GOOD -// FREQUENCY SUCKS -// FrequencyOrdering ordering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds*samplesPerRound, statSamples, k); -// ordering.initFrequency(); -// END FREQUENCY - - -// try { -// -// System.out.println("Program Configuration:"); -// System.out.print("Input File: " + infile + "\n" + -// "Kmer Length: " + k + "\n" + -// "Read Length: " + readLen + "\n" + -// "Pivot Length: " + pivot_len + "\n" + -// "# Of Threads: " + numThreads + "\n" + -// "R/W Buffer Size: " + bufferSize + "\n" + -// "Ordering: " + orderingName + "\n"); -// -// Partition partition = new Partition(k, infile, (int)Math.pow(4, pivot_len), pivot_len, bufferSize, readLen, ordering); -// Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); -// -// -// partition.Run(); -// -// AbstractMap distinctKmersPerPartition = map.Run(numThreads); -// BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); -// System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); -// -// HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); -// BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); -// -// } catch (Exception E) { -// System.out.println("Exception caught!"); -// E.printStackTrace(); -// } + try { + + System.out.println("Program Configuration:"); + System.out.print("Input File: " + infile + "\n" + + "Kmer Length: " + k + "\n" + + "Read Length: " + readLen + "\n" + + "Pivot Length: " + pivot_len + "\n" + + "# Of Threads: " + numThreads + "\n" + + "R/W Buffer Size: " + bufferSize + "\n" + + "Ordering: " + orderingName + "\n"); + + Partition partition = new Partition(k, infile, (int)Math.pow(4, pivot_len), pivot_len, bufferSize, readLen, ordering); + Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); + + + partition.Run(); + + AbstractMap distinctKmersPerPartition = map.Run(numThreads); + BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); + System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); + + HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); + BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); + + } catch (Exception E) { + System.out.println("Exception caught!"); + E.printStackTrace(); + } } diff --git a/src/buildgraph/Ordering/IterativeOrdering10.java b/src/buildgraph/Ordering/IterativeOrdering10.java index 3904114..1b8d191 100644 --- a/src/buildgraph/Ordering/IterativeOrdering10.java +++ b/src/buildgraph/Ordering/IterativeOrdering10.java @@ -105,7 +105,8 @@ public void initFrequency() throws IOException { int min_pos = -1; - int minValue, currentValue; + int minValue, currentValue, minValueNormalized; + while (keepSample && (describeline = bfrG.readLine()) != null) { @@ -117,15 +118,15 @@ public void initFrequency() throws IOException { min_pos = findSmallest(lineCharArray, 0, k); minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + minValueNormalized = Math.min(minValue, getReversed(minValue)); + + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); // TODO: getDecimal with reverse compliment ; - if(!pmerFrequency.containsKey( minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(0, k)); // += 1; + if(!pmerFrequency.containsKey( minValueNormalized)) pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(line.substring(0, k)); // += 1; // TODO: Add with reversecompliment if(roundNumber == rounds) { - if(!statFrequency.containsKey(minValue)) - statFrequency.put(minValue, new HLL(11, 5)); - statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0,k)); + addToHll(lineCharArray, minValueNormalized); } int bound = len - k + 1; @@ -136,38 +137,34 @@ public void initFrequency() throws IOException { if (i > min_pos) { min_pos = findSmallest(lineCharArray, i, i + k); minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = Math.min(minValue, getReversed(minValue)); - if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(!pmerFrequency.containsKey(minValueNormalized)) pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(line.substring(i, k+i)); // += 1; if(roundNumber == rounds) { - if(!statFrequency.containsKey(minValue)) - statFrequency.put(minValue, new HLL(11, 5)); - statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0,k)); + addToHll(lineCharArray, minValueNormalized); } } else { int lastIndexInWindow = k + i - pivotLength; if (strcmp(currentValue, minValue) < 0) { min_pos = lastIndexInWindow; minValue = currentValue; + minValueNormalized = Math.min(minValue, getReversed(minValue)); - if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + if(!pmerFrequency.containsKey(minValueNormalized)) pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(line.substring(i, k+i)); // += 1; if(roundNumber == rounds) { - if(!statFrequency.containsKey(minValue)) - statFrequency.put(minValue, new HLL(11, 5)); - statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0,k)); + addToHll(lineCharArray, minValueNormalized); } } } - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; + pmerFrequency.get(minValueNormalized).add(line.substring(i, k+i)); // += 1; if(roundNumber == rounds) { - if(!statFrequency.containsKey(minValue)) - statFrequency.put(minValue, new HLL(11, 5)); - statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0,k)); + addToHll(lineCharArray, minValueNormalized); } } } @@ -197,6 +194,13 @@ public void initFrequency() throws IOException { frG.close(); } + private void addToHll(char[] lineCharArray, int minValue) { +// TODO: Add with reversecompliment + if (!statFrequency.containsKey(minValue)) + statFrequency.put(minValue, new HLL(8, 5)); + statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0, k)); + } + private void adaptOrdering(HashMap> pmerFrequency) { int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; diff --git a/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized.java b/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized.java new file mode 100644 index 0000000..a22efd5 --- /dev/null +++ b/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized.java @@ -0,0 +1,302 @@ +package buildgraph.Ordering; + +import buildgraph.StringUtils; + +import java.io.*; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; + +public class IterativeOrdering9_WithCounterNormalized implements IOrdering { + private String inputFile; + private int readLen; + private int bufSize; + private int pivotLength; + private int k; + private long[] currentOrdering; + private StringUtils stringUtils; + private HashMap> frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double percentagePunishment; + + private Integer[] temp = null; + private int mask; + private long[] statFrequency; + + public IterativeOrdering9_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + } + + public IterativeOrdering9_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeOrdering9_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.percentagePunishment = percentagePunishment; + this.mask = (int) Math.pow(4, pivotLength) - 1; + } + + public String getCanon(String line) { + String x = new String(stringUtils.getReversedRead(line.toCharArray())); + for (int i = 0; i < line.length(); i++) { + if (line.charAt(i) < x.charAt(i)) + return line; + else if (line.charAt(i) > x.charAt(i)) + return x; + } + return x; + } + + + public void initFrequency() throws IOException { + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + statFrequency = new long[(int) Math.pow(4, pivotLength)]; +// HashSet[] pmerFrequency; +// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; + HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue, minValueNormalized; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = Math.min(minValue, getReversed(minValue)); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(0, k))); // += 1; + + if (roundNumber == rounds) statFrequency[minValueNormalized]++; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = Math.min(minValue, getReversed(minValue)); + + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; + if (roundNumber == rounds) statFrequency[minValueNormalized]++; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = Math.min(minValue, getReversed(minValue)); + + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; + if (roundNumber == rounds) statFrequency[minValueNormalized]++; + } + } + + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; + if (roundNumber == rounds) statFrequency[minValueNormalized]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { + numSampled = 0; + adaptOrdering(pmerFrequency); +// if(roundNumber % 100 == 0) { +// percentagePunishment *= 0.996; +// normalize(); +// } + pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(HashMap> pmerFrequency) { + int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; + for (Integer i : pmerFrequency.keySet()) { + frequencies[i] = pmerFrequency.get(i).size(); + } + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < frequencies.length; k++) { + if (frequencies[k] > biggest) { + biggest = frequencies[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + frequencies[biggestIndex] = 0; + frequencies[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + public int strcmp(int x, int y) { + if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if (temp == null) { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for (int i = 0; i < temp.length; i++) { + currentOrdering[i] = temp[i]; + } + } + + + public void exportOrderingForCpp() { + File file = new File("ranks.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < currentOrdering.length; i++) { + bf.write(Long.toString(currentOrdering[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + + public void exportBinningForCpp() { + File file = new File("freq.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < statFrequency.length; i++) { + bf.write(Long.toString(statFrequency[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } +} diff --git a/src/buildgraph/StringUtils.java b/src/buildgraph/StringUtils.java index 14e97f1..ad60c7b 100644 --- a/src/buildgraph/StringUtils.java +++ b/src/buildgraph/StringUtils.java @@ -17,6 +17,11 @@ public int getDecimal(char[] a, int from, int to){ return val; } + public int getRDecimal(char[] a, char[] b, int from, int to){ + + return Math.min(getDecimal(a, from, to), getDecimal(b, from, to)); + } + public long getLDecimal(char[] a, int from, int to){ long val=0; From bc737d27be19a746028410cc67c0187a6931166f Mon Sep 17 00:00:00 2001 From: danflomin Date: Mon, 29 Mar 2021 13:30:12 +0300 Subject: [PATCH 16/44] remove ordering10 which had a bug in it --- src/buildgraph/BuildDeBruijnGraph.java | 2 +- ...tiveOrdering10_WithCounterNormalized.java} | 105 ++++++++---------- 2 files changed, 49 insertions(+), 58 deletions(-) rename src/buildgraph/Ordering/{IterativeOrdering10.java => IterativeOrdering10_WithCounterNormalized.java} (80%) diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 3a9a81b..3f57d33 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -123,7 +123,7 @@ else if (args[i].equals("-punishPercentage")) ordering = ordering9_withCounterNormalized; break; case "10": - IterativeOrdering10 ordering10 = new IterativeOrdering10(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); + IterativeOrdering10_WithCounterNormalized ordering10 = new IterativeOrdering10_WithCounterNormalized(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); ordering10.initFrequency(); ordering10.exportOrderingForCpp(); ordering10.exportBinningForCpp(); diff --git a/src/buildgraph/Ordering/IterativeOrdering10.java b/src/buildgraph/Ordering/IterativeOrdering10_WithCounterNormalized.java similarity index 80% rename from src/buildgraph/Ordering/IterativeOrdering10.java rename to src/buildgraph/Ordering/IterativeOrdering10_WithCounterNormalized.java index 1b8d191..af6d028 100644 --- a/src/buildgraph/Ordering/IterativeOrdering10.java +++ b/src/buildgraph/Ordering/IterativeOrdering10_WithCounterNormalized.java @@ -1,15 +1,16 @@ package buildgraph.Ordering; +import buildgraph.Partition; import buildgraph.StringUtils; +import net.agkn.hll.HLL; import java.io.*; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; -import net.agkn.hll.*; -public class IterativeOrdering10 implements IOrdering { +public class IterativeOrdering10_WithCounterNormalized implements IOrdering { private String inputFile; private int readLen; private int bufSize; @@ -30,29 +31,7 @@ public class IterativeOrdering10 implements IOrdering { private int mask; private HashMap statFrequency; -// private class Counter{ -// private HashSet set; -// private HLL hll; -// private boolean isHll; -// -// public Counter () -// { -// hll = null; -// set = new HashSet<>(); -// isHll = false; -// } -// -// public void add(char[] a, int start, int end, String line) -// { -// if(!isHll) -// { -// set.add(line.substring(start, end)); -// } -// } -// -// } - - public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + public IterativeOrdering10_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; @@ -62,7 +41,7 @@ public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufS stringUtils = new StringUtils(); } - public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k) { + public IterativeOrdering10_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k) { this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { int canonical = Math.min(i, getReversed(i)); @@ -74,18 +53,37 @@ public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufS elementsToPush = 1; } - public IterativeOrdering10(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { + public IterativeOrdering10_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { this(pivotLength, infile, readLen, bufSize, k); this.roundSamples = roundSamples; this.rounds = rounds; this.elementsToPush = elementsToPush; this.statisticsSamples = statisticsSamples; this.percentagePunishment = percentagePunishment; - this.mask = (int)Math.pow(4, pivotLength) - 1; + this.mask = (int) Math.pow(4, pivotLength) - 1; + } + + public String getCanon(String line) { + String x = new String(stringUtils.getReversedRead(line.toCharArray())); + for (int i = 0; i < line.length(); i++) { + if (line.charAt(i) < x.charAt(i)) + return line; + else if (line.charAt(i) > x.charAt(i)) + return x; + } + return x; + } + + private void addToHll(char[] lineCharArray, int minValue) { +// TODO: Add with reversecompliment + if (!statFrequency.containsKey(minValue)) + statFrequency.put(minValue, new HLL(8, 5)); + statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0, k)); } public void initFrequency() throws IOException { + boolean keepSample = true; int numSampled = 0; int roundNumber = 0; @@ -107,7 +105,6 @@ public void initFrequency() throws IOException { int min_pos = -1; int minValue, currentValue, minValueNormalized; - while (keepSample && (describeline = bfrG.readLine()) != null) { bfrG.read(lineCharArray, 0, readLen); @@ -119,14 +116,14 @@ public void initFrequency() throws IOException { min_pos = findSmallest(lineCharArray, 0, k); minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); minValueNormalized = Math.min(minValue, getReversed(minValue)); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); // TODO: getDecimal with reverse compliment - ; - if(!pmerFrequency.containsKey( minValueNormalized)) pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(line.substring(0, k)); // += 1; // TODO: Add with reversecompliment - if(roundNumber == rounds) - { + if (roundNumber == rounds) { addToHll(lineCharArray, minValueNormalized); + } else { + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(0, k))); // += 1; } int bound = len - k + 1; @@ -139,11 +136,12 @@ public void initFrequency() throws IOException { minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); minValueNormalized = Math.min(minValue, getReversed(minValue)); - if(!pmerFrequency.containsKey(minValueNormalized)) pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) - { + if (roundNumber == rounds) { addToHll(lineCharArray, minValueNormalized); + } else { + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; } } else { int lastIndexInWindow = k + i - pivotLength; @@ -152,19 +150,20 @@ public void initFrequency() throws IOException { minValue = currentValue; minValueNormalized = Math.min(minValue, getReversed(minValue)); - if(!pmerFrequency.containsKey(minValueNormalized)) pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) - { + if (roundNumber == rounds) { addToHll(lineCharArray, minValueNormalized); + } else { + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; } } } - pmerFrequency.get(minValueNormalized).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) - { + if (roundNumber == rounds) { addToHll(lineCharArray, minValueNormalized); + } else { + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; } } } @@ -194,17 +193,10 @@ public void initFrequency() throws IOException { frG.close(); } - private void addToHll(char[] lineCharArray, int minValue) { -// TODO: Add with reversecompliment - if (!statFrequency.containsKey(minValue)) - statFrequency.put(minValue, new HLL(8, 5)); - statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0, k)); - } - private void adaptOrdering(HashMap> pmerFrequency) { int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; - for(Integer i : pmerFrequency.keySet()){ + for (Integer i : pmerFrequency.keySet()) { frequencies[i] = pmerFrequency.get(i).size(); } for (int i = 0; i < elementsToPush; i++) { @@ -265,13 +257,12 @@ public int strcmp(int x, int y) { private void normalize() { // currentOrdering - if(temp == null) - { + if (temp == null) { temp = new Integer[currentOrdering.length]; for (int i = 0; i < temp.length; temp[i] = i, i++) ; } Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i Date: Mon, 29 Mar 2021 17:45:17 +0300 Subject: [PATCH 17/44] remove several old iterative orderings --- src/buildgraph/BuildDeBruijnGraph.java | 56 ++-- .../Ordering/IterativeOrdering.java | 258 ----------------- .../Ordering/IterativeOrdering2.java | 224 --------------- .../Ordering/IterativeOrdering3.java | 236 ---------------- .../Ordering/IterativeOrdering4.java | 250 ----------------- .../Ordering/IterativeOrdering5.java | 264 ------------------ 6 files changed, 28 insertions(+), 1260 deletions(-) delete mode 100644 src/buildgraph/Ordering/IterativeOrdering.java delete mode 100644 src/buildgraph/Ordering/IterativeOrdering2.java delete mode 100644 src/buildgraph/Ordering/IterativeOrdering3.java delete mode 100644 src/buildgraph/Ordering/IterativeOrdering4.java delete mode 100644 src/buildgraph/Ordering/IterativeOrdering5.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 3f57d33..d2b7b50 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -150,34 +150,34 @@ else if (args[i].equals("-punishPercentage")) break; } - try { - - System.out.println("Program Configuration:"); - System.out.print("Input File: " + infile + "\n" + - "Kmer Length: " + k + "\n" + - "Read Length: " + readLen + "\n" + - "Pivot Length: " + pivot_len + "\n" + - "# Of Threads: " + numThreads + "\n" + - "R/W Buffer Size: " + bufferSize + "\n" + - "Ordering: " + orderingName + "\n"); - - Partition partition = new Partition(k, infile, (int)Math.pow(4, pivot_len), pivot_len, bufferSize, readLen, ordering); - Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); - - - partition.Run(); - - AbstractMap distinctKmersPerPartition = map.Run(numThreads); - BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); - System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); - - HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); - BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); - - } catch (Exception E) { - System.out.println("Exception caught!"); - E.printStackTrace(); - } +// try { +// +// System.out.println("Program Configuration:"); +// System.out.print("Input File: " + infile + "\n" + +// "Kmer Length: " + k + "\n" + +// "Read Length: " + readLen + "\n" + +// "Pivot Length: " + pivot_len + "\n" + +// "# Of Threads: " + numThreads + "\n" + +// "R/W Buffer Size: " + bufferSize + "\n" + +// "Ordering: " + orderingName + "\n"); +// +// Partition partition = new Partition(k, infile, (int)Math.pow(4, pivot_len), pivot_len, bufferSize, readLen, ordering); +// Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); +// +// +// partition.Run(); +// +// AbstractMap distinctKmersPerPartition = map.Run(numThreads); +// BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); +// System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); +// +// HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); +// BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); +// +// } catch (Exception E) { +// System.out.println("Exception caught!"); +// E.printStackTrace(); +// } } diff --git a/src/buildgraph/Ordering/IterativeOrdering.java b/src/buildgraph/Ordering/IterativeOrdering.java deleted file mode 100644 index ff8640b..0000000 --- a/src/buildgraph/Ordering/IterativeOrdering.java +++ /dev/null @@ -1,258 +0,0 @@ -package buildgraph.Ordering; - -import buildgraph.Ordering.UHS.UHSOrderingBase; -import buildgraph.Ordering.UHS.UHSSignatureOrdering; -import buildgraph.StringUtils; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.lang.reflect.Array; -import java.util.Arrays; -import java.util.HashSet; -import java.util.LinkedList; - -public class IterativeOrdering implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private long[] frequency; - - private int roundSamples; - private int rounds; - private int elementsToPush; - private int pushBy; - - public IterativeOrdering(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering, int roundSamples, int rounds, int elementsToPush, int pushBy) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.pushBy = pushBy; - stringUtils = new StringUtils(); - } - - public IterativeOrdering(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)], 100000, 10000, 1, (int) Math.pow(4, pivotLength)); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - } - - public IterativeOrdering(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int pushBy) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)], roundSamples, rounds, elementsToPush, pushBy); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - } - - - public void initFrequency() throws IOException { - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; - - String describeline; - char[] lineCharArray = new char[readLen]; - char[] currentArray; - - - int prepos, min_pos = -1; - int[] flag = new int[1]; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - int len = readLen; - char[] revCharArray = stringUtils.getReversedRead(lineCharArray); - - min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag); - //int initialMinPos = min_pos; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - - if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLength)) { - currentArray = flag[0] == 0 ? lineCharArray : revCharArray; - int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); - - min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag); - //initialMinPos = min_pos; - - if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLength) : calPosNew(revCharArray, min_pos, min_pos + pivotLength))) { - prepos = temp; - pmerFrequency[prepos]++; - } - - } else { - - if (strcmp(lineCharArray, revCharArray, k + i - pivotLength, len - i - k, pivotLength) < 0) { - if (strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLength, min_pos, pivotLength) < 0) { - - currentArray = flag[0] == 0 ? lineCharArray : revCharArray; - int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); - - min_pos = k + i - pivotLength; - if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLength)) { - prepos = temp; - pmerFrequency[prepos]++; - } - - flag[0] = 0; - - } - } else { - if (strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLength) < 0) { - - currentArray = flag[0] == 0 ? lineCharArray : revCharArray; - int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); - - min_pos = -k - i + len; - - if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLength)) { - prepos = temp; - pmerFrequency[prepos]++; - } - flag[0] = 1; - } - } - } - } - currentArray = flag[0] == 0 ? lineCharArray : revCharArray; - prepos = calPosNew(currentArray, min_pos, min_pos + pivotLength); - pmerFrequency[prepos]++; - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) - { - System.out.println("Sampling for binning round"); - roundSamples = Integer.MAX_VALUE;//100*rounds*roundSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - - } - bfrG.close(); - frG.close(); - } - - private void adaptOrdering(long[] pmerFrequency) { - for (int i = 0; i < elementsToPush; i++) { - long biggest = Arrays.stream(pmerFrequency).max().getAsLong(); - for (int j = 0; j < pmerFrequency.length; j++) { - if (pmerFrequency[j] == biggest) { - long newRank = currentOrdering[j] + pushBy; - currentOrdering[j] = newRank; - currentOrdering[getReversed(j)] = newRank; - pmerFrequency[j] = 0; - pmerFrequency[getReversed(j)] = 0; - } - } - } - } - - private int calPosNew(char[] a, int from, int to) { - return stringUtils.getDecimal(a, from, to); - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException { - int len = a.length; - int pos1 = findSmallest(a, from, to); - int pos2 = findSmallest(b, len - to, len - from); - - if (strcmp(a, b, pos1, pos2, pivotLength) < 0) { - flag[0] = 0; - return pos1; - } else { - flag[0] = 1; - return pos2; - } - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public void exportOrderingForCpp() { - System.out.print("{"); - for (int i = 0; i < currentOrdering.length; i++) { - System.out.print(currentOrdering[i] + ","); - } - System.out.print("}"); - System.out.println(); - } - - public void exportBinningForCpp() { - System.out.print("{"); - for (int i = 0; i < frequency.length; i++) { - System.out.print(frequency[i] + ","); - } - System.out.print("}"); - System.out.println(); - } - - -} - diff --git a/src/buildgraph/Ordering/IterativeOrdering2.java b/src/buildgraph/Ordering/IterativeOrdering2.java deleted file mode 100644 index fbf8dbd..0000000 --- a/src/buildgraph/Ordering/IterativeOrdering2.java +++ /dev/null @@ -1,224 +0,0 @@ -package buildgraph.Ordering; - -import buildgraph.StringUtils; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.Arrays; - -public class IterativeOrdering2 implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - - private int roundSamples; - private int rounds; - private int elementsToPush; - private int pushBy; - - public IterativeOrdering2(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering, int roundSamples, int rounds, int elementsToPush, int pushBy) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.pushBy = pushBy; - stringUtils = new StringUtils(); - } - - public IterativeOrdering2(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)], 100000, 10000, 1, (int) Math.pow(4, pivotLength)); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - } - - public IterativeOrdering2(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int pushBy) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)], roundSamples, rounds, elementsToPush, pushBy); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - } - - - public void initFrequency() throws IOException { - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; - - String describeline; - char[] lineCharArray = new char[readLen]; - char[] currentArray; - - - int prepos, min_pos = -1; - int[] flag = new int[1]; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - int len = readLen; - char[] revCharArray = stringUtils.getReversedRead(lineCharArray); - - min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag); - //int initialMinPos = min_pos; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - - if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLength)) { - currentArray = flag[0] == 0 ? lineCharArray : revCharArray; - int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); - - min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag); - //initialMinPos = min_pos; - - if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLength) : calPosNew(revCharArray, min_pos, min_pos + pivotLength))) { - prepos = temp; - pmerFrequency[prepos]++; - } - - } else { - - if (strcmp(lineCharArray, revCharArray, k + i - pivotLength, len - i - k, pivotLength) < 0) { - if (strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLength, min_pos, pivotLength) < 0) { - - currentArray = flag[0] == 0 ? lineCharArray : revCharArray; - int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); - - min_pos = k + i - pivotLength; - if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLength)) { - prepos = temp; - pmerFrequency[prepos]++; - } - - flag[0] = 0; - - } - } else { - if (strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLength) < 0) { - - currentArray = flag[0] == 0 ? lineCharArray : revCharArray; - int temp = calPosNew(currentArray, min_pos, min_pos + pivotLength); - - min_pos = -k - i + len; - - if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLength)) { - prepos = temp; - pmerFrequency[prepos]++; - } - flag[0] = 1; - } - } - } - } - currentArray = flag[0] == 0 ? lineCharArray : revCharArray; - prepos = calPosNew(currentArray, min_pos, min_pos + pivotLength); - pmerFrequency[prepos]++; - } - - if (Arrays.stream(pmerFrequency).max().getAsLong() > 25-Math.min(roundNumber/10, 15)) { - System.out.println("round number = "+roundNumber); - roundNumber++; - if (roundNumber == rounds) - keepSample = false; - else - numSampled = 0; - adaptOrdering(pmerFrequency); - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - } - - - } - bfrG.close(); - frG.close(); - } - - private void adaptOrdering(long[] pmerFrequency) { - for (int i = 0; i < elementsToPush; i++) { - long biggest = Arrays.stream(pmerFrequency).max().getAsLong(); - for (int j = 0; j < pmerFrequency.length; j++) { - if (pmerFrequency[j] == biggest) { - long newRank = currentOrdering[j] + pushBy; - currentOrdering[j] = newRank; - currentOrdering[getReversed(j)] = newRank; - pmerFrequency[j] = 0; - pmerFrequency[getReversed(j)] = 0; - } - } - } - } - - private int calPosNew(char[] a, int from, int to) { - return stringUtils.getDecimal(a, from, to); - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException { - int len = a.length; - int pos1 = findSmallest(a, from, to); - int pos2 = findSmallest(b, len - to, len - from); - - if (strcmp(a, b, pos1, pos2, pivotLength) < 0) { - flag[0] = 0; - return pos1; - } else { - flag[0] = 1; - return pos2; - } - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } -} diff --git a/src/buildgraph/Ordering/IterativeOrdering3.java b/src/buildgraph/Ordering/IterativeOrdering3.java deleted file mode 100644 index 4361184..0000000 --- a/src/buildgraph/Ordering/IterativeOrdering3.java +++ /dev/null @@ -1,236 +0,0 @@ -package buildgraph.Ordering; - -import buildgraph.StringUtils; - -import java.io.*; -import java.util.Arrays; - -public class IterativeOrdering3 implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private long[] frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - public IterativeOrdering3(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeOrdering3(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering3(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - } - - - public void initFrequency() throws IOException { - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - pmerFrequency[minValue] += k; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & 0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - pmerFrequency[minValue] += k; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - pmerFrequency[minValue] += k; - } - } - - pmerFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(long[] pmerFrequency) { - for (int i = 0; i < elementsToPush; i++) { - long biggest = Arrays.stream(pmerFrequency).max().getAsLong(); - for (int j = 0; j < pmerFrequency.length; j++) { - if (pmerFrequency[j] == biggest) { - long newRank = currentOrdering[j] + (int) Math.pow(4, pivotLength) / 100; - currentOrdering[j] = newRank; - currentOrdering[getReversed(j)] = newRank; - pmerFrequency[j] = 0; - pmerFrequency[getReversed(j)] = 0; - break; - } - } - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public void exportOrderingForCpp() { - File file = new File("rank.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < currentOrdering.length; i++) { - bf.write(Long.toString(currentOrdering[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } - - public void exportBinningForCpp() { - File file = new File("freq.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < frequency.length; i++) { - bf.write(Long.toString(frequency[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } -} diff --git a/src/buildgraph/Ordering/IterativeOrdering4.java b/src/buildgraph/Ordering/IterativeOrdering4.java deleted file mode 100644 index a4366bc..0000000 --- a/src/buildgraph/Ordering/IterativeOrdering4.java +++ /dev/null @@ -1,250 +0,0 @@ -package buildgraph.Ordering; - -import buildgraph.StringUtils; - -import java.io.*; -import java.util.Arrays; - -public class IterativeOrdering4 implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private long[] frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double maskRatio; - private double percentagePunishment; - - public IterativeOrdering4(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeOrdering4(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering4(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double maskRatio, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.maskRatio = maskRatio; - this.percentagePunishment = percentagePunishment; - } - - - public void initFrequency() throws IOException { - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - pmerFrequency[minValue] += k; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & 0x3fff;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - pmerFrequency[minValue] += k; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - pmerFrequency[minValue] += k; - } - } - - pmerFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(long[] pmerFrequency) { - boolean[] mask = new boolean[pmerFrequency.length]; - for(int i = 0 ; i biggest) { - biggest = pmerFrequency[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int)((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - pmerFrequency[biggestIndex] = 0; - pmerFrequency[getReversed(biggestIndex)] = 0; - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public void exportOrderingForCpp() { - File file = new File("ranks.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < currentOrdering.length; i++) { - bf.write(Long.toString(currentOrdering[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } - - public void exportBinningForCpp() { - File file = new File("freq.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < frequency.length; i++) { - bf.write(Long.toString(frequency[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } -} diff --git a/src/buildgraph/Ordering/IterativeOrdering5.java b/src/buildgraph/Ordering/IterativeOrdering5.java deleted file mode 100644 index fac839d..0000000 --- a/src/buildgraph/Ordering/IterativeOrdering5.java +++ /dev/null @@ -1,264 +0,0 @@ -package buildgraph.Ordering; - -import buildgraph.StringUtils; - -import java.io.*; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; - -public class IterativeOrdering5 implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private long[] frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double maskRatio; - private double percentagePunishment; - - public IterativeOrdering5(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeOrdering5(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering5(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double maskRatio, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.maskRatio = maskRatio; - this.percentagePunishment = percentagePunishment; - } - - - public void initFrequency() throws IOException { - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - pmerFrequency[minValue] += 1; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & 0x3fff;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - pmerFrequency[minValue] += 1; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - pmerFrequency[minValue] += 1; - } - } - - pmerFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(long[] pmerFrequency) { - boolean[] mask = new boolean[pmerFrequency.length]; - for (int i = 0; i < mask.length; i++) { - if (Math.random() < 1 - maskRatio) - mask[i] = true; - } -// TODO : if biggest is smaller than (samples / 4^(m-1))/5 - for (int i = 0; i < elementsToPush; i++) { - long biggest = -1; - int biggestIndex = -1; - for (int k = 0; k < pmerFrequency.length; k++) { - if (mask[k] && pmerFrequency[k] > biggest) { - biggest = pmerFrequency[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - pmerFrequency[biggestIndex] = 0; - pmerFrequency[getReversed(biggestIndex)] = 0; - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - Integer[] temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i Date: Wed, 31 Mar 2021 12:26:04 +0300 Subject: [PATCH 18/44] too many tries to fix partitioning - trying it in new branch --- src/buildgraph/BuildDeBruijnGraph.java | 78 +++-- src/buildgraph/Map.java | 132 ++----- src/buildgraph/MapOld.java | 321 ++++++++++++++++++ ...g9_WithCounterNormalized_AndSignature.java | 306 +++++++++++++++++ src/buildgraph/Partition.java | 2 +- src/buildgraph/PartitionNew.java | 288 ++++++++++++++++ src/buildgraph/PartitionTrunc.java | 166 +++++++++ src/buildgraph/StringUtils.java | 11 + 8 files changed, 1160 insertions(+), 144 deletions(-) create mode 100644 src/buildgraph/MapOld.java create mode 100644 src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java create mode 100644 src/buildgraph/PartitionNew.java create mode 100644 src/buildgraph/PartitionTrunc.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index d2b7b50..6a00863 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -18,9 +18,9 @@ public static void main(String[] args) throws IOException { String infile = null; - int k = 60, pivot_len = 8, bufferSize = 8192, numThreads = 20, hsmapCapacity = 10000000; + int k = 60, pivot_len = 8, bufferSize = 81920, numThreads = 20, hsmapCapacity = 10000000; int readLen = 124; -// int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// + int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// // boolean readable = false; String orderingName = "uhs_sig_freq"; int xor = 0; //11101101; @@ -47,8 +47,8 @@ else if (args[i].equals("-v")) version = args[i + 1]; else if (args[i].equals("-k")) k = new Integer(args[i + 1]); -// else if(args[i].equals("-NB")) -// numBlocks = new Integer(args[i+1]); + else if(args[i].equals("-NB")) + numBlocks = new Integer(args[i+1]); // else // if(args[i].equals("-o")) // orderingName = args[i+1]; @@ -122,6 +122,14 @@ else if (args[i].equals("-punishPercentage")) ordering9_withCounterNormalized.exportBinningForCpp(); ordering = ordering9_withCounterNormalized; break; + case "9-normalized-signature": // + IterativeOrdering9_WithCounterNormalized_AndSignature ordering9_withCounterNormalized_andSignature = new IterativeOrdering9_WithCounterNormalized_AndSignature(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); + ordering9_withCounterNormalized_andSignature.initFrequency(); + ordering9_withCounterNormalized_andSignature.exportOrderingForCpp(); + ordering9_withCounterNormalized_andSignature.exportBinningForCpp(); + ordering = ordering9_withCounterNormalized_andSignature; + System.out.println("lolz asdasd"); + break; case "10": IterativeOrdering10_WithCounterNormalized ordering10 = new IterativeOrdering10_WithCounterNormalized(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); ordering10.initFrequency(); @@ -148,36 +156,42 @@ else if (args[i].equals("-punishPercentage")) frequencyOrdering.initFrequency(); ordering = frequencyOrdering; break; + case "signature": + LexicographicSignatureOrdering signatureOrdering = new LexicographicSignatureOrdering(pivot_len); + ordering = signatureOrdering; + break; } -// try { -// -// System.out.println("Program Configuration:"); -// System.out.print("Input File: " + infile + "\n" + -// "Kmer Length: " + k + "\n" + -// "Read Length: " + readLen + "\n" + -// "Pivot Length: " + pivot_len + "\n" + -// "# Of Threads: " + numThreads + "\n" + -// "R/W Buffer Size: " + bufferSize + "\n" + -// "Ordering: " + orderingName + "\n"); -// -// Partition partition = new Partition(k, infile, (int)Math.pow(4, pivot_len), pivot_len, bufferSize, readLen, ordering); -// Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); -// -// -// partition.Run(); -// -// AbstractMap distinctKmersPerPartition = map.Run(numThreads); -// BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); -// System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); -// -// HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); -// BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); -// -// } catch (Exception E) { -// System.out.println("Exception caught!"); -// E.printStackTrace(); -// } + try { + + System.out.println("Program Configuration:"); + System.out.print("Input File: " + infile + "\n" + + "Kmer Length: " + k + "\n" + + "Read Length: " + readLen + "\n" + + "Pivot Length: " + pivot_len + "\n" + + "# Of Threads: " + numThreads + "\n" + + "R/W Buffer Size: " + bufferSize + "\n" + + "Ordering: " + orderingName + "\n"); + +// Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); + PartitionTrunc partition = new PartitionTrunc(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); + Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); +// MapTrunc map = new MapTrunc(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); + + + partition.Run(); + + AbstractMap distinctKmersPerPartition = map.Run(numThreads); + BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); + System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); + + HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); + BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); + + } catch (Exception E) { + System.out.println("Exception caught!"); + E.printStackTrace(); + } } diff --git a/src/buildgraph/Map.java b/src/buildgraph/Map.java index a98a9d0..1663a9f 100644 --- a/src/buildgraph/Map.java +++ b/src/buildgraph/Map.java @@ -20,6 +20,7 @@ public class Map{ private long forAndVal; private long forAndVal32; + private StringUtils stringUtils; private static int[] valTable = StringUtils.valTable; @@ -31,6 +32,8 @@ public Map(int kk, int numberOfBlocks, int bufferSize, int HScapacity){ this.blockID = 0; this.forAndVal = (long)Math.pow(2, 2*(k-32)) - 1; this.forAndVal32 = (long)Math.pow(2, 2*k) - 1; + stringUtils = new StringUtils(); + } public class MyThread extends Thread{ @@ -51,16 +54,10 @@ public void run(){ FileReader fr; BufferedReader bfr; - FileWriter fw; - BufferedWriter bfw; - - + String line; int p,j; - long cnt; - Kmer64 k1, k1_rev; - try{ File dir = new File("Maps"); @@ -82,131 +79,44 @@ public void run(){ fr = new FileReader("Nodes/nodes"+p); bfr = new BufferedReader(fr, bufSize); - fw = new FileWriter("Maps/maps"+p); - bfw = new BufferedWriter(fw, bufSize); + - HashMap nodes = new HashMap(capacity); + HashSet nodes = new HashSet(capacity); while((line = bfr.readLine()) != null){ String[] strs = line.split("\t"); - cnt = Long.parseLong(strs[1]); - - long preOriginal = -1, preReplace = -1, Original = -1, Replace = -1; - long diff = -1; - boolean newOut = true, next = false; - - Long ReplaceObj, Replace_revObj; - + + char[] lineCharArray = strs[0].toCharArray(); - k1 = new Kmer64(lineCharArray,0,k,false); - k1_rev = new Kmer64(lineCharArray,0,k,true); - + String revLine = new String(stringUtils.getReversedRead(lineCharArray)); + int bound = strs[0].length() - k + 1; for(j = 0; j < bound; j++){ - - if(j != 0){ - if(k > 32){ - k1 = new Kmer64((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A'], ((k1.high<<2) + valTable[lineCharArray[k+j-33]-'A']) & forAndVal); - k1_rev = new Kmer64((k1_rev.low>>>2) + ((k1_rev.high&3)<<62), (k1_rev.high>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-33)<<1))); - } - else{ - k1 = new Kmer64(((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A']) & forAndVal32, 0); - k1_rev = new Kmer64((k1_rev.low>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-1)<<1)), 0); - } - } - - ReplaceObj = nodes.get(k1); - Replace_revObj = nodes.get(k1_rev); - - if(ReplaceObj == null && Replace_revObj == null){ - nodes.put(k1, cnt+j*2); - - if(!newOut && !next){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - - newOut = true; - } - - } - else{ - if(ReplaceObj!=null){ - Original = cnt+j*2; - Replace = ReplaceObj; - } - else{ - Original = cnt+j*2; - Replace = Replace_revObj+1; - } - - if(newOut){ - bfw.write(Original+"\t"+Replace+"\t"); - newOut = false; - next = true; - } - - else if(Original-preOriginal==2){ - if(next){ - diff = Replace - preReplace; - if(diff==2){ - bfw.write("+\t"); - next = false; - } - else if(diff==-2){ - bfw.write("-\t"); - next = false; - } - else{ - bfw.write("\n"+Original+"\t"+Replace+"\t"); - } - } - else{ - if(Replace - preReplace != diff){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - - bfw.write(Original+"\t"+Replace+"\t"); - next = true; - } - } - } - - else if(next==true){ - - bfw.write("\n"+Original+"\t"+Replace+"\t"); - } - - preOriginal = Original; - preReplace = Replace; - } - - } - - if(!newOut && !next){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - } - else if(next){ - bfw.newLine(); + + nodes.add(strs[0].substring(j, j + k)); + nodes.add(revLine.substring(j, j + k)); } + } if(p%100 == 0) System.out.println(p); - distinctKmersPerPartition.put((long)p, (long)nodes.size()); + + distinctKmersPerPartition.put((long)p, (long)nodes.size() / 2); nodes.clear(); nodes = null; - bfw.close(); - fw.close(); + bfr.close(); fr.close(); - bfw = null; - fw = null; bfr = null; fr = null; + + File myObj = new File("Nodes/nodes"+p); + if (!myObj.delete()) + System.out.println("Failed to delete the file." + p); } }catch(Exception E){ diff --git a/src/buildgraph/MapOld.java b/src/buildgraph/MapOld.java new file mode 100644 index 0000000..d889b4a --- /dev/null +++ b/src/buildgraph/MapOld.java @@ -0,0 +1,321 @@ +package buildgraph; + +import java.io.*; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CountDownLatch; + + +public class MapOld { + + private int k; + private int numOfBlocks; + private int bufSize; + + private Object lock_blocks = new Object(); + + private int capacity; + + private int blockID; + + private long forAndVal; + private long forAndVal32; + + private static int[] valTable = StringUtils.valTable; + + public MapOld(int kk, int numberOfBlocks, int bufferSize, int HScapacity){ + this.k = kk; + this.numOfBlocks = numberOfBlocks; + this.bufSize = bufferSize; + this.capacity = HScapacity; + this.blockID = 0; + this.forAndVal = (long)Math.pow(2, 2*(k-32)) - 1; + this.forAndVal32 = (long)Math.pow(2, 2*k) - 1; + } + + public class MyThread extends Thread{ + private CountDownLatch threadsSignal; + private HashSet fileNames; + private ConcurrentHashMap distinctKmersPerPartition; + + public MyThread(CountDownLatch threadsSignal, HashSet fileNames, ConcurrentHashMap distinctKmersPerPartition){ + super(); + this.threadsSignal = threadsSignal; + this.fileNames = fileNames; + this.distinctKmersPerPartition = distinctKmersPerPartition; + } + + @Override + public void run(){ + System.out.println(Thread.currentThread().getName() + "Start..."); + + FileReader fr; + BufferedReader bfr; + FileWriter fw; + BufferedWriter bfw; + + + String line; + + int p,j; + long cnt; + Kmer64 k1, k1_rev; + + + try{ + File dir = new File("Maps"); + if(!dir.exists()) + dir.mkdir(); + + while(blockID nodes = new HashMap(capacity); + + while((line = bfr.readLine()) != null){ + + String[] strs = line.split("\t"); + cnt = Long.parseLong(strs[1]); + + long preOriginal = -1, preReplace = -1, Original = -1, Replace = -1; + long diff = -1; + boolean newOut = true, next = false; + + Long ReplaceObj, Replace_revObj; + + char[] lineCharArray = strs[0].toCharArray(); + k1 = new Kmer64(lineCharArray,0,k,false); + k1_rev = new Kmer64(lineCharArray,0,k,true); + + int bound = strs[0].length() - k + 1; + + for(j = 0; j < bound; j++){ + + if(j != 0){ + if(k > 32){ + k1 = new Kmer64((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A'], ((k1.high<<2) + valTable[lineCharArray[k+j-33]-'A']) & forAndVal); + k1_rev = new Kmer64((k1_rev.low>>>2) + ((k1_rev.high&3)<<62), (k1_rev.high>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-33)<<1))); + } + else{ + k1 = new Kmer64(((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A']) & forAndVal32, 0); + k1_rev = new Kmer64((k1_rev.low>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-1)<<1)), 0); + } + } + + ReplaceObj = nodes.get(k1); + Replace_revObj = nodes.get(k1_rev); + + if(ReplaceObj == null && Replace_revObj == null){ + nodes.put(k1, cnt+j*2); + + if(!newOut && !next){ + bfw.write(preOriginal+"\t"+preReplace); + bfw.newLine(); + + newOut = true; + } + + } + else{ + if(ReplaceObj!=null){ + Original = cnt+j*2; + Replace = ReplaceObj; + } + else{ + Original = cnt+j*2; + Replace = Replace_revObj+1; + } + + if(newOut){ + bfw.write(Original+"\t"+Replace+"\t"); + newOut = false; + next = true; + } + + else if(Original-preOriginal==2){ + if(next){ + diff = Replace - preReplace; + if(diff==2){ + bfw.write("+\t"); + next = false; + } + else if(diff==-2){ + bfw.write("-\t"); + next = false; + } + else{ + bfw.write("\n"+Original+"\t"+Replace+"\t"); + } + } + else{ + if(Replace - preReplace != diff){ + bfw.write(preOriginal+"\t"+preReplace); + bfw.newLine(); + + bfw.write(Original+"\t"+Replace+"\t"); + next = true; + } + } + } + + else if(next==true){ + + bfw.write("\n"+Original+"\t"+Replace+"\t"); + } + + preOriginal = Original; + preReplace = Replace; + } + + } + + if(!newOut && !next){ + bfw.write(preOriginal+"\t"+preReplace); + bfw.newLine(); + } + else if(next){ + bfw.newLine(); + } + } + + if(p%100 == 0) System.out.println(p); + + distinctKmersPerPartition.put((long)p, (long)nodes.size()); + + nodes.clear(); + nodes = null; + + bfw.close(); + fw.close(); + bfr.close(); + fr.close(); + bfw = null; + fw = null; + bfr = null; + fr = null; + + File myObj = new File("Nodes/nodes"+p); + if (!myObj.delete()) + System.out.println("Failed to delete the file." + p); + } + + }catch(Exception E){ + System.out.println("Exception caught!"); + E.printStackTrace(); + } + + threadsSignal.countDown(); + System.out.println(Thread.currentThread().getName() + "End. Remaining" + threadsSignal.getCount() + " threads"); + + } + } + + + private AbstractMap BuildMap(int threadNum, HashSet fileNames) throws Exception{ + CountDownLatch threadSignal = new CountDownLatch(threadNum); + + ConcurrentHashMap distinctKmersPerPartition = new ConcurrentHashMap<>(); + + for(int i=0;i Run(int numThreads) throws Exception{ + long time1=0; + + HashSet fileNames = getNodesFileNames(); + + long t1 = System.currentTimeMillis(); + System.out.println("Build Maps Begin!"); + AbstractMap distinctKmersPerPartition= BuildMap(numThreads, fileNames); + long t2 = System.currentTimeMillis(); + time1 = (t2-t1)/1000; + System.out.println("Time used for building maps: " + time1 + " seconds!"); + + return distinctKmersPerPartition; + + } + + private HashSet getNodesFileNames(){ + File[] files = (new File("./Nodes")).listFiles(); + List fileNames = new LinkedList<>(); + for(File file : files){ + if(file.isFile()){ + fileNames.add(file.getName()); + } + } + return new HashSet<>(fileNames); + } + + public static void main(String[] args){ + + int k = 15, numBlocks = 256, numThreads = 1, bufferSize = 8192, hsmapCapacity = 1000000; + + if(args[0].equals("-help")){ + System.out.print("Usage: java -jar Map.jar -k k -NB numOfBlocks [options]\n" + + "Options Available: \n" + + "[-t numOfThreads] : (Integer) Number Of Threads. Default: 1" + "\n" + + "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + + "[-c capacity] : (Integer) Hashmap Capacity. Default: 1000000" + "\n"); + return; + } + + for(int i=0; i> frequency; + + private int statisticsSamples; + private int roundSamples; + private int rounds; + private int elementsToPush; + + private double percentagePunishment; + + private Integer[] temp = null; + private int mask; + private long[] statFrequency; + + public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.pivotLength = pivotLength; + this.k = k; + this.currentOrdering = initialOrdering.clone(); + stringUtils = new StringUtils(); + signatureUtils = new SignatureUtils(pivotLength); + } + + public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, String infile, int readLen, int bufSize, int k) { + this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); + for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + int canonical = Math.min(i, getReversed(i)); + currentOrdering[i] = canonical; + currentOrdering[getReversed(i)] = canonical; + } + roundSamples = 100000; + rounds = 10000; + elementsToPush = 1; + } + + public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { + this(pivotLength, infile, readLen, bufSize, k); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.percentagePunishment = percentagePunishment; + this.mask = (int) Math.pow(4, pivotLength) - 1; + } + + public String getCanon(String line) { + String x = new String(stringUtils.getReversedRead(line.toCharArray())); + for (int i = 0; i < line.length(); i++) { + if (line.charAt(i) < x.charAt(i)) + return line; + else if (line.charAt(i) > x.charAt(i)) + return x; + } + return x; + } + + + public void initFrequency() throws IOException { + int numMmers = (int)Math.pow(4, pivotLength); + for (int i = 0; i < numMmers; i++) { + if(!signatureUtils.isAllowed(i) && i < getReversed(i)) + { + currentOrdering[i] += numMmers; + currentOrdering[getReversed(i)] += numMmers; + } + } + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + statFrequency = new long[(int) Math.pow(4, pivotLength)]; + HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue, minValueNormalized; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = Math.min(minValue, getReversed(minValue)); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + ; + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(0, k))); // += 1; + + if (roundNumber == rounds) statFrequency[minValueNormalized]++; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = Math.min(minValue, getReversed(minValue)); + + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; + if (roundNumber == rounds) statFrequency[minValueNormalized]++; + } else { + int lastIndexInWindow = k + i - pivotLength; + if (strcmp(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = Math.min(minValue, getReversed(minValue)); + + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; + if (roundNumber == rounds) statFrequency[minValueNormalized]++; + } + } + + pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; + if (roundNumber == rounds) statFrequency[minValueNormalized]++; + } + } + + if (numSampled >= roundSamples) { + roundNumber++; + if (roundNumber <= rounds) { // TODO: SHOULD THIS BE < and not <= + numSampled = 0; + adaptOrdering(pmerFrequency); + pmerFrequency.clear(); + if (roundNumber == rounds) { + System.out.println("Sampling for binning round"); + roundSamples = statisticsSamples; + } + } else { + keepSample = false; + } + } + frequency = pmerFrequency; + + } + normalize(); + bfrG.close(); + frG.close(); + } + + + private void adaptOrdering(HashMap> pmerFrequency) { + int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; + for (Integer i : pmerFrequency.keySet()) { + frequencies[i] = pmerFrequency.get(i).size(); + } + for (int i = 0; i < elementsToPush; i++) { + long biggest = -1; + int biggestIndex = -1; + for (int k = 0; k < frequencies.length; k++) { + if (frequencies[k] > biggest) { + biggest = frequencies[k]; + biggestIndex = k; + } + } + long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + currentOrdering[biggestIndex] = newRank; + currentOrdering[getReversed(biggestIndex)] = newRank; + frequencies[biggestIndex] = 0; + frequencies[getReversed(biggestIndex)] = 0; + } + } + + private int getReversed(int x) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < pivotLength; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } + + + @Override + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + for (int i = from + 1; i <= to - pivotLength; i++) { + if (strcmp(a, a, min_pos, i, pivotLength) > 0) + min_pos = i; + } + + return min_pos; + } + + @Override + public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { + int x = stringUtils.getDecimal(a, froma, froma + pivotLength); + int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); + + return strcmp(x,y); + } + + public int strcmp(int x, int y) { + if (x == y || y == getReversed(x)) return 0; +// if (x == y) return 0; + if (currentOrdering[x] < currentOrdering[y]) return -1; + return 1; + } + + private void normalize() { +// currentOrdering + if (temp == null) { + temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; temp[i] = i, i++) ; + } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for (int i = 0; i < temp.length; i++) { + currentOrdering[temp[i]] = i; // TODO: FIXED THIS + } + } + + + public void exportOrderingForCpp() { + File file = new File("ranks.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < currentOrdering.length; i++) { + bf.write(Long.toString(currentOrdering[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + + public void exportBinningForCpp() { + File file = new File("freq.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < statFrequency.length; i++) { + bf.write(Long.toString(statFrequency[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } +} diff --git a/src/buildgraph/Partition.java b/src/buildgraph/Partition.java index 3c45452..cd9e714 100644 --- a/src/buildgraph/Partition.java +++ b/src/buildgraph/Partition.java @@ -250,7 +250,7 @@ public void Run() throws Exception { long time1 = 0; long t1 = System.currentTimeMillis(); System.out.println("Distribute Nodes Begin!"); - for(minFile = 0, maxFile=10000; minFile < (int)Math.pow(4, pivotLen); minFile+= 10000, maxFile += 10000) + for(minFile = 0, maxFile=10000; minFile < numOfBlocks; minFile+= 10000, maxFile += 10000) { System.out.println("hi"); DistributeNodes(); diff --git a/src/buildgraph/PartitionNew.java b/src/buildgraph/PartitionNew.java new file mode 100644 index 0000000..a54de75 --- /dev/null +++ b/src/buildgraph/PartitionNew.java @@ -0,0 +1,288 @@ +package buildgraph; + +import buildgraph.Ordering.IOrdering; +import buildgraph.Ordering.UHS.YaelUHSOrdering; + +import java.io.*; +import java.util.HashSet; + +public class PartitionNew { + + private int k; + private String inputfile; + private int numOfBlocks; + private int pivotLen; + private int bufSize; + + private FileReader frG; + private BufferedReader bfrG; + private FileWriter[] fwG; + private BufferedWriter[] bfwG; + + private int readLen; + private IOrdering ordering; + + private StringUtils stringUtils; + + private int numOpenFiles; + private int minFile; + private int maxFile; + + private int[] finishedMinimizers; + private HashSet currentMinimizers; + private boolean shouldContinue; + + + public PartitionNew(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrdering ordering) { + this.k = kk; + this.inputfile = infile; + this.numOfBlocks = numberOfBlocks; + this.pivotLen = pivotLength; + this.bufSize = bufferSize; + this.readLen = readLen; + this.ordering = ordering; + this.stringUtils = new StringUtils(); + this.numOpenFiles = 0; + finishedMinimizers = new int[numberOfBlocks]; + currentMinimizers = new HashSet<>(); + shouldContinue = true; + } + + + private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException { + + int len = a.length; + int pos1 = ordering.findSmallest(a, from, to); + int pos2 = ordering.findSmallest(b, len - to, len - from); + + if (ordering.strcmp(a, b, pos1, pos2, pivotLen) < 0) { + flag[0] = 0; + return pos1; + } else { + flag[0] = 1; + return pos2; + } + } + + private int calPosNew(char[] a, int from, int to) { + return stringUtils.getDecimal(a, from, to) % numOfBlocks; + } + + private long DistributeNodes() throws IOException { + frG = new FileReader(inputfile); + bfrG = new BufferedReader(frG, bufSize); + fwG = new FileWriter[numOfBlocks]; + bfwG = new BufferedWriter[numOfBlocks]; + + String describeline; + + int numSuperKmers = 0; + + int prepos, substart = 0, subend, min_pos = -1; + + char[] lineCharArray = new char[readLen]; + + int[] flag = new int[1]; + + long cnt = 0, outcnt = 0; + + File dir = new File("Nodes"); + if (!dir.exists()) + dir.mkdir(); + + + while ((describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + prepos = -1; + if (stringUtils.isReadLegal(lineCharArray)) { + + substart = 0; + + outcnt = cnt; + + int len = readLen; + + char[] revCharArray = stringUtils.getReversedRead(lineCharArray); + + min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag); + + cnt += 2; + + int bound = len - k + 1; + + for (int i = 1; i < bound; i++) { + + if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLen)) { + + int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); + + min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag); + + if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen))) { + prepos = temp; + subend = i - 1 + k; + + + writeToFile(prepos, substart, subend, lineCharArray, outcnt); + numSuperKmers++; + + substart = i; + outcnt = cnt; + } + + } else { + + if (ordering.strcmp(lineCharArray, revCharArray, k + i - pivotLen, len - i - k, pivotLen) < 0) { + if (ordering.strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLen, min_pos, pivotLen) < 0) { + boolean enter = true; + if (ordering instanceof YaelUHSOrdering) { + if (!((YaelUHSOrdering) ordering).isInUHS(lineCharArray, k + i - pivotLen, k + i)) { + enter = false; + } + } + if (enter) { + int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); + + min_pos = k + i - pivotLen; + + if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLen)) { + prepos = temp; + subend = i - 1 + k; + + writeToFile(prepos, substart, subend, lineCharArray, outcnt); + numSuperKmers++; + + + substart = i; + outcnt = cnt; + } + + flag[0] = 0; + } + } + } else { + if (ordering.strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLen) < 0) { + boolean enter = true; + if (ordering instanceof YaelUHSOrdering) { + if (!((YaelUHSOrdering) ordering).isInUHS(revCharArray, len - i - k, len - i - k + pivotLen)) { + enter = false; + } + } + if (enter) { + int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); + + min_pos = -k - i + len; + + if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLen)) { + prepos = temp; + subend = i - 1 + k; + + writeToFile(prepos, substart, subend, lineCharArray, outcnt); + numSuperKmers++; + + + substart = i; + outcnt = cnt; + } + flag[0] = 1; + } + } + } + } + + cnt += 2; + } + subend = len; + prepos = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); + + writeToFile(prepos, substart, subend, lineCharArray, outcnt); + numSuperKmers++; + + } + } + + System.out.println("Largest ID is " + cnt); + System.out.println("Num superkmers is = " + numSuperKmers); + + for (int i = 0; i < bfwG.length; i++) { + if (bfwG[i] != null) { + bfwG[i].close(); + fwG[i].close(); + } + } + if(currentMinimizers.size() <15000) + { + shouldContinue = false; + } + else{ + for (Integer i : currentMinimizers) { + finishedMinimizers[i] = 1; + } + currentMinimizers.clear(); + } + bfrG.close(); + frG.close(); + + return cnt; + } + + private void tryCreateWriterForPmer(int prepos) throws IOException { + if (numOpenFiles == 16000) { + for (int i = 0; i < bfwG.length; i++) { + if (bfwG[i] != null) { + bfwG[i].close(); + fwG[i].close(); + bfwG[i] = null; + fwG[i] = null; + } + } + numOpenFiles = 0; + } + + if (bfwG[prepos] == null) { + fwG[prepos] = new FileWriter("Nodes/nodes" + prepos, true); + bfwG[prepos] = new BufferedWriter(fwG[prepos], bufSize); + numOpenFiles += 1; + } + } + + private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray, long outcnt) throws IOException { + if(finishedMinimizers[prepos] == 0 && currentMinimizers.size() < 15000){ + currentMinimizers.add(prepos); + } + if(currentMinimizers.contains(prepos)) + //if(minFile <= prepos && prepos < maxFile) + { + tryCreateWriterForPmer(prepos); + + BufferedWriter writer = bfwG[prepos]; + + writer.write(lineCharArray, substart, subend - substart); + writer.write("\t" + outcnt); + writer.newLine(); + } + } + + public void Run() throws Exception { + long time1 = 0; + long t1 = System.currentTimeMillis(); + System.out.println("Distribute Nodes Begin!"); + while (shouldContinue) + { + System.out.println("hi"); + DistributeNodes(); + } +// for(minFile = 0, maxFile=10000; minFile < numOfBlocks; minFile+= 10000, maxFile += 10000) +// { +// System.out.println("hi"); +// DistributeNodes(); +// } + long t2 = System.currentTimeMillis(); + time1 = (t2 - t1) / 1000; + System.out.println("Time used for distributing nodes: " + time1 + " seconds!"); + } + +} \ No newline at end of file diff --git a/src/buildgraph/PartitionTrunc.java b/src/buildgraph/PartitionTrunc.java new file mode 100644 index 0000000..4665633 --- /dev/null +++ b/src/buildgraph/PartitionTrunc.java @@ -0,0 +1,166 @@ +package buildgraph; + +import buildgraph.Ordering.IOrdering; +import buildgraph.Ordering.UHS.YaelUHSOrdering; + +import java.io.*; + +public class PartitionTrunc { + + private int k; + private String inputfile; + private int numOfBlocks; + private int pivotLen; + private int bufSize; + + private FileReader frG; + private BufferedReader bfrG; + private FileWriter[] fwG; + private BufferedWriter[] bfwG; + + private int readLen; + private IOrdering ordering; + + private StringUtils stringUtils; + + private int numOpenFiles; + private int minFile; + private int maxFile; + + + public PartitionTrunc(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrdering ordering) { + this.k = kk; + this.inputfile = infile; + this.numOfBlocks = numberOfBlocks; + this.pivotLen = pivotLength; + this.bufSize = bufferSize; + this.readLen = readLen; + this.ordering = ordering; + this.stringUtils = new StringUtils(); + this.numOpenFiles = 0; + } + + + private int findPosOfMin(char[] a, int from, int to) throws IOException { + + int len = a.length; + int pos1 = ordering.findSmallest(a, from, to); + return pos1; + + } + + private int calPosNew(char[] a, int from, int to) { + return Math.min(stringUtils.getDecimal(a, from, to), stringUtils.getReversedMmer(stringUtils.getDecimal(a, from, to), pivotLen)) % numOfBlocks; + } + + private long DistributeNodes() throws IOException { + frG = new FileReader(inputfile); + bfrG = new BufferedReader(frG, bufSize); + fwG = new FileWriter[numOfBlocks]; + bfwG = new BufferedWriter[numOfBlocks]; + + String describeline; + + int numSuperKmers = 0; + + int prepos, substart = 0, subend, min_pos = -1; + + char[] lineCharArray = new char[readLen]; + + long cnt = 0, outcnt = 0; + + File dir = new File("Nodes"); + if (!dir.exists()) + dir.mkdir(); + + + while ((describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + prepos = -1; + if (stringUtils.isReadLegal(lineCharArray)) { + + outcnt = cnt; + + int len = readLen; + + cnt += 2; + + int bound = len - k + 1; + + for (int i = 0; i < bound; i++) { + min_pos = findPosOfMin(lineCharArray, i, i + k); + prepos = calPosNew(lineCharArray, min_pos, min_pos+pivotLen); + writeToFile(prepos, i, i+k, lineCharArray, outcnt); + numSuperKmers++; + + outcnt = cnt; + cnt += 2; + } + } + } + + System.out.println("Largest ID is " + cnt); + System.out.println("Num superkmers is = " + numSuperKmers); + + for (int i = 0; i < bfwG.length; i++) { + if (bfwG[i] != null) { + bfwG[i].close(); + fwG[i].close(); + } + } + + bfrG.close(); + frG.close(); + + return cnt; + } + + private void tryCreateWriterForPmer(int prepos) throws IOException { + if (numOpenFiles == 16000) { + for (int i = 0; i < bfwG.length; i++) { + if (bfwG[i] != null) { + bfwG[i].close(); + fwG[i].close(); + bfwG[i] = null; + fwG[i] = null; + } + } + numOpenFiles = 0; + } + + if (bfwG[prepos] == null) { + fwG[prepos] = new FileWriter("Nodes/nodes" + prepos, true); + bfwG[prepos] = new BufferedWriter(fwG[prepos], bufSize); + numOpenFiles += 1; + } + } + + private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray, long outcnt) throws IOException { + if (minFile <= prepos && prepos < maxFile) { + tryCreateWriterForPmer(prepos); + + BufferedWriter writer = bfwG[prepos]; + + writer.write(lineCharArray, substart, subend - substart); + writer.write("\t" + outcnt); + writer.newLine(); + } + } + + public void Run() throws Exception { + long time1 = 0; + long t1 = System.currentTimeMillis(); + System.out.println("Distribute Nodes Begin!"); + for (minFile = 0, maxFile = 10000; minFile < numOfBlocks; minFile += 10000, maxFile += 10000) { + System.out.println("hi"); + DistributeNodes(); + } + long t2 = System.currentTimeMillis(); + time1 = (t2 - t1) / 1000; + System.out.println("Time used for distributing nodes: " + time1 + " seconds!"); + } + +} \ No newline at end of file diff --git a/src/buildgraph/StringUtils.java b/src/buildgraph/StringUtils.java index ad60c7b..e775d17 100644 --- a/src/buildgraph/StringUtils.java +++ b/src/buildgraph/StringUtils.java @@ -51,4 +51,15 @@ public char[] getReversedRead(char[] lineCharArray){ } return revCharArray; } + + public int getReversedMmer(int x, int length) { + int rev = 0; + int immer = ~x; + for (int i = 0; i < length; ++i) { + rev <<= 2; + rev |= immer & 0x3; + immer >>= 2; + } + return rev; + } } From 2c22407c25ade3a7804dee104d8f1ff7a234cd8f Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 31 Mar 2021 13:53:26 +0300 Subject: [PATCH 19/44] I fixed itttt :)))) --- src/buildgraph/BuildDeBruijnGraph.java | 2 +- src/buildgraph/Map.java | 30 +++++++++-- src/buildgraph/Ordering/IOrderingPP.java | 8 +++ ...g9_WithCounterNormalized_AndSignature.java | 12 +++-- src/buildgraph/PartitionTrunc.java | 52 +++++++++++++------ 5 files changed, 81 insertions(+), 23 deletions(-) create mode 100644 src/buildgraph/Ordering/IOrderingPP.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index 6a00863..c61ac0c 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -174,7 +174,7 @@ else if (args[i].equals("-punishPercentage")) "Ordering: " + orderingName + "\n"); // Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); - PartitionTrunc partition = new PartitionTrunc(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); + PartitionTrunc partition = new PartitionTrunc(k, infile, numBlocks, pivot_len, bufferSize, readLen, (IOrderingPP) ordering); Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); // MapTrunc map = new MapTrunc(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); diff --git a/src/buildgraph/Map.java b/src/buildgraph/Map.java index 1663a9f..5a00bb5 100644 --- a/src/buildgraph/Map.java +++ b/src/buildgraph/Map.java @@ -94,16 +94,38 @@ public void run(){ int bound = strs[0].length() - k + 1; for(j = 0; j < bound; j++){ - - nodes.add(strs[0].substring(j, j + k)); - nodes.add(revLine.substring(j, j + k)); + String reg = strs[0].substring(j, j + k); + String rev = new String(stringUtils.getReversedRead(reg.toCharArray())); + if(reg.equals(rev)) + { + nodes.add(rev); + } + else{ + boolean didAdd = false; + for (int i = 0; i < k; i++) { + if(rev.charAt(i) < reg.charAt(i)) + { + nodes.add(rev); + didAdd = true; + break; + } + else if(reg.charAt(i) < rev.charAt(i)) + { + nodes.add(rev); + didAdd = true; + break; + } + } + if(!didAdd) + nodes.add(reg); + } } } if(p%100 == 0) System.out.println(p); - distinctKmersPerPartition.put((long)p, (long)nodes.size() / 2); + distinctKmersPerPartition.put((long)p, (long)nodes.size()); nodes.clear(); nodes = null; diff --git a/src/buildgraph/Ordering/IOrderingPP.java b/src/buildgraph/Ordering/IOrderingPP.java new file mode 100644 index 0000000..80dcfb6 --- /dev/null +++ b/src/buildgraph/Ordering/IOrderingPP.java @@ -0,0 +1,8 @@ +package buildgraph.Ordering; + +import java.io.IOException; + +public interface IOrderingPP extends IOrdering { + long getRank(int mmer); + int strcmp(int x, int y); +} diff --git a/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java b/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java index 7dd18ee..053c674 100644 --- a/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java +++ b/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java @@ -8,7 +8,7 @@ import java.util.HashMap; import java.util.HashSet; -public class IterativeOrdering9_WithCounterNormalized_AndSignature implements IOrdering { +public class IterativeOrdering9_WithCounterNormalized_AndSignature implements IOrderingPP { private String inputFile; private int readLen; private int bufSize; @@ -232,9 +232,10 @@ public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { return strcmp(x,y); } + @Override public int strcmp(int x, int y) { - if (x == y || y == getReversed(x)) return 0; -// if (x == y) return 0; +// if (x == y || y == getReversed(x)) return 0; + if (x == y) return 0; if (currentOrdering[x] < currentOrdering[y]) return -1; return 1; } @@ -303,4 +304,9 @@ public void exportBinningForCpp() { } } } + + @Override + public long getRank(int mmer) { + return currentOrdering[mmer]; + } } diff --git a/src/buildgraph/PartitionTrunc.java b/src/buildgraph/PartitionTrunc.java index 4665633..5f63136 100644 --- a/src/buildgraph/PartitionTrunc.java +++ b/src/buildgraph/PartitionTrunc.java @@ -1,9 +1,11 @@ package buildgraph; import buildgraph.Ordering.IOrdering; +import buildgraph.Ordering.IOrderingPP; import buildgraph.Ordering.UHS.YaelUHSOrdering; import java.io.*; +import java.util.HashSet; public class PartitionTrunc { @@ -19,7 +21,7 @@ public class PartitionTrunc { private BufferedWriter[] bfwG; private int readLen; - private IOrdering ordering; + private IOrderingPP ordering; private StringUtils stringUtils; @@ -27,8 +29,10 @@ public class PartitionTrunc { private int minFile; private int maxFile; + private int mask; - public PartitionTrunc(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrdering ordering) { + + public PartitionTrunc(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrderingPP ordering) { this.k = kk; this.inputfile = infile; this.numOfBlocks = numberOfBlocks; @@ -38,6 +42,8 @@ public PartitionTrunc(int kk, String infile, int numberOfBlocks, int pivotLength this.ordering = ordering; this.stringUtils = new StringUtils(); this.numOpenFiles = 0; + this.mask = (int) Math.pow(4, pivotLength) - 1; + } @@ -66,6 +72,8 @@ private long DistributeNodes() throws IOException { int prepos, substart = 0, subend, min_pos = -1; char[] lineCharArray = new char[readLen]; + int len = readLen; + long cnt = 0, outcnt = 0; @@ -73,7 +81,7 @@ private long DistributeNodes() throws IOException { if (!dir.exists()) dir.mkdir(); - + int minValue, minValueNormalized, currentValue, start; while ((describeline = bfrG.readLine()) != null) { bfrG.read(lineCharArray, 0, readLen); @@ -82,27 +90,41 @@ private long DistributeNodes() throws IOException { prepos = -1; if (stringUtils.isReadLegal(lineCharArray)) { - outcnt = cnt; + min_pos = ordering.findSmallest(lineCharArray, 0, k); + start = 0; + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLen); + minValueNormalized = Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLen)) % numOfBlocks; + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLen, k); - int len = readLen; + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - cnt += 2; + if (i > min_pos) { + writeToFile(minValueNormalized, start, min_pos+k,lineCharArray, 0); + + min_pos = ordering.findSmallest(lineCharArray, i, i + k); + start = i; + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLen); + minValueNormalized = Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLen)) % numOfBlocks; - int bound = len - k + 1; - for (int i = 0; i < bound; i++) { - min_pos = findPosOfMin(lineCharArray, i, i + k); - prepos = calPosNew(lineCharArray, min_pos, min_pos+pivotLen); - writeToFile(prepos, i, i+k, lineCharArray, outcnt); - numSuperKmers++; + } else { + int lastIndexInWindow = k + i - pivotLen; + if (ordering.strcmp(currentValue, minValue) < 0) { + writeToFile(minValueNormalized, start, lastIndexInWindow+pivotLen - 1,lineCharArray, 0); - outcnt = cnt; - cnt += 2; + start = lastIndexInWindow + pivotLen - k; + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLen)) % numOfBlocks; + } + } } + writeToFile(minValueNormalized, start, len, lineCharArray, 0); } } - System.out.println("Largest ID is " + cnt); System.out.println("Num superkmers is = " + numSuperKmers); for (int i = 0; i < bfwG.length; i++) { From 5001b885e21bddc72f1d73f56d11a358cf876cd1 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 31 Mar 2021 14:07:45 +0300 Subject: [PATCH 20/44] add mechanism for multiple pass to not open many files at once --- src/buildgraph/BuildDeBruijnGraph.java | 4 +- src/buildgraph/MapOld.java | 321 ------------------ ...ativeOrdering10_WithCounterNormalized.java | 1 - src/buildgraph/Partition.java | 179 ++++------ src/buildgraph/PartitionNew.java | 288 ---------------- src/buildgraph/PartitionTrunc.java | 188 ---------- 6 files changed, 59 insertions(+), 922 deletions(-) delete mode 100644 src/buildgraph/MapOld.java delete mode 100644 src/buildgraph/PartitionNew.java delete mode 100644 src/buildgraph/PartitionTrunc.java diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/buildgraph/BuildDeBruijnGraph.java index c61ac0c..b8a0805 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/buildgraph/BuildDeBruijnGraph.java @@ -2,13 +2,11 @@ import buildgraph.Ordering.*; import buildgraph.Ordering.UHS.UHSFrequencySignatureOrdering; -import buildgraph.Ordering.UHS.UHSSignatureOrdering; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; -import java.lang.reflect.Array; import java.util.AbstractMap; import java.util.HashMap; @@ -174,7 +172,7 @@ else if (args[i].equals("-punishPercentage")) "Ordering: " + orderingName + "\n"); // Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); - PartitionTrunc partition = new PartitionTrunc(k, infile, numBlocks, pivot_len, bufferSize, readLen, (IOrderingPP) ordering); + Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, (IOrderingPP) ordering); Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); // MapTrunc map = new MapTrunc(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); diff --git a/src/buildgraph/MapOld.java b/src/buildgraph/MapOld.java deleted file mode 100644 index d889b4a..0000000 --- a/src/buildgraph/MapOld.java +++ /dev/null @@ -1,321 +0,0 @@ -package buildgraph; - -import java.io.*; -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.CountDownLatch; - - -public class MapOld { - - private int k; - private int numOfBlocks; - private int bufSize; - - private Object lock_blocks = new Object(); - - private int capacity; - - private int blockID; - - private long forAndVal; - private long forAndVal32; - - private static int[] valTable = StringUtils.valTable; - - public MapOld(int kk, int numberOfBlocks, int bufferSize, int HScapacity){ - this.k = kk; - this.numOfBlocks = numberOfBlocks; - this.bufSize = bufferSize; - this.capacity = HScapacity; - this.blockID = 0; - this.forAndVal = (long)Math.pow(2, 2*(k-32)) - 1; - this.forAndVal32 = (long)Math.pow(2, 2*k) - 1; - } - - public class MyThread extends Thread{ - private CountDownLatch threadsSignal; - private HashSet fileNames; - private ConcurrentHashMap distinctKmersPerPartition; - - public MyThread(CountDownLatch threadsSignal, HashSet fileNames, ConcurrentHashMap distinctKmersPerPartition){ - super(); - this.threadsSignal = threadsSignal; - this.fileNames = fileNames; - this.distinctKmersPerPartition = distinctKmersPerPartition; - } - - @Override - public void run(){ - System.out.println(Thread.currentThread().getName() + "Start..."); - - FileReader fr; - BufferedReader bfr; - FileWriter fw; - BufferedWriter bfw; - - - String line; - - int p,j; - long cnt; - Kmer64 k1, k1_rev; - - - try{ - File dir = new File("Maps"); - if(!dir.exists()) - dir.mkdir(); - - while(blockID nodes = new HashMap(capacity); - - while((line = bfr.readLine()) != null){ - - String[] strs = line.split("\t"); - cnt = Long.parseLong(strs[1]); - - long preOriginal = -1, preReplace = -1, Original = -1, Replace = -1; - long diff = -1; - boolean newOut = true, next = false; - - Long ReplaceObj, Replace_revObj; - - char[] lineCharArray = strs[0].toCharArray(); - k1 = new Kmer64(lineCharArray,0,k,false); - k1_rev = new Kmer64(lineCharArray,0,k,true); - - int bound = strs[0].length() - k + 1; - - for(j = 0; j < bound; j++){ - - if(j != 0){ - if(k > 32){ - k1 = new Kmer64((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A'], ((k1.high<<2) + valTable[lineCharArray[k+j-33]-'A']) & forAndVal); - k1_rev = new Kmer64((k1_rev.low>>>2) + ((k1_rev.high&3)<<62), (k1_rev.high>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-33)<<1))); - } - else{ - k1 = new Kmer64(((k1.low<<2) + valTable[lineCharArray[k+j-1]-'A']) & forAndVal32, 0); - k1_rev = new Kmer64((k1_rev.low>>>2) + ((long)((valTable[lineCharArray[k+j-1]-'A']^3))<<((k-1)<<1)), 0); - } - } - - ReplaceObj = nodes.get(k1); - Replace_revObj = nodes.get(k1_rev); - - if(ReplaceObj == null && Replace_revObj == null){ - nodes.put(k1, cnt+j*2); - - if(!newOut && !next){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - - newOut = true; - } - - } - else{ - if(ReplaceObj!=null){ - Original = cnt+j*2; - Replace = ReplaceObj; - } - else{ - Original = cnt+j*2; - Replace = Replace_revObj+1; - } - - if(newOut){ - bfw.write(Original+"\t"+Replace+"\t"); - newOut = false; - next = true; - } - - else if(Original-preOriginal==2){ - if(next){ - diff = Replace - preReplace; - if(diff==2){ - bfw.write("+\t"); - next = false; - } - else if(diff==-2){ - bfw.write("-\t"); - next = false; - } - else{ - bfw.write("\n"+Original+"\t"+Replace+"\t"); - } - } - else{ - if(Replace - preReplace != diff){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - - bfw.write(Original+"\t"+Replace+"\t"); - next = true; - } - } - } - - else if(next==true){ - - bfw.write("\n"+Original+"\t"+Replace+"\t"); - } - - preOriginal = Original; - preReplace = Replace; - } - - } - - if(!newOut && !next){ - bfw.write(preOriginal+"\t"+preReplace); - bfw.newLine(); - } - else if(next){ - bfw.newLine(); - } - } - - if(p%100 == 0) System.out.println(p); - - distinctKmersPerPartition.put((long)p, (long)nodes.size()); - - nodes.clear(); - nodes = null; - - bfw.close(); - fw.close(); - bfr.close(); - fr.close(); - bfw = null; - fw = null; - bfr = null; - fr = null; - - File myObj = new File("Nodes/nodes"+p); - if (!myObj.delete()) - System.out.println("Failed to delete the file." + p); - } - - }catch(Exception E){ - System.out.println("Exception caught!"); - E.printStackTrace(); - } - - threadsSignal.countDown(); - System.out.println(Thread.currentThread().getName() + "End. Remaining" + threadsSignal.getCount() + " threads"); - - } - } - - - private AbstractMap BuildMap(int threadNum, HashSet fileNames) throws Exception{ - CountDownLatch threadSignal = new CountDownLatch(threadNum); - - ConcurrentHashMap distinctKmersPerPartition = new ConcurrentHashMap<>(); - - for(int i=0;i Run(int numThreads) throws Exception{ - long time1=0; - - HashSet fileNames = getNodesFileNames(); - - long t1 = System.currentTimeMillis(); - System.out.println("Build Maps Begin!"); - AbstractMap distinctKmersPerPartition= BuildMap(numThreads, fileNames); - long t2 = System.currentTimeMillis(); - time1 = (t2-t1)/1000; - System.out.println("Time used for building maps: " + time1 + " seconds!"); - - return distinctKmersPerPartition; - - } - - private HashSet getNodesFileNames(){ - File[] files = (new File("./Nodes")).listFiles(); - List fileNames = new LinkedList<>(); - for(File file : files){ - if(file.isFile()){ - fileNames.add(file.getName()); - } - } - return new HashSet<>(fileNames); - } - - public static void main(String[] args){ - - int k = 15, numBlocks = 256, numThreads = 1, bufferSize = 8192, hsmapCapacity = 1000000; - - if(args[0].equals("-help")){ - System.out.print("Usage: java -jar Map.jar -k k -NB numOfBlocks [options]\n" + - "Options Available: \n" + - "[-t numOfThreads] : (Integer) Number Of Threads. Default: 1" + "\n" + - "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + - "[-c capacity] : (Integer) Hashmap Capacity. Default: 1000000" + "\n"); - return; - } - - for(int i=0; i currentMinimizers; + private byte[] finishedMinimizers; + private int maxMinimizersPerPass; + private boolean keepPassing; - public Partition(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrdering ordering) { + private final int mask; + + + public Partition(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrderingPP ordering) { this.k = kk; this.inputfile = infile; this.numOfBlocks = numberOfBlocks; @@ -39,43 +47,31 @@ public Partition(int kk, String infile, int numberOfBlocks, int pivotLength, int this.ordering = ordering; this.stringUtils = new StringUtils(); this.numOpenFiles = 0; + this.mask = (int) Math.pow(4, pivotLength) - 1; + finishedMinimizers = new byte[numOfBlocks]; + currentMinimizers = new HashSet<>(); + maxMinimizersPerPass = 10000; + keepPassing = true; } - private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException { - - int len = a.length; - int pos1 = ordering.findSmallest(a, from, to); - int pos2 = ordering.findSmallest(b, len - to, len - from); - - if (ordering.strcmp(a, b, pos1, pos2, pivotLen) < 0) { - flag[0] = 0; - return pos1; - } else { - flag[0] = 1; - return pos2; - } - } - - private int calPosNew(char[] a, int from, int to) { - return stringUtils.getDecimal(a, from, to) % numOfBlocks; - } - private long DistributeNodes() throws IOException { frG = new FileReader(inputfile); bfrG = new BufferedReader(frG, bufSize); fwG = new FileWriter[numOfBlocks]; bfwG = new BufferedWriter[numOfBlocks]; + currentMinimizers.clear(); + String describeline; int numSuperKmers = 0; - int prepos, substart = 0, subend, min_pos = -1; + int minPos = -1; char[] lineCharArray = new char[readLen]; + int len = readLen; - int[] flag = new int[1]; long cnt = 0, outcnt = 0; @@ -83,121 +79,49 @@ private long DistributeNodes() throws IOException { if (!dir.exists()) dir.mkdir(); - + int minValue, minValueNormalized, currentValue, start; while ((describeline = bfrG.readLine()) != null) { bfrG.read(lineCharArray, 0, readLen); bfrG.read(); - prepos = -1; if (stringUtils.isReadLegal(lineCharArray)) { - substart = 0; - - outcnt = cnt; - - int len = readLen; - - char[] revCharArray = stringUtils.getReversedRead(lineCharArray); - - min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag); - - cnt += 2; + minPos = ordering.findSmallest(lineCharArray, 0, k); + start = 0; + minValue = stringUtils.getDecimal(lineCharArray, minPos, minPos + pivotLen); + minValueNormalized = getNormalizedValue(minValue); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLen, k); int bound = len - k + 1; - for (int i = 1; i < bound; i++) { + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLen)) { - - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag); - - if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen))) { - prepos = temp; - subend = i - 1 + k; + if (i > minPos) { + writeToFile(minValueNormalized, start, minPos + k, lineCharArray, 0); + minPos = ordering.findSmallest(lineCharArray, i, i + k); + start = i; + minValue = stringUtils.getDecimal(lineCharArray, minPos, minPos + pivotLen); + minValueNormalized = getNormalizedValue(minValue); - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - numSuperKmers++; - - substart = i; - outcnt = cnt; - } } else { - - if (ordering.strcmp(lineCharArray, revCharArray, k + i - pivotLen, len - i - k, pivotLen) < 0) { - if (ordering.strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLen, min_pos, pivotLen) < 0) { - boolean enter = true; - if (ordering instanceof YaelUHSOrdering) { - if (!((YaelUHSOrdering) ordering).isInUHS(lineCharArray, k + i - pivotLen, k + i)) { - enter = false; - } - } - if (enter) { - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = k + i - pivotLen; - - if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLen)) { - prepos = temp; - subend = i - 1 + k; - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - numSuperKmers++; - - - substart = i; - outcnt = cnt; - } - - flag[0] = 0; - } - } - } else { - if (ordering.strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLen) < 0) { - boolean enter = true; - if (ordering instanceof YaelUHSOrdering) { - if (!((YaelUHSOrdering) ordering).isInUHS(revCharArray, len - i - k, len - i - k + pivotLen)) { - enter = false; - } - } - if (enter) { - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = -k - i + len; - - if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLen)) { - prepos = temp; - subend = i - 1 + k; - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - numSuperKmers++; - - - substart = i; - outcnt = cnt; - } - flag[0] = 1; - } - } + int lastIndexInWindow = k + i - pivotLen; + if (ordering.strcmp(currentValue, minValue) < 0) { + writeToFile(minValueNormalized, start, lastIndexInWindow + pivotLen - 1, lineCharArray, 0); + + start = lastIndexInWindow + pivotLen - k; + minPos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = getNormalizedValue(minValue); } } - - cnt += 2; } - subend = len; - prepos = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - numSuperKmers++; - + writeToFile(minValueNormalized, start, len, lineCharArray, 0); } } - System.out.println("Largest ID is " + cnt); System.out.println("Num superkmers is = " + numSuperKmers); for (int i = 0; i < bfwG.length; i++) { @@ -206,6 +130,12 @@ private long DistributeNodes() throws IOException { fwG[i].close(); } } + for (Integer i : currentMinimizers) { + finishedMinimizers[i] = 1; + } + if(currentMinimizers.size() < maxMinimizersPerPass) + keepPassing = false; + currentMinimizers.clear(); bfrG.close(); frG.close(); @@ -213,6 +143,10 @@ private long DistributeNodes() throws IOException { return cnt; } + private int getNormalizedValue(int minValue) { + return Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLen)) % numOfBlocks; + } + private void tryCreateWriterForPmer(int prepos) throws IOException { if (numOpenFiles == 16000) { for (int i = 0; i < bfwG.length; i++) { @@ -234,8 +168,12 @@ private void tryCreateWriterForPmer(int prepos) throws IOException { } private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray, long outcnt) throws IOException { - if(minFile <= prepos && prepos < maxFile) + if(finishedMinimizers[prepos] == 0 && currentMinimizers.size() < maxMinimizersPerPass) { + currentMinimizers.add(prepos); + } + + if (currentMinimizers.contains(prepos)) { tryCreateWriterForPmer(prepos); BufferedWriter writer = bfwG[prepos]; @@ -250,8 +188,7 @@ public void Run() throws Exception { long time1 = 0; long t1 = System.currentTimeMillis(); System.out.println("Distribute Nodes Begin!"); - for(minFile = 0, maxFile=10000; minFile < numOfBlocks; minFile+= 10000, maxFile += 10000) - { + while (keepPassing){ System.out.println("hi"); DistributeNodes(); } diff --git a/src/buildgraph/PartitionNew.java b/src/buildgraph/PartitionNew.java deleted file mode 100644 index a54de75..0000000 --- a/src/buildgraph/PartitionNew.java +++ /dev/null @@ -1,288 +0,0 @@ -package buildgraph; - -import buildgraph.Ordering.IOrdering; -import buildgraph.Ordering.UHS.YaelUHSOrdering; - -import java.io.*; -import java.util.HashSet; - -public class PartitionNew { - - private int k; - private String inputfile; - private int numOfBlocks; - private int pivotLen; - private int bufSize; - - private FileReader frG; - private BufferedReader bfrG; - private FileWriter[] fwG; - private BufferedWriter[] bfwG; - - private int readLen; - private IOrdering ordering; - - private StringUtils stringUtils; - - private int numOpenFiles; - private int minFile; - private int maxFile; - - private int[] finishedMinimizers; - private HashSet currentMinimizers; - private boolean shouldContinue; - - - public PartitionNew(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrdering ordering) { - this.k = kk; - this.inputfile = infile; - this.numOfBlocks = numberOfBlocks; - this.pivotLen = pivotLength; - this.bufSize = bufferSize; - this.readLen = readLen; - this.ordering = ordering; - this.stringUtils = new StringUtils(); - this.numOpenFiles = 0; - finishedMinimizers = new int[numberOfBlocks]; - currentMinimizers = new HashSet<>(); - shouldContinue = true; - } - - - private int findPosOfMin(char[] a, char[] b, int from, int to, int[] flag) throws IOException { - - int len = a.length; - int pos1 = ordering.findSmallest(a, from, to); - int pos2 = ordering.findSmallest(b, len - to, len - from); - - if (ordering.strcmp(a, b, pos1, pos2, pivotLen) < 0) { - flag[0] = 0; - return pos1; - } else { - flag[0] = 1; - return pos2; - } - } - - private int calPosNew(char[] a, int from, int to) { - return stringUtils.getDecimal(a, from, to) % numOfBlocks; - } - - private long DistributeNodes() throws IOException { - frG = new FileReader(inputfile); - bfrG = new BufferedReader(frG, bufSize); - fwG = new FileWriter[numOfBlocks]; - bfwG = new BufferedWriter[numOfBlocks]; - - String describeline; - - int numSuperKmers = 0; - - int prepos, substart = 0, subend, min_pos = -1; - - char[] lineCharArray = new char[readLen]; - - int[] flag = new int[1]; - - long cnt = 0, outcnt = 0; - - File dir = new File("Nodes"); - if (!dir.exists()) - dir.mkdir(); - - - while ((describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - prepos = -1; - if (stringUtils.isReadLegal(lineCharArray)) { - - substart = 0; - - outcnt = cnt; - - int len = readLen; - - char[] revCharArray = stringUtils.getReversedRead(lineCharArray); - - min_pos = findPosOfMin(lineCharArray, revCharArray, 0, k, flag); - - cnt += 2; - - int bound = len - k + 1; - - for (int i = 1; i < bound; i++) { - - if (i > (flag[0] == 0 ? min_pos : len - min_pos - pivotLen)) { - - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = findPosOfMin(lineCharArray, revCharArray, i, i + k, flag); - - if (temp != (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen))) { - prepos = temp; - subend = i - 1 + k; - - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - numSuperKmers++; - - substart = i; - outcnt = cnt; - } - - } else { - - if (ordering.strcmp(lineCharArray, revCharArray, k + i - pivotLen, len - i - k, pivotLen) < 0) { - if (ordering.strcmp(lineCharArray, flag[0] == 0 ? lineCharArray : revCharArray, k + i - pivotLen, min_pos, pivotLen) < 0) { - boolean enter = true; - if (ordering instanceof YaelUHSOrdering) { - if (!((YaelUHSOrdering) ordering).isInUHS(lineCharArray, k + i - pivotLen, k + i)) { - enter = false; - } - } - if (enter) { - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = k + i - pivotLen; - - if (temp != calPosNew(lineCharArray, min_pos, min_pos + pivotLen)) { - prepos = temp; - subend = i - 1 + k; - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - numSuperKmers++; - - - substart = i; - outcnt = cnt; - } - - flag[0] = 0; - } - } - } else { - if (ordering.strcmp(revCharArray, flag[0] == 0 ? lineCharArray : revCharArray, len - i - k, min_pos, pivotLen) < 0) { - boolean enter = true; - if (ordering instanceof YaelUHSOrdering) { - if (!((YaelUHSOrdering) ordering).isInUHS(revCharArray, len - i - k, len - i - k + pivotLen)) { - enter = false; - } - } - if (enter) { - int temp = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - min_pos = -k - i + len; - - if (temp != calPosNew(revCharArray, min_pos, min_pos + pivotLen)) { - prepos = temp; - subend = i - 1 + k; - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - numSuperKmers++; - - - substart = i; - outcnt = cnt; - } - flag[0] = 1; - } - } - } - } - - cnt += 2; - } - subend = len; - prepos = (flag[0] == 0 ? calPosNew(lineCharArray, min_pos, min_pos + pivotLen) : calPosNew(revCharArray, min_pos, min_pos + pivotLen)); - - writeToFile(prepos, substart, subend, lineCharArray, outcnt); - numSuperKmers++; - - } - } - - System.out.println("Largest ID is " + cnt); - System.out.println("Num superkmers is = " + numSuperKmers); - - for (int i = 0; i < bfwG.length; i++) { - if (bfwG[i] != null) { - bfwG[i].close(); - fwG[i].close(); - } - } - if(currentMinimizers.size() <15000) - { - shouldContinue = false; - } - else{ - for (Integer i : currentMinimizers) { - finishedMinimizers[i] = 1; - } - currentMinimizers.clear(); - } - bfrG.close(); - frG.close(); - - return cnt; - } - - private void tryCreateWriterForPmer(int prepos) throws IOException { - if (numOpenFiles == 16000) { - for (int i = 0; i < bfwG.length; i++) { - if (bfwG[i] != null) { - bfwG[i].close(); - fwG[i].close(); - bfwG[i] = null; - fwG[i] = null; - } - } - numOpenFiles = 0; - } - - if (bfwG[prepos] == null) { - fwG[prepos] = new FileWriter("Nodes/nodes" + prepos, true); - bfwG[prepos] = new BufferedWriter(fwG[prepos], bufSize); - numOpenFiles += 1; - } - } - - private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray, long outcnt) throws IOException { - if(finishedMinimizers[prepos] == 0 && currentMinimizers.size() < 15000){ - currentMinimizers.add(prepos); - } - if(currentMinimizers.contains(prepos)) - //if(minFile <= prepos && prepos < maxFile) - { - tryCreateWriterForPmer(prepos); - - BufferedWriter writer = bfwG[prepos]; - - writer.write(lineCharArray, substart, subend - substart); - writer.write("\t" + outcnt); - writer.newLine(); - } - } - - public void Run() throws Exception { - long time1 = 0; - long t1 = System.currentTimeMillis(); - System.out.println("Distribute Nodes Begin!"); - while (shouldContinue) - { - System.out.println("hi"); - DistributeNodes(); - } -// for(minFile = 0, maxFile=10000; minFile < numOfBlocks; minFile+= 10000, maxFile += 10000) -// { -// System.out.println("hi"); -// DistributeNodes(); -// } - long t2 = System.currentTimeMillis(); - time1 = (t2 - t1) / 1000; - System.out.println("Time used for distributing nodes: " + time1 + " seconds!"); - } - -} \ No newline at end of file diff --git a/src/buildgraph/PartitionTrunc.java b/src/buildgraph/PartitionTrunc.java deleted file mode 100644 index 5f63136..0000000 --- a/src/buildgraph/PartitionTrunc.java +++ /dev/null @@ -1,188 +0,0 @@ -package buildgraph; - -import buildgraph.Ordering.IOrdering; -import buildgraph.Ordering.IOrderingPP; -import buildgraph.Ordering.UHS.YaelUHSOrdering; - -import java.io.*; -import java.util.HashSet; - -public class PartitionTrunc { - - private int k; - private String inputfile; - private int numOfBlocks; - private int pivotLen; - private int bufSize; - - private FileReader frG; - private BufferedReader bfrG; - private FileWriter[] fwG; - private BufferedWriter[] bfwG; - - private int readLen; - private IOrderingPP ordering; - - private StringUtils stringUtils; - - private int numOpenFiles; - private int minFile; - private int maxFile; - - private int mask; - - - public PartitionTrunc(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrderingPP ordering) { - this.k = kk; - this.inputfile = infile; - this.numOfBlocks = numberOfBlocks; - this.pivotLen = pivotLength; - this.bufSize = bufferSize; - this.readLen = readLen; - this.ordering = ordering; - this.stringUtils = new StringUtils(); - this.numOpenFiles = 0; - this.mask = (int) Math.pow(4, pivotLength) - 1; - - } - - - private int findPosOfMin(char[] a, int from, int to) throws IOException { - - int len = a.length; - int pos1 = ordering.findSmallest(a, from, to); - return pos1; - - } - - private int calPosNew(char[] a, int from, int to) { - return Math.min(stringUtils.getDecimal(a, from, to), stringUtils.getReversedMmer(stringUtils.getDecimal(a, from, to), pivotLen)) % numOfBlocks; - } - - private long DistributeNodes() throws IOException { - frG = new FileReader(inputfile); - bfrG = new BufferedReader(frG, bufSize); - fwG = new FileWriter[numOfBlocks]; - bfwG = new BufferedWriter[numOfBlocks]; - - String describeline; - - int numSuperKmers = 0; - - int prepos, substart = 0, subend, min_pos = -1; - - char[] lineCharArray = new char[readLen]; - int len = readLen; - - - long cnt = 0, outcnt = 0; - - File dir = new File("Nodes"); - if (!dir.exists()) - dir.mkdir(); - - int minValue, minValueNormalized, currentValue, start; - while ((describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - prepos = -1; - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = ordering.findSmallest(lineCharArray, 0, k); - start = 0; - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLen); - minValueNormalized = Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLen)) % numOfBlocks; - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLen, k); - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - writeToFile(minValueNormalized, start, min_pos+k,lineCharArray, 0); - - min_pos = ordering.findSmallest(lineCharArray, i, i + k); - start = i; - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLen); - minValueNormalized = Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLen)) % numOfBlocks; - - - } else { - int lastIndexInWindow = k + i - pivotLen; - if (ordering.strcmp(currentValue, minValue) < 0) { - writeToFile(minValueNormalized, start, lastIndexInWindow+pivotLen - 1,lineCharArray, 0); - - start = lastIndexInWindow + pivotLen - k; - min_pos = lastIndexInWindow; - minValue = currentValue; - minValueNormalized = Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLen)) % numOfBlocks; - } - } - } - writeToFile(minValueNormalized, start, len, lineCharArray, 0); - } - } - - System.out.println("Num superkmers is = " + numSuperKmers); - - for (int i = 0; i < bfwG.length; i++) { - if (bfwG[i] != null) { - bfwG[i].close(); - fwG[i].close(); - } - } - - bfrG.close(); - frG.close(); - - return cnt; - } - - private void tryCreateWriterForPmer(int prepos) throws IOException { - if (numOpenFiles == 16000) { - for (int i = 0; i < bfwG.length; i++) { - if (bfwG[i] != null) { - bfwG[i].close(); - fwG[i].close(); - bfwG[i] = null; - fwG[i] = null; - } - } - numOpenFiles = 0; - } - - if (bfwG[prepos] == null) { - fwG[prepos] = new FileWriter("Nodes/nodes" + prepos, true); - bfwG[prepos] = new BufferedWriter(fwG[prepos], bufSize); - numOpenFiles += 1; - } - } - - private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray, long outcnt) throws IOException { - if (minFile <= prepos && prepos < maxFile) { - tryCreateWriterForPmer(prepos); - - BufferedWriter writer = bfwG[prepos]; - - writer.write(lineCharArray, substart, subend - substart); - writer.write("\t" + outcnt); - writer.newLine(); - } - } - - public void Run() throws Exception { - long time1 = 0; - long t1 = System.currentTimeMillis(); - System.out.println("Distribute Nodes Begin!"); - for (minFile = 0, maxFile = 10000; minFile < numOfBlocks; minFile += 10000, maxFile += 10000) { - System.out.println("hi"); - DistributeNodes(); - } - long t2 = System.currentTimeMillis(); - time1 = (t2 - t1) / 1000; - System.out.println("Time used for distributing nodes: " + time1 + " seconds!"); - } - -} \ No newline at end of file From 2b3749d67bd330b3097b0534bb84716c9ad1e946 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 31 Mar 2021 14:19:36 +0300 Subject: [PATCH 21/44] refator --- src/META-INF/MANIFEST.MF | 2 +- src/buildgraph/Kmer64.java | 78 ------ src/buildgraph/Map.java | 248 ------------------ src/buildgraph/Replace.java | 237 ----------------- src/dumbo/Map.java | 181 +++++++++++++ .../Ordering/FrequencyOrdering.java | 9 +- .../Ordering/IOrdering.java | 2 +- .../Ordering/IOrderingPP.java | 4 +- ...ativeOrdering10_WithCounterNormalized.java | 4 +- .../Ordering/IterativeOrdering6.java | 4 +- .../Ordering/IterativeOrdering8.java | 4 +- .../Ordering/IterativeOrdering9.java | 4 +- ...rativeOrdering9_WithCounterNormalized.java | 4 +- ...g9_WithCounterNormalized_AndSignature.java | 4 +- .../Ordering/IterativeSignatureOrdering9.java | 4 +- .../Ordering/IterativeUHSOrdering8.java | 4 +- .../Ordering/IterativeUHSOrdering9.java | 4 +- .../Ordering/LexicographicOrdering.java | 2 +- .../LexicographicSignatureOrdering.java | 4 +- .../Ordering/RandomOrdering.java | 4 +- .../Ordering/SignatureUtils.java | 4 +- .../UHS/UHSFrequencySignatureOrdering.java | 6 +- .../Ordering/UHS/UHSOrderingBase.java | 6 +- .../Ordering/UHS/UHSSignatureOrdering.java | 6 +- .../Ordering/UHS/YaelUHSOrdering.java | 6 +- .../OrderingOptimizer.java} | 39 +-- src/{buildgraph => dumbo}/Partition.java | 42 +-- src/{buildgraph => dumbo}/StringUtils.java | 2 +- 28 files changed, 241 insertions(+), 677 deletions(-) delete mode 100644 src/buildgraph/Kmer64.java delete mode 100644 src/buildgraph/Map.java delete mode 100644 src/buildgraph/Replace.java create mode 100644 src/dumbo/Map.java rename src/{buildgraph => dumbo}/Ordering/FrequencyOrdering.java (96%) rename src/{buildgraph => dumbo}/Ordering/IOrdering.java (88%) rename src/{buildgraph => dumbo}/Ordering/IOrderingPP.java (65%) rename src/{buildgraph => dumbo}/Ordering/IterativeOrdering10_WithCounterNormalized.java (99%) rename src/{buildgraph => dumbo}/Ordering/IterativeOrdering6.java (99%) rename src/{buildgraph => dumbo}/Ordering/IterativeOrdering8.java (99%) rename src/{buildgraph => dumbo}/Ordering/IterativeOrdering9.java (99%) rename src/{buildgraph => dumbo}/Ordering/IterativeOrdering9_WithCounterNormalized.java (99%) rename src/{buildgraph => dumbo}/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java (99%) rename src/{buildgraph => dumbo}/Ordering/IterativeSignatureOrdering9.java (99%) rename src/{buildgraph => dumbo}/Ordering/IterativeUHSOrdering8.java (99%) rename src/{buildgraph => dumbo}/Ordering/IterativeUHSOrdering9.java (99%) rename src/{buildgraph => dumbo}/Ordering/LexicographicOrdering.java (96%) rename src/{buildgraph => dumbo}/Ordering/LexicographicSignatureOrdering.java (95%) rename src/{buildgraph => dumbo}/Ordering/RandomOrdering.java (94%) rename src/{buildgraph => dumbo}/Ordering/SignatureUtils.java (97%) rename src/{buildgraph => dumbo}/Ordering/UHS/UHSFrequencySignatureOrdering.java (98%) rename src/{buildgraph => dumbo}/Ordering/UHS/UHSOrderingBase.java (98%) rename src/{buildgraph => dumbo}/Ordering/UHS/UHSSignatureOrdering.java (97%) rename src/{buildgraph => dumbo}/Ordering/UHS/YaelUHSOrdering.java (97%) rename src/{buildgraph/BuildDeBruijnGraph.java => dumbo/OrderingOptimizer.java} (77%) rename src/{buildgraph => dumbo}/Partition.java (83%) rename src/{buildgraph => dumbo}/StringUtils.java (98%) diff --git a/src/META-INF/MANIFEST.MF b/src/META-INF/MANIFEST.MF index 0fb9411..76e33bd 100644 --- a/src/META-INF/MANIFEST.MF +++ b/src/META-INF/MANIFEST.MF @@ -1,4 +1,4 @@ Manifest-Version: 1.0 -Main-Class: buildgraph.BuildDeBruijnGraph +Main-Class: dumbo.OrderingOptimizer diff --git a/src/buildgraph/Kmer64.java b/src/buildgraph/Kmer64.java deleted file mode 100644 index 9e6ac27..0000000 --- a/src/buildgraph/Kmer64.java +++ /dev/null @@ -1,78 +0,0 @@ -package buildgraph; - -public class Kmer64 extends Object { - - public long high; - public long low; - - private final static char[] baseDic = {'A', 'C', 'G', 'T'}; - private final static int[] intDic = {0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3}; - - private final int base2int(char base) { - return intDic[base - 'A']; - } - - - public Kmer64(char[] str, int start, int end, boolean rev) { - - this.high = this.low = 0; - - int len = end - start; - - if (!rev) { - if (len <= 32) { - for (int i = start; i <= end - 1; i++) { - this.low = (this.low << 2) + base2int(str[i]); - } - } else { - for (int i = end - 32; i <= end - 1; i++) { - this.low = (this.low << 2) + base2int(str[i]); - } - - for (int i = start; i <= end - 33; i++) { - this.high = (this.high << 2) + base2int(str[i]); - } - } - } else { - if (len <= 32) { - for (int i = end - 1; i >= start; i--) { - this.low = (this.low << 2) + 3 ^ base2int(str[i]); - } - } else { - for (int i = start + 31; i >= start; i--) { - this.low = (this.low << 2) + 3 ^ base2int(str[i]); - } - - for (int i = end - 1; i >= start + 32; i--) { - this.high = (this.high << 2) + 3 ^ base2int(str[i]); - } - } - } - - } - - public Kmer64(long low, long high) { - this.low = low; - this.high = high; - } - - @Override - public boolean equals(Object another) { - Kmer64 k = (Kmer64) another; - if (this.high == k.high && this.low == k.low) - return true; - else - return false; - } - - @Override - public int hashCode() { - return (int) ((low ^ (low >>> 32)) ^ (high ^ (high >>> 32))); - } - - - public String toString() { - return this.high + "," + this.low; - } -} - diff --git a/src/buildgraph/Map.java b/src/buildgraph/Map.java deleted file mode 100644 index 5a00bb5..0000000 --- a/src/buildgraph/Map.java +++ /dev/null @@ -1,248 +0,0 @@ -package buildgraph; - -import java.io.*; -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.CountDownLatch; - - -public class Map{ - - private int k; - private int numOfBlocks; - private int bufSize; - - private Object lock_blocks = new Object(); - - private int capacity; - - private int blockID; - - private long forAndVal; - private long forAndVal32; - private StringUtils stringUtils; - - private static int[] valTable = StringUtils.valTable; - - public Map(int kk, int numberOfBlocks, int bufferSize, int HScapacity){ - this.k = kk; - this.numOfBlocks = numberOfBlocks; - this.bufSize = bufferSize; - this.capacity = HScapacity; - this.blockID = 0; - this.forAndVal = (long)Math.pow(2, 2*(k-32)) - 1; - this.forAndVal32 = (long)Math.pow(2, 2*k) - 1; - stringUtils = new StringUtils(); - - } - - public class MyThread extends Thread{ - private CountDownLatch threadsSignal; - private HashSet fileNames; - private ConcurrentHashMap distinctKmersPerPartition; - - public MyThread(CountDownLatch threadsSignal, HashSet fileNames, ConcurrentHashMap distinctKmersPerPartition){ - super(); - this.threadsSignal = threadsSignal; - this.fileNames = fileNames; - this.distinctKmersPerPartition = distinctKmersPerPartition; - } - - @Override - public void run(){ - System.out.println(Thread.currentThread().getName() + "Start..."); - - FileReader fr; - BufferedReader bfr; - - String line; - - int p,j; - - try{ - File dir = new File("Maps"); - if(!dir.exists()) - dir.mkdir(); - - while(blockID nodes = new HashSet(capacity); - - while((line = bfr.readLine()) != null){ - - String[] strs = line.split("\t"); - - - char[] lineCharArray = strs[0].toCharArray(); - String revLine = new String(stringUtils.getReversedRead(lineCharArray)); - - int bound = strs[0].length() - k + 1; - - for(j = 0; j < bound; j++){ - String reg = strs[0].substring(j, j + k); - String rev = new String(stringUtils.getReversedRead(reg.toCharArray())); - if(reg.equals(rev)) - { - nodes.add(rev); - } - else{ - boolean didAdd = false; - for (int i = 0; i < k; i++) { - if(rev.charAt(i) < reg.charAt(i)) - { - nodes.add(rev); - didAdd = true; - break; - } - else if(reg.charAt(i) < rev.charAt(i)) - { - nodes.add(rev); - didAdd = true; - break; - } - } - if(!didAdd) - nodes.add(reg); - } - } - - } - - if(p%100 == 0) System.out.println(p); - - distinctKmersPerPartition.put((long)p, (long)nodes.size()); - - nodes.clear(); - nodes = null; - - - bfr.close(); - fr.close(); - bfr = null; - fr = null; - - File myObj = new File("Nodes/nodes"+p); - if (!myObj.delete()) - System.out.println("Failed to delete the file." + p); - } - - }catch(Exception E){ - System.out.println("Exception caught!"); - E.printStackTrace(); - } - - threadsSignal.countDown(); - System.out.println(Thread.currentThread().getName() + "End. Remaining" + threadsSignal.getCount() + " threads"); - - } - } - - - private AbstractMap BuildMap(int threadNum, HashSet fileNames) throws Exception{ - CountDownLatch threadSignal = new CountDownLatch(threadNum); - - ConcurrentHashMap distinctKmersPerPartition = new ConcurrentHashMap<>(); - - for(int i=0;i Run(int numThreads) throws Exception{ - long time1=0; - - HashSet fileNames = getNodesFileNames(); - - long t1 = System.currentTimeMillis(); - System.out.println("Build Maps Begin!"); - AbstractMap distinctKmersPerPartition= BuildMap(numThreads, fileNames); - long t2 = System.currentTimeMillis(); - time1 = (t2-t1)/1000; - System.out.println("Time used for building maps: " + time1 + " seconds!"); - - return distinctKmersPerPartition; - - } - - private HashSet getNodesFileNames(){ - File[] files = (new File("./Nodes")).listFiles(); - List fileNames = new LinkedList<>(); - for(File file : files){ - if(file.isFile()){ - fileNames.add(file.getName()); - } - } - return new HashSet<>(fileNames); - } - - public static void main(String[] args){ - - int k = 15, numBlocks = 256, numThreads = 1, bufferSize = 8192, hsmapCapacity = 1000000; - - if(args[0].equals("-help")){ - System.out.print("Usage: java -jar Map.jar -k k -NB numOfBlocks [options]\n" + - "Options Available: \n" + - "[-t numOfThreads] : (Integer) Number Of Threads. Default: 1" + "\n" + - "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + - "[-c capacity] : (Integer) Hashmap Capacity. Default: 1000000" + "\n"); - return; - } - - for(int i=0; i 3){ - long rangeEnd = Long.parseLong(strs[4]); - if(strs[2].equals("+")){ - for(long temp=replaceID+2; temp<=rangeEnd; temp+=2){ - bfw.write(temp + " "); - } - } - else if(strs[2].equals("-")){ - for(long temp=replaceID-2; temp>=rangeEnd; temp-=2){ - bfw.write(temp + " "); - } - } - i = Long.parseLong(strs[3]); - } - - if((str=bfr.readLine())!=null){ - strs = str.split("\t"); - originalID = new Long(strs[0]); - replaceID = new Long(strs[1]); - } - else{ - originalID = Long.MAX_VALUE; - replaceID = Long.MAX_VALUE; - } - } - else{ - bfw.write(i + " "); - } - - if((i+2) % modValue == 0) - bfw.newLine(); - } - - bfw.close(); - fw.close(); - bfr.close(); - fr.close(); - } - - private void DoReplaceBin() throws IOException{ - fr = new FileReader(replaceTableFile); - bfr = new BufferedReader(fr, bufSize); - DataOutputStream out = null; - out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(outputGraphFile)), bufSize)); - - long originalID, replaceID; - - String str; - String[] strs = null; - - if((str=bfr.readLine())!=null){ - strs = str.split("\t"); - originalID = new Long(strs[0]); - replaceID = new Long(strs[1]); - } - else{ - originalID = Long.MAX_VALUE; - replaceID = Long.MAX_VALUE; - } - - - for(long i=0; i 3){ - long rangeEnd = Long.parseLong(strs[4]); - if(strs[2].equals("+")){ - for(long temp=replaceID+2; temp<=rangeEnd; temp+=2){ - out.writeLong(temp); - } - } - else if(strs[2].equals("-")){ - for(long temp=replaceID-2; temp>=rangeEnd; temp-=2){ - out.writeLong(temp); - } - } - i = Long.parseLong(strs[3]); - } - - if((str=bfr.readLine())!=null){ - strs = str.split("\t"); - originalID = new Long(strs[0]); - replaceID = new Long(strs[1]); - } - else{ - originalID = Long.MAX_VALUE; - replaceID = Long.MAX_VALUE; - } - } - else{ - out.writeLong(i); - } - - } - - out.close(); - bfr.close(); - fr.close(); - } - - public void Run(boolean readable) throws Exception{ - - long time1=0; - - long t1 = System.currentTimeMillis(); - System.out.println("Replace IDs Begin!"); - - if(readable) - DoReplace(); - else - DoReplaceBin(); - - long t2 = System.currentTimeMillis(); - time1 = (t2-t1)/1000; - System.out.println("Time used for replacing IDs: " + time1 + " seconds!"); - - } - - public static void main(String[] args){ - - String infile = "E:\\test.txt"; - String outfile = "E:\\testOut.txt"; - int k = 15, bufferSize = 8192, readLen = 101; - long largestID = 0; - boolean readable = false; - - if(args[0].equals("-help")){ - System.out.print("Usage: java -jar Replace.jar -in InputTablePath -out outGraphPath -k k -L readLength -m largestID[options]\n" + - "Options Available: \n" + - "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + - "[-r readable] : (Boolean) Output Format: true means readable text, false means binary. Default: false" + "\n"); - return; - } - - for(int i=0; i fileNames; + private ConcurrentHashMap distinctKmersPerPartition; + + public MyThread(CountDownLatch threadsSignal, HashSet fileNames, ConcurrentHashMap distinctKmersPerPartition) { + super(); + this.threadsSignal = threadsSignal; + this.fileNames = fileNames; + this.distinctKmersPerPartition = distinctKmersPerPartition; + } + + @Override + public void run() { + System.out.println(Thread.currentThread().getName() + "Start..."); + + FileReader fr; + BufferedReader bfr; + + String line; + + int p, j; + + try { + File dir = new File("Maps"); + if (!dir.exists()) + dir.mkdir(); + + while (blockID < numOfBlocks) { + + synchronized (lock_blocks) { + p = blockID; + blockID++; + } + + String filename = "nodes" + p; + if (!fileNames.contains(filename)) { + continue; + } + + + fr = new FileReader("Nodes/nodes" + p); + bfr = new BufferedReader(fr, bufSize); + + + HashSet nodes = new HashSet(capacity); + + while ((line = bfr.readLine()) != null) { + + int bound = line.length() - k + 1; + + for (j = 0; j < bound; j++) { + String reg = line.substring(j, j + k); + String rev = new String(stringUtils.getReversedRead(reg.toCharArray())); + if (reg.equals(rev)) { + nodes.add(rev); + } else { + boolean didAdd = false; + for (int i = 0; i < k; i++) { + if (rev.charAt(i) < reg.charAt(i)) { + nodes.add(rev); + didAdd = true; + break; + } else if (reg.charAt(i) < rev.charAt(i)) { + nodes.add(rev); + didAdd = true; + break; + } + } + if (!didAdd) + nodes.add(reg); + } + } + + } + + if (p % 100 == 0) System.out.println(p); + + distinctKmersPerPartition.put((long) p, (long) nodes.size()); + + nodes.clear(); + nodes = null; + + + bfr.close(); + fr.close(); + bfr = null; + fr = null; + + File myObj = new File("Nodes/nodes" + p); + if (!myObj.delete()) + System.out.println("Failed to delete the file." + p); + } + + } catch (Exception E) { + System.out.println("Exception caught!"); + E.printStackTrace(); + } + + threadsSignal.countDown(); + System.out.println(Thread.currentThread().getName() + "End. Remaining" + threadsSignal.getCount() + " threads"); + + } + } + + + private AbstractMap BuildMap(int threadNum, HashSet fileNames) throws Exception { + CountDownLatch threadSignal = new CountDownLatch(threadNum); + + ConcurrentHashMap distinctKmersPerPartition = new ConcurrentHashMap<>(); + + for (int i = 0; i < threadNum; i++) { + Thread t = new MyThread(threadSignal, fileNames, distinctKmersPerPartition); + t.start(); + } + threadSignal.await(); + System.out.println(Thread.currentThread().getName() + "End."); + return distinctKmersPerPartition; + } + + public AbstractMap Run(int numThreads) throws Exception { + long time1 = 0; + + HashSet fileNames = getNodesFileNames(); + + long t1 = System.currentTimeMillis(); + System.out.println("Build Maps Begin!"); + AbstractMap distinctKmersPerPartition = BuildMap(numThreads, fileNames); + long t2 = System.currentTimeMillis(); + time1 = (t2 - t1) / 1000; + System.out.println("Time used for building maps: " + time1 + " seconds!"); + + return distinctKmersPerPartition; + + } + + private HashSet getNodesFileNames() { + File[] files = (new File("./Nodes")).listFiles(); + List fileNames = new LinkedList<>(); + for (File file : files) { + if (file.isFile()) { + fileNames.add(file.getName()); + } + } + return new HashSet<>(fileNames); + } + +} \ No newline at end of file diff --git a/src/buildgraph/Ordering/FrequencyOrdering.java b/src/dumbo/Ordering/FrequencyOrdering.java similarity index 96% rename from src/buildgraph/Ordering/FrequencyOrdering.java rename to src/dumbo/Ordering/FrequencyOrdering.java index 6496bca..2046a99 100644 --- a/src/buildgraph/Ordering/FrequencyOrdering.java +++ b/src/dumbo/Ordering/FrequencyOrdering.java @@ -1,14 +1,9 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.Ordering.UHS.UHSSignatureOrdering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; -import java.net.Inet4Address; import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; public class FrequencyOrdering implements IOrdering { private int pivotLength; diff --git a/src/buildgraph/Ordering/IOrdering.java b/src/dumbo/Ordering/IOrdering.java similarity index 88% rename from src/buildgraph/Ordering/IOrdering.java rename to src/dumbo/Ordering/IOrdering.java index c6ae276..1f29130 100644 --- a/src/buildgraph/Ordering/IOrdering.java +++ b/src/dumbo/Ordering/IOrdering.java @@ -1,4 +1,4 @@ -package buildgraph.Ordering; +package dumbo.Ordering; import java.io.IOException; diff --git a/src/buildgraph/Ordering/IOrderingPP.java b/src/dumbo/Ordering/IOrderingPP.java similarity index 65% rename from src/buildgraph/Ordering/IOrderingPP.java rename to src/dumbo/Ordering/IOrderingPP.java index 80dcfb6..42e110c 100644 --- a/src/buildgraph/Ordering/IOrderingPP.java +++ b/src/dumbo/Ordering/IOrderingPP.java @@ -1,6 +1,4 @@ -package buildgraph.Ordering; - -import java.io.IOException; +package dumbo.Ordering; public interface IOrderingPP extends IOrdering { long getRank(int mmer); diff --git a/src/buildgraph/Ordering/IterativeOrdering10_WithCounterNormalized.java b/src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.java similarity index 99% rename from src/buildgraph/Ordering/IterativeOrdering10_WithCounterNormalized.java rename to src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.java index c9297a7..48fc108 100644 --- a/src/buildgraph/Ordering/IterativeOrdering10_WithCounterNormalized.java +++ b/src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import net.agkn.hll.HLL; import java.io.*; diff --git a/src/buildgraph/Ordering/IterativeOrdering6.java b/src/dumbo/Ordering/IterativeOrdering6.java similarity index 99% rename from src/buildgraph/Ordering/IterativeOrdering6.java rename to src/dumbo/Ordering/IterativeOrdering6.java index 6e0ed43..ee04c2d 100644 --- a/src/buildgraph/Ordering/IterativeOrdering6.java +++ b/src/dumbo/Ordering/IterativeOrdering6.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; import java.util.Arrays; diff --git a/src/buildgraph/Ordering/IterativeOrdering8.java b/src/dumbo/Ordering/IterativeOrdering8.java similarity index 99% rename from src/buildgraph/Ordering/IterativeOrdering8.java rename to src/dumbo/Ordering/IterativeOrdering8.java index f5e6cbd..0392e05 100644 --- a/src/buildgraph/Ordering/IterativeOrdering8.java +++ b/src/dumbo/Ordering/IterativeOrdering8.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; import java.util.Arrays; diff --git a/src/buildgraph/Ordering/IterativeOrdering9.java b/src/dumbo/Ordering/IterativeOrdering9.java similarity index 99% rename from src/buildgraph/Ordering/IterativeOrdering9.java rename to src/dumbo/Ordering/IterativeOrdering9.java index e3ead4c..5fdd8f3 100644 --- a/src/buildgraph/Ordering/IterativeOrdering9.java +++ b/src/dumbo/Ordering/IterativeOrdering9.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; import java.util.Arrays; diff --git a/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized.java b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized.java similarity index 99% rename from src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized.java rename to src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized.java index a22efd5..e0a49c0 100644 --- a/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized.java +++ b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; import java.util.Arrays; diff --git a/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java similarity index 99% rename from src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java rename to src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java index 053c674..7e7e42f 100644 --- a/src/buildgraph/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java +++ b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; import java.util.Arrays; diff --git a/src/buildgraph/Ordering/IterativeSignatureOrdering9.java b/src/dumbo/Ordering/IterativeSignatureOrdering9.java similarity index 99% rename from src/buildgraph/Ordering/IterativeSignatureOrdering9.java rename to src/dumbo/Ordering/IterativeSignatureOrdering9.java index cc2eaaa..81d8c76 100644 --- a/src/buildgraph/Ordering/IterativeSignatureOrdering9.java +++ b/src/dumbo/Ordering/IterativeSignatureOrdering9.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; import java.util.Arrays; diff --git a/src/buildgraph/Ordering/IterativeUHSOrdering8.java b/src/dumbo/Ordering/IterativeUHSOrdering8.java similarity index 99% rename from src/buildgraph/Ordering/IterativeUHSOrdering8.java rename to src/dumbo/Ordering/IterativeUHSOrdering8.java index efb5a01..feb5702 100644 --- a/src/buildgraph/Ordering/IterativeUHSOrdering8.java +++ b/src/dumbo/Ordering/IterativeUHSOrdering8.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; import java.util.Arrays; diff --git a/src/buildgraph/Ordering/IterativeUHSOrdering9.java b/src/dumbo/Ordering/IterativeUHSOrdering9.java similarity index 99% rename from src/buildgraph/Ordering/IterativeUHSOrdering9.java rename to src/dumbo/Ordering/IterativeUHSOrdering9.java index b89ee13..87a4933 100644 --- a/src/buildgraph/Ordering/IterativeUHSOrdering9.java +++ b/src/dumbo/Ordering/IterativeUHSOrdering9.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; import java.util.Arrays; diff --git a/src/buildgraph/Ordering/LexicographicOrdering.java b/src/dumbo/Ordering/LexicographicOrdering.java similarity index 96% rename from src/buildgraph/Ordering/LexicographicOrdering.java rename to src/dumbo/Ordering/LexicographicOrdering.java index b606d5b..b046792 100644 --- a/src/buildgraph/Ordering/LexicographicOrdering.java +++ b/src/dumbo/Ordering/LexicographicOrdering.java @@ -1,4 +1,4 @@ -package buildgraph.Ordering; +package dumbo.Ordering; public class LexicographicOrdering implements IOrdering { diff --git a/src/buildgraph/Ordering/LexicographicSignatureOrdering.java b/src/dumbo/Ordering/LexicographicSignatureOrdering.java similarity index 95% rename from src/buildgraph/Ordering/LexicographicSignatureOrdering.java rename to src/dumbo/Ordering/LexicographicSignatureOrdering.java index 7b816c4..25b6a5e 100644 --- a/src/buildgraph/Ordering/LexicographicSignatureOrdering.java +++ b/src/dumbo/Ordering/LexicographicSignatureOrdering.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.IOException; diff --git a/src/buildgraph/Ordering/RandomOrdering.java b/src/dumbo/Ordering/RandomOrdering.java similarity index 94% rename from src/buildgraph/Ordering/RandomOrdering.java rename to src/dumbo/Ordering/RandomOrdering.java index 33ad4cb..b11d5c3 100644 --- a/src/buildgraph/Ordering/RandomOrdering.java +++ b/src/dumbo/Ordering/RandomOrdering.java @@ -1,6 +1,6 @@ -package buildgraph.Ordering; +package dumbo.Ordering; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.IOException; diff --git a/src/buildgraph/Ordering/SignatureUtils.java b/src/dumbo/Ordering/SignatureUtils.java similarity index 97% rename from src/buildgraph/Ordering/SignatureUtils.java rename to src/dumbo/Ordering/SignatureUtils.java index fa11bb7..879e99f 100644 --- a/src/buildgraph/Ordering/SignatureUtils.java +++ b/src/dumbo/Ordering/SignatureUtils.java @@ -1,6 +1,4 @@ -package buildgraph.Ordering; - -import java.util.HashMap; +package dumbo.Ordering; public class SignatureUtils { diff --git a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java similarity index 98% rename from src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java rename to src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java index 4838a41..613c6cf 100644 --- a/src/buildgraph/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java @@ -1,10 +1,8 @@ -package buildgraph.Ordering.UHS; +package dumbo.Ordering.UHS; -import buildgraph.StringUtils; +import dumbo.StringUtils; import java.io.*; -import java.util.Arrays; -import java.util.Comparator; public class UHSFrequencySignatureOrdering extends UHSSignatureOrdering { private String inputFile; diff --git a/src/buildgraph/Ordering/UHS/UHSOrderingBase.java b/src/dumbo/Ordering/UHS/UHSOrderingBase.java similarity index 98% rename from src/buildgraph/Ordering/UHS/UHSOrderingBase.java rename to src/dumbo/Ordering/UHS/UHSOrderingBase.java index 3307108..d7d1422 100644 --- a/src/buildgraph/Ordering/UHS/UHSOrderingBase.java +++ b/src/dumbo/Ordering/UHS/UHSOrderingBase.java @@ -1,7 +1,7 @@ -package buildgraph.Ordering.UHS; +package dumbo.Ordering.UHS; -import buildgraph.Ordering.IOrdering; -import buildgraph.StringUtils; +import dumbo.Ordering.IOrdering; +import dumbo.StringUtils; import java.io.BufferedReader; import java.io.FileReader; diff --git a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java similarity index 97% rename from src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java rename to src/dumbo/Ordering/UHS/UHSSignatureOrdering.java index 6e78e38..83b6aaf 100644 --- a/src/buildgraph/Ordering/UHS/UHSSignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java @@ -1,7 +1,7 @@ -package buildgraph.Ordering.UHS; +package dumbo.Ordering.UHS; -import buildgraph.Ordering.SignatureUtils; -import buildgraph.StringUtils; +import dumbo.Ordering.SignatureUtils; +import dumbo.StringUtils; import java.io.IOException; diff --git a/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java b/src/dumbo/Ordering/UHS/YaelUHSOrdering.java similarity index 97% rename from src/buildgraph/Ordering/UHS/YaelUHSOrdering.java rename to src/dumbo/Ordering/UHS/YaelUHSOrdering.java index ccca731..643b4ab 100644 --- a/src/buildgraph/Ordering/UHS/YaelUHSOrdering.java +++ b/src/dumbo/Ordering/UHS/YaelUHSOrdering.java @@ -1,7 +1,7 @@ -package buildgraph.Ordering.UHS; +package dumbo.Ordering.UHS; -import buildgraph.Ordering.IOrdering; -import buildgraph.StringUtils; +import dumbo.Ordering.IOrdering; +import dumbo.StringUtils; import java.io.BufferedReader; import java.io.FileReader; diff --git a/src/buildgraph/BuildDeBruijnGraph.java b/src/dumbo/OrderingOptimizer.java similarity index 77% rename from src/buildgraph/BuildDeBruijnGraph.java rename to src/dumbo/OrderingOptimizer.java index b8a0805..8106f9b 100644 --- a/src/buildgraph/BuildDeBruijnGraph.java +++ b/src/dumbo/OrderingOptimizer.java @@ -1,7 +1,7 @@ -package buildgraph; +package dumbo; -import buildgraph.Ordering.*; -import buildgraph.Ordering.UHS.UHSFrequencySignatureOrdering; +import dumbo.Ordering.*; +import dumbo.Ordering.UHS.UHSFrequencySignatureOrdering; import java.io.BufferedWriter; import java.io.File; @@ -10,7 +10,7 @@ import java.util.AbstractMap; import java.util.HashMap; -public class BuildDeBruijnGraph { +public class OrderingOptimizer { public static void main(String[] args) throws IOException { @@ -79,28 +79,7 @@ else if (args[i].equals("-punishPercentage")) orderingName = "iterativeOrdering"; -// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k); /// this is the first version 100000, 10000, 1 -// ordering.initFrequency(); -// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 30000, 1, 10); -// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, 10); -// IterativeOrdering ordering = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, 25000, 100000, 1, (int)Math.pow(4,pivot_len)/100); -// IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k); - -// IterativeOrdering2 ordering = new IterativeOrdering2(pivot_len, infile, readLen, bufferSize, k, 100000, 10000, 5, (int)Math.pow(4,pivot_len)/100); - -// ordering.initFrequency(); - -// UHSFrequencySignatureOrdering ordering = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true); -// ordering.initRank(); - -// IterativeOrdering3 ordering = new IterativeOrdering3(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples); -// IterativeOrdering4 ordering = new IterativeOrdering4(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples, maskRatio, punishPercentage); -// IterativeOrdering6 ordering = new IterativeOrdering6(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples, maskRatio, punishPercentage); -// IterativeUHSOrdering8 ordering = new IterativeUHSOrdering8(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds,elementsToPush, statSamples, maskRatio, punishPercentage); -// IterativeOrdering8 ordering = new IterativeOrdering8(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - -// GOOD IOrdering ordering = null; System.out.println(version); @@ -171,20 +150,18 @@ else if (args[i].equals("-punishPercentage")) "R/W Buffer Size: " + bufferSize + "\n" + "Ordering: " + orderingName + "\n"); -// Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, ordering); Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, (IOrderingPP) ordering); Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); -// MapTrunc map = new MapTrunc(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); partition.Run(); AbstractMap distinctKmersPerPartition = map.Run(numThreads); - BuildDeBruijnGraph.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); + OrderingOptimizer.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); - HashMap bytesPerFile = BuildDeBruijnGraph.getBytesPerFile(); - BuildDeBruijnGraph.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); + HashMap bytesPerFile = OrderingOptimizer.getBytesPerFile(); + OrderingOptimizer.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); } catch (Exception E) { System.out.println("Exception caught!"); @@ -211,7 +188,7 @@ public static void writeToFile(AbstractMap data, String fileName) { File file = new File(fileName); BufferedWriter bf = null; - ; + try { bf = new BufferedWriter(new FileWriter(file)); diff --git a/src/buildgraph/Partition.java b/src/dumbo/Partition.java similarity index 83% rename from src/buildgraph/Partition.java rename to src/dumbo/Partition.java index af550f6..b98a66f 100644 --- a/src/buildgraph/Partition.java +++ b/src/dumbo/Partition.java @@ -1,8 +1,6 @@ -package buildgraph; +package dumbo; -import buildgraph.Ordering.IOrdering; -import buildgraph.Ordering.IOrderingPP; -import buildgraph.Ordering.UHS.YaelUHSOrdering; +import dumbo.Ordering.IOrderingPP; import java.io.*; import java.util.HashSet; @@ -25,9 +23,6 @@ public class Partition { private StringUtils stringUtils; - private int numOpenFiles; - private int minFile; - private int maxFile; private HashSet currentMinimizers; private byte[] finishedMinimizers; @@ -46,12 +41,11 @@ public Partition(int kk, String infile, int numberOfBlocks, int pivotLength, int this.readLen = readLen; this.ordering = ordering; this.stringUtils = new StringUtils(); - this.numOpenFiles = 0; this.mask = (int) Math.pow(4, pivotLength) - 1; - finishedMinimizers = new byte[numOfBlocks]; - currentMinimizers = new HashSet<>(); - maxMinimizersPerPass = 10000; - keepPassing = true; + this.finishedMinimizers = new byte[numOfBlocks]; + this.currentMinimizers = new HashSet<>(); + this.maxMinimizersPerPass = 1000; + this.keepPassing = true; } @@ -95,10 +89,10 @@ private long DistributeNodes() throws IOException { int bound = len - k + 1; for (int i = 1; i < bound; i++) { - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; if (i > minPos) { - writeToFile(minValueNormalized, start, minPos + k, lineCharArray, 0); + writeToFile(minValueNormalized, start, minPos + k, lineCharArray); minPos = ordering.findSmallest(lineCharArray, i, i + k); start = i; @@ -109,7 +103,7 @@ private long DistributeNodes() throws IOException { } else { int lastIndexInWindow = k + i - pivotLen; if (ordering.strcmp(currentValue, minValue) < 0) { - writeToFile(minValueNormalized, start, lastIndexInWindow + pivotLen - 1, lineCharArray, 0); + writeToFile(minValueNormalized, start, lastIndexInWindow + pivotLen - 1, lineCharArray); start = lastIndexInWindow + pivotLen - k; minPos = lastIndexInWindow; @@ -118,7 +112,7 @@ private long DistributeNodes() throws IOException { } } } - writeToFile(minValueNormalized, start, len, lineCharArray, 0); + writeToFile(minValueNormalized, start, len, lineCharArray); } } @@ -148,26 +142,13 @@ private int getNormalizedValue(int minValue) { } private void tryCreateWriterForPmer(int prepos) throws IOException { - if (numOpenFiles == 16000) { - for (int i = 0; i < bfwG.length; i++) { - if (bfwG[i] != null) { - bfwG[i].close(); - fwG[i].close(); - bfwG[i] = null; - fwG[i] = null; - } - } - numOpenFiles = 0; - } - if (bfwG[prepos] == null) { fwG[prepos] = new FileWriter("Nodes/nodes" + prepos, true); bfwG[prepos] = new BufferedWriter(fwG[prepos], bufSize); - numOpenFiles += 1; } } - private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray, long outcnt) throws IOException { + private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray) throws IOException { if(finishedMinimizers[prepos] == 0 && currentMinimizers.size() < maxMinimizersPerPass) { currentMinimizers.add(prepos); @@ -179,7 +160,6 @@ private void writeToFile(int prepos, int substart, int subend, char[] lineCharAr BufferedWriter writer = bfwG[prepos]; writer.write(lineCharArray, substart, subend - substart); - writer.write("\t" + outcnt); writer.newLine(); } } diff --git a/src/buildgraph/StringUtils.java b/src/dumbo/StringUtils.java similarity index 98% rename from src/buildgraph/StringUtils.java rename to src/dumbo/StringUtils.java index e775d17..f222d0b 100644 --- a/src/buildgraph/StringUtils.java +++ b/src/dumbo/StringUtils.java @@ -1,4 +1,4 @@ -package buildgraph; +package dumbo; public class StringUtils { From 1790197a2cc6b76a3505d2cfcb5a1cb6eaaeb807 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 31 Mar 2021 14:51:04 +0300 Subject: [PATCH 22/44] remove old and not necessary orderings --- src/dumbo/Ordering/IterativeOrdering6.java | 268 ---------------- src/dumbo/Ordering/IterativeOrdering8.java | 266 ---------------- src/dumbo/Ordering/IterativeOrdering9.java | 285 ----------------- .../Ordering/IterativeSignatureOrdering9.java | 296 ------------------ src/dumbo/OrderingOptimizer.java | 8 +- 5 files changed, 1 insertion(+), 1122 deletions(-) delete mode 100644 src/dumbo/Ordering/IterativeOrdering6.java delete mode 100644 src/dumbo/Ordering/IterativeOrdering8.java delete mode 100644 src/dumbo/Ordering/IterativeOrdering9.java delete mode 100644 src/dumbo/Ordering/IterativeSignatureOrdering9.java diff --git a/src/dumbo/Ordering/IterativeOrdering6.java b/src/dumbo/Ordering/IterativeOrdering6.java deleted file mode 100644 index ee04c2d..0000000 --- a/src/dumbo/Ordering/IterativeOrdering6.java +++ /dev/null @@ -1,268 +0,0 @@ -package dumbo.Ordering; - -import dumbo.StringUtils; - -import java.io.*; -import java.util.Arrays; -import java.util.Comparator; - -public class IterativeOrdering6 implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private long[] frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double maskRatio; - private double percentagePunishment; - - Integer[] temp = null; - - public IterativeOrdering6(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeOrdering6(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering6(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double maskRatio, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.maskRatio = maskRatio; - this.percentagePunishment = percentagePunishment; - } - - - public void initFrequency() throws IOException { - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - pmerFrequency[minValue] += 1; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & 0xffff;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - pmerFrequency[minValue] += 1; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - pmerFrequency[minValue] += 1; - } - } - - pmerFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(long[] pmerFrequency) { - boolean[] mask = new boolean[pmerFrequency.length]; - for (int i = 0; i < mask.length; i++) { - if (Math.random() < 1 - maskRatio) - mask[i] = true; - } -// TODO : if biggest is smaller than (samples / 4^(m-1))/5 - for (int i = 0; i < elementsToPush; i++) { - long biggest = -1; - int biggestIndex = -1; - for (int k = 0; k < pmerFrequency.length; k++) { - if (mask[k] && pmerFrequency[k] > biggest) { - biggest = pmerFrequency[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - pmerFrequency[biggestIndex] = 0; - pmerFrequency[getReversed(biggestIndex)] = 0; - } - normalize(); - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if(temp == null) - { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - pmerFrequency[minValue] += 1; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - pmerFrequency[minValue] += 1; - } - } - - pmerFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); - if(roundNumber % 100 == 0) { - percentagePunishment *= 0.996; - normalize(); - } - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(long[] pmerFrequency) { -// TODO : if biggest is smaller than (samples / 4^(m-1))/5 - for (int i = 0; i < elementsToPush; i++) { - long biggest = -1; - int biggestIndex = -1; - for (int k = 0; k < pmerFrequency.length; k++) { - if (pmerFrequency[k] > biggest) { - biggest = pmerFrequency[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - pmerFrequency[biggestIndex] = 0; - pmerFrequency[getReversed(biggestIndex)] = 0; - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if(temp == null) - { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i> frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double percentagePunishment; - - private Integer[] temp = null; - private int mask; - private long[] statFrequency; - - public IterativeOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.percentagePunishment = percentagePunishment; - this.mask = (int)Math.pow(4, pivotLength) - 1; - } - - - public void initFrequency() throws IOException { - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - statFrequency = new long[(int) Math.pow(4, pivotLength)]; -// HashSet[] pmerFrequency; -// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; - HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - if(!pmerFrequency.containsKey( minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(0, k)); // += 1; - if(roundNumber == rounds) statFrequency[minValue]++; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - - if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) statFrequency[minValue]++; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - - if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) statFrequency[minValue]++; - } - } - - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) statFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); -// if(roundNumber % 100 == 0) { -// percentagePunishment *= 0.996; -// normalize(); -// } - pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(HashMap> pmerFrequency) { - int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; - for(Integer i : pmerFrequency.keySet()){ - frequencies[i] = pmerFrequency.get(i).size(); - } - for (int i = 0; i < elementsToPush; i++) { - long biggest = -1; - int biggestIndex = -1; - for (int k = 0; k < frequencies.length; k++) { - if (frequencies[k] > biggest) { - biggest = frequencies[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - frequencies[biggestIndex] = 0; - frequencies[getReversed(biggestIndex)] = 0; - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if(temp == null) - { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i> frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double percentagePunishment; - - private Integer[] temp = null; - private int mask; - private long[] statFrequency; - - private SignatureUtils signatureUtils; - - public IterativeSignatureOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeSignatureOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeSignatureOrdering9(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.percentagePunishment = percentagePunishment; - this.mask = (int)Math.pow(4, pivotLength) - 1; - signatureUtils = new SignatureUtils(pivotLength); - } - - - public void initFrequency() throws IOException { - - for(int i = 0; i<=mask; i++) /// init as signature - { - if(!signatureUtils.isAllowed(i)) - { - currentOrdering[i] += mask + 1; - } - } - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - statFrequency = new long[(int) Math.pow(4, pivotLength)]; -// HashSet[] pmerFrequency; -// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; - HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - if(!pmerFrequency.containsKey( minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(0, k)); // += 1; - if(roundNumber == rounds) statFrequency[minValue]++; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - - if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) statFrequency[minValue]++; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - - if(!pmerFrequency.containsKey(minValue)) pmerFrequency.put(minValue, new HashSet<>()); - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) statFrequency[minValue]++; - } - } - - pmerFrequency.get(minValue).add(line.substring(i, k+i)); // += 1; - if(roundNumber == rounds) statFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); -// if(roundNumber % 100 == 0) { -// percentagePunishment *= 0.996; -// normalize(); -// } - pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(HashMap> pmerFrequency) { - int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; - for(Integer i : pmerFrequency.keySet()){ - frequencies[i] = pmerFrequency.get(i).size(); - } - for (int i = 0; i < elementsToPush; i++) { - long biggest = -1; - int biggestIndex = -1; - for (int k = 0; k < frequencies.length; k++) { - if (frequencies[k] > biggest) { - biggest = frequencies[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - frequencies[biggestIndex] = 0; - frequencies[getReversed(biggestIndex)] = 0; - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if(temp == null) - { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i Date: Wed, 31 Mar 2021 16:05:49 +0300 Subject: [PATCH 23/44] stable version of partition and IterativeOrdering9_WithCounterNormalized_AndSignature --- src/dumbo/ExportUtils.java | 108 ++++++++++++ src/dumbo/Ordering/IOrderingPP.java | 6 +- ...g9_WithCounterNormalized_AndSignature.java | 165 +++++------------- src/dumbo/OrderingOptimizer.java | 128 +++++--------- src/dumbo/StringUtils.java | 16 +- 5 files changed, 209 insertions(+), 214 deletions(-) create mode 100644 src/dumbo/ExportUtils.java diff --git a/src/dumbo/ExportUtils.java b/src/dumbo/ExportUtils.java new file mode 100644 index 0000000..e69f84d --- /dev/null +++ b/src/dumbo/ExportUtils.java @@ -0,0 +1,108 @@ +package dumbo; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.AbstractMap; +import java.util.HashMap; + +public class ExportUtils { + public void exportOrderingForCpp(long[] currentOrdering) { + File file = new File("ranks.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < currentOrdering.length; i++) { + bf.write(Long.toString(currentOrdering[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + + public void exportBinningForCpp(long[] statFrequency) { + File file = new File("freq.txt"); + + BufferedWriter bf = null; + + try { + bf = new BufferedWriter(new FileWriter(file)); + + for (int i = 0; i < statFrequency.length; i++) { + bf.write(Long.toString(statFrequency[i])); + bf.newLine(); + } + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + } + + public HashMap getBytesPerFile() { + File folder = new File("./Nodes"); + File[] listOfFiles = folder.listFiles(); + + HashMap bytesPerFile = new HashMap<>(); + + for (int i = 0; i < listOfFiles.length; i++) { + if (listOfFiles[i].isFile()) + bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length()); + } + return bytesPerFile; + } + + public void writeToFile(AbstractMap data, String fileName) { + File file = new File(fileName); + + BufferedWriter bf = null; + + + try { + bf = new BufferedWriter(new FileWriter(file)); + + bf.write("x = {"); + bf.newLine(); + + //iterate map entries + for (java.util.Map.Entry entry : data.entrySet()) { + bf.write(entry.getKey() + ":" + entry.getValue() + ","); + bf.newLine(); + } + bf.write("}"); + bf.flush(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + + try { + //always close the writer + bf.close(); + } catch (Exception e) { + } + } + + } +} diff --git a/src/dumbo/Ordering/IOrderingPP.java b/src/dumbo/Ordering/IOrderingPP.java index 42e110c..b5e1cfe 100644 --- a/src/dumbo/Ordering/IOrderingPP.java +++ b/src/dumbo/Ordering/IOrderingPP.java @@ -1,6 +1,8 @@ package dumbo.Ordering; -public interface IOrderingPP extends IOrdering { - long getRank(int mmer); +import java.io.IOException; + +public interface IOrderingPP { int strcmp(int x, int y); + int findSmallest(char[] a, int from, int to) throws IOException; } diff --git a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java index 7e7e42f..b370654 100644 --- a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java +++ b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java @@ -26,7 +26,6 @@ public class IterativeOrdering9_WithCounterNormalized_AndSignature implements IO private double percentagePunishment; - private Integer[] temp = null; private int mask; private long[] statFrequency; @@ -44,9 +43,9 @@ public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, St public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, String infile, int readLen, int bufSize, int k) { this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); + int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength)); currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; + currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] = canonical; } roundSamples = 100000; rounds = 10000; @@ -63,25 +62,13 @@ public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, St this.mask = (int) Math.pow(4, pivotLength) - 1; } - public String getCanon(String line) { - String x = new String(stringUtils.getReversedRead(line.toCharArray())); - for (int i = 0; i < line.length(); i++) { - if (line.charAt(i) < x.charAt(i)) - return line; - else if (line.charAt(i) > x.charAt(i)) - return x; - } - return x; - } - public void initFrequency() throws IOException { - int numMmers = (int)Math.pow(4, pivotLength); + int numMmers = (int) Math.pow(4, pivotLength); for (int i = 0; i < numMmers; i++) { - if(!signatureUtils.isAllowed(i) && i < getReversed(i)) - { + if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { currentOrdering[i] += numMmers; - currentOrdering[getReversed(i)] += numMmers; + currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; } } @@ -114,45 +101,33 @@ public void initFrequency() throws IOException { min_pos = findSmallest(lineCharArray, 0, k); minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = Math.min(minValue, getReversed(minValue)); + minValueNormalized = getNormalizedValue(minValue); currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(0, k))); // += 1; - if (roundNumber == rounds) statFrequency[minValueNormalized]++; + updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, 0); int bound = len - k + 1; for (int i = 1; i < bound; i++) { numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; if (i > min_pos) { min_pos = findSmallest(lineCharArray, i, i + k); minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = Math.min(minValue, getReversed(minValue)); + minValueNormalized = getNormalizedValue(minValue); - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - if (roundNumber == rounds) statFrequency[minValueNormalized]++; + updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); } else { int lastIndexInWindow = k + i - pivotLength; if (strcmp(currentValue, minValue) < 0) { min_pos = lastIndexInWindow; minValue = currentValue; - minValueNormalized = Math.min(minValue, getReversed(minValue)); + minValueNormalized = getNormalizedValue(minValue); - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - if (roundNumber == rounds) statFrequency[minValueNormalized]++; + updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); } } - - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - if (roundNumber == rounds) statFrequency[minValueNormalized]++; + updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); } } @@ -178,6 +153,20 @@ public void initFrequency() throws IOException { frG.close(); } + private void updateStatistics(int roundNumber, HashMap> pmerFrequency, int minValueNormalized, String line, int startPosition) { + if (roundNumber == rounds) + statFrequency[minValueNormalized]++; + else { + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(stringUtils.getCanon(line.substring(startPosition, k + startPosition))); + } + } + + private int getNormalizedValue(int minValue) { + return Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLength)); + } + private void adaptOrdering(HashMap> pmerFrequency) { int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; @@ -195,118 +184,48 @@ private void adaptOrdering(HashMap> pmerFrequency) { } long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; + currentOrdering[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank; frequencies[biggestIndex] = 0; - frequencies[getReversed(biggestIndex)] = 0; + frequencies[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = 0; } } - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - @Override public int findSmallest(char[] a, int from, int to) throws IOException { int min_pos = from; + int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength); + int currentValue = minValue; for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) + currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask; + if (strcmp(minValue, currentValue) > 0) + { min_pos = i; + minValue = currentValue; + } } return min_pos; } - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - return strcmp(x,y); - } @Override public int strcmp(int x, int y) { -// if (x == y || y == getReversed(x)) return 0; if (x == y) return 0; if (currentOrdering[x] < currentOrdering[y]) return -1; return 1; } private void normalize() { -// currentOrdering - if (temp == null) { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for (int i = 0; i < temp.length; i++) { - currentOrdering[temp[i]] = i; // TODO: FIXED THIS - } - } - - - public void exportOrderingForCpp() { - File file = new File("ranks.txt"); + Integer[] temp = new Integer[currentOrdering.length]; + for (int i = 0; i < temp.length; i++) + temp[i] = i; - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < currentOrdering.length; i++) { - bf.write(Long.toString(currentOrdering[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } + Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); + for (int i = 0; i < temp.length; i++) { + currentOrdering[temp[i]] = i; } } - public void exportBinningForCpp() { - File file = new File("freq.txt"); - - BufferedWriter bf = null; - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < statFrequency.length; i++) { - bf.write(Long.toString(statFrequency[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } - - @Override - public long getRank(int mmer) { - return currentOrdering[mmer]; - } } diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index d1eb03f..d44fd24 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -18,13 +18,14 @@ public static void main(String[] args) throws IOException { int k = 60, pivot_len = 8, bufferSize = 81920, numThreads = 20, hsmapCapacity = 10000000; int readLen = 124; - int numBlocks = (int)Math.pow(4, pivot_len);//256; 1000;// + int numBlocks = (int) Math.pow(4, pivot_len);//256; 1000;// // boolean readable = false; String orderingName = "uhs_sig_freq"; int xor = 0; //11101101; int numRounds = 0, elementsToPush = 0, samplesPerRound = 0, statSamples = 0; double punishPercentage = 1; String version = "10"; + int partitionData = 0; if (args.length > 0 && args[0].equals("-help")) { System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" + @@ -45,8 +46,10 @@ else if (args[i].equals("-v")) version = args[i + 1]; else if (args[i].equals("-k")) k = new Integer(args[i + 1]); - else if(args[i].equals("-NB")) - numBlocks = new Integer(args[i+1]); + else if (args[i].equals("-NB")) + numBlocks = new Integer(args[i + 1]); + else if (args[i].equals("-partition")) + partitionData = new Integer(args[i + 1]); // else // if(args[i].equals("-o")) // orderingName = args[i+1]; @@ -81,23 +84,22 @@ else if (args[i].equals("-punishPercentage")) orderingName = "iterativeOrdering"; - IOrdering ordering = null; + IOrderingPP ordering = null; System.out.println(version); - switch(version) - { + switch (version) { case "9-normalized": // good version IterativeOrdering9_WithCounterNormalized ordering9_withCounterNormalized = new IterativeOrdering9_WithCounterNormalized(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); ordering9_withCounterNormalized.initFrequency(); ordering9_withCounterNormalized.exportOrderingForCpp(); ordering9_withCounterNormalized.exportBinningForCpp(); - ordering = ordering9_withCounterNormalized; +// ordering = ordering9_withCounterNormalized; break; case "9-normalized-signature": // IterativeOrdering9_WithCounterNormalized_AndSignature ordering9_withCounterNormalized_andSignature = new IterativeOrdering9_WithCounterNormalized_AndSignature(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); ordering9_withCounterNormalized_andSignature.initFrequency(); - ordering9_withCounterNormalized_andSignature.exportOrderingForCpp(); - ordering9_withCounterNormalized_andSignature.exportBinningForCpp(); +// ordering9_withCounterNormalized_andSignature.exportOrderingForCpp(); +// ordering9_withCounterNormalized_andSignature.exportBinningForCpp(); ordering = ordering9_withCounterNormalized_andSignature; System.out.println("lolz asdasd"); break; @@ -106,109 +108,67 @@ else if (args[i].equals("-punishPercentage")) ordering10.initFrequency(); ordering10.exportOrderingForCpp(); ordering10.exportBinningForCpp(); - ordering = ordering10; +// ordering = ordering10; break; case "universal-frequency-signature": - UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile,readLen, bufferSize, true, k, statSamples);; + UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, k, statSamples); + ; universalFrequencySignature.initRank(); universalFrequencySignature.exportOrderingForCpp(); universalFrequencySignature.exportBinningForCpp(); - ordering = universalFrequencySignature; +// ordering = universalFrequencySignature; break; case "universal-frequency": - UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile,readLen, bufferSize, false, k, statSamples);; + UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, k, statSamples); + ; universalFrequency.initRank(); universalFrequency.exportOrderingForCpp(); universalFrequency.exportBinningForCpp(); - ordering = universalFrequency; +// ordering = universalFrequency; break; case "frequency": // FREQUENCY SUCKS - FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds*samplesPerRound, statSamples, k); + FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound, statSamples, k); frequencyOrdering.initFrequency(); - ordering = frequencyOrdering; +// ordering = frequencyOrdering; break; case "signature": LexicographicSignatureOrdering signatureOrdering = new LexicographicSignatureOrdering(pivot_len); - ordering = signatureOrdering; +// ordering = signatureOrdering; break; } - try { - - System.out.println("Program Configuration:"); - System.out.print("Input File: " + infile + "\n" + - "Kmer Length: " + k + "\n" + - "Read Length: " + readLen + "\n" + - "Pivot Length: " + pivot_len + "\n" + - "# Of Threads: " + numThreads + "\n" + - "R/W Buffer Size: " + bufferSize + "\n" + - "Ordering: " + orderingName + "\n"); - - Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, (IOrderingPP) ordering); - Map map = new Map(k, (int)Math.pow(4, pivot_len), bufferSize, hsmapCapacity); - - - partition.Run(); - - AbstractMap distinctKmersPerPartition = map.Run(numThreads); - OrderingOptimizer.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); - System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); - - HashMap bytesPerFile = OrderingOptimizer.getBytesPerFile(); - OrderingOptimizer.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); + if (partitionData == 1) { + try { + ExportUtils exportUtils = new ExportUtils(); + System.out.println("Program Configuration:"); + System.out.print("Input File: " + infile + "\n" + + "Kmer Length: " + k + "\n" + + "Read Length: " + readLen + "\n" + + "Pivot Length: " + pivot_len + "\n" + + "# Of Threads: " + numThreads + "\n" + + "R/W Buffer Size: " + bufferSize + "\n" + + "Ordering: " + orderingName + "\n"); - } catch (Exception E) { - System.out.println("Exception caught!"); - E.printStackTrace(); - } + Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, (IOrderingPP) ordering); + Map map = new Map(k, (int) Math.pow(4, pivot_len), bufferSize, hsmapCapacity); - } + partition.Run(); - public static HashMap getBytesPerFile() { - File folder = new File("./Nodes"); - File[] listOfFiles = folder.listFiles(); + AbstractMap distinctKmersPerPartition = map.Run(numThreads); + exportUtils.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); + System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); - HashMap bytesPerFile = new HashMap<>(); + HashMap bytesPerFile = exportUtils.getBytesPerFile(); + exportUtils.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); - for (int i = 0; i < listOfFiles.length; i++) { - if (listOfFiles[i].isFile()) - bytesPerFile.put(Long.parseLong(listOfFiles[i].getName().replace("nodes", "")), listOfFiles[i].length()); + } catch (Exception E) { + System.out.println("Exception caught!"); + E.printStackTrace(); + } } - return bytesPerFile; } - public static void writeToFile(AbstractMap data, String fileName) { - File file = new File(fileName); - - BufferedWriter bf = null; - try { - bf = new BufferedWriter(new FileWriter(file)); - - bf.write("x = {"); - bf.newLine(); - - //iterate map entries - for (java.util.Map.Entry entry : data.entrySet()) { - bf.write(entry.getKey() + ":" + entry.getValue() + ","); - bf.newLine(); - } - bf.write("}"); - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - - } - } diff --git a/src/dumbo/StringUtils.java b/src/dumbo/StringUtils.java index f222d0b..2ee9fcc 100644 --- a/src/dumbo/StringUtils.java +++ b/src/dumbo/StringUtils.java @@ -17,11 +17,6 @@ public int getDecimal(char[] a, int from, int to){ return val; } - public int getRDecimal(char[] a, char[] b, int from, int to){ - - return Math.min(getDecimal(a, from, to), getDecimal(b, from, to)); - } - public long getLDecimal(char[] a, int from, int to){ long val=0; @@ -62,4 +57,15 @@ public int getReversedMmer(int x, int length) { } return rev; } + + public String getCanon(String line) { + String x = new String(getReversedRead(line.toCharArray())); + for (int i = 0; i < line.length(); i++) { + if (line.charAt(i) < x.charAt(i)) + return line; + else if (line.charAt(i) > x.charAt(i)) + return x; + } + return x; + } } From 86b335d296827c6d08c577f2f2729d011e3ca41a Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 31 Mar 2021 16:35:29 +0300 Subject: [PATCH 24/44] add MinimizerCounter for knowing load of each minimizer --- src/dumbo/Map.java | 4 + src/dumbo/MinimizerCounter.java | 77 +++++++++++++++++++ ...g9_WithCounterNormalized_AndSignature.java | 71 +++++++++-------- src/dumbo/Partition.java | 2 +- src/dumbo/StringUtils.java | 6 +- 5 files changed, 125 insertions(+), 35 deletions(-) create mode 100644 src/dumbo/MinimizerCounter.java diff --git a/src/dumbo/Map.java b/src/dumbo/Map.java index 24c5064..3cc12a1 100644 --- a/src/dumbo/Map.java +++ b/src/dumbo/Map.java @@ -111,6 +111,10 @@ public void run() { distinctKmersPerPartition.put((long) p, (long) nodes.size()); + for (String s: nodes) { + + } + nodes.clear(); nodes = null; diff --git a/src/dumbo/MinimizerCounter.java b/src/dumbo/MinimizerCounter.java new file mode 100644 index 0000000..8b62948 --- /dev/null +++ b/src/dumbo/MinimizerCounter.java @@ -0,0 +1,77 @@ +package dumbo; + +import dumbo.Ordering.IOrderingPP; + +import java.io.*; + +public class MinimizerCounter { + + private int k; + private String kmerSetFile; + private int pivotLen; + private int bufSize; + + private FileReader frG; + private BufferedReader bfrG; + + private IOrderingPP ordering; + + private StringUtils stringUtils; + + private long[] minimizerCounters; + + + public MinimizerCounter(int kk, String kmerSetFile, int pivotLength, int bufferSize, IOrderingPP ordering) { + this.k = kk; + this.kmerSetFile = kmerSetFile; + this.pivotLen = pivotLength; + this.bufSize = bufferSize; + this.ordering = ordering; + this.stringUtils = new StringUtils(); + minimizerCounters = new long[(int) Math.pow(4, pivotLength)]; + } + + + private long[] getMinimizersCounters() throws IOException { + frG = new FileReader(kmerSetFile); + bfrG = new BufferedReader(frG, bufSize); + + String describeline; + + int minPos; + char[] lineCharArray = new char[k]; + + + int minValue, minValueNormalized, currentValue, start; + while ((describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, k); + bfrG.read(); + + if (stringUtils.isReadLegal(lineCharArray)) { + minPos = ordering.findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, minPos, minPos + pivotLen); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLen); + minimizerCounters[minValueNormalized]++; + } + } + + bfrG.close(); + frG.close(); + + return minimizerCounters.clone(); + } + + public void Run() throws Exception { + long time1 = 0; + long t1 = System.currentTimeMillis(); + System.out.println("Minimizers counting Begin!"); + System.out.println("hi"); + getMinimizersCounters(); + + long t2 = System.currentTimeMillis(); + time1 = (t2 - t1) / 1000; + System.out.println("Time used for counting minimizers appearances: " + time1 + " seconds!"); + } + +} \ No newline at end of file diff --git a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java index b370654..dfa2cc5 100644 --- a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java +++ b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java @@ -29,42 +29,49 @@ public class IterativeOrdering9_WithCounterNormalized_AndSignature implements IO private int mask; private long[] statFrequency; - public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { + private int numMmers; + + + public IterativeOrdering9_WithCounterNormalized_AndSignature( + int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, int statisticsSamples, double percentagePunishment) + { + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.statisticsSamples = statisticsSamples; + this.percentagePunishment = percentagePunishment; + numMmers = (int) Math.pow(4, pivotLength); + this.mask = numMmers - 1; this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; this.pivotLength = pivotLength; this.k = k; - this.currentOrdering = initialOrdering.clone(); stringUtils = new StringUtils(); signatureUtils = new SignatureUtils(pivotLength); + currentOrdering = new long[(int) Math.pow(4, pivotLength)]; } - public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { + public IterativeOrdering9_WithCounterNormalized_AndSignature( + int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, int statisticsSamples, double percentagePunishment, long[] initialOrdering) + { + this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment); + currentOrdering = initialOrdering.clone(); + if(currentOrdering.length != numMmers) + throw new IllegalArgumentException("initialOrdering is not of correct size"); + } + + + public void initFrequency() throws IOException { + + for (int i = 0; i < numMmers; i++) { int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength)); currentOrdering[i] = canonical; currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] = canonical; } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering9_WithCounterNormalized_AndSignature(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.percentagePunishment = percentagePunishment; - this.mask = (int) Math.pow(4, pivotLength) - 1; - } - - public void initFrequency() throws IOException { - int numMmers = (int) Math.pow(4, pivotLength); for (int i = 0; i < numMmers; i++) { if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { currentOrdering[i] += numMmers; @@ -79,8 +86,8 @@ public void initFrequency() throws IOException { FileReader frG = new FileReader(inputFile); BufferedReader bfrG = new BufferedReader(frG, bufSize); - statFrequency = new long[(int) Math.pow(4, pivotLength)]; - HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); + statFrequency = new long[numMmers]; + HashMap> pmerFrequency = new HashMap<>(numMmers); String describeline; char[] lineCharArray = new char[readLen]; @@ -101,7 +108,7 @@ public void initFrequency() throws IOException { min_pos = findSmallest(lineCharArray, 0, k); minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = getNormalizedValue(minValue); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, 0); @@ -114,7 +121,7 @@ public void initFrequency() throws IOException { if (i > min_pos) { min_pos = findSmallest(lineCharArray, i, i + k); minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = getNormalizedValue(minValue); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); } else { @@ -122,7 +129,7 @@ public void initFrequency() throws IOException { if (strcmp(currentValue, minValue) < 0) { min_pos = lastIndexInWindow; minValue = currentValue; - minValueNormalized = getNormalizedValue(minValue); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); } @@ -159,17 +166,15 @@ private void updateStatistics(int roundNumber, HashMap> else { if (!pmerFrequency.containsKey(minValueNormalized)) pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(stringUtils.getCanon(line.substring(startPosition, k + startPosition))); + pmerFrequency.get(minValueNormalized).add(stringUtils.getCanonical(line.substring(startPosition, k + startPosition))); } } - private int getNormalizedValue(int minValue) { - return Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLength)); - } + private void adaptOrdering(HashMap> pmerFrequency) { - int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; + int[] frequencies = new int[numMmers]; for (Integer i : pmerFrequency.keySet()) { frequencies[i] = pmerFrequency.get(i).size(); } @@ -182,7 +187,7 @@ private void adaptOrdering(HashMap> pmerFrequency) { biggestIndex = k; } } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); + long newRank = currentOrdering[biggestIndex] + (int) (numMmers * percentagePunishment); currentOrdering[biggestIndex] = newRank; currentOrdering[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank; frequencies[biggestIndex] = 0; diff --git a/src/dumbo/Partition.java b/src/dumbo/Partition.java index b98a66f..24e0930 100644 --- a/src/dumbo/Partition.java +++ b/src/dumbo/Partition.java @@ -138,7 +138,7 @@ private long DistributeNodes() throws IOException { } private int getNormalizedValue(int minValue) { - return Math.min(minValue, stringUtils.getReversedMmer(minValue, pivotLen)) % numOfBlocks; + return stringUtils.getNormalizedValue(minValue, pivotLen) % numOfBlocks; } private void tryCreateWriterForPmer(int prepos) throws IOException { diff --git a/src/dumbo/StringUtils.java b/src/dumbo/StringUtils.java index 2ee9fcc..98f57b4 100644 --- a/src/dumbo/StringUtils.java +++ b/src/dumbo/StringUtils.java @@ -58,7 +58,7 @@ public int getReversedMmer(int x, int length) { return rev; } - public String getCanon(String line) { + public String getCanonical(String line) { String x = new String(getReversedRead(line.toCharArray())); for (int i = 0; i < line.length(); i++) { if (line.charAt(i) < x.charAt(i)) @@ -68,4 +68,8 @@ else if (line.charAt(i) > x.charAt(i)) } return x; } + + public int getNormalizedValue(int minValue, int pivotLength) { + return Math.min(minValue, getReversedMmer(minValue, pivotLength)); + } } From c5fbfb8357aa6fc8dd20c25233b75acdb158f2a5 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 31 Mar 2021 16:59:59 +0300 Subject: [PATCH 25/44] checkpoint for very good version --- src/dumbo/ExportUtils.java | 46 +++++++++++++++++++++++++++++--- src/dumbo/MinimizerCounter.java | 5 ++-- src/dumbo/OrderingOptimizer.java | 42 ++++++++--------------------- 3 files changed, 56 insertions(+), 37 deletions(-) diff --git a/src/dumbo/ExportUtils.java b/src/dumbo/ExportUtils.java index e69f84d..d3aee6d 100644 --- a/src/dumbo/ExportUtils.java +++ b/src/dumbo/ExportUtils.java @@ -1,11 +1,9 @@ package dumbo; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; +import java.io.*; import java.util.AbstractMap; import java.util.HashMap; +import java.util.LinkedList; public class ExportUtils { public void exportOrderingForCpp(long[] currentOrdering) { @@ -34,6 +32,38 @@ public void exportOrderingForCpp(long[] currentOrdering) { } } +// public long[] importOrdering(String fileName, int pivotLength) throws Exception { +// String line; +// LinkedList ranks = new LinkedList<>(); +// +// File file = new File(fileName); +// BufferedReader bfr = null; +// +// try { +// bfr = new BufferedReader(new FileReader(file)); +// while ((line = bfr.readLine()) != null) { +// ranks.add(Long.getLong(line)); +// } +// +// } catch (IOException e) { +// e.printStackTrace(); +// } finally { +// bfr.close(); +// } +// +// if (ranks.size() != (int) Math.pow(4, pivotLength)) { +// throw new Exception("rank file of wrong size"); +// } +// int i = 0; +// long[] ordering = new long[(int) Math.pow(4, pivotLength)]; +// while (ranks.size() > 0) { +// ordering[i] = ranks.pop(); +// i++; +// } +// return ordering; +// +// } + public void exportBinningForCpp(long[] statFrequency) { File file = new File("freq.txt"); @@ -73,6 +103,14 @@ public HashMap getBytesPerFile() { return bytesPerFile; } + public void writeToFile(long[] arr, String fileName) { + HashMap map = new HashMap<>(); + for (long i = 0; i < arr.length; i++) { + map.put(i, arr[(int)i]); + } + writeToFile(map, fileName); + } + public void writeToFile(AbstractMap data, String fileName) { File file = new File(fileName); diff --git a/src/dumbo/MinimizerCounter.java b/src/dumbo/MinimizerCounter.java index 8b62948..2b73c7b 100644 --- a/src/dumbo/MinimizerCounter.java +++ b/src/dumbo/MinimizerCounter.java @@ -62,16 +62,17 @@ private long[] getMinimizersCounters() throws IOException { return minimizerCounters.clone(); } - public void Run() throws Exception { + public long[] Run() throws Exception { long time1 = 0; long t1 = System.currentTimeMillis(); System.out.println("Minimizers counting Begin!"); System.out.println("hi"); - getMinimizersCounters(); + long[] counters = getMinimizersCounters(); long t2 = System.currentTimeMillis(); time1 = (t2 - t1) / 1000; System.out.println("Time used for counting minimizers appearances: " + time1 + " seconds!"); + return counters; } } \ No newline at end of file diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index d44fd24..83da21a 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -8,6 +8,7 @@ import java.io.FileWriter; import java.io.IOException; import java.util.AbstractMap; +import java.util.Arrays; import java.util.HashMap; public class OrderingOptimizer { @@ -16,26 +17,20 @@ public static void main(String[] args) throws IOException { String infile = null; - int k = 60, pivot_len = 8, bufferSize = 81920, numThreads = 20, hsmapCapacity = 10000000; + int k = 60, pivot_len = 8, bufferSize = 81920; int readLen = 124; - int numBlocks = (int) Math.pow(4, pivot_len);//256; 1000;// -// boolean readable = false; String orderingName = "uhs_sig_freq"; - int xor = 0; //11101101; int numRounds = 0, elementsToPush = 0, samplesPerRound = 0, statSamples = 0; double punishPercentage = 1; String version = "10"; - int partitionData = 0; + String kmerSetFile = null; if (args.length > 0 && args[0].equals("-help")) { System.out.print("Usage: java -jar BuildDeBruijnGraph.jar -in InputPath -k k -L readLength[options]\n" + "Options Available: \n" + - "[-NB numOfBlocks] : (Integer) Number Of Kmer Blocks. Default: 256" + "\n" + "[-p pivotLength] : (Integer) Pivot Length. Default: 12" + "\n" + - "[-t numOfThreads] : (Integer) Number Of Threads. Default: 1" + "\n" + "[-b bufferSize] : (Integer) Read/Writer Buffer Size. Default: 8192" + "\n" + - "[-o order] : lexico or sig or uhs or uhs_sig" + "\n" + - "[-r readable] : (Boolean) Output Format: true means readable text, false means binary. Default: false" + "\n"); + "[-o order] : lexico or sig or uhs or uhs_sig" + "\n"); return; } @@ -46,10 +41,8 @@ else if (args[i].equals("-v")) version = args[i + 1]; else if (args[i].equals("-k")) k = new Integer(args[i + 1]); - else if (args[i].equals("-NB")) - numBlocks = new Integer(args[i + 1]); - else if (args[i].equals("-partition")) - partitionData = new Integer(args[i + 1]); + else if (args[i].equals("-kmers-file")) + kmerSetFile = args[i + 1]; // else // if(args[i].equals("-o")) // orderingName = args[i+1]; @@ -59,10 +52,6 @@ else if (args[i].equals("-b")) bufferSize = new Integer(args[i + 1]); else if (args[i].equals("-L")) readLen = new Integer(args[i + 1]); - else if (args[i].equals("-t")) - numThreads = new Integer(args[i + 1]); -// else if(args[i].equals("-r")) -// readable = new Boolean(args[i+1]); else if (args[i].equals("-rounds")) numRounds = new Integer(args[i + 1]); else if (args[i].equals("-samples")) @@ -137,31 +126,22 @@ else if (args[i].equals("-punishPercentage")) break; } - if (partitionData == 1) { + if (kmerSetFile != null) { try { ExportUtils exportUtils = new ExportUtils(); System.out.println("Program Configuration:"); System.out.print("Input File: " + infile + "\n" + "Kmer Length: " + k + "\n" + - "Read Length: " + readLen + "\n" + "Pivot Length: " + pivot_len + "\n" + - "# Of Threads: " + numThreads + "\n" + "R/W Buffer Size: " + bufferSize + "\n" + "Ordering: " + orderingName + "\n"); - Partition partition = new Partition(k, infile, numBlocks, pivot_len, bufferSize, readLen, (IOrderingPP) ordering); - Map map = new Map(k, (int) Math.pow(4, pivot_len), bufferSize, hsmapCapacity); + MinimizerCounter minimizerCounter = new MinimizerCounter(k, kmerSetFile, pivot_len, bufferSize, ordering); + long[] counters = minimizerCounter.Run(); + exportUtils.writeToFile(counters, orderingName + pivot_len + "_" + "kmers"); + System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + Arrays.stream(counters).sum()); - partition.Run(); - - AbstractMap distinctKmersPerPartition = map.Run(numThreads); - exportUtils.writeToFile(distinctKmersPerPartition, orderingName + pivot_len + "_" + "kmers"); - System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + distinctKmersPerPartition.values().stream().mapToLong(Long::longValue).sum()); - - HashMap bytesPerFile = exportUtils.getBytesPerFile(); - exportUtils.writeToFile(bytesPerFile, orderingName + pivot_len + "_" + "bytes"); - } catch (Exception E) { System.out.println("Exception caught!"); E.printStackTrace(); From 04c413245603b560891b9db2ce8a2728f1ad085e Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 31 Mar 2021 18:13:55 +0300 Subject: [PATCH 26/44] remove IterativeOrdering9_WithCounterNormalized.java, it is contained in IterativeOrdering9_WithCounterNormalized_AndSignature --- ...rativeOrdering9_WithCounterNormalized.java | 302 ------------------ ...g9_WithCounterNormalized_AndSignature.java | 29 +- src/dumbo/OrderingOptimizer.java | 29 +- 3 files changed, 32 insertions(+), 328 deletions(-) delete mode 100644 src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized.java diff --git a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized.java b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized.java deleted file mode 100644 index e0a49c0..0000000 --- a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized.java +++ /dev/null @@ -1,302 +0,0 @@ -package dumbo.Ordering; - -import dumbo.StringUtils; - -import java.io.*; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; - -public class IterativeOrdering9_WithCounterNormalized implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private HashMap> frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double percentagePunishment; - - private Integer[] temp = null; - private int mask; - private long[] statFrequency; - - public IterativeOrdering9_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeOrdering9_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering9_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.percentagePunishment = percentagePunishment; - this.mask = (int) Math.pow(4, pivotLength) - 1; - } - - public String getCanon(String line) { - String x = new String(stringUtils.getReversedRead(line.toCharArray())); - for (int i = 0; i < line.length(); i++) { - if (line.charAt(i) < x.charAt(i)) - return line; - else if (line.charAt(i) > x.charAt(i)) - return x; - } - return x; - } - - - public void initFrequency() throws IOException { - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - statFrequency = new long[(int) Math.pow(4, pivotLength)]; -// HashSet[] pmerFrequency; -// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; - HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue, minValueNormalized; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = Math.min(minValue, getReversed(minValue)); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(0, k))); // += 1; - - if (roundNumber == rounds) statFrequency[minValueNormalized]++; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = Math.min(minValue, getReversed(minValue)); - - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - if (roundNumber == rounds) statFrequency[minValueNormalized]++; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - minValueNormalized = Math.min(minValue, getReversed(minValue)); - - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - if (roundNumber == rounds) statFrequency[minValueNormalized]++; - } - } - - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - if (roundNumber == rounds) statFrequency[minValueNormalized]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); -// if(roundNumber % 100 == 0) { -// percentagePunishment *= 0.996; -// normalize(); -// } - pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(HashMap> pmerFrequency) { - int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; - for (Integer i : pmerFrequency.keySet()) { - frequencies[i] = pmerFrequency.get(i).size(); - } - for (int i = 0; i < elementsToPush; i++) { - long biggest = -1; - int biggestIndex = -1; - for (int k = 0; k < frequencies.length; k++) { - if (frequencies[k] > biggest) { - biggest = frequencies[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - frequencies[biggestIndex] = 0; - frequencies[getReversed(biggestIndex)] = 0; - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if (temp == null) { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for (int i = 0; i < temp.length; i++) { - currentOrdering[i] = temp[i]; - } - } - - - public void exportOrderingForCpp() { - File file = new File("ranks.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < currentOrdering.length; i++) { - bf.write(Long.toString(currentOrdering[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } - - public void exportBinningForCpp() { - File file = new File("freq.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < statFrequency.length; i++) { - bf.write(Long.toString(statFrequency[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } -} diff --git a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java index dfa2cc5..94ccb7b 100644 --- a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java +++ b/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java @@ -30,19 +30,20 @@ public class IterativeOrdering9_WithCounterNormalized_AndSignature implements IO private long[] statFrequency; private int numMmers; + private boolean useSignature; public IterativeOrdering9_WithCounterNormalized_AndSignature( int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, - int elementsToPush, int statisticsSamples, double percentagePunishment) - { + int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature) { this.roundSamples = roundSamples; this.rounds = rounds; this.elementsToPush = elementsToPush; this.statisticsSamples = statisticsSamples; this.percentagePunishment = percentagePunishment; numMmers = (int) Math.pow(4, pivotLength); - this.mask = numMmers - 1; + this.useSignature = useSignature; + this.mask = numMmers - 1; this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; @@ -55,11 +56,10 @@ public IterativeOrdering9_WithCounterNormalized_AndSignature( public IterativeOrdering9_WithCounterNormalized_AndSignature( int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, - int elementsToPush, int statisticsSamples, double percentagePunishment, long[] initialOrdering) - { - this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment); + int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, long[] initialOrdering) { + this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); currentOrdering = initialOrdering.clone(); - if(currentOrdering.length != numMmers) + if (currentOrdering.length != numMmers) throw new IllegalArgumentException("initialOrdering is not of correct size"); } @@ -72,10 +72,12 @@ public void initFrequency() throws IOException { currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] = canonical; } - for (int i = 0; i < numMmers; i++) { - if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { - currentOrdering[i] += numMmers; - currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; + if (useSignature) { + for (int i = 0; i < numMmers; i++) { + if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { + currentOrdering[i] += numMmers; + currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; + } } } @@ -171,8 +173,6 @@ private void updateStatistics(int roundNumber, HashMap> } - - private void adaptOrdering(HashMap> pmerFrequency) { int[] frequencies = new int[numMmers]; for (Integer i : pmerFrequency.keySet()) { @@ -203,8 +203,7 @@ public int findSmallest(char[] a, int from, int to) throws IOException { int currentValue = minValue; for (int i = from + 1; i <= to - pivotLength; i++) { currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask; - if (strcmp(minValue, currentValue) > 0) - { + if (strcmp(minValue, currentValue) > 0) { min_pos = i; minValue = currentValue; } diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 83da21a..0c9fc2b 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -69,6 +69,14 @@ else if (args[i].equals("-punishPercentage")) } } + System.out.println("Optimizing an ordering:"); + System.out.print("Input File: " + kmerSetFile + "\n" + + "Kmer Length: " + k + "\n" + + "Pivot Length: " + pivot_len + "\n" + + "R/W Buffer Size: " + bufferSize + "\n" + + "Read length" + readLen + "\n" + + "Ordering: " + orderingName + "\n"); + orderingName = "iterativeOrdering"; @@ -78,18 +86,18 @@ else if (args[i].equals("-punishPercentage")) switch (version) { case "9-normalized": // good version - IterativeOrdering9_WithCounterNormalized ordering9_withCounterNormalized = new IterativeOrdering9_WithCounterNormalized(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - ordering9_withCounterNormalized.initFrequency(); - ordering9_withCounterNormalized.exportOrderingForCpp(); - ordering9_withCounterNormalized.exportBinningForCpp(); -// ordering = ordering9_withCounterNormalized; + IterativeOrdering9_WithCounterNormalized_AndSignature iterative = new IterativeOrdering9_WithCounterNormalized_AndSignature(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, false); + iterative.initFrequency(); +// ordering9_withCounterNormalized.exportOrderingForCpp(); +// ordering9_withCounterNormalized.exportBinningForCpp(); + ordering = iterative; break; case "9-normalized-signature": // - IterativeOrdering9_WithCounterNormalized_AndSignature ordering9_withCounterNormalized_andSignature = new IterativeOrdering9_WithCounterNormalized_AndSignature(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - ordering9_withCounterNormalized_andSignature.initFrequency(); + IterativeOrdering9_WithCounterNormalized_AndSignature iterativeSignature = new IterativeOrdering9_WithCounterNormalized_AndSignature(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, true); + iterativeSignature.initFrequency(); // ordering9_withCounterNormalized_andSignature.exportOrderingForCpp(); // ordering9_withCounterNormalized_andSignature.exportBinningForCpp(); - ordering = ordering9_withCounterNormalized_andSignature; + ordering = iterativeSignature; System.out.println("lolz asdasd"); break; case "10": @@ -129,8 +137,8 @@ else if (args[i].equals("-punishPercentage")) if (kmerSetFile != null) { try { ExportUtils exportUtils = new ExportUtils(); - System.out.println("Program Configuration:"); - System.out.print("Input File: " + infile + "\n" + + System.out.println("Counting minimizer appearances:"); + System.out.print("Input File: " + kmerSetFile + "\n" + "Kmer Length: " + k + "\n" + "Pivot Length: " + pivot_len + "\n" + "R/W Buffer Size: " + bufferSize + "\n" + @@ -150,5 +158,4 @@ else if (args[i].equals("-punishPercentage")) } - } From 079b2ef3d1975045e662a601ea0983073b3c1fa1 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 31 Mar 2021 18:15:19 +0300 Subject: [PATCH 27/44] change name to IterativeOrdering - this is the main product --- ...lized_AndSignature.java => IterativeOrdering.java} | 6 +++--- src/dumbo/OrderingOptimizer.java | 11 ++++------- 2 files changed, 7 insertions(+), 10 deletions(-) rename src/dumbo/Ordering/{IterativeOrdering9_WithCounterNormalized_AndSignature.java => IterativeOrdering.java} (97%) diff --git a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java b/src/dumbo/Ordering/IterativeOrdering.java similarity index 97% rename from src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java rename to src/dumbo/Ordering/IterativeOrdering.java index 94ccb7b..d44684a 100644 --- a/src/dumbo/Ordering/IterativeOrdering9_WithCounterNormalized_AndSignature.java +++ b/src/dumbo/Ordering/IterativeOrdering.java @@ -8,7 +8,7 @@ import java.util.HashMap; import java.util.HashSet; -public class IterativeOrdering9_WithCounterNormalized_AndSignature implements IOrderingPP { +public class IterativeOrdering implements IOrderingPP { private String inputFile; private int readLen; private int bufSize; @@ -33,7 +33,7 @@ public class IterativeOrdering9_WithCounterNormalized_AndSignature implements IO private boolean useSignature; - public IterativeOrdering9_WithCounterNormalized_AndSignature( + public IterativeOrdering( int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature) { this.roundSamples = roundSamples; @@ -54,7 +54,7 @@ public IterativeOrdering9_WithCounterNormalized_AndSignature( currentOrdering = new long[(int) Math.pow(4, pivotLength)]; } - public IterativeOrdering9_WithCounterNormalized_AndSignature( + public IterativeOrdering( int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, long[] initialOrdering) { this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 0c9fc2b..aaaffb3 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -3,13 +3,8 @@ import dumbo.Ordering.*; import dumbo.Ordering.UHS.UHSFrequencySignatureOrdering; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; import java.io.IOException; -import java.util.AbstractMap; import java.util.Arrays; -import java.util.HashMap; public class OrderingOptimizer { @@ -86,14 +81,16 @@ else if (args[i].equals("-punishPercentage")) switch (version) { case "9-normalized": // good version - IterativeOrdering9_WithCounterNormalized_AndSignature iterative = new IterativeOrdering9_WithCounterNormalized_AndSignature(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, false); + IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, + samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, false); iterative.initFrequency(); // ordering9_withCounterNormalized.exportOrderingForCpp(); // ordering9_withCounterNormalized.exportBinningForCpp(); ordering = iterative; break; case "9-normalized-signature": // - IterativeOrdering9_WithCounterNormalized_AndSignature iterativeSignature = new IterativeOrdering9_WithCounterNormalized_AndSignature(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, true); + IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, + samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, true); iterativeSignature.initFrequency(); // ordering9_withCounterNormalized_andSignature.exportOrderingForCpp(); // ordering9_withCounterNormalized_andSignature.exportBinningForCpp(); From 67b8f53f62b30bb1cfbbcad8dd9005e6c282aa14 Mon Sep 17 00:00:00 2001 From: danflomin Date: Fri, 2 Apr 2021 18:14:06 +0300 Subject: [PATCH 28/44] checkpoint before transforming IOrderingPP to abstract class --- src/dumbo/Ordering/FrequencyOrdering.java | 139 ++++-------------- src/dumbo/Ordering/LexicographicOrdering.java | 39 ++--- .../LexicographicSignatureOrdering.java | 21 +-- src/dumbo/OrderingOptimizer.java | 8 +- 4 files changed, 63 insertions(+), 144 deletions(-) diff --git a/src/dumbo/Ordering/FrequencyOrdering.java b/src/dumbo/Ordering/FrequencyOrdering.java index 2046a99..7525db2 100644 --- a/src/dumbo/Ordering/FrequencyOrdering.java +++ b/src/dumbo/Ordering/FrequencyOrdering.java @@ -4,13 +4,15 @@ import java.io.*; import java.util.Arrays; +import java.util.Comparator; -public class FrequencyOrdering implements IOrdering { +public class FrequencyOrdering implements IOrderingPP { private int pivotLength; private String inputFile; private int readLen; private int bufSize; - private long[] pmerFrequency; + private int[] pmerFrequency; + private int[] currentOrdering; private long[] statsFrequency; private int numSamples; private int numStats; @@ -23,12 +25,13 @@ public FrequencyOrdering(int pivotLen, String infile, int readLen, int bufSize, this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; - pmerFrequency = new long[(int)Math.pow(4, pivotLen)]; + pmerFrequency = new int[(int) Math.pow(4, pivotLen)]; + currentOrdering = new int[(int) Math.pow(4, pivotLen)]; this.numSamples = numSamples; this.numStats = numStats; this.k = k; stringUtils = new StringUtils(); - mask = (int)Math.pow(4, pivotLen) - 1; + mask = (int) Math.pow(4, pivotLen) - 1; } public void initFrequency() throws IOException { @@ -48,18 +51,14 @@ public void initFrequency() throws IOException { bfrG.read(); if (stringUtils.isReadLegal(lineCharArray)) { - char[] revCharArray = stringUtils.getReversedRead(lineCharArray); - for (int i = 0; i < lineCharArray.length-pivotLength; i++) { + for (int i = 0; i < lineCharArray.length - pivotLength; i++) { - int lineValue = stringUtils.getDecimal(lineCharArray, i, i+pivotLength); - pmerFrequency[lineValue] += 1; - - int revValue = stringUtils.getDecimal(revCharArray, i, i+pivotLength); - pmerFrequency[revValue] += 1; + int value = stringUtils.getNormalizedValue(stringUtils.getDecimal(lineCharArray, i, i + pivotLength), pivotLength); + pmerFrequency[value] += 1; counter++; } - if(counter > numSamples){ + if (counter > numSamples) { break; } } @@ -73,6 +72,7 @@ public void initFrequency() throws IOException { } private void initStats(BufferedReader bfrG) throws IOException { +// TODO: FIX int numSampled = 0; boolean keepSample = true; @@ -124,128 +124,51 @@ private void initStats(BufferedReader bfrG) throws IOException { } statsFrequency[minValue]++; } - if(numSampled > numStats) keepSample = false; + if (numSampled > numStats) keepSample = false; } } } - public long[] getRawOrdering() - { - return pmerFrequency.clone(); - } - private void normalize() { Integer[] temp = new Integer[pmerFrequency.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - - Arrays.sort(temp, this::strcmp); - for(int i = 0 ; i pmerFrequency[a])); + for (int i = 0; i < temp.length; i++) { + currentOrdering[temp[i]] = i; } } - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - @Override public int findSmallest(char[] a, int from, int to) throws IOException { int min_pos = from; + int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength); + int currentValue = minValue; for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) + currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask; + if (strcmp(minValue, currentValue) > 0) { min_pos = i; + minValue = currentValue; + } } return min_pos; } - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - return strcmp(x,y); - } + public int strcmp(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + if (a == b) return 0; - public int strcmp(int x, int y) - { - if (x == y) return 0; - if (pmerFrequency[x] == pmerFrequency[y]) { - if(x 0) + int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength); + int currentValue = minValue; + for (int i = from + 1; i <= to - pivotLength; i++) { + currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask; + if (strcmp(minValue, currentValue) > 0) { min_pos = i; + minValue = currentValue; + } } return min_pos; } - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - for (int i = 0; i < len; i++) { - if (a[froma + i] < b[fromb + i]) - return -1; - else if (a[froma + i] > b[fromb + i]) - return 1; - } - return 0; - } } diff --git a/src/dumbo/Ordering/LexicographicSignatureOrdering.java b/src/dumbo/Ordering/LexicographicSignatureOrdering.java index 25b6a5e..8d89116 100644 --- a/src/dumbo/Ordering/LexicographicSignatureOrdering.java +++ b/src/dumbo/Ordering/LexicographicSignatureOrdering.java @@ -6,21 +6,16 @@ public class LexicographicSignatureOrdering extends LexicographicOrdering { private SignatureUtils signatureUtils; - private StringUtils stringUtils; + public LexicographicSignatureOrdering(int pivotLen) throws IOException { super(pivotLen); signatureUtils = new SignatureUtils(pivotLen); - stringUtils = new StringUtils(); } @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { -// boolean aAllowed = signatureUtils.isAllowed(a, froma, froma + len); -// boolean bAllowed = signatureUtils.isAllowed(b, fromb, fromb + len); - int x = stringUtils.getDecimal(a, froma, froma + pivotLen); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen); - boolean aAllowed = signatureUtils.isAllowed(a, froma, x); - boolean bAllowed = signatureUtils.isAllowed(b, fromb, y); + public int strcmp(int x, int y) { + boolean aAllowed = signatureUtils.isAllowed(x); + boolean bAllowed = signatureUtils.isAllowed(y); if (!aAllowed && bAllowed) { return 1; @@ -28,12 +23,6 @@ public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { return -1; } - for (int i = 0; i < len; i++) { - if (a[froma + i] < b[fromb + i]) - return -1; - else if (a[froma + i] > b[fromb + i]) - return 1; - } - return 0; + return Integer.compare(stringUtils.getNormalizedValue(x, pivotLength), stringUtils.getNormalizedValue(y, pivotLength)); } } diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index aaaffb3..806dbc7 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -123,11 +123,15 @@ else if (args[i].equals("-punishPercentage")) case "frequency": // FREQUENCY SUCKS FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound, statSamples, k); frequencyOrdering.initFrequency(); -// ordering = frequencyOrdering; + ordering = frequencyOrdering; break; case "signature": LexicographicSignatureOrdering signatureOrdering = new LexicographicSignatureOrdering(pivot_len); -// ordering = signatureOrdering; + ordering = signatureOrdering; + break; + case "lexicographic": + LexicographicOrdering lexicographicOrdering = new LexicographicOrdering(pivot_len); + ordering = lexicographicOrdering; break; } From 71c63fa9d7b8d172a15208bdf7d07f24f5b9a5de Mon Sep 17 00:00:00 2001 From: danflomin Date: Fri, 2 Apr 2021 23:27:10 +0300 Subject: [PATCH 29/44] remove some refactor orderings to OrderingBase move some to folder --- src/dumbo/Map.java | 185 ----------- src/dumbo/MinimizerCounter.java | 6 +- src/dumbo/Ordering/FrequencyOrdering.java | 31 +- src/dumbo/Ordering/IOrdering.java | 9 - src/dumbo/Ordering/IOrderingPP.java | 8 - src/dumbo/Ordering/IterativeOrdering.java | 90 ++--- ...ativeOrdering10_WithCounterNormalized.txt} | 0 src/dumbo/Ordering/IterativeUHSOrdering8.java | 313 ------------------ src/dumbo/Ordering/IterativeUHSOrdering9.java | 309 ----------------- src/dumbo/Ordering/LexicographicOrdering.java | 39 --- src/dumbo/Ordering/OrderingBase.java | 55 +++ src/dumbo/Ordering/RandomOrdering.java | 38 --- .../Standard/LexicographicOrdering.java | 19 ++ .../LexicographicSignatureOrdering.java | 8 +- .../Ordering/Standard/RandomOrdering.java | 21 ++ .../{ => Standard}/SignatureUtils.java | 2 +- .../UHS/UHSFrequencySignatureOrdering.java | 111 ++----- src/dumbo/Ordering/UHS/UHSOrderingBase.java | 106 ++---- .../Ordering/UHS/UHSSignatureOrdering.java | 80 ++--- src/dumbo/Ordering/UHS/YaelUHSOrdering.java | 142 -------- src/dumbo/OrderingOptimizer.java | 33 +- src/dumbo/Partition.java | 180 ---------- 22 files changed, 242 insertions(+), 1543 deletions(-) delete mode 100644 src/dumbo/Map.java delete mode 100644 src/dumbo/Ordering/IOrdering.java delete mode 100644 src/dumbo/Ordering/IOrderingPP.java rename src/dumbo/Ordering/{IterativeOrdering10_WithCounterNormalized.java => IterativeOrdering10_WithCounterNormalized.txt} (100%) delete mode 100644 src/dumbo/Ordering/IterativeUHSOrdering8.java delete mode 100644 src/dumbo/Ordering/IterativeUHSOrdering9.java delete mode 100644 src/dumbo/Ordering/LexicographicOrdering.java create mode 100644 src/dumbo/Ordering/OrderingBase.java delete mode 100644 src/dumbo/Ordering/RandomOrdering.java create mode 100644 src/dumbo/Ordering/Standard/LexicographicOrdering.java rename src/dumbo/Ordering/{ => Standard}/LexicographicSignatureOrdering.java (83%) create mode 100644 src/dumbo/Ordering/Standard/RandomOrdering.java rename src/dumbo/Ordering/{ => Standard}/SignatureUtils.java (98%) delete mode 100644 src/dumbo/Ordering/UHS/YaelUHSOrdering.java delete mode 100644 src/dumbo/Partition.java diff --git a/src/dumbo/Map.java b/src/dumbo/Map.java deleted file mode 100644 index 3cc12a1..0000000 --- a/src/dumbo/Map.java +++ /dev/null @@ -1,185 +0,0 @@ -package dumbo; - -import java.io.*; -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.CountDownLatch; - - -public class Map { - - private int k; - private int numOfBlocks; - private int bufSize; - - private Object lock_blocks = new Object(); - - private int capacity; - - private int blockID; - - private StringUtils stringUtils; - - private static int[] valTable = StringUtils.valTable; - - public Map(int kk, int numberOfBlocks, int bufferSize, int HScapacity) { - this.k = kk; - this.numOfBlocks = numberOfBlocks; - this.bufSize = bufferSize; - this.capacity = HScapacity; - this.blockID = 0; - stringUtils = new StringUtils(); - } - - public class MyThread extends Thread { - private CountDownLatch threadsSignal; - private HashSet fileNames; - private ConcurrentHashMap distinctKmersPerPartition; - - public MyThread(CountDownLatch threadsSignal, HashSet fileNames, ConcurrentHashMap distinctKmersPerPartition) { - super(); - this.threadsSignal = threadsSignal; - this.fileNames = fileNames; - this.distinctKmersPerPartition = distinctKmersPerPartition; - } - - @Override - public void run() { - System.out.println(Thread.currentThread().getName() + "Start..."); - - FileReader fr; - BufferedReader bfr; - - String line; - - int p, j; - - try { - File dir = new File("Maps"); - if (!dir.exists()) - dir.mkdir(); - - while (blockID < numOfBlocks) { - - synchronized (lock_blocks) { - p = blockID; - blockID++; - } - - String filename = "nodes" + p; - if (!fileNames.contains(filename)) { - continue; - } - - - fr = new FileReader("Nodes/nodes" + p); - bfr = new BufferedReader(fr, bufSize); - - - HashSet nodes = new HashSet(capacity); - - while ((line = bfr.readLine()) != null) { - - int bound = line.length() - k + 1; - - for (j = 0; j < bound; j++) { - String reg = line.substring(j, j + k); - String rev = new String(stringUtils.getReversedRead(reg.toCharArray())); - if (reg.equals(rev)) { - nodes.add(rev); - } else { - boolean didAdd = false; - for (int i = 0; i < k; i++) { - if (rev.charAt(i) < reg.charAt(i)) { - nodes.add(rev); - didAdd = true; - break; - } else if (reg.charAt(i) < rev.charAt(i)) { - nodes.add(rev); - didAdd = true; - break; - } - } - if (!didAdd) - nodes.add(reg); - } - } - - } - - if (p % 100 == 0) System.out.println(p); - - distinctKmersPerPartition.put((long) p, (long) nodes.size()); - - for (String s: nodes) { - - } - - nodes.clear(); - nodes = null; - - - bfr.close(); - fr.close(); - bfr = null; - fr = null; - - File myObj = new File("Nodes/nodes" + p); - if (!myObj.delete()) - System.out.println("Failed to delete the file." + p); - } - - } catch (Exception E) { - System.out.println("Exception caught!"); - E.printStackTrace(); - } - - threadsSignal.countDown(); - System.out.println(Thread.currentThread().getName() + "End. Remaining" + threadsSignal.getCount() + " threads"); - - } - } - - - private AbstractMap BuildMap(int threadNum, HashSet fileNames) throws Exception { - CountDownLatch threadSignal = new CountDownLatch(threadNum); - - ConcurrentHashMap distinctKmersPerPartition = new ConcurrentHashMap<>(); - - for (int i = 0; i < threadNum; i++) { - Thread t = new MyThread(threadSignal, fileNames, distinctKmersPerPartition); - t.start(); - } - threadSignal.await(); - System.out.println(Thread.currentThread().getName() + "End."); - return distinctKmersPerPartition; - } - - public AbstractMap Run(int numThreads) throws Exception { - long time1 = 0; - - HashSet fileNames = getNodesFileNames(); - - long t1 = System.currentTimeMillis(); - System.out.println("Build Maps Begin!"); - AbstractMap distinctKmersPerPartition = BuildMap(numThreads, fileNames); - long t2 = System.currentTimeMillis(); - time1 = (t2 - t1) / 1000; - System.out.println("Time used for building maps: " + time1 + " seconds!"); - - return distinctKmersPerPartition; - - } - - private HashSet getNodesFileNames() { - File[] files = (new File("./Nodes")).listFiles(); - List fileNames = new LinkedList<>(); - for (File file : files) { - if (file.isFile()) { - fileNames.add(file.getName()); - } - } - return new HashSet<>(fileNames); - } - -} \ No newline at end of file diff --git a/src/dumbo/MinimizerCounter.java b/src/dumbo/MinimizerCounter.java index 2b73c7b..3c5f1c8 100644 --- a/src/dumbo/MinimizerCounter.java +++ b/src/dumbo/MinimizerCounter.java @@ -1,6 +1,6 @@ package dumbo; -import dumbo.Ordering.IOrderingPP; +import dumbo.Ordering.OrderingBase; import java.io.*; @@ -14,14 +14,14 @@ public class MinimizerCounter { private FileReader frG; private BufferedReader bfrG; - private IOrderingPP ordering; + private OrderingBase ordering; private StringUtils stringUtils; private long[] minimizerCounters; - public MinimizerCounter(int kk, String kmerSetFile, int pivotLength, int bufferSize, IOrderingPP ordering) { + public MinimizerCounter(int kk, String kmerSetFile, int pivotLength, int bufferSize, OrderingBase ordering) { this.k = kk; this.kmerSetFile = kmerSetFile; this.pivotLen = pivotLength; diff --git a/src/dumbo/Ordering/FrequencyOrdering.java b/src/dumbo/Ordering/FrequencyOrdering.java index 7525db2..50677fe 100644 --- a/src/dumbo/Ordering/FrequencyOrdering.java +++ b/src/dumbo/Ordering/FrequencyOrdering.java @@ -6,8 +6,7 @@ import java.util.Arrays; import java.util.Comparator; -public class FrequencyOrdering implements IOrderingPP { - private int pivotLength; +public class FrequencyOrdering extends OrderingBase { private String inputFile; private int readLen; private int bufSize; @@ -17,11 +16,9 @@ public class FrequencyOrdering implements IOrderingPP { private int numSamples; private int numStats; private int k; - private StringUtils stringUtils; - private int mask; public FrequencyOrdering(int pivotLen, String infile, int readLen, int bufSize, int numSamples, int numStats, int k) { - pivotLength = pivotLen; + super(pivotLen); this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; @@ -30,8 +27,6 @@ public FrequencyOrdering(int pivotLen, String infile, int readLen, int bufSize, this.numSamples = numSamples; this.numStats = numStats; this.k = k; - stringUtils = new StringUtils(); - mask = (int) Math.pow(4, pivotLen) - 1; } public void initFrequency() throws IOException { @@ -115,7 +110,7 @@ private void initStats(BufferedReader bfrG) throws IOException { statsFrequency[minValue]++; } else { int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { + if (compareMmer(currentValue, minValue) < 0) { min_pos = lastIndexInWindow; minValue = currentValue; @@ -142,29 +137,17 @@ private void normalize() { } - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength); - int currentValue = minValue; - for (int i = from + 1; i <= to - pivotLength; i++) { - currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask; - if (strcmp(minValue, currentValue) > 0) { - min_pos = i; - minValue = currentValue; - } - } - return min_pos; + public int[] getRanks() { + return currentOrdering.clone(); } - - public int strcmp(int x, int y) { + public int compareMmer(int x, int y) { int a = stringUtils.getNormalizedValue(x, pivotLength); int b = stringUtils.getNormalizedValue(y, pivotLength); if (a == b) return 0; - if (pmerFrequency[x] < pmerFrequency[y]) + if (pmerFrequency[a] < pmerFrequency[b]) return -1; else return 1; diff --git a/src/dumbo/Ordering/IOrdering.java b/src/dumbo/Ordering/IOrdering.java deleted file mode 100644 index 1f29130..0000000 --- a/src/dumbo/Ordering/IOrdering.java +++ /dev/null @@ -1,9 +0,0 @@ -package dumbo.Ordering; - -import java.io.IOException; - -public interface IOrdering { - - int findSmallest(char[] a, int from, int to) throws IOException; - int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException; -} diff --git a/src/dumbo/Ordering/IOrderingPP.java b/src/dumbo/Ordering/IOrderingPP.java deleted file mode 100644 index b5e1cfe..0000000 --- a/src/dumbo/Ordering/IOrderingPP.java +++ /dev/null @@ -1,8 +0,0 @@ -package dumbo.Ordering; - -import java.io.IOException; - -public interface IOrderingPP { - int strcmp(int x, int y); - int findSmallest(char[] a, int from, int to) throws IOException; -} diff --git a/src/dumbo/Ordering/IterativeOrdering.java b/src/dumbo/Ordering/IterativeOrdering.java index d44684a..d0fe2b0 100644 --- a/src/dumbo/Ordering/IterativeOrdering.java +++ b/src/dumbo/Ordering/IterativeOrdering.java @@ -1,5 +1,6 @@ package dumbo.Ordering; +import dumbo.Ordering.Standard.SignatureUtils; import dumbo.StringUtils; import java.io.*; @@ -8,14 +9,12 @@ import java.util.HashMap; import java.util.HashSet; -public class IterativeOrdering implements IOrderingPP { +public class IterativeOrdering extends OrderingBase { private String inputFile; private int readLen; private int bufSize; - private int pivotLength; private int k; - private long[] currentOrdering; - private StringUtils stringUtils; + private int[] currentOrdering; private SignatureUtils signatureUtils; private HashMap> frequency; @@ -26,61 +25,77 @@ public class IterativeOrdering implements IOrderingPP { private double percentagePunishment; - private int mask; private long[] statFrequency; - private int numMmers; private boolean useSignature; + private boolean initialized; + public IterativeOrdering( int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature) { + super(pivotLength); this.roundSamples = roundSamples; this.rounds = rounds; this.elementsToPush = elementsToPush; this.statisticsSamples = statisticsSamples; this.percentagePunishment = percentagePunishment; - numMmers = (int) Math.pow(4, pivotLength); this.useSignature = useSignature; - this.mask = numMmers - 1; this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; - this.pivotLength = pivotLength; this.k = k; - stringUtils = new StringUtils(); signatureUtils = new SignatureUtils(pivotLength); - currentOrdering = new long[(int) Math.pow(4, pivotLength)]; + currentOrdering = new int[(int) Math.pow(4, pivotLength)]; + initialized = false; } public IterativeOrdering( int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, - int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, long[] initialOrdering) { + int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, int[] initialOrdering) { this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); currentOrdering = initialOrdering.clone(); + initialized = true; + badArgumentsThrow(); + } + + public IterativeOrdering( + int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, OrderingBase initialOrdering) throws IOException { + this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); + currentOrdering = initialOrdering.getRanks().clone(); + initialized = true; + badArgumentsThrow(); + } + + private void badArgumentsThrow() { if (currentOrdering.length != numMmers) throw new IllegalArgumentException("initialOrdering is not of correct size"); + if (useSignature) + throw new IllegalArgumentException("Can't initialize ordering from outside with useSignature as true"); } public void initFrequency() throws IOException { - for (int i = 0; i < numMmers; i++) { - int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength)); - currentOrdering[i] = canonical; - currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] = canonical; - } - - if (useSignature) { + if (!initialized) { for (int i = 0; i < numMmers; i++) { - if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { - currentOrdering[i] += numMmers; - currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; + int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength)); + currentOrdering[i] = canonical; + currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] = canonical; + } + if (useSignature) { + for (int i = 0; i < numMmers; i++) { + if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { + currentOrdering[i] += numMmers; + currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; + } } } } + boolean keepSample = true; int numSampled = 0; int roundNumber = 0; @@ -128,7 +143,7 @@ public void initFrequency() throws IOException { updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, i); } else { int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { + if (compareMmer(currentValue, minValue) < 0) { min_pos = lastIndexInWindow; minValue = currentValue; minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); @@ -187,7 +202,8 @@ private void adaptOrdering(HashMap> pmerFrequency) { biggestIndex = k; } } - long newRank = currentOrdering[biggestIndex] + (int) (numMmers * percentagePunishment); +// TODO: might not be necessary to change both. + int newRank = currentOrdering[biggestIndex] + (int) (numMmers * percentagePunishment); currentOrdering[biggestIndex] = newRank; currentOrdering[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank; frequencies[biggestIndex] = 0; @@ -197,27 +213,17 @@ private void adaptOrdering(HashMap> pmerFrequency) { @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength); - int currentValue = minValue; - for (int i = from + 1; i <= to - pivotLength; i++) { - currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask; - if (strcmp(minValue, currentValue) > 0) { - min_pos = i; - minValue = currentValue; - } - } - - return min_pos; + public int compareMmer(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + if (a == b) return 0; + if (currentOrdering[a] < currentOrdering[b]) return -1; + return 1; } - @Override - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; + public int[] getRanks() { + return currentOrdering.clone(); } private void normalize() { diff --git a/src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.java b/src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.txt similarity index 100% rename from src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.java rename to src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.txt diff --git a/src/dumbo/Ordering/IterativeUHSOrdering8.java b/src/dumbo/Ordering/IterativeUHSOrdering8.java deleted file mode 100644 index feb5702..0000000 --- a/src/dumbo/Ordering/IterativeUHSOrdering8.java +++ /dev/null @@ -1,313 +0,0 @@ -package dumbo.Ordering; - -import dumbo.StringUtils; - -import java.io.*; -import java.util.Arrays; -import java.util.Comparator; - -public class IterativeUHSOrdering8 implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private long[] frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double maskRatio; - private double percentagePunishment; - - Integer[] temp = null; - - byte[] UHSElements; - private int sizeOfUHS; - - private int mask; - - public IterativeUHSOrdering8(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeUHSOrdering8(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeUHSOrdering8(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double maskRatio, double percentagePunishment) throws IOException { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.maskRatio = maskRatio; - this.percentagePunishment = percentagePunishment; - this.UHSElements = uhsBitSet(); - this.mask = (int)Math.pow(4, pivotLength) - 1; - } - - - public void initFrequency() throws IOException { - int rank = 1; - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - if(UHSElements[i] == 1 && currentOrdering[i] == 0) - { - currentOrdering[i] = rank; - currentOrdering[getReversed(i)] = rank; - rank++; - } - else - { - currentOrdering[i] = Long.MAX_VALUE-i; - } - } - sizeOfUHS = rank; - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - long[] pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - ; - pmerFrequency[minValue] += 1; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - pmerFrequency[minValue] += 1; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - pmerFrequency[minValue] += 1; - } - } - - pmerFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); - if(roundNumber % 100 == 0) - percentagePunishment *= 0.996; - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - for(int i = 0 ; i biggest) { - biggest = pmerFrequency[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) (sizeOfUHS * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - pmerFrequency[biggestIndex] = 0; - pmerFrequency[getReversed(biggestIndex)] = 0; - } - - //normalize(); - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if(temp == null) - { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - pmerFrequency[minValue] += 1; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - pmerFrequency[minValue] += 1; - } - } - - pmerFrequency[minValue]++; - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); - if(roundNumber % 100 == 0) - percentagePunishment *= 0.996; - pmerFrequency = new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - for(int i = 0 ; i biggest) { - biggest = pmerFrequency[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) (sizeOfUHS * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - pmerFrequency[biggestIndex] = 0; - pmerFrequency[getReversed(biggestIndex)] = 0; - } - - //normalize(); - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if(temp == null) - { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for(int i = 0 ; i 0) { - min_pos = i; - minValue = currentValue; - } - } - - return min_pos; - } - -} diff --git a/src/dumbo/Ordering/OrderingBase.java b/src/dumbo/Ordering/OrderingBase.java new file mode 100644 index 0000000..502c235 --- /dev/null +++ b/src/dumbo/Ordering/OrderingBase.java @@ -0,0 +1,55 @@ +package dumbo.Ordering; + +import dumbo.StringUtils; + +import java.io.IOException; +import java.util.Arrays; + +public abstract class OrderingBase { + + protected int pivotLength; + protected int numMmers; + protected int mask; + + protected StringUtils stringUtils; + + public OrderingBase(int pivotLength) { + this.pivotLength = pivotLength; + this.numMmers = (int) Math.pow(4, pivotLength); + this.mask = numMmers - 1; + this.stringUtils = new StringUtils(); + } + + + public abstract int compareMmer(int x, int y); + + public int[] getRanks() { + Integer[] ranks = new Integer[numMmers]; + for (int i = 0; i < ranks.length; i++) { + ranks[i] = i; + } + + Arrays.sort(ranks, this::compareMmer); + + int[] primitveRanks = new int[numMmers]; + for (int i = 0; i < ranks.length; i++) { + primitveRanks[i] = ranks[i]; + } + return primitveRanks; + } + + public int findSmallest(char[] a, int from, int to) throws IOException { + int min_pos = from; + int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength); + int currentValue = minValue; + for (int i = from + 1; i <= to - pivotLength; i++) { + currentValue = ((currentValue << 2) + StringUtils.valTable[a[i + pivotLength - 1] - 'A']) & mask; + if (compareMmer(minValue, currentValue) > 0) { + min_pos = i; + minValue = currentValue; + } + } + + return min_pos; + } +} diff --git a/src/dumbo/Ordering/RandomOrdering.java b/src/dumbo/Ordering/RandomOrdering.java deleted file mode 100644 index b11d5c3..0000000 --- a/src/dumbo/Ordering/RandomOrdering.java +++ /dev/null @@ -1,38 +0,0 @@ -package dumbo.Ordering; - -import dumbo.StringUtils; - -import java.io.IOException; - -public class RandomOrdering implements IOrdering { - - protected StringUtils stringUtils; - private int pivotLen; - - public RandomOrdering(int pivotLen) { - this.pivotLen = pivotLen; - stringUtils = new StringUtils(); - } - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLen; i++) { - if (strcmp(a, a, min_pos, i, pivotLen) > 0) - min_pos = i; - } - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException { - int x = stringUtils.getDecimal(a, froma, froma + pivotLen); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen); - int t = 11101101; - if ((x ^ t) < (y ^ t)) - return -1; - else if ((x ^ t) > (y ^ t)) - return 1; - return 0; - } -} \ No newline at end of file diff --git a/src/dumbo/Ordering/Standard/LexicographicOrdering.java b/src/dumbo/Ordering/Standard/LexicographicOrdering.java new file mode 100644 index 0000000..adcb074 --- /dev/null +++ b/src/dumbo/Ordering/Standard/LexicographicOrdering.java @@ -0,0 +1,19 @@ +package dumbo.Ordering.Standard; + + +import dumbo.Ordering.OrderingBase; + +public class LexicographicOrdering extends OrderingBase { + + public LexicographicOrdering(int pivotLength) { + super(pivotLength); + } + + + @Override + public int compareMmer(int x, int y) { + return Integer.compare(stringUtils.getNormalizedValue(x, pivotLength), stringUtils.getNormalizedValue(y, pivotLength)); + } + + +} diff --git a/src/dumbo/Ordering/LexicographicSignatureOrdering.java b/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java similarity index 83% rename from src/dumbo/Ordering/LexicographicSignatureOrdering.java rename to src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java index 8d89116..f3babfc 100644 --- a/src/dumbo/Ordering/LexicographicSignatureOrdering.java +++ b/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java @@ -1,11 +1,9 @@ -package dumbo.Ordering; - -import dumbo.StringUtils; +package dumbo.Ordering.Standard; import java.io.IOException; public class LexicographicSignatureOrdering extends LexicographicOrdering { - private SignatureUtils signatureUtils; + protected SignatureUtils signatureUtils; public LexicographicSignatureOrdering(int pivotLen) throws IOException { super(pivotLen); @@ -13,7 +11,7 @@ public LexicographicSignatureOrdering(int pivotLen) throws IOException { } @Override - public int strcmp(int x, int y) { + public int compareMmer(int x, int y) { boolean aAllowed = signatureUtils.isAllowed(x); boolean bAllowed = signatureUtils.isAllowed(y); diff --git a/src/dumbo/Ordering/Standard/RandomOrdering.java b/src/dumbo/Ordering/Standard/RandomOrdering.java new file mode 100644 index 0000000..448bfac --- /dev/null +++ b/src/dumbo/Ordering/Standard/RandomOrdering.java @@ -0,0 +1,21 @@ +package dumbo.Ordering.Standard; + +import dumbo.Ordering.OrderingBase; + +public class RandomOrdering extends OrderingBase { + private int xor; + + public RandomOrdering(int pivotLen, int xor) { + super(pivotLen); + this.xor = xor; + } + + @Override + public int compareMmer(int x, int y) { + if ((x ^ xor) < (y ^ xor)) + return -1; + else if ((x ^ xor) > (y ^ xor)) + return 1; + return 0; + } +} \ No newline at end of file diff --git a/src/dumbo/Ordering/SignatureUtils.java b/src/dumbo/Ordering/Standard/SignatureUtils.java similarity index 98% rename from src/dumbo/Ordering/SignatureUtils.java rename to src/dumbo/Ordering/Standard/SignatureUtils.java index 879e99f..86898ed 100644 --- a/src/dumbo/Ordering/SignatureUtils.java +++ b/src/dumbo/Ordering/Standard/SignatureUtils.java @@ -1,4 +1,4 @@ -package dumbo.Ordering; +package dumbo.Ordering.Standard; public class SignatureUtils { diff --git a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java index 613c6cf..e4bc8b4 100644 --- a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java @@ -14,18 +14,16 @@ public class UHSFrequencySignatureOrdering extends UHSSignatureOrdering { private boolean isInit; private long[] statsFrequency; - private int mask; public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature, int k, int numStats) throws IOException { super(0, pivotLen, useSignature); this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; - pmerFrequency = new long[(int)Math.pow(4, pivotLen)]; + pmerFrequency = new long[numMmers]; this.k = k; this.numStats = numStats; isInit = false; - mask = (int)Math.pow(4, pivotLen) - 1; } @Override @@ -35,15 +33,15 @@ public void initRank() throws IOException { isRankInit = true; } - protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) throws IOException { - int baseCompareValue = strcmpBase(x, y); + protected int rawCompare(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { + int baseCompareValue = compareMmerBase(xNormalized, yNormalized); if (baseCompareValue != BOTH_IN_UHS && baseCompareValue != BOTH_NOT_IN_UHS) { return baseCompareValue; } // from down here - both in UHS - if(useSignature){ + if (useSignature) { if (!xAllowed && yAllowed) { return 1; } else if (!yAllowed && xAllowed) { @@ -52,13 +50,12 @@ protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) } // both allowed or both not allowed - if(pmerFrequency[x] == pmerFrequency[y]){ - if(x 1000000){ + if (counter > 1000000) { break; } } } - initStats(bfrG); +// initStats(bfrG); bfrG.close(); frG.close(); } @@ -108,7 +97,7 @@ private void initStats(BufferedReader bfrG) throws IOException { int numSampled = 0; boolean keepSample = true; - statsFrequency = new long[(int) Math.pow(4, pivotLen)]; + statsFrequency = new long[numMmers]; String describeline; char[] lineCharArray = new char[readLen]; @@ -128,8 +117,8 @@ private void initStats(BufferedReader bfrG) throws IOException { if (stringUtils.isReadLegal(lineCharArray)) { min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLen); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLen, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); statsFrequency[minValue]++; @@ -140,13 +129,13 @@ private void initStats(BufferedReader bfrG) throws IOException { if (i > min_pos) { min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLen); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); statsFrequency[minValue]++; } else { - int lastIndexInWindow = k + i - pivotLen; - if (strcmp(currentValue, minValue) < 0) { + int lastIndexInWindow = k + i - pivotLength; + if (compareMmer(currentValue, minValue) < 0) { min_pos = lastIndexInWindow; minValue = currentValue; @@ -155,70 +144,8 @@ private void initStats(BufferedReader bfrG) throws IOException { } statsFrequency[minValue]++; } - if(numSampled > numStats) keepSample = false; + if (numSampled > numStats) keepSample = false; } } } - -// public int[] getNormalizedForm() -// { -// int[] ret = rankOfPmer.clone(); -// return ret; -// } - - public void exportOrderingForCpp() { - File file = new File("ranks.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < rankOfPmer.length; i++) { - bf.write(Long.toString(rankOfPmer[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } - - public void exportBinningForCpp() { - File file = new File("freq.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < statsFrequency.length; i++) { - bf.write(Long.toString(statsFrequency[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } - - - - } diff --git a/src/dumbo/Ordering/UHS/UHSOrderingBase.java b/src/dumbo/Ordering/UHS/UHSOrderingBase.java index d7d1422..3c66ed1 100644 --- a/src/dumbo/Ordering/UHS/UHSOrderingBase.java +++ b/src/dumbo/Ordering/UHS/UHSOrderingBase.java @@ -1,40 +1,35 @@ package dumbo.Ordering.UHS; -import dumbo.Ordering.IOrdering; -import dumbo.StringUtils; +import dumbo.Ordering.OrderingBase; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.Arrays; -import java.util.HashMap; +import java.util.Comparator; import java.util.HashSet; -public abstract class UHSOrderingBase implements IOrdering { +public abstract class UHSOrderingBase extends OrderingBase { protected byte[] uhsBits; - protected StringUtils stringUtils; protected static final int BOTH_IN_UHS = 824; protected static final int BOTH_NOT_IN_UHS = 1001; - protected int pivotLen; protected int[] rankOfPmer; protected boolean isRankInit; public UHSOrderingBase(int pivotLen) throws IOException { - this.pivotLen = pivotLen; - stringUtils = new StringUtils(); + super(pivotLen); uhsBits = uhsBitSet(pivotLen); rankOfPmer = new int[(int) Math.pow(4, pivotLen)]; Arrays.fill(rankOfPmer, Integer.MAX_VALUE); isRankInit = false; } - protected abstract int calculateStrcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException; - protected abstract int calculateStrcmp(int x, int y) throws IOException; + abstract int rawCompare(int x, int y); public boolean isInUHS(int pmerDecimal) { @@ -46,28 +41,24 @@ public boolean isInUHS(int pmerDecimal) { return false; } - public boolean isInUHS(char[] a, int from, int to) { - return isInUHS(stringUtils.getDecimal(a, from, to)); - } - protected int strcmpBase(int x, int y) { - if (x == y) + protected int compareMmerBase(int x, int y) { + if (x == y || y == stringUtils.getReversedMmer(x, pivotLength)) return 0; boolean xInUHS = isInUHS(x); boolean yInUHS = isInUHS(y); - if(xInUHS) - { - if(!yInUHS) return -1; + if (xInUHS) { + if (!yInUHS) return -1; return BOTH_IN_UHS; } - if(yInUHS) + if (yInUHS) return 1; return BOTH_NOT_IN_UHS; } private byte[] uhsBitSet(int pivotLen) throws IOException { - int n = (int) Math.pow(4, pivotLen) / 8; + int n = numMmers / 8; int i = 0; byte[] bits = new byte[n]; @@ -80,7 +71,7 @@ private byte[] uhsBitSet(int pivotLen) throws IOException { reader = new BufferedReader(frG); String line; while ((line = reader.readLine()) != null) { - i = stringUtils.getDecimal(line.toCharArray(), 0, pivotLen); + i = stringUtils.getNormalizedValue(stringUtils.getDecimal(line.toCharArray(), 0, pivotLen), pivotLength); bits[i / 8] |= 1 << (i % 8); count++; } @@ -97,77 +88,28 @@ private byte[] uhsBitSet(int pivotLen) throws IOException { public void initRank() throws IOException { System.out.println("start init rank"); HashSet pmers = new HashSet<>(); - for(int i = 0; i <(int)Math.pow(4, pivotLen); i++) {if(isInUHS(i)) pmers.add(i);}; + for (int i = 0; i < numMmers; i++) { + if (isInUHS(i)) pmers.add(i); + } Integer[] pmersArr = new Integer[pmers.size()]; pmers.toArray(pmersArr); - Arrays.sort(pmersArr, (o1, o2) -> { - try { - return calculateStrcmp(o1, o2); - } catch (IOException e) { - e.printStackTrace(); - } - return 0; - }); + Arrays.sort(pmersArr, this::rawCompare); for (int i = 0; i < pmersArr.length; i++) { rankOfPmer[pmersArr[i]] = i; } + normalize(); System.out.println("finish init rank"); } -// public void initRank() throws IOException { -// System.out.println("start init rank"); -// HashSet pmers = getPmersInUHS(); -// char[][] pmersArr = new char[pmers.size()][pivotLen]; -// pmers.toArray(pmersArr); -// Arrays.sort(pmersArr, (o1, o2) -> { -// try { -// return calculateStrcmp(o1, o2, 0, 0, pivotLen); -// } catch (IOException e) { -// e.printStackTrace(); -// } -// return 0; -// }); -// for (int i = 0; i < pmersArr.length; i++) { -// rankOfPmer[stringUtils.getDecimal(pmersArr[i], 0, pivotLen)] = i; -// } -// System.out.println("finish init rank"); -// } - - private HashSet getPmersInUHS() { - HashSet pmers = new HashSet<>(); - StringBuilder sb = new StringBuilder(pivotLen); - for (int i = 0; i < pivotLen; i++) sb.append('A'); - generate(pmers, sb, 0); - return pmers; - - } + protected void normalize() { + Integer[] temp = new Integer[rankOfPmer.length]; + for (int i = 0; i < temp.length; i++) + temp[i] = i; - private void generate(HashSet pmers, StringBuilder sb, int n) { - char[] alphabet = {'A', 'C', 'G', 'T'}; - if (n == sb.capacity()) { - char[] pmer = sb.toString().toCharArray(); - if (isInUHS(pmer, 0, pivotLen)) { - pmers.add(pmer); - } - return; - } - for (char letter : alphabet) { - sb.setCharAt(n, letter); - generate(pmers, sb, n + 1); + Arrays.sort(temp, Comparator.comparingLong(a -> rankOfPmer[a])); + for (int i = 0; i < temp.length; i++) { + rankOfPmer[temp[i]] = i; } } - - protected static HashMap pivotLengthToHexRepresentation = new HashMap() { - { - put(5, 0x3ff); - put(6, 0xfff); - put(7, 0x3fff); - put(8, 0xffff); - put(10, 0xfffff); - put(12, 0xffffff); - } - - }; - } diff --git a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java index 83b6aaf..c906ff2 100644 --- a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java @@ -1,7 +1,6 @@ package dumbo.Ordering.UHS; -import dumbo.Ordering.SignatureUtils; -import dumbo.StringUtils; +import dumbo.Ordering.Standard.SignatureUtils; import java.io.IOException; @@ -29,86 +28,47 @@ public void initRank() throws IOException { } - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException { - if(!isRankInit) throw new IOException("rank not initialized yet"); - - int x = stringUtils.getDecimal(a, froma, froma + pivotLen); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen); - - if (x == y) return 0; + public int compareMmer(int x, int y) { - // isRankInit = true here - if (rankOfPmer[x] < rankOfPmer[y]) { + if (!isRankInit) + { + System.out.println("problema - rank not initialized"); return -1; } - return 1; - } - public int strcmp(int x, int y) throws IOException { - if(!isRankInit) throw new IOException("rank not initialized yet"); - if (x == y) return 0; + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + if (a == b) return 0; // isRankInit = true here - if (rankOfPmer[x] < rankOfPmer[y]) { + if (rankOfPmer[a] < rankOfPmer[b]) { return -1; } return 1; } - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - int j = stringUtils.getDecimal(a, min_pos, min_pos + pivotLen); - int prev = j; - int hexRepresentation = pivotLengthToHexRepresentation.get(pivotLen); - for (int i = from + 1; i <= to - pivotLen; i++) { - j = ((j * 4) ^ (StringUtils.valTable[a[i + pivotLen - 1] - 'A'])) & hexRepresentation; - - if (isInUHS(j)) { - if(rankOfPmer[j] < rankOfPmer[prev]){ - min_pos = i; - prev = j; - } - - } - } - return min_pos; - } - - protected int calculateStrcmp(char[] a, char[] b, int froma, int fromb, int len) throws IOException { - int x = stringUtils.getDecimal(a, froma, froma + pivotLen); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLen); - - if (x == y) return 0; - - boolean aAllowed = true, bAllowed = true; - if (useSignature) { - aAllowed = signatureUtils.isAllowed(a, froma, x); - bAllowed = signatureUtils.isAllowed(b, fromb, y); - } - return strcmpSignature(x, y, aAllowed, bAllowed); - } + protected int rawCompare(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); - protected int calculateStrcmp(int x, int y) throws IOException { - if (x == y) return 0; + if (a == b) return 0; boolean aAllowed = true, bAllowed = true; if (useSignature) { - aAllowed = signatureUtils.isAllowed(x); - bAllowed = signatureUtils.isAllowed(y); + aAllowed = signatureUtils.isAllowed(a); + bAllowed = signatureUtils.isAllowed(b); } - return strcmpSignature(x, y, aAllowed, bAllowed); + return rawCompare(a, b, aAllowed, bAllowed); } - - protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) throws IOException { - int baseCompareValue = strcmpBase(x, y); + protected int rawCompare(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { + int baseCompareValue = compareMmerBase(xNormalized, yNormalized); if (baseCompareValue != BOTH_IN_UHS && baseCompareValue != BOTH_NOT_IN_UHS) { return baseCompareValue; } @@ -121,7 +81,7 @@ protected int strcmpSignature(int x, int y, boolean xAllowed, boolean yAllowed) } } // both allowed or both not allowed - if ((x ^ xor) < (y ^ xor)) + if ((xNormalized ^ xor) < (yNormalized ^ xor)) return -1; else return 1; diff --git a/src/dumbo/Ordering/UHS/YaelUHSOrdering.java b/src/dumbo/Ordering/UHS/YaelUHSOrdering.java deleted file mode 100644 index 643b4ab..0000000 --- a/src/dumbo/Ordering/UHS/YaelUHSOrdering.java +++ /dev/null @@ -1,142 +0,0 @@ -package dumbo.Ordering.UHS; - -import dumbo.Ordering.IOrdering; -import dumbo.StringUtils; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashMap; - -public class YaelUHSOrdering implements IOrdering { - - private final StringUtils stringUtils; - private int pivotLength; - private byte[] uhs_bits; - private int xor; - private int mask; - - public YaelUHSOrdering(int pivotLength, int xor) throws IOException { - this.xor = xor; - this.stringUtils = new StringUtils(); - this.pivotLength = pivotLength; - uhs_bits = uhsBitSet(pivotLength); - mask = pivotLengthToHexRepresentation.get(pivotLength); - System.out.println("YAEL UHS"); - } - - @Override - public int findSmallest(char[] a, int from, int to){ - int min_pos = from; - int j = stringUtils.getDecimal(a, min_pos, min_pos+pivotLength); - int prev = j; - for(int i=from+1; i<=to-pivotLength; i++){ - j = ((j * 4) ^ (StringUtils.valTable[a[i+pivotLength-1] - 'A'])) & mask; - if(((this.uhs_bits[j >> 3] >> (j & 0b111)) & 1) == 1) { - if(strcmp(prev, j)>0) { - min_pos = i; - prev = j; - } - } - } - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len){ - - int x = stringUtils.getDecimal(a, froma, froma+pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb+pivotLength); - int xdiv8 = x >> 3; int xmod8 = x & 0b111; - int ydiv8 = y >> 3; int ymod8 = y & 0b111; - if ((((this.uhs_bits[xdiv8] >> (xmod8)) & 1) ^ ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) == 0) { - if((x ^ xor) < (y ^ xor)) - return -1; - else //if((x ^ 11101101) > (y ^ 11101101)) - return 1; - } - - if (((this.uhs_bits[xdiv8] >> (xmod8)) & 1) > ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) - return -1; - if (((this.uhs_bits[xdiv8] >> (xmod8)) & 1) < ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) - return 1; - - return 0; - } - - private int strcmp(int x, int y){ - int xdiv8 = x >> 3; int xmod8 = x & 0b111; - int ydiv8 = y >> 3; int ymod8 = y & 0b111; - if ((((this.uhs_bits[xdiv8] >> (xmod8)) & 1) ^ ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) == 0) { - if((x ^ xor) < (y ^ xor)) - return -1; - else if((x ^ xor) > (y ^ xor)) - return 1; - } - - if (((this.uhs_bits[xdiv8] >> (xmod8)) & 1) > ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) - return -1; - if (((this.uhs_bits[xdiv8] >> (xmod8)) & 1) < ((this.uhs_bits[ydiv8] >> (ymod8)) & 1)) - return 1; - - return 0; - } - - - private byte[] uhsBitSet(int pivotLen) throws IOException { - int n = (int) Math.pow(4, pivotLen) / 8; - int i = 0; - byte[] bits = new byte[n]; - - String DocksFile = "res_" + pivotLen + ".txt"; - FileReader frG = new FileReader(DocksFile); - int count = 0; - - BufferedReader reader; - try { - reader = new BufferedReader(frG); - String line; - while ((line = reader.readLine()) != null) { - i = stringUtils.getDecimal(line.toCharArray(), 0, pivotLen); - bits[i / 8] |= 1 << (i % 8); - count++; - } - reader.close(); - } catch (IOException e) { - e.printStackTrace(); - } - System.out.println(count); - frG.close(); - - return bits; - } - - protected static HashMap pivotLengthToHexRepresentation = new HashMap() { - { - put(5, 0x3ff); - put(6, 0xfff); - put(7, 0x3fff); - put(8, 0xffff); - put(10, 0xfffff); - put(11, 0x3fffff); - put(12, 0xffffff); - put(13, 0x3ffffff); - put(14, 0xfffffff); - } - - }; - - public boolean isInUHS(int pmerDecimal) { - int pmerDecimalDiv8 = pmerDecimal >> 3; - int pmerDecimalMod8 = pmerDecimal & 0b111; - if (((this.uhs_bits[pmerDecimalDiv8] >> (pmerDecimalMod8)) & 1) == 1) { - return true; - } - return false; - } - - public boolean isInUHS(char[] a, int from, int to) { - return isInUHS(stringUtils.getDecimal(a, from, to)); - } - -} diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 806dbc7..499bb7a 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -1,6 +1,8 @@ package dumbo; import dumbo.Ordering.*; +import dumbo.Ordering.Standard.LexicographicOrdering; +import dumbo.Ordering.Standard.LexicographicSignatureOrdering; import dumbo.Ordering.UHS.UHSFrequencySignatureOrdering; import java.io.IOException; @@ -14,7 +16,7 @@ public static void main(String[] args) throws IOException { int k = 60, pivot_len = 8, bufferSize = 81920; int readLen = 124; - String orderingName = "uhs_sig_freq"; + String orderingName = "iterativeOrdering"; int numRounds = 0, elementsToPush = 0, samplesPerRound = 0, statSamples = 0; double punishPercentage = 1; String version = "10"; @@ -73,10 +75,8 @@ else if (args[i].equals("-punishPercentage")) "Ordering: " + orderingName + "\n"); - orderingName = "iterativeOrdering"; - - IOrderingPP ordering = null; + OrderingBase ordering = null; System.out.println(version); switch (version) { @@ -88,7 +88,18 @@ else if (args[i].equals("-punishPercentage")) // ordering9_withCounterNormalized.exportBinningForCpp(); ordering = iterative; break; - case "9-normalized-signature": // + case "9-frequency": + FrequencyOrdering _frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound, statSamples, k); + _frequencyOrdering.initFrequency(); + + IterativeOrdering iterativeFrequency = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, + samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, false, _frequencyOrdering); + iterativeFrequency.initFrequency(); +// ordering9_withCounterNormalized.exportOrderingForCpp(); +// ordering9_withCounterNormalized.exportBinningForCpp(); + ordering = iterativeFrequency; + break; + case "9-normalized-signature": IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, true); iterativeSignature.initFrequency(); @@ -108,17 +119,17 @@ else if (args[i].equals("-punishPercentage")) UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, k, statSamples); ; universalFrequencySignature.initRank(); - universalFrequencySignature.exportOrderingForCpp(); - universalFrequencySignature.exportBinningForCpp(); -// ordering = universalFrequencySignature; +// universalFrequencySignature.exportOrderingForCpp(); +// universalFrequencySignature.exportBinningForCpp(); + ordering = universalFrequencySignature; break; case "universal-frequency": UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, k, statSamples); ; universalFrequency.initRank(); - universalFrequency.exportOrderingForCpp(); - universalFrequency.exportBinningForCpp(); -// ordering = universalFrequency; +// universalFrequency.exportOrderingForCpp(); +// universalFrequency.exportBinningForCpp(); + ordering = universalFrequency; break; case "frequency": // FREQUENCY SUCKS FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound, statSamples, k); diff --git a/src/dumbo/Partition.java b/src/dumbo/Partition.java deleted file mode 100644 index 24e0930..0000000 --- a/src/dumbo/Partition.java +++ /dev/null @@ -1,180 +0,0 @@ -package dumbo; - -import dumbo.Ordering.IOrderingPP; - -import java.io.*; -import java.util.HashSet; - -public class Partition { - - private int k; - private String inputfile; - private int numOfBlocks; - private int pivotLen; - private int bufSize; - - private FileReader frG; - private BufferedReader bfrG; - private FileWriter[] fwG; - private BufferedWriter[] bfwG; - - private int readLen; - private IOrderingPP ordering; - - private StringUtils stringUtils; - - - private HashSet currentMinimizers; - private byte[] finishedMinimizers; - private int maxMinimizersPerPass; - private boolean keepPassing; - - private final int mask; - - - public Partition(int kk, String infile, int numberOfBlocks, int pivotLength, int bufferSize, int readLen, IOrderingPP ordering) { - this.k = kk; - this.inputfile = infile; - this.numOfBlocks = numberOfBlocks; - this.pivotLen = pivotLength; - this.bufSize = bufferSize; - this.readLen = readLen; - this.ordering = ordering; - this.stringUtils = new StringUtils(); - this.mask = (int) Math.pow(4, pivotLength) - 1; - this.finishedMinimizers = new byte[numOfBlocks]; - this.currentMinimizers = new HashSet<>(); - this.maxMinimizersPerPass = 1000; - this.keepPassing = true; - } - - - private long DistributeNodes() throws IOException { - frG = new FileReader(inputfile); - bfrG = new BufferedReader(frG, bufSize); - fwG = new FileWriter[numOfBlocks]; - bfwG = new BufferedWriter[numOfBlocks]; - - currentMinimizers.clear(); - - String describeline; - - int numSuperKmers = 0; - - int minPos = -1; - - char[] lineCharArray = new char[readLen]; - int len = readLen; - - - long cnt = 0, outcnt = 0; - - File dir = new File("Nodes"); - if (!dir.exists()) - dir.mkdir(); - - int minValue, minValueNormalized, currentValue, start; - while ((describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - - minPos = ordering.findSmallest(lineCharArray, 0, k); - start = 0; - minValue = stringUtils.getDecimal(lineCharArray, minPos, minPos + pivotLen); - minValueNormalized = getNormalizedValue(minValue); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLen, k); - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; - - if (i > minPos) { - writeToFile(minValueNormalized, start, minPos + k, lineCharArray); - - minPos = ordering.findSmallest(lineCharArray, i, i + k); - start = i; - minValue = stringUtils.getDecimal(lineCharArray, minPos, minPos + pivotLen); - minValueNormalized = getNormalizedValue(minValue); - - - } else { - int lastIndexInWindow = k + i - pivotLen; - if (ordering.strcmp(currentValue, minValue) < 0) { - writeToFile(minValueNormalized, start, lastIndexInWindow + pivotLen - 1, lineCharArray); - - start = lastIndexInWindow + pivotLen - k; - minPos = lastIndexInWindow; - minValue = currentValue; - minValueNormalized = getNormalizedValue(minValue); - } - } - } - writeToFile(minValueNormalized, start, len, lineCharArray); - } - } - - System.out.println("Num superkmers is = " + numSuperKmers); - - for (int i = 0; i < bfwG.length; i++) { - if (bfwG[i] != null) { - bfwG[i].close(); - fwG[i].close(); - } - } - for (Integer i : currentMinimizers) { - finishedMinimizers[i] = 1; - } - if(currentMinimizers.size() < maxMinimizersPerPass) - keepPassing = false; - currentMinimizers.clear(); - - bfrG.close(); - frG.close(); - - return cnt; - } - - private int getNormalizedValue(int minValue) { - return stringUtils.getNormalizedValue(minValue, pivotLen) % numOfBlocks; - } - - private void tryCreateWriterForPmer(int prepos) throws IOException { - if (bfwG[prepos] == null) { - fwG[prepos] = new FileWriter("Nodes/nodes" + prepos, true); - bfwG[prepos] = new BufferedWriter(fwG[prepos], bufSize); - } - } - - private void writeToFile(int prepos, int substart, int subend, char[] lineCharArray) throws IOException { - if(finishedMinimizers[prepos] == 0 && currentMinimizers.size() < maxMinimizersPerPass) - { - currentMinimizers.add(prepos); - } - - if (currentMinimizers.contains(prepos)) { - tryCreateWriterForPmer(prepos); - - BufferedWriter writer = bfwG[prepos]; - - writer.write(lineCharArray, substart, subend - substart); - writer.newLine(); - } - } - - public void Run() throws Exception { - long time1 = 0; - long t1 = System.currentTimeMillis(); - System.out.println("Distribute Nodes Begin!"); - while (keepPassing){ - System.out.println("hi"); - DistributeNodes(); - } - long t2 = System.currentTimeMillis(); - time1 = (t2 - t1) / 1000; - System.out.println("Time used for distributing nodes: " + time1 + " seconds!"); - } - -} \ No newline at end of file From 88c3f8b5ca27cbc5fd4090fa673d12249c6c2a8a Mon Sep 17 00:00:00 2001 From: danflomin Date: Sat, 3 Apr 2021 16:27:27 +0300 Subject: [PATCH 30/44] refactor OrderingBase and subclasses --- src/dumbo/Ordering/OrderingBase.java | 46 ++++++++++++++----- .../{ => Standard}/FrequencyOrdering.java | 7 +-- src/dumbo/Ordering/UHS/UHSOrderingBase.java | 15 +----- .../Ordering/UHS/UHSSignatureOrdering.java | 21 --------- src/dumbo/OrderingOptimizer.java | 1 + 5 files changed, 41 insertions(+), 49 deletions(-) rename src/dumbo/Ordering/{ => Standard}/FrequencyOrdering.java (96%) diff --git a/src/dumbo/Ordering/OrderingBase.java b/src/dumbo/Ordering/OrderingBase.java index 502c235..c6a0d03 100644 --- a/src/dumbo/Ordering/OrderingBase.java +++ b/src/dumbo/Ordering/OrderingBase.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; public abstract class OrderingBase { @@ -13,29 +14,39 @@ public abstract class OrderingBase { protected StringUtils stringUtils; + protected int[] mmerRanks; + protected boolean isRankInitialized; + public OrderingBase(int pivotLength) { this.pivotLength = pivotLength; this.numMmers = (int) Math.pow(4, pivotLength); this.mask = numMmers - 1; this.stringUtils = new StringUtils(); + this.mmerRanks = new int[numMmers]; } - public abstract int compareMmer(int x, int y); + public abstract void initializeRanks(); - public int[] getRanks() { - Integer[] ranks = new Integer[numMmers]; - for (int i = 0; i < ranks.length; i++) { - ranks[i] = i; - } + public int compareMmer(int x, int y) throws Exception { - Arrays.sort(ranks, this::compareMmer); + if (!isRankInitialized) + throw new Exception("problema - rank not initialized"); - int[] primitveRanks = new int[numMmers]; - for (int i = 0; i < ranks.length; i++) { - primitveRanks[i] = ranks[i]; - } - return primitveRanks; + + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + if (a == b) return 0; + + if (mmerRanks[a] < mmerRanks[b]) + return -1; + else + return 1; + } + + public int[] getRanks() { + return mmerRanks.clone(); } public int findSmallest(char[] a, int from, int to) throws IOException { @@ -52,4 +63,15 @@ public int findSmallest(char[] a, int from, int to) throws IOException { return min_pos; } + + protected void normalize() { + Integer[] temp = new Integer[mmerRanks.length]; + for (int i = 0; i < temp.length; i++) + temp[i] = i; + + Arrays.sort(temp, Comparator.comparingLong(a -> mmerRanks[a])); + for (int i = 0; i < temp.length; i++) { + mmerRanks[temp[i]] = i; + } + } } diff --git a/src/dumbo/Ordering/FrequencyOrdering.java b/src/dumbo/Ordering/Standard/FrequencyOrdering.java similarity index 96% rename from src/dumbo/Ordering/FrequencyOrdering.java rename to src/dumbo/Ordering/Standard/FrequencyOrdering.java index 50677fe..9140ae0 100644 --- a/src/dumbo/Ordering/FrequencyOrdering.java +++ b/src/dumbo/Ordering/Standard/FrequencyOrdering.java @@ -1,5 +1,6 @@ -package dumbo.Ordering; +package dumbo.Ordering.Standard; +import dumbo.Ordering.OrderingBase; import dumbo.StringUtils; import java.io.*; @@ -22,8 +23,8 @@ public FrequencyOrdering(int pivotLen, String infile, int readLen, int bufSize, this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; - pmerFrequency = new int[(int) Math.pow(4, pivotLen)]; - currentOrdering = new int[(int) Math.pow(4, pivotLen)]; + pmerFrequency = new int[numMmers]; + currentOrdering = new int[numMmers]; this.numSamples = numSamples; this.numStats = numStats; this.k = k; diff --git a/src/dumbo/Ordering/UHS/UHSOrderingBase.java b/src/dumbo/Ordering/UHS/UHSOrderingBase.java index 3c66ed1..164be73 100644 --- a/src/dumbo/Ordering/UHS/UHSOrderingBase.java +++ b/src/dumbo/Ordering/UHS/UHSOrderingBase.java @@ -17,15 +17,13 @@ public abstract class UHSOrderingBase extends OrderingBase { protected static final int BOTH_NOT_IN_UHS = 1001; - protected int[] rankOfPmer; protected boolean isRankInit; public UHSOrderingBase(int pivotLen) throws IOException { super(pivotLen); uhsBits = uhsBitSet(pivotLen); - rankOfPmer = new int[(int) Math.pow(4, pivotLen)]; - Arrays.fill(rankOfPmer, Integer.MAX_VALUE); + Arrays.fill(mmerRanks, Integer.MAX_VALUE); isRankInit = false; } @@ -96,20 +94,11 @@ public void initRank() throws IOException { pmers.toArray(pmersArr); Arrays.sort(pmersArr, this::rawCompare); for (int i = 0; i < pmersArr.length; i++) { - rankOfPmer[pmersArr[i]] = i; + mmerRanks[pmersArr[i]] = i; } normalize(); System.out.println("finish init rank"); } - protected void normalize() { - Integer[] temp = new Integer[rankOfPmer.length]; - for (int i = 0; i < temp.length; i++) - temp[i] = i; - Arrays.sort(temp, Comparator.comparingLong(a -> rankOfPmer[a])); - for (int i = 0; i < temp.length; i++) { - rankOfPmer[temp[i]] = i; - } - } } diff --git a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java index c906ff2..664c3af 100644 --- a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java @@ -28,27 +28,6 @@ public void initRank() throws IOException { } - @Override - public int compareMmer(int x, int y) { - - if (!isRankInit) - { - System.out.println("problema - rank not initialized"); - return -1; - } - - - int a = stringUtils.getNormalizedValue(x, pivotLength); - int b = stringUtils.getNormalizedValue(y, pivotLength); - - if (a == b) return 0; - - // isRankInit = true here - if (rankOfPmer[a] < rankOfPmer[b]) { - return -1; - } - return 1; - } protected int rawCompare(int x, int y) { diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 499bb7a..95c822d 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -1,6 +1,7 @@ package dumbo; import dumbo.Ordering.*; +import dumbo.Ordering.Standard.FrequencyOrdering; import dumbo.Ordering.Standard.LexicographicOrdering; import dumbo.Ordering.Standard.LexicographicSignatureOrdering; import dumbo.Ordering.UHS.UHSFrequencySignatureOrdering; From 3136929f41156322bd0aea67b952d607e1177420 Mon Sep 17 00:00:00 2001 From: danflomin Date: Sat, 3 Apr 2021 16:36:53 +0300 Subject: [PATCH 31/44] remove initStats from UHSFrequencySignatureOrdering --- src/dumbo/Ordering/IterativeOrdering.java | 2 +- src/dumbo/Ordering/OrderingBase.java | 5 +- .../Ordering/Standard/FrequencyOrdering.java | 2 +- .../UHS/UHSFrequencySignatureOrdering.java | 65 ++----------------- src/dumbo/Ordering/UHS/UHSOrderingBase.java | 27 ++++---- .../Ordering/UHS/UHSSignatureOrdering.java | 8 +-- 6 files changed, 25 insertions(+), 84 deletions(-) diff --git a/src/dumbo/Ordering/IterativeOrdering.java b/src/dumbo/Ordering/IterativeOrdering.java index d0fe2b0..f2b203f 100644 --- a/src/dumbo/Ordering/IterativeOrdering.java +++ b/src/dumbo/Ordering/IterativeOrdering.java @@ -226,7 +226,7 @@ public int[] getRanks() { return currentOrdering.clone(); } - private void normalize() { + protected void normalize() { Integer[] temp = new Integer[currentOrdering.length]; for (int i = 0; i < temp.length; i++) temp[i] = i; diff --git a/src/dumbo/Ordering/OrderingBase.java b/src/dumbo/Ordering/OrderingBase.java index c6a0d03..71a2b65 100644 --- a/src/dumbo/Ordering/OrderingBase.java +++ b/src/dumbo/Ordering/OrderingBase.java @@ -23,10 +23,11 @@ public OrderingBase(int pivotLength) { this.mask = numMmers - 1; this.stringUtils = new StringUtils(); this.mmerRanks = new int[numMmers]; + this.isRankInitialized = false; } - public abstract void initializeRanks(); + public abstract void initializeRanks() throws IOException; public int compareMmer(int x, int y) throws Exception { @@ -49,7 +50,7 @@ public int[] getRanks() { return mmerRanks.clone(); } - public int findSmallest(char[] a, int from, int to) throws IOException { + public int findSmallest(char[] a, int from, int to) throws Exception { int min_pos = from; int minValue = stringUtils.getDecimal(a, min_pos, min_pos + pivotLength); int currentValue = minValue; diff --git a/src/dumbo/Ordering/Standard/FrequencyOrdering.java b/src/dumbo/Ordering/Standard/FrequencyOrdering.java index 9140ae0..cb81b3a 100644 --- a/src/dumbo/Ordering/Standard/FrequencyOrdering.java +++ b/src/dumbo/Ordering/Standard/FrequencyOrdering.java @@ -126,7 +126,7 @@ private void initStats(BufferedReader bfrG) throws IOException { } - private void normalize() { + protected void normalize() { Integer[] temp = new Integer[pmerFrequency.length]; for (int i = 0; i < temp.length; i++) temp[i] = i; diff --git a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java index e4bc8b4..4ef1e83 100644 --- a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java @@ -27,10 +27,10 @@ public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, i } @Override - public void initRank() throws IOException { - initFrequency(); - super.initRank(); - isRankInit = true; + public void initializeRanks() throws IOException { + countFrequency(); + super.initializeRanks(); + isRankInitialized = true; } protected int rawCompare(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { @@ -62,7 +62,7 @@ protected int rawCompare(int xNormalized, int yNormalized, boolean xAllowed, boo } - private void initFrequency() throws IOException { + private void countFrequency() throws IOException { FileReader frG = new FileReader(inputFile); BufferedReader bfrG = new BufferedReader(frG, bufSize); @@ -92,60 +92,5 @@ private void initFrequency() throws IOException { frG.close(); } - private void initStats(BufferedReader bfrG) throws IOException { - int numSampled = 0; - boolean keepSample = true; - - statsFrequency = new long[numMmers]; - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - - statsFrequency[minValue]++; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - - - statsFrequency[minValue]++; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (compareMmer(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - - statsFrequency[minValue]++; - } - } - statsFrequency[minValue]++; - } - if (numSampled > numStats) keepSample = false; - } - } - } } diff --git a/src/dumbo/Ordering/UHS/UHSOrderingBase.java b/src/dumbo/Ordering/UHS/UHSOrderingBase.java index 164be73..5e40a30 100644 --- a/src/dumbo/Ordering/UHS/UHSOrderingBase.java +++ b/src/dumbo/Ordering/UHS/UHSOrderingBase.java @@ -17,14 +17,12 @@ public abstract class UHSOrderingBase extends OrderingBase { protected static final int BOTH_NOT_IN_UHS = 1001; - protected boolean isRankInit; - public UHSOrderingBase(int pivotLen) throws IOException { super(pivotLen); uhsBits = uhsBitSet(pivotLen); Arrays.fill(mmerRanks, Integer.MAX_VALUE); - isRankInit = false; + } abstract int rawCompare(int x, int y); @@ -40,12 +38,12 @@ public boolean isInUHS(int pmerDecimal) { } - protected int compareMmerBase(int x, int y) { - if (x == y || y == stringUtils.getReversedMmer(x, pivotLength)) + protected int compareMmerBase(int xNormalized, int yNormalized) { + if (xNormalized == yNormalized) return 0; - boolean xInUHS = isInUHS(x); - boolean yInUHS = isInUHS(y); + boolean xInUHS = isInUHS(xNormalized); + boolean yInUHS = isInUHS(yNormalized); if (xInUHS) { if (!yInUHS) return -1; return BOTH_IN_UHS; @@ -56,9 +54,8 @@ protected int compareMmerBase(int x, int y) { } private byte[] uhsBitSet(int pivotLen) throws IOException { - int n = numMmers / 8; int i = 0; - byte[] bits = new byte[n]; + byte[] bits = new byte[numMmers]; String DocksFile = "res_" + pivotLen + ".txt"; FileReader frG = new FileReader(DocksFile); @@ -70,7 +67,7 @@ private byte[] uhsBitSet(int pivotLen) throws IOException { String line; while ((line = reader.readLine()) != null) { i = stringUtils.getNormalizedValue(stringUtils.getDecimal(line.toCharArray(), 0, pivotLen), pivotLength); - bits[i / 8] |= 1 << (i % 8); + bits[i] = 1; count++; } reader.close(); @@ -83,15 +80,15 @@ private byte[] uhsBitSet(int pivotLen) throws IOException { return bits; } - public void initRank() throws IOException { + public void initializeRanks() throws IOException { System.out.println("start init rank"); - HashSet pmers = new HashSet<>(); + HashSet normalizedMmersUHS = new HashSet<>(); for (int i = 0; i < numMmers; i++) { - if (isInUHS(i)) pmers.add(i); + if (isInUHS(i)) normalizedMmersUHS.add(i); } - Integer[] pmersArr = new Integer[pmers.size()]; - pmers.toArray(pmersArr); + Integer[] pmersArr = new Integer[normalizedMmersUHS.size()]; + normalizedMmersUHS.toArray(pmersArr); Arrays.sort(pmersArr, this::rawCompare); for (int i = 0; i < pmersArr.length; i++) { mmerRanks[pmersArr[i]] = i; diff --git a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java index 664c3af..63178ea 100644 --- a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java @@ -22,14 +22,12 @@ public UHSSignatureOrdering(int pivotLen, boolean useSignature) throws IOExcepti } @Override - public void initRank() throws IOException { - super.initRank(); - isRankInit = true; + public void initializeRanks() throws IOException { + super.initializeRanks(); + isRankInitialized = true; } - - protected int rawCompare(int x, int y) { int a = stringUtils.getNormalizedValue(x, pivotLength); int b = stringUtils.getNormalizedValue(y, pivotLength); From 8c416b637b01dd6bef1c2ba16a4e76329102dfa6 Mon Sep 17 00:00:00 2001 From: danflomin Date: Sat, 3 Apr 2021 17:50:50 +0300 Subject: [PATCH 32/44] lots and lots --- src/dumbo/MinimizerCounter.java | 2 +- src/dumbo/Ordering/IterativeOrdering.java | 52 +++------ src/dumbo/Ordering/OrderingBase.java | 12 +- .../Ordering/Standard/FrequencyOrdering.java | 103 +++--------------- .../Standard/LexicographicOrdering.java | 21 +++- .../LexicographicSignatureOrdering.java | 36 +++++- .../Ordering/Standard/RandomOrdering.java | 26 ++++- .../UHS/UHSFrequencySignatureOrdering.java | 56 ++++++---- src/dumbo/Ordering/UHS/UHSOrderingBase.java | 33 ++++-- .../Ordering/UHS/UHSSignatureOrdering.java | 41 +++---- src/dumbo/OrderingOptimizer.java | 47 ++++---- 11 files changed, 213 insertions(+), 216 deletions(-) diff --git a/src/dumbo/MinimizerCounter.java b/src/dumbo/MinimizerCounter.java index 3c5f1c8..d91d98a 100644 --- a/src/dumbo/MinimizerCounter.java +++ b/src/dumbo/MinimizerCounter.java @@ -32,7 +32,7 @@ public MinimizerCounter(int kk, String kmerSetFile, int pivotLength, int bufferS } - private long[] getMinimizersCounters() throws IOException { + private long[] getMinimizersCounters() throws Exception { frG = new FileReader(kmerSetFile); bfrG = new BufferedReader(frG, bufSize); diff --git a/src/dumbo/Ordering/IterativeOrdering.java b/src/dumbo/Ordering/IterativeOrdering.java index f2b203f..48aec04 100644 --- a/src/dumbo/Ordering/IterativeOrdering.java +++ b/src/dumbo/Ordering/IterativeOrdering.java @@ -14,7 +14,6 @@ public class IterativeOrdering extends OrderingBase { private int readLen; private int bufSize; private int k; - private int[] currentOrdering; private SignatureUtils signatureUtils; private HashMap> frequency; @@ -47,7 +46,6 @@ public IterativeOrdering( this.bufSize = bufSize; this.k = k; signatureUtils = new SignatureUtils(pivotLength); - currentOrdering = new int[(int) Math.pow(4, pivotLength)]; initialized = false; } @@ -55,7 +53,7 @@ public IterativeOrdering( int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, int[] initialOrdering) { this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); - currentOrdering = initialOrdering.clone(); + mmerRanks = initialOrdering.clone(); initialized = true; badArgumentsThrow(); } @@ -64,32 +62,32 @@ public IterativeOrdering( int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, OrderingBase initialOrdering) throws IOException { this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); - currentOrdering = initialOrdering.getRanks().clone(); + mmerRanks = initialOrdering.getRanks().clone(); initialized = true; badArgumentsThrow(); } private void badArgumentsThrow() { - if (currentOrdering.length != numMmers) + if (mmerRanks.length != numMmers) throw new IllegalArgumentException("initialOrdering is not of correct size"); if (useSignature) throw new IllegalArgumentException("Can't initialize ordering from outside with useSignature as true"); } - public void initFrequency() throws IOException { + protected void initFrequency() throws Exception { if (!initialized) { for (int i = 0; i < numMmers; i++) { int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength)); - currentOrdering[i] = canonical; - currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] = canonical; + mmerRanks[i] = canonical; + mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] = canonical; } if (useSignature) { for (int i = 0; i < numMmers; i++) { if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { - currentOrdering[i] += numMmers; - currentOrdering[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; + mmerRanks[i] += numMmers; + mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; } } } @@ -104,7 +102,7 @@ public void initFrequency() throws IOException { BufferedReader bfrG = new BufferedReader(frG, bufSize); statFrequency = new long[numMmers]; - HashMap> pmerFrequency = new HashMap<>(numMmers); + HashMap> pmerFrequency = new HashMap<>(roundSamples * 2); String describeline; char[] lineCharArray = new char[readLen]; @@ -203,9 +201,9 @@ private void adaptOrdering(HashMap> pmerFrequency) { } } // TODO: might not be necessary to change both. - int newRank = currentOrdering[biggestIndex] + (int) (numMmers * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank; + int newRank = mmerRanks[biggestIndex] + (int) (numMmers * percentagePunishment); + mmerRanks[biggestIndex] = newRank; + mmerRanks[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank; frequencies[biggestIndex] = 0; frequencies[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = 0; } @@ -213,29 +211,13 @@ private void adaptOrdering(HashMap> pmerFrequency) { @Override - public int compareMmer(int x, int y) { - int a = stringUtils.getNormalizedValue(x, pivotLength); - int b = stringUtils.getNormalizedValue(y, pivotLength); - if (a == b) return 0; - if (currentOrdering[a] < currentOrdering[b]) return -1; - return 1; + public void initializeRanks() throws Exception { + isRankInitialized = true; + initFrequency(); } @Override - public int[] getRanks() { - return currentOrdering.clone(); + protected int rawCompareMmer(int x, int y) throws Exception { + return compareMmer(x, y); } - - protected void normalize() { - Integer[] temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; i++) - temp[i] = i; - - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for (int i = 0; i < temp.length; i++) { - currentOrdering[temp[i]] = i; - } - } - - } diff --git a/src/dumbo/Ordering/OrderingBase.java b/src/dumbo/Ordering/OrderingBase.java index 71a2b65..5967a60 100644 --- a/src/dumbo/Ordering/OrderingBase.java +++ b/src/dumbo/Ordering/OrderingBase.java @@ -27,23 +27,19 @@ public OrderingBase(int pivotLength) { } - public abstract void initializeRanks() throws IOException; + public abstract void initializeRanks() throws Exception; + protected abstract int rawCompareMmer(int x, int y) throws Exception; public int compareMmer(int x, int y) throws Exception { - if (!isRankInitialized) throw new Exception("problema - rank not initialized"); - int a = stringUtils.getNormalizedValue(x, pivotLength); int b = stringUtils.getNormalizedValue(y, pivotLength); if (a == b) return 0; - - if (mmerRanks[a] < mmerRanks[b]) - return -1; - else - return 1; + if (mmerRanks[a] < mmerRanks[b]) return -1; + return 1; } public int[] getRanks() { diff --git a/src/dumbo/Ordering/Standard/FrequencyOrdering.java b/src/dumbo/Ordering/Standard/FrequencyOrdering.java index cb81b3a..8d9ef21 100644 --- a/src/dumbo/Ordering/Standard/FrequencyOrdering.java +++ b/src/dumbo/Ordering/Standard/FrequencyOrdering.java @@ -1,7 +1,6 @@ package dumbo.Ordering.Standard; import dumbo.Ordering.OrderingBase; -import dumbo.StringUtils; import java.io.*; import java.util.Arrays; @@ -12,25 +11,18 @@ public class FrequencyOrdering extends OrderingBase { private int readLen; private int bufSize; private int[] pmerFrequency; - private int[] currentOrdering; - private long[] statsFrequency; private int numSamples; - private int numStats; - private int k; - public FrequencyOrdering(int pivotLen, String infile, int readLen, int bufSize, int numSamples, int numStats, int k) { + public FrequencyOrdering(int pivotLen, String infile, int readLen, int bufSize, int numSamples) { super(pivotLen); this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; - pmerFrequency = new int[numMmers]; - currentOrdering = new int[numMmers]; + this.pmerFrequency = new int[numMmers]; this.numSamples = numSamples; - this.numStats = numStats; - this.k = k; } - public void initFrequency() throws IOException { + protected void initFrequency() throws IOException { FileReader frG = new FileReader(inputFile); BufferedReader bfrG = new BufferedReader(frG, bufSize); @@ -59,95 +51,36 @@ public void initFrequency() throws IOException { } } } - - normalize(); - initStats(bfrG); - bfrG.close(); frG.close(); } - private void initStats(BufferedReader bfrG) throws IOException { -// TODO: FIX - - int numSampled = 0; - boolean keepSample = true; - - statsFrequency = new long[(int) Math.pow(4, pivotLength)]; - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - statsFrequency[minValue]++; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - - - statsFrequency[minValue]++; - } else { - int lastIndexInWindow = k + i - pivotLength; - if (compareMmer(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - - statsFrequency[minValue]++; - } - } - statsFrequency[minValue]++; - } - if (numSampled > numStats) keepSample = false; - } + @Override + public void initializeRanks() throws Exception { + initFrequency(); + Integer[] mmers = new Integer[numMmers]; + for (int i = 0; i < mmers.length; i++) { + mmers[i] = i; } - } - - protected void normalize() { - Integer[] temp = new Integer[pmerFrequency.length]; - for (int i = 0; i < temp.length; i++) - temp[i] = i; - - Arrays.sort(temp, Comparator.comparingInt(a -> pmerFrequency[a])); - for (int i = 0; i < temp.length; i++) { - currentOrdering[temp[i]] = i; + Arrays.sort(mmers, this::rawCompareMmer); + for (int i = 0; i < mmers.length; i++) { + mmerRanks[mmers[i]] = i; } - + isRankInitialized = true; } @Override - public int[] getRanks() { - return currentOrdering.clone(); - } - - public int compareMmer(int x, int y) { + public int rawCompareMmer(int x, int y) { int a = stringUtils.getNormalizedValue(x, pivotLength); int b = stringUtils.getNormalizedValue(y, pivotLength); if (a == b) return 0; + if (pmerFrequency[a] == pmerFrequency[b]) { + if (a < b) return -1; + return 1; + } if (pmerFrequency[a] < pmerFrequency[b]) return -1; else diff --git a/src/dumbo/Ordering/Standard/LexicographicOrdering.java b/src/dumbo/Ordering/Standard/LexicographicOrdering.java index adcb074..4a318e0 100644 --- a/src/dumbo/Ordering/Standard/LexicographicOrdering.java +++ b/src/dumbo/Ordering/Standard/LexicographicOrdering.java @@ -3,15 +3,32 @@ import dumbo.Ordering.OrderingBase; +import java.io.IOException; +import java.util.Arrays; + public class LexicographicOrdering extends OrderingBase { public LexicographicOrdering(int pivotLength) { super(pivotLength); } - @Override - public int compareMmer(int x, int y) { + public void initializeRanks() throws IOException { + Integer[] mmers = new Integer[numMmers]; + for (int i = 0; i < mmers.length; i++) { + mmers[i] = i; + } + + Arrays.sort(mmers, this::rawCompareMmer); + for (int i = 0; i < mmers.length; i++) { + mmerRanks[mmers[i]] = i; + } + System.out.println("finish init rank"); + isRankInitialized = true; + } + + + protected int rawCompareMmer(int x, int y) { return Integer.compare(stringUtils.getNormalizedValue(x, pivotLength), stringUtils.getNormalizedValue(y, pivotLength)); } diff --git a/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java b/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java index f3babfc..f5e93aa 100644 --- a/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java +++ b/src/dumbo/Ordering/Standard/LexicographicSignatureOrdering.java @@ -1,6 +1,8 @@ package dumbo.Ordering.Standard; import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; public class LexicographicSignatureOrdering extends LexicographicOrdering { protected SignatureUtils signatureUtils; @@ -11,9 +13,35 @@ public LexicographicSignatureOrdering(int pivotLen) throws IOException { } @Override - public int compareMmer(int x, int y) { - boolean aAllowed = signatureUtils.isAllowed(x); - boolean bAllowed = signatureUtils.isAllowed(y); + public void initializeRanks() throws IOException { + Arrays.fill(mmerRanks, Integer.MAX_VALUE); + + HashSet normalizedAllowedMmers = new HashSet<>(); + for (int i = 0; i < numMmers; i++) { + if (signatureUtils.isAllowed(stringUtils.getNormalizedValue(i, pivotLength))) + normalizedAllowedMmers.add(stringUtils.getNormalizedValue(i, pivotLength)); + } + + Integer[] mmers = new Integer[normalizedAllowedMmers.size()]; + normalizedAllowedMmers.toArray(mmers); + + Arrays.sort(mmers); + + for (int i = 0; i < mmers.length; i++) { + mmerRanks[mmers[i]] = i; + } + normalize(); + System.out.println("finish init rank"); + isRankInitialized = true; + } + + @Override + protected int rawCompareMmer(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + boolean aAllowed = signatureUtils.isAllowed(a); + boolean bAllowed = signatureUtils.isAllowed(b); if (!aAllowed && bAllowed) { return 1; @@ -21,6 +49,6 @@ public int compareMmer(int x, int y) { return -1; } - return Integer.compare(stringUtils.getNormalizedValue(x, pivotLength), stringUtils.getNormalizedValue(y, pivotLength)); + return Integer.compare(a, b); } } diff --git a/src/dumbo/Ordering/Standard/RandomOrdering.java b/src/dumbo/Ordering/Standard/RandomOrdering.java index 448bfac..bb0a0c5 100644 --- a/src/dumbo/Ordering/Standard/RandomOrdering.java +++ b/src/dumbo/Ordering/Standard/RandomOrdering.java @@ -2,6 +2,9 @@ import dumbo.Ordering.OrderingBase; +import java.io.IOException; +import java.util.Arrays; + public class RandomOrdering extends OrderingBase { private int xor; @@ -11,10 +14,27 @@ public RandomOrdering(int pivotLen, int xor) { } @Override - public int compareMmer(int x, int y) { - if ((x ^ xor) < (y ^ xor)) + public void initializeRanks() throws IOException { + Integer[] mmers = new Integer[numMmers]; + for (int i = 0; i < mmers.length; i++) { + mmers[i] = i; + } + + Arrays.sort(mmers, this::rawCompareMmer); + for (int i = 0; i < mmers.length; i++) { + mmerRanks[mmers[i]] = i; + } + System.out.println("finish init rank"); + isRankInitialized = true; + } + + protected int rawCompareMmer(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + if ((a ^ xor) < (b ^ xor)) return -1; - else if ((x ^ xor) > (y ^ xor)) + else if ((a ^ xor) > (b ^ xor)) return 1; return 0; } diff --git a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java index 4ef1e83..be37cd4 100644 --- a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java @@ -1,39 +1,56 @@ package dumbo.Ordering.UHS; -import dumbo.StringUtils; +import dumbo.Ordering.Standard.SignatureUtils; import java.io.*; -public class UHSFrequencySignatureOrdering extends UHSSignatureOrdering { +public class UHSFrequencySignatureOrdering extends UHSOrderingBase { private String inputFile; private int readLen; private int bufSize; - private long[] pmerFrequency; - private int k; - private int numStats; - private boolean isInit; + private long[] mmerFrequency; + private int numMmersToCount; - private long[] statsFrequency; + private SignatureUtils signatureUtils; + protected boolean useSignature; - public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature, int k, int numStats) throws IOException { - super(0, pivotLen, useSignature); + + public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature, int numMmersToCount) throws IOException { + super(pivotLen); this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; - pmerFrequency = new long[numMmers]; - this.k = k; - this.numStats = numStats; - isInit = false; + this.mmerFrequency = new long[numMmers]; + this.numMmersToCount = numMmersToCount; + + this.useSignature = useSignature; + this.signatureUtils = new SignatureUtils(pivotLen); } @Override - public void initializeRanks() throws IOException { + public void initializeRanks() throws Exception { countFrequency(); super.initializeRanks(); isRankInitialized = true; } - protected int rawCompare(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { + @Override + protected int rawCompareMmer(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + if (a == b) return 0; + + boolean aAllowed = true, bAllowed = true; + if (useSignature) { + aAllowed = signatureUtils.isAllowed(a); + bAllowed = signatureUtils.isAllowed(b); + } + + return rawCompareMmer(a, b, aAllowed, bAllowed); + } + + protected int rawCompareMmer(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { int baseCompareValue = compareMmerBase(xNormalized, yNormalized); if (baseCompareValue != BOTH_IN_UHS && baseCompareValue != BOTH_NOT_IN_UHS) { return baseCompareValue; @@ -50,12 +67,12 @@ protected int rawCompare(int xNormalized, int yNormalized, boolean xAllowed, boo } // both allowed or both not allowed - if (pmerFrequency[xNormalized] == pmerFrequency[yNormalized]) { + if (mmerFrequency[xNormalized] == mmerFrequency[yNormalized]) { if (xNormalized < yNormalized) return -1; else return 1; - } else if (pmerFrequency[xNormalized] < pmerFrequency[yNormalized]) + } else if (mmerFrequency[xNormalized] < mmerFrequency[yNormalized]) return -1; else return 1; @@ -79,15 +96,14 @@ private void countFrequency() throws IOException { for (int i = 0; i <= lineCharArray.length - pivotLength; i++) { int value = stringUtils.getNormalizedValue(stringUtils.getDecimal(lineCharArray, i, i + pivotLength), pivotLength); - pmerFrequency[value] += 1; + mmerFrequency[value] += 1; counter++; } - if (counter > 1000000) { + if (counter > numMmersToCount) { break; } } } -// initStats(bfrG); bfrG.close(); frG.close(); } diff --git a/src/dumbo/Ordering/UHS/UHSOrderingBase.java b/src/dumbo/Ordering/UHS/UHSOrderingBase.java index 5e40a30..23154f8 100644 --- a/src/dumbo/Ordering/UHS/UHSOrderingBase.java +++ b/src/dumbo/Ordering/UHS/UHSOrderingBase.java @@ -17,7 +17,6 @@ public abstract class UHSOrderingBase extends OrderingBase { protected static final int BOTH_NOT_IN_UHS = 1001; - public UHSOrderingBase(int pivotLen) throws IOException { super(pivotLen); uhsBits = uhsBitSet(pivotLen); @@ -25,8 +24,6 @@ public UHSOrderingBase(int pivotLen) throws IOException { } - abstract int rawCompare(int x, int y); - public boolean isInUHS(int pmerDecimal) { int pmerDecimalDiv8 = pmerDecimal >> 3; @@ -53,6 +50,17 @@ protected int compareMmerBase(int xNormalized, int yNormalized) { return BOTH_NOT_IN_UHS; } + @Override + protected int rawCompareMmer(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + int result = compareMmerBase(a, b); + if (result == -1 || result == 1) + return result; + return Integer.compare(a, b); + } + private byte[] uhsBitSet(int pivotLen) throws IOException { int i = 0; byte[] bits = new byte[numMmers]; @@ -80,21 +88,28 @@ private byte[] uhsBitSet(int pivotLen) throws IOException { return bits; } - public void initializeRanks() throws IOException { + @Override + public void initializeRanks() throws Exception { System.out.println("start init rank"); HashSet normalizedMmersUHS = new HashSet<>(); for (int i = 0; i < numMmers; i++) { if (isInUHS(i)) normalizedMmersUHS.add(i); } - Integer[] pmersArr = new Integer[normalizedMmersUHS.size()]; - normalizedMmersUHS.toArray(pmersArr); - Arrays.sort(pmersArr, this::rawCompare); - for (int i = 0; i < pmersArr.length; i++) { - mmerRanks[pmersArr[i]] = i; + Integer[] mmers = new Integer[normalizedMmersUHS.size()]; + normalizedMmersUHS.toArray(mmers); + try { + Arrays.sort(mmers, this::rawCompareMmer); + } catch (Exception e) { + throw e; + } + + for (int i = 0; i < mmers.length; i++) { + mmerRanks[mmers[i]] = i; } normalize(); System.out.println("finish init rank"); + isRankInitialized = true; } diff --git a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java index 63178ea..3861f4c 100644 --- a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java @@ -21,30 +21,8 @@ public UHSSignatureOrdering(int pivotLen, boolean useSignature) throws IOExcepti this(0, pivotLen, useSignature); } - @Override - public void initializeRanks() throws IOException { - super.initializeRanks(); - isRankInitialized = true; - } - - - protected int rawCompare(int x, int y) { - int a = stringUtils.getNormalizedValue(x, pivotLength); - int b = stringUtils.getNormalizedValue(y, pivotLength); - - if (a == b) return 0; - - boolean aAllowed = true, bAllowed = true; - if (useSignature) { - aAllowed = signatureUtils.isAllowed(a); - bAllowed = signatureUtils.isAllowed(b); - } - - return rawCompare(a, b, aAllowed, bAllowed); - } - - protected int rawCompare(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { + protected int rawCompareMmer(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { int baseCompareValue = compareMmerBase(xNormalized, yNormalized); if (baseCompareValue != BOTH_IN_UHS && baseCompareValue != BOTH_NOT_IN_UHS) { return baseCompareValue; @@ -64,4 +42,21 @@ protected int rawCompare(int xNormalized, int yNormalized, boolean xAllowed, boo return 1; } + + @Override + protected int rawCompareMmer(int x, int y) { + int a = stringUtils.getNormalizedValue(x, pivotLength); + int b = stringUtils.getNormalizedValue(y, pivotLength); + + if (a == b) return 0; + + boolean aAllowed = true, bAllowed = true; + if (useSignature) { + aAllowed = signatureUtils.isAllowed(a); + bAllowed = signatureUtils.isAllowed(b); + } + + return rawCompareMmer(a, b, aAllowed, bAllowed); + } + } diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 95c822d..92e23d3 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -11,7 +11,7 @@ public class OrderingOptimizer { - public static void main(String[] args) throws IOException { + public static void main(String[] args) throws Exception { String infile = null; @@ -84,18 +84,18 @@ else if (args[i].equals("-punishPercentage")) case "9-normalized": // good version IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, false); - iterative.initFrequency(); + iterative.initializeRanks(); // ordering9_withCounterNormalized.exportOrderingForCpp(); // ordering9_withCounterNormalized.exportBinningForCpp(); ordering = iterative; break; case "9-frequency": - FrequencyOrdering _frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound, statSamples, k); - _frequencyOrdering.initFrequency(); + FrequencyOrdering _frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound); + _frequencyOrdering.initializeRanks(); IterativeOrdering iterativeFrequency = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, false, _frequencyOrdering); - iterativeFrequency.initFrequency(); + iterativeFrequency.initializeRanks(); // ordering9_withCounterNormalized.exportOrderingForCpp(); // ordering9_withCounterNormalized.exportBinningForCpp(); ordering = iterativeFrequency; @@ -103,47 +103,42 @@ else if (args[i].equals("-punishPercentage")) case "9-normalized-signature": IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, true); - iterativeSignature.initFrequency(); + iterativeSignature.initializeRanks(); // ordering9_withCounterNormalized_andSignature.exportOrderingForCpp(); // ordering9_withCounterNormalized_andSignature.exportBinningForCpp(); ordering = iterativeSignature; System.out.println("lolz asdasd"); break; - case "10": - IterativeOrdering10_WithCounterNormalized ordering10 = new IterativeOrdering10_WithCounterNormalized(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); - ordering10.initFrequency(); - ordering10.exportOrderingForCpp(); - ordering10.exportBinningForCpp(); -// ordering = ordering10; - break; +// case "10": +// IterativeOrdering10_WithCounterNormalized ordering10 = new IterativeOrdering10_WithCounterNormalized(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); +// ordering10.initFrequency(); +// ordering10.exportOrderingForCpp(); +// ordering10.exportBinningForCpp(); +//// ordering = ordering10; +// break; case "universal-frequency-signature": UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, k, statSamples); - ; - universalFrequencySignature.initRank(); -// universalFrequencySignature.exportOrderingForCpp(); -// universalFrequencySignature.exportBinningForCpp(); + universalFrequencySignature.initializeRanks(); ordering = universalFrequencySignature; break; case "universal-frequency": UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, k, statSamples); ; - universalFrequency.initRank(); -// universalFrequency.exportOrderingForCpp(); -// universalFrequency.exportBinningForCpp(); + universalFrequency.initializeRanks(); ordering = universalFrequency; break; case "frequency": // FREQUENCY SUCKS - FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound, statSamples, k); - frequencyOrdering.initFrequency(); + FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound); + frequencyOrdering.initializeRanks(); ordering = frequencyOrdering; break; case "signature": - LexicographicSignatureOrdering signatureOrdering = new LexicographicSignatureOrdering(pivot_len); - ordering = signatureOrdering; + ordering = new LexicographicSignatureOrdering(pivot_len); + ordering.initializeRanks(); break; case "lexicographic": - LexicographicOrdering lexicographicOrdering = new LexicographicOrdering(pivot_len); - ordering = lexicographicOrdering; + ordering = new LexicographicOrdering(pivot_len); + ordering.initializeRanks(); break; } From 561da21003a46d21f60de4f162f25b7b446870c5 Mon Sep 17 00:00:00 2001 From: danflomin Date: Sat, 3 Apr 2021 17:51:13 +0300 Subject: [PATCH 33/44] remove UHSSignatureOrdering.java --- .../Ordering/UHS/UHSSignatureOrdering.java | 62 ------------------- 1 file changed, 62 deletions(-) delete mode 100644 src/dumbo/Ordering/UHS/UHSSignatureOrdering.java diff --git a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java deleted file mode 100644 index 3861f4c..0000000 --- a/src/dumbo/Ordering/UHS/UHSSignatureOrdering.java +++ /dev/null @@ -1,62 +0,0 @@ -package dumbo.Ordering.UHS; - -import dumbo.Ordering.Standard.SignatureUtils; - -import java.io.IOException; - -public class UHSSignatureOrdering extends UHSOrderingBase { - private SignatureUtils signatureUtils; - protected boolean useSignature; - protected int xor; - - - public UHSSignatureOrdering(int xor, int pivotLen, boolean useSignature) throws IOException { - super(pivotLen); - this.xor = xor; - this.useSignature = useSignature; - signatureUtils = new SignatureUtils(pivotLen); - } - - public UHSSignatureOrdering(int pivotLen, boolean useSignature) throws IOException { - this(0, pivotLen, useSignature); - } - - - protected int rawCompareMmer(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { - int baseCompareValue = compareMmerBase(xNormalized, yNormalized); - if (baseCompareValue != BOTH_IN_UHS && baseCompareValue != BOTH_NOT_IN_UHS) { - return baseCompareValue; - } - // from down here - both in UHS - if (useSignature) { - if (!xAllowed && yAllowed) { - return 1; - } else if (!yAllowed && xAllowed) { - return -1; - } - } - // both allowed or both not allowed - if ((xNormalized ^ xor) < (yNormalized ^ xor)) - return -1; - else - return 1; - - } - - @Override - protected int rawCompareMmer(int x, int y) { - int a = stringUtils.getNormalizedValue(x, pivotLength); - int b = stringUtils.getNormalizedValue(y, pivotLength); - - if (a == b) return 0; - - boolean aAllowed = true, bAllowed = true; - if (useSignature) { - aAllowed = signatureUtils.isAllowed(a); - bAllowed = signatureUtils.isAllowed(b); - } - - return rawCompareMmer(a, b, aAllowed, bAllowed); - } - -} From 7469ac48c40d2d4599826cfc1a824e6a89f70260 Mon Sep 17 00:00:00 2001 From: danflomin Date: Sat, 3 Apr 2021 18:14:09 +0300 Subject: [PATCH 34/44] checked that orderings run succesfully, didn;t check products --- src/dumbo/Ordering/IterativeOrdering.java | 4 - src/dumbo/Ordering/OrderingBase.java | 1 - .../Ordering/Standard/FrequencyOrdering.java | 1 - .../UHS/UHSFrequencySignatureOrdering.java | 74 +++++++++---------- src/dumbo/Ordering/UHS/UHSOrderingBase.java | 51 +------------ src/dumbo/OrderingOptimizer.java | 4 +- 6 files changed, 39 insertions(+), 96 deletions(-) diff --git a/src/dumbo/Ordering/IterativeOrdering.java b/src/dumbo/Ordering/IterativeOrdering.java index 48aec04..c82520c 100644 --- a/src/dumbo/Ordering/IterativeOrdering.java +++ b/src/dumbo/Ordering/IterativeOrdering.java @@ -216,8 +216,4 @@ public void initializeRanks() throws Exception { initFrequency(); } - @Override - protected int rawCompareMmer(int x, int y) throws Exception { - return compareMmer(x, y); - } } diff --git a/src/dumbo/Ordering/OrderingBase.java b/src/dumbo/Ordering/OrderingBase.java index 5967a60..ecf76db 100644 --- a/src/dumbo/Ordering/OrderingBase.java +++ b/src/dumbo/Ordering/OrderingBase.java @@ -28,7 +28,6 @@ public OrderingBase(int pivotLength) { public abstract void initializeRanks() throws Exception; - protected abstract int rawCompareMmer(int x, int y) throws Exception; public int compareMmer(int x, int y) throws Exception { if (!isRankInitialized) diff --git a/src/dumbo/Ordering/Standard/FrequencyOrdering.java b/src/dumbo/Ordering/Standard/FrequencyOrdering.java index 8d9ef21..df8b634 100644 --- a/src/dumbo/Ordering/Standard/FrequencyOrdering.java +++ b/src/dumbo/Ordering/Standard/FrequencyOrdering.java @@ -71,7 +71,6 @@ public void initializeRanks() throws Exception { isRankInitialized = true; } - @Override public int rawCompareMmer(int x, int y) { int a = stringUtils.getNormalizedValue(x, pivotLength); int b = stringUtils.getNormalizedValue(y, pivotLength); diff --git a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java index be37cd4..3c4b875 100644 --- a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java @@ -3,12 +3,15 @@ import dumbo.Ordering.Standard.SignatureUtils; import java.io.*; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashSet; public class UHSFrequencySignatureOrdering extends UHSOrderingBase { private String inputFile; private int readLen; private int bufSize; - private long[] mmerFrequency; + private int[] mmerFrequency; private int numMmersToCount; private SignatureUtils signatureUtils; @@ -20,7 +23,7 @@ public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, i this.inputFile = infile; this.readLen = readLen; this.bufSize = bufSize; - this.mmerFrequency = new long[numMmers]; + this.mmerFrequency = new int[numMmers]; this.numMmersToCount = numMmersToCount; this.useSignature = useSignature; @@ -29,56 +32,47 @@ public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, i @Override public void initializeRanks() throws Exception { - countFrequency(); - super.initializeRanks(); - isRankInitialized = true; - } + System.out.println("start init rank"); - @Override - protected int rawCompareMmer(int x, int y) { - int a = stringUtils.getNormalizedValue(x, pivotLength); - int b = stringUtils.getNormalizedValue(y, pivotLength); + countFrequency(); + Arrays.fill(mmerRanks, Integer.MAX_VALUE); - if (a == b) return 0; + int idx = 0; - boolean aAllowed = true, bAllowed = true; - if (useSignature) { - aAllowed = signatureUtils.isAllowed(a); - bAllowed = signatureUtils.isAllowed(b); + HashSet normalizedAllowedMmersUHS = new HashSet<>(); + for (int i = 0; i < numMmers; i++) { + if (isInUHS(i) && (!useSignature || signatureUtils.isAllowed(i))) + normalizedAllowedMmersUHS.add(i); } - - return rawCompareMmer(a, b, aAllowed, bAllowed); - } - - protected int rawCompareMmer(int xNormalized, int yNormalized, boolean xAllowed, boolean yAllowed) { - int baseCompareValue = compareMmerBase(xNormalized, yNormalized); - if (baseCompareValue != BOTH_IN_UHS && baseCompareValue != BOTH_NOT_IN_UHS) { - return baseCompareValue; + Integer[] allowedMmers = new Integer[normalizedAllowedMmersUHS.size()]; + normalizedAllowedMmersUHS.toArray(allowedMmers); + Arrays.sort(allowedMmers, Comparator.comparingInt(a -> mmerFrequency[a])); + for (int i = 0; i < allowedMmers.length; i++) { + mmerRanks[allowedMmers[i]] = idx; + idx++; } - // from down here - both in UHS - if (useSignature) { - if (!xAllowed && yAllowed) { - return 1; - } else if (!yAllowed && xAllowed) { - return -1; + HashSet normalizedNotAllowedMmersUHS = new HashSet<>(); + for (int i = 0; i < numMmers; i++) { + if (isInUHS(i) && (!signatureUtils.isAllowed(i))) + normalizedNotAllowedMmersUHS.add(i); + } + Integer[] notAllowedMmers = new Integer[normalizedNotAllowedMmersUHS.size()]; + normalizedNotAllowedMmersUHS.toArray(notAllowedMmers); + Arrays.sort(notAllowedMmers, Comparator.comparingInt(a -> mmerFrequency[a])); + for (int i = 0; i < notAllowedMmers.length; i++) { + mmerRanks[notAllowedMmers[i]] = idx; + idx++; } } - // both allowed or both not allowed - if (mmerFrequency[xNormalized] == mmerFrequency[yNormalized]) { - if (xNormalized < yNormalized) - return -1; - else - return 1; - } else if (mmerFrequency[xNormalized] < mmerFrequency[yNormalized]) - return -1; - else - return 1; - + normalize(); + System.out.println("finish init rank"); + isRankInitialized = true; } + private void countFrequency() throws IOException { FileReader frG = new FileReader(inputFile); BufferedReader bfrG = new BufferedReader(frG, bufSize); diff --git a/src/dumbo/Ordering/UHS/UHSOrderingBase.java b/src/dumbo/Ordering/UHS/UHSOrderingBase.java index 23154f8..7163188 100644 --- a/src/dumbo/Ordering/UHS/UHSOrderingBase.java +++ b/src/dumbo/Ordering/UHS/UHSOrderingBase.java @@ -5,13 +5,10 @@ import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashSet; public abstract class UHSOrderingBase extends OrderingBase { - protected byte[] uhsBits; + protected byte[] normalizedUHS; protected static final int BOTH_IN_UHS = 824; protected static final int BOTH_NOT_IN_UHS = 1001; @@ -19,19 +16,12 @@ public abstract class UHSOrderingBase extends OrderingBase { public UHSOrderingBase(int pivotLen) throws IOException { super(pivotLen); - uhsBits = uhsBitSet(pivotLen); - Arrays.fill(mmerRanks, Integer.MAX_VALUE); + normalizedUHS = uhsBitSet(pivotLen); } - public boolean isInUHS(int pmerDecimal) { - int pmerDecimalDiv8 = pmerDecimal >> 3; - int pmerDecimalMod8 = pmerDecimal & 0b111; - if (((this.uhsBits[pmerDecimalDiv8] >> (pmerDecimalMod8)) & 1) == 1) { - return true; - } - return false; + return normalizedUHS[pmerDecimal] == 1; } @@ -50,17 +40,6 @@ protected int compareMmerBase(int xNormalized, int yNormalized) { return BOTH_NOT_IN_UHS; } - @Override - protected int rawCompareMmer(int x, int y) { - int a = stringUtils.getNormalizedValue(x, pivotLength); - int b = stringUtils.getNormalizedValue(y, pivotLength); - - int result = compareMmerBase(a, b); - if (result == -1 || result == 1) - return result; - return Integer.compare(a, b); - } - private byte[] uhsBitSet(int pivotLen) throws IOException { int i = 0; byte[] bits = new byte[numMmers]; @@ -88,29 +67,5 @@ private byte[] uhsBitSet(int pivotLen) throws IOException { return bits; } - @Override - public void initializeRanks() throws Exception { - System.out.println("start init rank"); - HashSet normalizedMmersUHS = new HashSet<>(); - for (int i = 0; i < numMmers; i++) { - if (isInUHS(i)) normalizedMmersUHS.add(i); - } - - Integer[] mmers = new Integer[normalizedMmersUHS.size()]; - normalizedMmersUHS.toArray(mmers); - try { - Arrays.sort(mmers, this::rawCompareMmer); - } catch (Exception e) { - throw e; - } - - for (int i = 0; i < mmers.length; i++) { - mmerRanks[mmers[i]] = i; - } - normalize(); - System.out.println("finish init rank"); - isRankInitialized = true; - } - } diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 92e23d3..d3b05b8 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -117,12 +117,12 @@ else if (args[i].equals("-punishPercentage")) //// ordering = ordering10; // break; case "universal-frequency-signature": - UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, k, statSamples); + UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, statSamples); universalFrequencySignature.initializeRanks(); ordering = universalFrequencySignature; break; case "universal-frequency": - UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, k, statSamples); + UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, statSamples); ; universalFrequency.initializeRanks(); ordering = universalFrequency; From 58c9576eebbdc4eaef793410e425449bc3976739 Mon Sep 17 00:00:00 2001 From: danflomin Date: Sun, 4 Apr 2021 11:37:45 +0300 Subject: [PATCH 35/44] lalala --- src/dumbo/OrderingOptimizer.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index d3b05b8..fd77b41 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -90,7 +90,7 @@ else if (args[i].equals("-punishPercentage")) ordering = iterative; break; case "9-frequency": - FrequencyOrdering _frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound); + FrequencyOrdering _frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, 100000000); _frequencyOrdering.initializeRanks(); IterativeOrdering iterativeFrequency = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, @@ -117,12 +117,12 @@ else if (args[i].equals("-punishPercentage")) //// ordering = ordering10; // break; case "universal-frequency-signature": - UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, statSamples); + UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, 100000000); universalFrequencySignature.initializeRanks(); ordering = universalFrequencySignature; break; case "universal-frequency": - UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, statSamples); + UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, 100000000); ; universalFrequency.initializeRanks(); ordering = universalFrequency; From a9dfd5e627406a0de2af2d07549520bc6ff194bb Mon Sep 17 00:00:00 2001 From: danflomin Date: Mon, 12 Apr 2021 21:55:58 +0300 Subject: [PATCH 36/44] checkpoint this stuff and move to new dumbo idea --- src/dumbo/BinSizeCounter.java | 127 +++++++++++ src/dumbo/LoadCounter.java | 210 ++++++++++++++++++ .../Ordering/Standard/FrequencyOrdering.java | 1 + src/dumbo/OrderingOptimizer.java | 35 ++- 4 files changed, 365 insertions(+), 8 deletions(-) create mode 100644 src/dumbo/BinSizeCounter.java create mode 100644 src/dumbo/LoadCounter.java diff --git a/src/dumbo/BinSizeCounter.java b/src/dumbo/BinSizeCounter.java new file mode 100644 index 0000000..78ec113 --- /dev/null +++ b/src/dumbo/BinSizeCounter.java @@ -0,0 +1,127 @@ +package dumbo; + +import dumbo.Ordering.OrderingBase; +import net.agkn.hll.HLL; +import net.agkn.hll.HLLType; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.util.HashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; + +public class BinSizeCounter { + private String inputFile; + private int readLen; + private int bufSize; + private int k; + + private int[] frequencies; + private OrderingBase ordering; + + private StringUtils stringUtils; + + private int pivotLength; + private long statisticsSamples; + + private int mask; + private int numMmers; + + + public BinSizeCounter( + int pivotLength, String infile, int readLen, int bufSize, int k, long statisticsSamples, OrderingBase ordering) { + this.pivotLength = pivotLength; + this.statisticsSamples = statisticsSamples; + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.k = k; + numMmers = (int) Math.pow(4, pivotLength); + frequencies = new int[numMmers]; + this.ordering = ordering; + stringUtils = new StringUtils(); + mask = numMmers - 1; + } + + + private void concurrentCounter(char[] lineCharArray) throws Exception { + int min_pos, minValue, minValueNormalized, currentValue; + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = ordering.findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + + + frequencies[minValueNormalized] += k; + + int bound = readLen - k + 1; + for (int i = 1; i < bound; i++) { + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; + + if (i > min_pos) { + min_pos = ordering.findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + } else { + int lastIndexInWindow = k + i - pivotLength; + if (ordering.compareMmer(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + } + } + frequencies[minValueNormalized]++; + } + } + } + + + protected void initFrequency() throws Exception { + + + boolean keepSample = true; + long numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + + String describeline; + char[] lineCharArray = new char[readLen]; + + ThreadPoolExecutor executor = + (ThreadPoolExecutor) Executors.newFixedThreadPool(1); + + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + + concurrentCounter(lineCharArray); + numSampled += readLen - k; + if (numSampled > statisticsSamples) + keepSample = false; + + } + + executor.shutdown(); + bfrG.close(); + frG.close(); + } + + + public long[] getStatistics() { + long[] stats = new long[numMmers]; + for (int i = 0; i < numMmers; i++) { + stats[i] = frequencies[i]; + } + return stats; + } + + +} diff --git a/src/dumbo/LoadCounter.java b/src/dumbo/LoadCounter.java new file mode 100644 index 0000000..eb2f979 --- /dev/null +++ b/src/dumbo/LoadCounter.java @@ -0,0 +1,210 @@ +package dumbo; + +import dumbo.Ordering.OrderingBase; +import dumbo.Ordering.Standard.SignatureUtils; +import dumbo.StringUtils; +import net.agkn.hll.HLL; +import net.agkn.hll.HLLType; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; + +import java.security.MessageDigest; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; + +public class LoadCounter { + private String inputFile; + private int readLen; + private int bufSize; + private int k; + private HashMap frequency; + private Object[] frequencyLocks; + private int[] frequencies; + private OrderingBase ordering; + + private StringUtils stringUtils; + + private int pivotLength; + private long statisticsSamples; + + private int mask; + private int numMmers; + + + + + public LoadCounter( + int pivotLength, String infile, int readLen, int bufSize, int k, long statisticsSamples, OrderingBase ordering) { + this.pivotLength = pivotLength; + this.statisticsSamples = statisticsSamples; + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.k = k; + numMmers = (int) Math.pow(4, pivotLength); + frequency = new HashMap<>(numMmers); + frequencies = new int[numMmers]; + this.ordering = ordering; + stringUtils = new StringUtils(); + mask = numMmers - 1; + frequencyLocks = new Object[numMmers + 1]; + for (int i = 0; i < frequencyLocks.length - 1; i++) { + frequencyLocks[i] = new Object(); + } + } + + + private void concurrentCounter(char[] lineCharArray) throws Exception { + int min_pos, minValue, minValueNormalized, currentValue, numSampled = 0; + + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + min_pos = ordering.findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + + updateStatistics(minValueNormalized, line, 0); + + int bound = readLen - k + 1; + for (int i = 1; i < bound; i++) { + numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; + + if (i > min_pos) { + min_pos = ordering.findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + } else { + int lastIndexInWindow = k + i - pivotLength; + if (ordering.compareMmer(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + } + } + updateStatistics(minValueNormalized, line, i); + } + } + } + + + protected void initFrequency() throws Exception { + + + boolean keepSample = true; + long numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + + String describeline; + char[] lineCharArray = new char[readLen]; + + ThreadPoolExecutor executor = + (ThreadPoolExecutor) Executors.newFixedThreadPool(1); + + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + +// char[] localLineCharArray = lineCharArray.clone(); +// executor.submit(() -> { +// concurrentCounter(localLineCharArray); +// return null; +// }); + + concurrentCounter(lineCharArray); + numSampled += readLen - k; + if (numSampled > statisticsSamples) + keepSample = false; + + } + + executor.shutdown(); + bfrG.close(); + frG.close(); + } + + private void updateStatistics(int minValueNormalized, String line, int startPosition) { +// synchronized (frequencyLocks[minValueNormalized]) +// { + if (!frequency.containsKey(minValueNormalized)) +// frequency.put(minValueNormalized, new HLL(11, 5)); /// about 3gb of ram before going to sparse + frequency.put(minValueNormalized, new HLL(11, 5, 0, true, HLLType.FULL)); + frequency.get(minValueNormalized).addRaw(hashString(stringUtils.getCanonical(line.substring(startPosition, k + startPosition)))); +// } + //synchronized (frequencyLocks[numMmers]) + frequencies[minValueNormalized]++; + + + } + + private long hashString(String s) { + long h = 1125899906842597L; // prime + int len = s.length(); + for (int i = 0; i < len; i++) { + h = 31 * h + s.charAt(i); + } + return h; + } + + + public long[] getStatistics() { + long[] stats = new long[numMmers]; + for (int i = 0; i < numMmers; i++) { + if (frequency.containsKey(i)) { + stats[i] = frequency.get(i).cardinality(); + } +// if (i < stringUtils.getReversedMmer(i, pivotLength)) { +// stats[i] += 1000; +// } + } + + + + // pure counters +// System.out.println("x = ["); +// for (int i = 0; i < stats.length; i++) { +// System.out.print(stats[i]+ ", "); +// } +// System.out.println("]"); + + + // all ratios + System.out.println("x = ["); + for (int j = 0; j < stats.length; j++) { + if(frequencies[j] != 0) + System.out.print((float) stats[j] / frequencies[j] + ", "); + else + System.out.print("0, "); + } + System.out.println("]"); +// ConcurrentLinkedQueuex = new ConcurrentLinkedQueue<>(); +// x.remove() + +// long max = Arrays.stream(stats).max().getAsLong(); +// for (int i = 0; i < numMmers; i++) { +// if (stats[i] > 0 && stats[i] * 1.1 > max) { +// stats[i] *= 1.1; +// } +// } + return stats; + } + + +} diff --git a/src/dumbo/Ordering/Standard/FrequencyOrdering.java b/src/dumbo/Ordering/Standard/FrequencyOrdering.java index df8b634..cbcad4f 100644 --- a/src/dumbo/Ordering/Standard/FrequencyOrdering.java +++ b/src/dumbo/Ordering/Standard/FrequencyOrdering.java @@ -65,6 +65,7 @@ public void initializeRanks() throws Exception { } Arrays.sort(mmers, this::rawCompareMmer); +// Arrays.sort(mmers, Comparator.comparingInt(a -> pmerFrequency[a])); for (int i = 0; i < mmers.length; i++) { mmerRanks[mmers[i]] = i; } diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index fd77b41..17fd437 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -18,7 +18,8 @@ public static void main(String[] args) throws Exception { int k = 60, pivot_len = 8, bufferSize = 81920; int readLen = 124; String orderingName = "iterativeOrdering"; - int numRounds = 0, elementsToPush = 0, samplesPerRound = 0, statSamples = 0; + int numRounds = 0, elementsToPush = 0, samplesPerRound = 0; + long statSamples = 0; double punishPercentage = 1; String version = "10"; String kmerSetFile = null; @@ -57,7 +58,7 @@ else if (args[i].equals("-samples")) else if (args[i].equals("-elementsToPush")) elementsToPush = new Integer(args[i + 1]); else if (args[i].equals("-statSamples")) - statSamples = new Integer(args[i + 1]); + statSamples = new Long(args[i + 1]); else if (args[i].equals("-punishPercentage")) punishPercentage = new Double(args[i + 1]); else { @@ -83,7 +84,7 @@ else if (args[i].equals("-punishPercentage")) case "9-normalized": // good version IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, - samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, false); + samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false); iterative.initializeRanks(); // ordering9_withCounterNormalized.exportOrderingForCpp(); // ordering9_withCounterNormalized.exportBinningForCpp(); @@ -94,7 +95,7 @@ else if (args[i].equals("-punishPercentage")) _frequencyOrdering.initializeRanks(); IterativeOrdering iterativeFrequency = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, - samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, false, _frequencyOrdering); + samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false, _frequencyOrdering); iterativeFrequency.initializeRanks(); // ordering9_withCounterNormalized.exportOrderingForCpp(); // ordering9_withCounterNormalized.exportBinningForCpp(); @@ -102,7 +103,7 @@ else if (args[i].equals("-punishPercentage")) break; case "9-normalized-signature": IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, - samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage, true); + samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, true); iterativeSignature.initializeRanks(); // ordering9_withCounterNormalized_andSignature.exportOrderingForCpp(); // ordering9_withCounterNormalized_andSignature.exportBinningForCpp(); @@ -142,9 +143,17 @@ else if (args[i].equals("-punishPercentage")) break; } + ExportUtils exportUtils = new ExportUtils(); + + int[] ranks = ordering.getRanks(); + long[] longRanks = new long[ranks.length]; + for (int i = 0; i < longRanks.length; longRanks[i]=ranks[i], i++) ; + + exportUtils.exportOrderingForCpp(longRanks); + if (kmerSetFile != null) { try { - ExportUtils exportUtils = new ExportUtils(); + System.out.println("Counting minimizer appearances:"); System.out.print("Input File: " + kmerSetFile + "\n" + "Kmer Length: " + k + "\n" + @@ -152,10 +161,20 @@ else if (args[i].equals("-punishPercentage")) "R/W Buffer Size: " + bufferSize + "\n" + "Ordering: " + orderingName + "\n"); - MinimizerCounter minimizerCounter = new MinimizerCounter(k, kmerSetFile, pivot_len, bufferSize, ordering); - long[] counters = minimizerCounter.Run(); +// MinimizerCounter minimizerCounter = new MinimizerCounter(k, kmerSetFile, pivot_len, bufferSize, ordering); +// long[] counters = minimizerCounter.Run(); + +// LoadCounter counter = new LoadCounter(pivot_len, infile, readLen, bufferSize, k, statSamples, ordering); +// counter.initFrequency(); + + BinSizeCounter counter = new BinSizeCounter(pivot_len, infile, readLen, bufferSize, k, statSamples, ordering); + counter.initFrequency(); + + long[] counters = counter.getStatistics(); + exportUtils.writeToFile(counters, orderingName + pivot_len + "_" + "kmers"); System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + Arrays.stream(counters).sum()); + exportUtils.exportBinningForCpp(counters); } catch (Exception E) { From 3e0569c6da38bf7f236d5da349bbe1db1e4c812f Mon Sep 17 00:00:00 2001 From: danflomin Date: Sun, 25 Apr 2021 20:47:52 +0300 Subject: [PATCH 37/44] fix bin size counter add split-frequency (doesn't work well) stable version for tests --- src/dumbo/BinSizeCounter.java | 16 +- src/dumbo/LoadCounter.java | 16 +- src/dumbo/Ordering/IterativeOrderingV2.java | 244 ++++++++++++++++++++ src/dumbo/OrderingOptimizer.java | 23 +- 4 files changed, 270 insertions(+), 29 deletions(-) create mode 100644 src/dumbo/Ordering/IterativeOrderingV2.java diff --git a/src/dumbo/BinSizeCounter.java b/src/dumbo/BinSizeCounter.java index 78ec113..7eacfd4 100644 --- a/src/dumbo/BinSizeCounter.java +++ b/src/dumbo/BinSizeCounter.java @@ -59,21 +59,23 @@ private void concurrentCounter(char[] lineCharArray) throws Exception { int bound = readLen - k + 1; for (int i = 1; i < bound; i++) { + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; if (i > min_pos) { min_pos = ordering.findSmallest(lineCharArray, i, i + k); minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); - } else { + frequencies[minValueNormalized] += k; + } else if (ordering.compareMmer(currentValue, minValue) < 0) { int lastIndexInWindow = k + i - pivotLength; - if (ordering.compareMmer(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); - } + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + frequencies[minValueNormalized] += k; } - frequencies[minValueNormalized]++; + else + frequencies[minValueNormalized]++; } } } diff --git a/src/dumbo/LoadCounter.java b/src/dumbo/LoadCounter.java index eb2f979..a1334da 100644 --- a/src/dumbo/LoadCounter.java +++ b/src/dumbo/LoadCounter.java @@ -186,14 +186,14 @@ public long[] getStatistics() { // all ratios - System.out.println("x = ["); - for (int j = 0; j < stats.length; j++) { - if(frequencies[j] != 0) - System.out.print((float) stats[j] / frequencies[j] + ", "); - else - System.out.print("0, "); - } - System.out.println("]"); +// System.out.println("x = ["); +// for (int j = 0; j < stats.length; j++) { +// if(frequencies[j] != 0) +// System.out.print((float) stats[j] / frequencies[j] + ", "); +// else +// System.out.print("0, "); +// } +// System.out.println("]"); // ConcurrentLinkedQueuex = new ConcurrentLinkedQueue<>(); // x.remove() diff --git a/src/dumbo/Ordering/IterativeOrderingV2.java b/src/dumbo/Ordering/IterativeOrderingV2.java new file mode 100644 index 0000000..d6708d6 --- /dev/null +++ b/src/dumbo/Ordering/IterativeOrderingV2.java @@ -0,0 +1,244 @@ +package dumbo.Ordering; + +import dumbo.Ordering.Standard.SignatureUtils; +import dumbo.StringUtils; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; + +public class IterativeOrderingV2 extends OrderingBase { + private String inputFile; + private int readLen; + private int bufSize; + private int k; + private SignatureUtils signatureUtils; + + private int roundSamples; + private int rounds; + private int elementsToPush; + + private boolean useSignature; + + private boolean initialized; + + private boolean samplingMinimizerFrequency; + private int[] statFrequency; + private int frequencySampledMinimizer; + + + public IterativeOrderingV2( + int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, boolean useSignature) { + super(pivotLength); + this.roundSamples = roundSamples; + this.rounds = rounds; + this.elementsToPush = elementsToPush; + this.useSignature = useSignature; + this.inputFile = infile; + this.readLen = readLen; + this.bufSize = bufSize; + this.k = k; + signatureUtils = new SignatureUtils(pivotLength); + initialized = false; + samplingMinimizerFrequency = false; + } + + public IterativeOrderingV2( + int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, boolean useSignature, int[] initialOrdering) { + this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, useSignature); + mmerRanks = initialOrdering.clone(); + initialized = true; + badArgumentsThrow(); + } + + public IterativeOrderingV2( + int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int elementsToPush, boolean useSignature, OrderingBase initialOrdering) throws IOException { + this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, useSignature); + mmerRanks = initialOrdering.getRanks().clone(); + initialized = true; + badArgumentsThrow(); + } + + private void badArgumentsThrow() { + if (mmerRanks.length != numMmers) + throw new IllegalArgumentException("initialOrdering is not of correct size"); + if (useSignature) + throw new IllegalArgumentException("Can't initialize ordering from outside with useSignature as true"); + } + + + protected void initFrequency() throws Exception { + + if (!initialized) { + for (int i = 0; i < numMmers; i++) { + int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength)); + mmerRanks[i] = canonical; + mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] = canonical; + } + if (useSignature) { + for (int i = 0; i < numMmers; i++) { + if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { + mmerRanks[i] += numMmers; + mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; + } + } + } + } + + + boolean keepSample = true; + int numSampled = 0; + int roundNumber = 0; + + FileReader frG = new FileReader(inputFile); + BufferedReader bfrG = new BufferedReader(frG, bufSize); + + statFrequency = new int[numMmers]; + HashMap> pmerFrequency = new HashMap<>(roundSamples * 2); + + String describeline; + char[] lineCharArray = new char[readLen]; + + int len = readLen; + + + int min_pos = -1; + int minValue, currentValue, minValueNormalized; + + while (keepSample && (describeline = bfrG.readLine()) != null) { + + bfrG.read(lineCharArray, 0, readLen); + bfrG.read(); + String line = new String(lineCharArray); + + if (stringUtils.isReadLegal(lineCharArray)) { + + boolean sampledWantedMinimizer = false; + + min_pos = findSmallest(lineCharArray, 0, k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); + + sampledWantedMinimizer = updateStatistics(pmerFrequency, minValueNormalized, line, 0); + + if(sampledWantedMinimizer) + continue; + + int bound = len - k + 1; + for (int i = 1; i < bound; i++) { + if(!samplingMinimizerFrequency) numSampled++; + currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; + + if (i > min_pos) { + min_pos = findSmallest(lineCharArray, i, i + k); + minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + + } else { + int lastIndexInWindow = k + i - pivotLength; + if (compareMmer(currentValue, minValue) < 0) { + min_pos = lastIndexInWindow; + minValue = currentValue; + minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); + } + } + sampledWantedMinimizer = updateStatistics(pmerFrequency, minValueNormalized, line, i); + if(sampledWantedMinimizer) + { + numSampled++; + break; + } + } + } + + if (numSampled >= roundSamples) { + numSampled = 0; + if (samplingMinimizerFrequency) { // 2 iterations is 1 round + adaptOrdering(pmerFrequency); + pmerFrequency.clear(); + samplingMinimizerFrequency = false; + roundNumber++; + roundSamples *= 100; + if (roundNumber == rounds) // TODO: SHOULD THIS BE < and not <= + keepSample = false; + } else { + // find biggest and put it as freq minimizer + HashMap x = new HashMap<>(); + for (Integer i : pmerFrequency.keySet()) { + x.put(i, pmerFrequency.get(i).size()); + } + int biggest = -1, idx = -1; + for (Integer i : x.keySet()) { + if (x.get(i) > biggest) { + biggest = x.get(i); + idx = i; + } + } + roundSamples /= 100; + frequencySampledMinimizer = idx; + samplingMinimizerFrequency = true; + } + } + + } + normalize(); + bfrG.close(); + frG.close(); + } + + private boolean updateStatistics(HashMap> pmerFrequency, int minValueNormalized, String line, int startPosition) { + String canonical = stringUtils.getCanonical(line.substring(startPosition, k + startPosition)); + if (samplingMinimizerFrequency) { + if (minValueNormalized == frequencySampledMinimizer) { + for (int i = 0; i <= canonical.length() - pivotLength; i++) { + int value = stringUtils.getNormalizedValue(stringUtils.getDecimal(canonical.toCharArray(), i, i + pivotLength), pivotLength); + statFrequency[value] += 1; + } + return true; + } + } else { + if (!pmerFrequency.containsKey(minValueNormalized)) + pmerFrequency.put(minValueNormalized, new HashSet<>()); + pmerFrequency.get(minValueNormalized).add(canonical); + } + return false; + } + + + + + private void adaptOrdering(HashMap> pmerFrequency) { + for (int i = 0; i < elementsToPush; i++) { + int biggest = 0; + int biggestIndex = -1; + for (int k = 0; k < statFrequency.length; k++) { + if (k != frequencySampledMinimizer && statFrequency[k] > biggest && mmerRanks[k] < mmerRanks[frequencySampledMinimizer] + 1000) { // TODO: add k is normalized + if ((!pmerFrequency.containsKey(k)) || pmerFrequency.get(k).size() < 0.1 * pmerFrequency.get(frequencySampledMinimizer).size()) { + biggest = statFrequency[k]; + biggestIndex = k; + } + } + } +// TODO: might not be necessary to change both. + int newRank = mmerRanks[frequencySampledMinimizer] - 1; + mmerRanks[biggestIndex] = newRank; + mmerRanks[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank; + statFrequency[biggestIndex] = 0; + statFrequency[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = 0; + } + } + + + @Override + public void initializeRanks() throws Exception { + isRankInitialized = true; + initFrequency(); + } + +} diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 17fd437..173a6c4 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -86,8 +86,6 @@ else if (args[i].equals("-punishPercentage")) IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false); iterative.initializeRanks(); -// ordering9_withCounterNormalized.exportOrderingForCpp(); -// ordering9_withCounterNormalized.exportBinningForCpp(); ordering = iterative; break; case "9-frequency": @@ -97,26 +95,24 @@ else if (args[i].equals("-punishPercentage")) IterativeOrdering iterativeFrequency = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false, _frequencyOrdering); iterativeFrequency.initializeRanks(); -// ordering9_withCounterNormalized.exportOrderingForCpp(); -// ordering9_withCounterNormalized.exportBinningForCpp(); ordering = iterativeFrequency; break; + case "split-frequency": + FrequencyOrdering _frequencyOrdering2 = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, 1000000000); + _frequencyOrdering2.initializeRanks(); + + IterativeOrderingV2 iterative2Frequency = new IterativeOrderingV2(pivot_len, infile, readLen, bufferSize, k, + samplesPerRound, numRounds, elementsToPush, false, _frequencyOrdering2); + iterative2Frequency.initializeRanks(); + ordering = iterative2Frequency; + break; case "9-normalized-signature": IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, true); iterativeSignature.initializeRanks(); -// ordering9_withCounterNormalized_andSignature.exportOrderingForCpp(); -// ordering9_withCounterNormalized_andSignature.exportBinningForCpp(); ordering = iterativeSignature; System.out.println("lolz asdasd"); break; -// case "10": -// IterativeOrdering10_WithCounterNormalized ordering10 = new IterativeOrdering10_WithCounterNormalized(pivot_len, infile, readLen, bufferSize, k, samplesPerRound, numRounds, elementsToPush, statSamples, punishPercentage); -// ordering10.initFrequency(); -// ordering10.exportOrderingForCpp(); -// ordering10.exportBinningForCpp(); -//// ordering = ordering10; -// break; case "universal-frequency-signature": UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, 100000000); universalFrequencySignature.initializeRanks(); @@ -124,7 +120,6 @@ else if (args[i].equals("-punishPercentage")) break; case "universal-frequency": UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, 100000000); - ; universalFrequency.initializeRanks(); ordering = universalFrequency; break; From cc70f5d5f131d5888c976c6914d296f255e98c72 Mon Sep 17 00:00:00 2001 From: danflomin Date: Tue, 27 Apr 2021 16:42:44 +0300 Subject: [PATCH 38/44] remove read len requirement from IterativeOrdering and BinSizeCounter add option to choose from cli whether to count load or to collect bin statistics --- src/dumbo/BinSizeCounter.java | 28 +++++++--------- src/dumbo/Ordering/IterativeOrdering.java | 38 ++++++++++++--------- src/dumbo/OrderingOptimizer.java | 40 ++++++++++++----------- 3 files changed, 54 insertions(+), 52 deletions(-) diff --git a/src/dumbo/BinSizeCounter.java b/src/dumbo/BinSizeCounter.java index 7eacfd4..dee16c4 100644 --- a/src/dumbo/BinSizeCounter.java +++ b/src/dumbo/BinSizeCounter.java @@ -12,7 +12,6 @@ public class BinSizeCounter { private String inputFile; - private int readLen; private int bufSize; private int k; @@ -29,11 +28,10 @@ public class BinSizeCounter { public BinSizeCounter( - int pivotLength, String infile, int readLen, int bufSize, int k, long statisticsSamples, OrderingBase ordering) { + int pivotLength, String infile, int bufSize, int k, long statisticsSamples, OrderingBase ordering) { this.pivotLength = pivotLength; this.statisticsSamples = statisticsSamples; this.inputFile = infile; - this.readLen = readLen; this.bufSize = bufSize; this.k = k; numMmers = (int) Math.pow(4, pivotLength); @@ -57,7 +55,7 @@ private void concurrentCounter(char[] lineCharArray) throws Exception { frequencies[minValueNormalized] += k; - int bound = readLen - k + 1; + int bound = lineCharArray.length - k + 1; for (int i = 1; i < bound; i++) { currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; @@ -82,36 +80,32 @@ private void concurrentCounter(char[] lineCharArray) throws Exception { protected void initFrequency() throws Exception { - - boolean keepSample = true; long numSampled = 0; - int roundNumber = 0; FileReader frG = new FileReader(inputFile); BufferedReader bfrG = new BufferedReader(frG, bufSize); - String describeline; - char[] lineCharArray = new char[readLen]; - - ThreadPoolExecutor executor = - (ThreadPoolExecutor) Executors.newFixedThreadPool(1); + String describeline, line; + char[] lineCharArray; + int readLen; while (keepSample && (describeline = bfrG.readLine()) != null) { - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); + line = bfrG.readLine(); + readLen = line.length(); + lineCharArray = line.toCharArray(); + + if(readLen < k) + continue; concurrentCounter(lineCharArray); numSampled += readLen - k; if (numSampled > statisticsSamples) keepSample = false; - } - - executor.shutdown(); bfrG.close(); frG.close(); } diff --git a/src/dumbo/Ordering/IterativeOrdering.java b/src/dumbo/Ordering/IterativeOrdering.java index c82520c..7725809 100644 --- a/src/dumbo/Ordering/IterativeOrdering.java +++ b/src/dumbo/Ordering/IterativeOrdering.java @@ -11,7 +11,7 @@ public class IterativeOrdering extends OrderingBase { private String inputFile; - private int readLen; + private int bufSize; private int k; private SignatureUtils signatureUtils; @@ -32,7 +32,7 @@ public class IterativeOrdering extends OrderingBase { public IterativeOrdering( - int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature) { super(pivotLength); this.roundSamples = roundSamples; @@ -42,7 +42,7 @@ public IterativeOrdering( this.percentagePunishment = percentagePunishment; this.useSignature = useSignature; this.inputFile = infile; - this.readLen = readLen; + this.bufSize = bufSize; this.k = k; signatureUtils = new SignatureUtils(pivotLength); @@ -50,18 +50,18 @@ public IterativeOrdering( } public IterativeOrdering( - int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, int[] initialOrdering) { - this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); + this(pivotLength, infile, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); mmerRanks = initialOrdering.clone(); initialized = true; badArgumentsThrow(); } public IterativeOrdering( - int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, + int pivotLength, String infile, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment, boolean useSignature, OrderingBase initialOrdering) throws IOException { - this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); + this(pivotLength, infile, bufSize, k, roundSamples, rounds, elementsToPush, statisticsSamples, percentagePunishment, useSignature); mmerRanks = initialOrdering.getRanks().clone(); initialized = true; badArgumentsThrow(); @@ -104,20 +104,26 @@ protected void initFrequency() throws Exception { statFrequency = new long[numMmers]; HashMap> pmerFrequency = new HashMap<>(roundSamples * 2); - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; + String skippedDescribeLine, line; + char[] lineCharArray;// = new char[readLen]; + int readLen; int min_pos = -1; int minValue, currentValue, minValueNormalized; - while (keepSample && (describeline = bfrG.readLine()) != null) { + while (keepSample && (skippedDescribeLine = bfrG.readLine()) != null) { + + line = bfrG.readLine(); + readLen = line.length(); + lineCharArray = line.toCharArray(); + + if(readLen < k) + continue; - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); +// bfrG.read(lineCharArray, 0, readLen); +// bfrG.read(); +// String line = new String(lineCharArray); if (stringUtils.isReadLegal(lineCharArray)) { @@ -128,7 +134,7 @@ protected void initFrequency() throws Exception { updateStatistics(roundNumber, pmerFrequency, minValueNormalized, line, 0); - int bound = len - k + 1; + int bound = readLen - k + 1; for (int i = 1; i < bound; i++) { numSampled++; currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 173a6c4..ac13976 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -42,6 +42,7 @@ else if (args[i].equals("-k")) k = new Integer(args[i + 1]); else if (args[i].equals("-kmers-file")) kmerSetFile = args[i + 1]; + // else // if(args[i].equals("-o")) // orderingName = args[i+1]; @@ -69,7 +70,7 @@ else if (args[i].equals("-punishPercentage")) } System.out.println("Optimizing an ordering:"); - System.out.print("Input File: " + kmerSetFile + "\n" + + System.out.print("Input File: " + infile + "\n" + "Kmer Length: " + k + "\n" + "Pivot Length: " + pivot_len + "\n" + "R/W Buffer Size: " + bufferSize + "\n" + @@ -83,7 +84,7 @@ else if (args[i].equals("-punishPercentage")) switch (version) { case "9-normalized": // good version - IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, + IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false); iterative.initializeRanks(); ordering = iterative; @@ -92,7 +93,7 @@ else if (args[i].equals("-punishPercentage")) FrequencyOrdering _frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, 100000000); _frequencyOrdering.initializeRanks(); - IterativeOrdering iterativeFrequency = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, + IterativeOrdering iterativeFrequency = new IterativeOrdering(pivot_len, infile, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false, _frequencyOrdering); iterativeFrequency.initializeRanks(); ordering = iterativeFrequency; @@ -107,7 +108,7 @@ else if (args[i].equals("-punishPercentage")) ordering = iterative2Frequency; break; case "9-normalized-signature": - IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, readLen, bufferSize, k, + IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, true); iterativeSignature.initializeRanks(); ordering = iterativeSignature; @@ -146,6 +147,8 @@ else if (args[i].equals("-punishPercentage")) exportUtils.exportOrderingForCpp(longRanks); + + long[] counters; if (kmerSetFile != null) { try { @@ -153,30 +156,29 @@ else if (args[i].equals("-punishPercentage")) System.out.print("Input File: " + kmerSetFile + "\n" + "Kmer Length: " + k + "\n" + "Pivot Length: " + pivot_len + "\n" + - "R/W Buffer Size: " + bufferSize + "\n" + "Ordering: " + orderingName + "\n"); -// MinimizerCounter minimizerCounter = new MinimizerCounter(k, kmerSetFile, pivot_len, bufferSize, ordering); -// long[] counters = minimizerCounter.Run(); - -// LoadCounter counter = new LoadCounter(pivot_len, infile, readLen, bufferSize, k, statSamples, ordering); -// counter.initFrequency(); - - BinSizeCounter counter = new BinSizeCounter(pivot_len, infile, readLen, bufferSize, k, statSamples, ordering); - counter.initFrequency(); - - long[] counters = counter.getStatistics(); + MinimizerCounter minimizerCounter = new MinimizerCounter(k, kmerSetFile, pivot_len, bufferSize, ordering); + counters = minimizerCounter.Run(); exportUtils.writeToFile(counters, orderingName + pivot_len + "_" + "kmers"); - System.out.println("TOTAL NUMBER OF DISTINCT KMERS = " + Arrays.stream(counters).sum()); - exportUtils.exportBinningForCpp(counters); - - } catch (Exception E) { System.out.println("Exception caught!"); E.printStackTrace(); } } + if (statSamples > 0) { + System.out.println("Collecting stats for binning"); +// LoadCounter counter = new LoadCounter(pivot_len, infile, readLen, bufferSize, k, statSamples, ordering); +// counter.initFrequency(); + + BinSizeCounter counter = new BinSizeCounter(pivot_len, infile, bufferSize, k, statSamples, ordering); + counter.initFrequency(); + + counters = counter.getStatistics(); + exportUtils.exportBinningForCpp(counters); + + } } From 2d94e461a863544e1fd61f39b11ea90af8045cdf Mon Sep 17 00:00:00 2001 From: danflomin Date: Tue, 27 Apr 2021 16:52:46 +0300 Subject: [PATCH 39/44] remove iterative ordering 2 which was meant for frequency optimization --- src/dumbo/Ordering/IterativeOrderingV2.java | 244 -------------------- 1 file changed, 244 deletions(-) delete mode 100644 src/dumbo/Ordering/IterativeOrderingV2.java diff --git a/src/dumbo/Ordering/IterativeOrderingV2.java b/src/dumbo/Ordering/IterativeOrderingV2.java deleted file mode 100644 index d6708d6..0000000 --- a/src/dumbo/Ordering/IterativeOrderingV2.java +++ /dev/null @@ -1,244 +0,0 @@ -package dumbo.Ordering; - -import dumbo.Ordering.Standard.SignatureUtils; -import dumbo.StringUtils; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashMap; -import java.util.HashSet; - -public class IterativeOrderingV2 extends OrderingBase { - private String inputFile; - private int readLen; - private int bufSize; - private int k; - private SignatureUtils signatureUtils; - - private int roundSamples; - private int rounds; - private int elementsToPush; - - private boolean useSignature; - - private boolean initialized; - - private boolean samplingMinimizerFrequency; - private int[] statFrequency; - private int frequencySampledMinimizer; - - - public IterativeOrderingV2( - int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, - int elementsToPush, boolean useSignature) { - super(pivotLength); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.useSignature = useSignature; - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.k = k; - signatureUtils = new SignatureUtils(pivotLength); - initialized = false; - samplingMinimizerFrequency = false; - } - - public IterativeOrderingV2( - int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, - int elementsToPush, boolean useSignature, int[] initialOrdering) { - this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, useSignature); - mmerRanks = initialOrdering.clone(); - initialized = true; - badArgumentsThrow(); - } - - public IterativeOrderingV2( - int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, - int elementsToPush, boolean useSignature, OrderingBase initialOrdering) throws IOException { - this(pivotLength, infile, readLen, bufSize, k, roundSamples, rounds, elementsToPush, useSignature); - mmerRanks = initialOrdering.getRanks().clone(); - initialized = true; - badArgumentsThrow(); - } - - private void badArgumentsThrow() { - if (mmerRanks.length != numMmers) - throw new IllegalArgumentException("initialOrdering is not of correct size"); - if (useSignature) - throw new IllegalArgumentException("Can't initialize ordering from outside with useSignature as true"); - } - - - protected void initFrequency() throws Exception { - - if (!initialized) { - for (int i = 0; i < numMmers; i++) { - int canonical = Math.min(i, stringUtils.getReversedMmer(i, pivotLength)); - mmerRanks[i] = canonical; - mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] = canonical; - } - if (useSignature) { - for (int i = 0; i < numMmers; i++) { - if (!signatureUtils.isAllowed(i) && i < stringUtils.getReversedMmer(i, pivotLength)) { - mmerRanks[i] += numMmers; - mmerRanks[stringUtils.getReversedMmer(i, pivotLength)] += numMmers; - } - } - } - } - - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - statFrequency = new int[numMmers]; - HashMap> pmerFrequency = new HashMap<>(roundSamples * 2); - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue, minValueNormalized; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); - - if (stringUtils.isReadLegal(lineCharArray)) { - - boolean sampledWantedMinimizer = false; - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - - sampledWantedMinimizer = updateStatistics(pmerFrequency, minValueNormalized, line, 0); - - if(sampledWantedMinimizer) - continue; - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - if(!samplingMinimizerFrequency) numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); - - } else { - int lastIndexInWindow = k + i - pivotLength; - if (compareMmer(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - minValueNormalized = stringUtils.getNormalizedValue(minValue, pivotLength); - } - } - sampledWantedMinimizer = updateStatistics(pmerFrequency, minValueNormalized, line, i); - if(sampledWantedMinimizer) - { - numSampled++; - break; - } - } - } - - if (numSampled >= roundSamples) { - numSampled = 0; - if (samplingMinimizerFrequency) { // 2 iterations is 1 round - adaptOrdering(pmerFrequency); - pmerFrequency.clear(); - samplingMinimizerFrequency = false; - roundNumber++; - roundSamples *= 100; - if (roundNumber == rounds) // TODO: SHOULD THIS BE < and not <= - keepSample = false; - } else { - // find biggest and put it as freq minimizer - HashMap x = new HashMap<>(); - for (Integer i : pmerFrequency.keySet()) { - x.put(i, pmerFrequency.get(i).size()); - } - int biggest = -1, idx = -1; - for (Integer i : x.keySet()) { - if (x.get(i) > biggest) { - biggest = x.get(i); - idx = i; - } - } - roundSamples /= 100; - frequencySampledMinimizer = idx; - samplingMinimizerFrequency = true; - } - } - - } - normalize(); - bfrG.close(); - frG.close(); - } - - private boolean updateStatistics(HashMap> pmerFrequency, int minValueNormalized, String line, int startPosition) { - String canonical = stringUtils.getCanonical(line.substring(startPosition, k + startPosition)); - if (samplingMinimizerFrequency) { - if (minValueNormalized == frequencySampledMinimizer) { - for (int i = 0; i <= canonical.length() - pivotLength; i++) { - int value = stringUtils.getNormalizedValue(stringUtils.getDecimal(canonical.toCharArray(), i, i + pivotLength), pivotLength); - statFrequency[value] += 1; - } - return true; - } - } else { - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(canonical); - } - return false; - } - - - - - private void adaptOrdering(HashMap> pmerFrequency) { - for (int i = 0; i < elementsToPush; i++) { - int biggest = 0; - int biggestIndex = -1; - for (int k = 0; k < statFrequency.length; k++) { - if (k != frequencySampledMinimizer && statFrequency[k] > biggest && mmerRanks[k] < mmerRanks[frequencySampledMinimizer] + 1000) { // TODO: add k is normalized - if ((!pmerFrequency.containsKey(k)) || pmerFrequency.get(k).size() < 0.1 * pmerFrequency.get(frequencySampledMinimizer).size()) { - biggest = statFrequency[k]; - biggestIndex = k; - } - } - } -// TODO: might not be necessary to change both. - int newRank = mmerRanks[frequencySampledMinimizer] - 1; - mmerRanks[biggestIndex] = newRank; - mmerRanks[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = newRank; - statFrequency[biggestIndex] = 0; - statFrequency[stringUtils.getReversedMmer(biggestIndex, pivotLength)] = 0; - } - } - - - @Override - public void initializeRanks() throws Exception { - isRankInitialized = true; - initFrequency(); - } - -} From 5b846161132e1b513183fabf79c1e0fb08ee40f6 Mon Sep 17 00:00:00 2001 From: danflomin Date: Tue, 27 Apr 2021 17:33:58 +0300 Subject: [PATCH 40/44] la lal al a --- .gitignore | 2 ++ src/META-INF/MANIFEST.MF | 4 ---- src/dumbo/BinSizeCounter.java | 2 -- src/dumbo/OrderingOptimizer.java | 9 --------- 4 files changed, 2 insertions(+), 15 deletions(-) delete mode 100644 src/META-INF/MANIFEST.MF diff --git a/.gitignore b/.gitignore index a1c2a23..bd8eb75 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,7 @@ *.tar.gz *.rar +.idea* + # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* diff --git a/src/META-INF/MANIFEST.MF b/src/META-INF/MANIFEST.MF deleted file mode 100644 index 76e33bd..0000000 --- a/src/META-INF/MANIFEST.MF +++ /dev/null @@ -1,4 +0,0 @@ -Manifest-Version: 1.0 -Main-Class: dumbo.OrderingOptimizer - - diff --git a/src/dumbo/BinSizeCounter.java b/src/dumbo/BinSizeCounter.java index dee16c4..7a84c4a 100644 --- a/src/dumbo/BinSizeCounter.java +++ b/src/dumbo/BinSizeCounter.java @@ -1,8 +1,6 @@ package dumbo; import dumbo.Ordering.OrderingBase; -import net.agkn.hll.HLL; -import net.agkn.hll.HLLType; import java.io.BufferedReader; import java.io.FileReader; diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index ac13976..c2f3897 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -98,15 +98,6 @@ else if (args[i].equals("-punishPercentage")) iterativeFrequency.initializeRanks(); ordering = iterativeFrequency; break; - case "split-frequency": - FrequencyOrdering _frequencyOrdering2 = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, 1000000000); - _frequencyOrdering2.initializeRanks(); - - IterativeOrderingV2 iterative2Frequency = new IterativeOrderingV2(pivot_len, infile, readLen, bufferSize, k, - samplesPerRound, numRounds, elementsToPush, false, _frequencyOrdering2); - iterative2Frequency.initializeRanks(); - ordering = iterative2Frequency; - break; case "9-normalized-signature": IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, true); From 7463a6148104a75d211ed9a3e653edd92811fdd7 Mon Sep 17 00:00:00 2001 From: danflomin Date: Tue, 27 Apr 2021 17:59:51 +0300 Subject: [PATCH 41/44] cloned to new directory because of intellij problems add 2 intellij files --- msp.iml | 12 ++++++++++++ src/META-INF/MANIFEST.MF | 3 +++ 2 files changed, 15 insertions(+) create mode 100644 msp.iml create mode 100644 src/META-INF/MANIFEST.MF diff --git a/msp.iml b/msp.iml new file mode 100644 index 0000000..724d33c --- /dev/null +++ b/msp.iml @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/src/META-INF/MANIFEST.MF b/src/META-INF/MANIFEST.MF new file mode 100644 index 0000000..3eb3083 --- /dev/null +++ b/src/META-INF/MANIFEST.MF @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Main-Class: dumbo.OrderingOptimizer + From dffa81435f8083a2f289cc808db60ffdaa4cf896 Mon Sep 17 00:00:00 2001 From: danflomin Date: Wed, 9 Jun 2021 14:47:56 +0300 Subject: [PATCH 42/44] good version --- msp.iml | 4 ++-- src/dumbo/MinimizerCounter.java | 18 ++++++++++++------ src/dumbo/OrderingOptimizer.java | 31 +++++++++---------------------- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/msp.iml b/msp.iml index 724d33c..b1c5c22 100644 --- a/msp.iml +++ b/msp.iml @@ -1,11 +1,11 @@ - + - + diff --git a/src/dumbo/MinimizerCounter.java b/src/dumbo/MinimizerCounter.java index d91d98a..fbfb63f 100644 --- a/src/dumbo/MinimizerCounter.java +++ b/src/dumbo/MinimizerCounter.java @@ -3,6 +3,7 @@ import dumbo.Ordering.OrderingBase; import java.io.*; +import java.util.*; public class MinimizerCounter { @@ -36,17 +37,24 @@ private long[] getMinimizersCounters() throws Exception { frG = new FileReader(kmerSetFile); bfrG = new BufferedReader(frG, bufSize); - String describeline; + + String describeline, line; int minPos; - char[] lineCharArray = new char[k]; + char[] lineCharArray; int minValue, minValueNormalized, currentValue, start; while ((describeline = bfrG.readLine()) != null) { - bfrG.read(lineCharArray, 0, k); - bfrG.read(); +// bfrG.read(lineCharArray, 0, k); +// bfrG.read(); + + line = bfrG.readLine(); + int readLen = line.length(); + if(readLen != k) + throw new Exception("Input row is not of length k"); + lineCharArray = line.toCharArray(); if (stringUtils.isReadLegal(lineCharArray)) { minPos = ordering.findSmallest(lineCharArray, 0, k); @@ -58,7 +66,6 @@ private long[] getMinimizersCounters() throws Exception { bfrG.close(); frG.close(); - return minimizerCounters.clone(); } @@ -66,7 +73,6 @@ public long[] Run() throws Exception { long time1 = 0; long t1 = System.currentTimeMillis(); System.out.println("Minimizers counting Begin!"); - System.out.println("hi"); long[] counters = getMinimizersCounters(); long t2 = System.currentTimeMillis(); diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index c2f3897..4c1424d 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -3,11 +3,13 @@ import dumbo.Ordering.*; import dumbo.Ordering.Standard.FrequencyOrdering; import dumbo.Ordering.Standard.LexicographicOrdering; +import dumbo.Ordering.Standard.RandomOrdering; import dumbo.Ordering.Standard.LexicographicSignatureOrdering; import dumbo.Ordering.UHS.UHSFrequencySignatureOrdering; import java.io.IOException; import java.util.Arrays; +import java.util.Random; public class OrderingOptimizer { @@ -83,39 +85,19 @@ else if (args[i].equals("-punishPercentage")) System.out.println(version); switch (version) { - case "9-normalized": // good version + case "9-normalized": IterativeOrdering iterative = new IterativeOrdering(pivot_len, infile, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false); iterative.initializeRanks(); ordering = iterative; break; - case "9-frequency": - FrequencyOrdering _frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, 100000000); - _frequencyOrdering.initializeRanks(); - - IterativeOrdering iterativeFrequency = new IterativeOrdering(pivot_len, infile, bufferSize, k, - samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, false, _frequencyOrdering); - iterativeFrequency.initializeRanks(); - ordering = iterativeFrequency; - break; case "9-normalized-signature": IterativeOrdering iterativeSignature = new IterativeOrdering(pivot_len, infile, bufferSize, k, samplesPerRound, numRounds, elementsToPush, 0, punishPercentage, true); iterativeSignature.initializeRanks(); ordering = iterativeSignature; - System.out.println("lolz asdasd"); - break; - case "universal-frequency-signature": - UHSFrequencySignatureOrdering universalFrequencySignature = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, true, 100000000); - universalFrequencySignature.initializeRanks(); - ordering = universalFrequencySignature; - break; - case "universal-frequency": - UHSFrequencySignatureOrdering universalFrequency = new UHSFrequencySignatureOrdering(pivot_len, infile, readLen, bufferSize, false, 100000000); - universalFrequency.initializeRanks(); - ordering = universalFrequency; break; - case "frequency": // FREQUENCY SUCKS + case "frequency": FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound); frequencyOrdering.initializeRanks(); ordering = frequencyOrdering; @@ -128,6 +110,11 @@ else if (args[i].equals("-punishPercentage")) ordering = new LexicographicOrdering(pivot_len); ordering.initializeRanks(); break; + case "random": + Random r = new Random(); + ordering = new RandomOrdering(pivot_len, r.nextInt()); + ordering.initializeRanks(); + break; } ExportUtils exportUtils = new ExportUtils(); From a3e641ba0fdecc7aec8346adcbc4bc19d8d45ba7 Mon Sep 17 00:00:00 2001 From: danflomin Date: Thu, 15 Jul 2021 22:11:48 +0300 Subject: [PATCH 43/44] removed several unnecessary files --- ...rativeOrdering10_WithCounterNormalized.txt | 327 ------------------ .../Ordering/Standard/FrequencyOrdering.java | 91 ----- .../UHS/UHSFrequencySignatureOrdering.java | 106 ------ src/dumbo/Ordering/UHS/UHSOrderingBase.java | 71 ---- src/dumbo/OrderingOptimizer.java | 1 - 5 files changed, 596 deletions(-) delete mode 100644 src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.txt delete mode 100644 src/dumbo/Ordering/Standard/FrequencyOrdering.java delete mode 100644 src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java delete mode 100644 src/dumbo/Ordering/UHS/UHSOrderingBase.java diff --git a/src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.txt b/src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.txt deleted file mode 100644 index 48fc108..0000000 --- a/src/dumbo/Ordering/IterativeOrdering10_WithCounterNormalized.txt +++ /dev/null @@ -1,327 +0,0 @@ -package dumbo.Ordering; - -import dumbo.StringUtils; -import net.agkn.hll.HLL; - -import java.io.*; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; - -public class IterativeOrdering10_WithCounterNormalized implements IOrdering { - private String inputFile; - private int readLen; - private int bufSize; - private int pivotLength; - private int k; - private long[] currentOrdering; - private StringUtils stringUtils; - private HashMap> frequency; - - private int statisticsSamples; - private int roundSamples; - private int rounds; - private int elementsToPush; - - private double percentagePunishment; - - private Integer[] temp = null; - private int mask; - private HashMap statFrequency; - - public IterativeOrdering10_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k, long[] initialOrdering) { - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pivotLength = pivotLength; - this.k = k; - this.currentOrdering = initialOrdering.clone(); - stringUtils = new StringUtils(); - } - - public IterativeOrdering10_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k) { - this(pivotLength, infile, readLen, bufSize, k, new long[(int) Math.pow(4, pivotLength)]); - for (int i = 0; i < (int) Math.pow(4, pivotLength); i++) { - int canonical = Math.min(i, getReversed(i)); - currentOrdering[i] = canonical; - currentOrdering[getReversed(i)] = canonical; - } - roundSamples = 100000; - rounds = 10000; - elementsToPush = 1; - } - - public IterativeOrdering10_WithCounterNormalized(int pivotLength, String infile, int readLen, int bufSize, int k, int roundSamples, int rounds, int elementsToPush, int statisticsSamples, double percentagePunishment) { - this(pivotLength, infile, readLen, bufSize, k); - this.roundSamples = roundSamples; - this.rounds = rounds; - this.elementsToPush = elementsToPush; - this.statisticsSamples = statisticsSamples; - this.percentagePunishment = percentagePunishment; - this.mask = (int) Math.pow(4, pivotLength) - 1; - } - - public String getCanon(String line) { - String x = new String(stringUtils.getReversedRead(line.toCharArray())); - for (int i = 0; i < line.length(); i++) { - if (line.charAt(i) < x.charAt(i)) - return line; - else if (line.charAt(i) > x.charAt(i)) - return x; - } - return x; - } - - private void addToHll(char[] lineCharArray, int minValue) { -// TODO: Add with reversecompliment - if (!statFrequency.containsKey(minValue)) - statFrequency.put(minValue, new HLL(8, 5)); - statFrequency.get(minValue).addRaw(stringUtils.getLDecimal(lineCharArray, 0, k)); - } - - - public void initFrequency() throws IOException { - - boolean keepSample = true; - int numSampled = 0; - int roundNumber = 0; - - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - statFrequency = new HashMap<>(); -// HashSet[] pmerFrequency; -// pmerFrequency = new HashSet()[(int) Math.pow(4, pivotLength)]; - HashMap> pmerFrequency = new HashMap<>((int) Math.pow(4, pivotLength)); - - String describeline; - char[] lineCharArray = new char[readLen]; - - int len = readLen; - - - int min_pos = -1; - int minValue, currentValue, minValueNormalized; - - while (keepSample && (describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - String line = new String(lineCharArray); - - if (stringUtils.isReadLegal(lineCharArray)) { - - min_pos = findSmallest(lineCharArray, 0, k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = Math.min(minValue, getReversed(minValue)); - currentValue = stringUtils.getDecimal(lineCharArray, k - pivotLength, k); - - if (roundNumber == rounds) { - addToHll(lineCharArray, minValueNormalized); - } else { - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(0, k))); // += 1; - } - - int bound = len - k + 1; - for (int i = 1; i < bound; i++) { - numSampled++; - currentValue = ((currentValue << 2) + StringUtils.valTable[lineCharArray[i + k - 1] - 'A']) & mask;//0xffff; - - if (i > min_pos) { - min_pos = findSmallest(lineCharArray, i, i + k); - minValue = stringUtils.getDecimal(lineCharArray, min_pos, min_pos + pivotLength); - minValueNormalized = Math.min(minValue, getReversed(minValue)); - - if (roundNumber == rounds) { - addToHll(lineCharArray, minValueNormalized); - } else { - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - } - } else { - int lastIndexInWindow = k + i - pivotLength; - if (strcmp(currentValue, minValue) < 0) { - min_pos = lastIndexInWindow; - minValue = currentValue; - minValueNormalized = Math.min(minValue, getReversed(minValue)); - - if (roundNumber == rounds) { - addToHll(lineCharArray, minValueNormalized); - } else { - if (!pmerFrequency.containsKey(minValueNormalized)) - pmerFrequency.put(minValueNormalized, new HashSet<>()); - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - } - } - } - - if (roundNumber == rounds) { - addToHll(lineCharArray, minValueNormalized); - } else { - pmerFrequency.get(minValueNormalized).add(getCanon(line.substring(i, k + i))); // += 1; - } - } - } - - if (numSampled >= roundSamples) { - roundNumber++; - if (roundNumber <= rounds) { - numSampled = 0; - adaptOrdering(pmerFrequency); -// if(roundNumber % 100 == 0) { -// percentagePunishment *= 0.996; -// normalize(); -// } - pmerFrequency.clear();//new long[(int) Math.pow(4, pivotLength)]; // zero out elements - if (roundNumber == rounds) { - System.out.println("Sampling for binning round"); - roundSamples = statisticsSamples; - } - } else { - keepSample = false; - } - } - frequency = pmerFrequency; - - } - bfrG.close(); - frG.close(); - } - - - private void adaptOrdering(HashMap> pmerFrequency) { - int[] frequencies = new int[(int) Math.pow(4, pivotLength)]; - for (Integer i : pmerFrequency.keySet()) { - frequencies[i] = pmerFrequency.get(i).size(); - } - for (int i = 0; i < elementsToPush; i++) { - long biggest = -1; - int biggestIndex = -1; - for (int k = 0; k < frequencies.length; k++) { - if (frequencies[k] > biggest) { - biggest = frequencies[k]; - biggestIndex = k; - } - } - long newRank = currentOrdering[biggestIndex] + (int) ((int) Math.pow(4, pivotLength) * percentagePunishment); - currentOrdering[biggestIndex] = newRank; - currentOrdering[getReversed(biggestIndex)] = newRank; - frequencies[biggestIndex] = 0; - frequencies[getReversed(biggestIndex)] = 0; - } - } - - private int getReversed(int x) { - int rev = 0; - int immer = ~x; - for (int i = 0; i < pivotLength; ++i) { - rev <<= 2; - rev |= immer & 0x3; - immer >>= 2; - } - return rev; - } - - - @Override - public int findSmallest(char[] a, int from, int to) throws IOException { - int min_pos = from; - for (int i = from + 1; i <= to - pivotLength; i++) { - if (strcmp(a, a, min_pos, i, pivotLength) > 0) - min_pos = i; - } - - return min_pos; - } - - @Override - public int strcmp(char[] a, char[] b, int froma, int fromb, int len) { - int x = stringUtils.getDecimal(a, froma, froma + pivotLength); - int y = stringUtils.getDecimal(b, fromb, fromb + pivotLength); - - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - public int strcmp(int x, int y) { - if (x == y) return 0; - if (currentOrdering[x] < currentOrdering[y]) return -1; - return 1; - } - - private void normalize() { -// currentOrdering - if (temp == null) { - temp = new Integer[currentOrdering.length]; - for (int i = 0; i < temp.length; temp[i] = i, i++) ; - } - Arrays.sort(temp, Comparator.comparingLong(a -> currentOrdering[a])); - for (int i = 0; i < temp.length; i++) { - currentOrdering[i] = temp[i]; - } - } - - - public void exportOrderingForCpp() { - File file = new File("ranks.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - - for (int i = 0; i < currentOrdering.length; i++) { - bf.write(Long.toString(currentOrdering[i])); - bf.newLine(); - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } - - public void exportBinningForCpp() { - File file = new File("freq.txt"); - - BufferedWriter bf = null; - - try { - bf = new BufferedWriter(new FileWriter(file)); - for (int i = 0; i < (int)Math.pow(4, pivotLength); i++) { - if(statFrequency.containsKey(i)) { - bf.write(Long.toString(statFrequency.get(i).cardinality())); - bf.newLine(); - } - else - { - bf.write("0"); - bf.newLine(); - } - } - bf.flush(); - - } catch (IOException e) { - e.printStackTrace(); - } finally { - - try { - //always close the writer - bf.close(); - } catch (Exception e) { - } - } - } -} diff --git a/src/dumbo/Ordering/Standard/FrequencyOrdering.java b/src/dumbo/Ordering/Standard/FrequencyOrdering.java deleted file mode 100644 index cbcad4f..0000000 --- a/src/dumbo/Ordering/Standard/FrequencyOrdering.java +++ /dev/null @@ -1,91 +0,0 @@ -package dumbo.Ordering.Standard; - -import dumbo.Ordering.OrderingBase; - -import java.io.*; -import java.util.Arrays; -import java.util.Comparator; - -public class FrequencyOrdering extends OrderingBase { - private String inputFile; - private int readLen; - private int bufSize; - private int[] pmerFrequency; - private int numSamples; - - public FrequencyOrdering(int pivotLen, String infile, int readLen, int bufSize, int numSamples) { - super(pivotLen); - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.pmerFrequency = new int[numMmers]; - this.numSamples = numSamples; - } - - protected void initFrequency() throws IOException { - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - int counter = 0; - - String describeline; - - char[] lineCharArray = new char[readLen]; - - - while ((describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - for (int i = 0; i < lineCharArray.length - pivotLength; i++) { - - int value = stringUtils.getNormalizedValue(stringUtils.getDecimal(lineCharArray, i, i + pivotLength), pivotLength); - pmerFrequency[value] += 1; - - counter++; - } - if (counter > numSamples) { - break; - } - } - } - bfrG.close(); - frG.close(); - } - - - @Override - public void initializeRanks() throws Exception { - initFrequency(); - Integer[] mmers = new Integer[numMmers]; - for (int i = 0; i < mmers.length; i++) { - mmers[i] = i; - } - - Arrays.sort(mmers, this::rawCompareMmer); -// Arrays.sort(mmers, Comparator.comparingInt(a -> pmerFrequency[a])); - for (int i = 0; i < mmers.length; i++) { - mmerRanks[mmers[i]] = i; - } - isRankInitialized = true; - } - - public int rawCompareMmer(int x, int y) { - int a = stringUtils.getNormalizedValue(x, pivotLength); - int b = stringUtils.getNormalizedValue(y, pivotLength); - if (a == b) return 0; - - if (pmerFrequency[a] == pmerFrequency[b]) { - if (a < b) return -1; - return 1; - } - if (pmerFrequency[a] < pmerFrequency[b]) - return -1; - else - return 1; - } - - -} diff --git a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java b/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java deleted file mode 100644 index 3c4b875..0000000 --- a/src/dumbo/Ordering/UHS/UHSFrequencySignatureOrdering.java +++ /dev/null @@ -1,106 +0,0 @@ -package dumbo.Ordering.UHS; - -import dumbo.Ordering.Standard.SignatureUtils; - -import java.io.*; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashSet; - -public class UHSFrequencySignatureOrdering extends UHSOrderingBase { - private String inputFile; - private int readLen; - private int bufSize; - private int[] mmerFrequency; - private int numMmersToCount; - - private SignatureUtils signatureUtils; - protected boolean useSignature; - - - public UHSFrequencySignatureOrdering(int pivotLen, String infile, int readLen, int bufSize, boolean useSignature, int numMmersToCount) throws IOException { - super(pivotLen); - this.inputFile = infile; - this.readLen = readLen; - this.bufSize = bufSize; - this.mmerFrequency = new int[numMmers]; - this.numMmersToCount = numMmersToCount; - - this.useSignature = useSignature; - this.signatureUtils = new SignatureUtils(pivotLen); - } - - @Override - public void initializeRanks() throws Exception { - System.out.println("start init rank"); - - countFrequency(); - Arrays.fill(mmerRanks, Integer.MAX_VALUE); - - int idx = 0; - - HashSet normalizedAllowedMmersUHS = new HashSet<>(); - for (int i = 0; i < numMmers; i++) { - if (isInUHS(i) && (!useSignature || signatureUtils.isAllowed(i))) - normalizedAllowedMmersUHS.add(i); - } - Integer[] allowedMmers = new Integer[normalizedAllowedMmersUHS.size()]; - normalizedAllowedMmersUHS.toArray(allowedMmers); - Arrays.sort(allowedMmers, Comparator.comparingInt(a -> mmerFrequency[a])); - for (int i = 0; i < allowedMmers.length; i++) { - mmerRanks[allowedMmers[i]] = idx; - idx++; - } - - if (useSignature) { - HashSet normalizedNotAllowedMmersUHS = new HashSet<>(); - for (int i = 0; i < numMmers; i++) { - if (isInUHS(i) && (!signatureUtils.isAllowed(i))) - normalizedNotAllowedMmersUHS.add(i); - } - Integer[] notAllowedMmers = new Integer[normalizedNotAllowedMmersUHS.size()]; - normalizedNotAllowedMmersUHS.toArray(notAllowedMmers); - Arrays.sort(notAllowedMmers, Comparator.comparingInt(a -> mmerFrequency[a])); - for (int i = 0; i < notAllowedMmers.length; i++) { - mmerRanks[notAllowedMmers[i]] = idx; - idx++; - } - } - - normalize(); - System.out.println("finish init rank"); - isRankInitialized = true; - } - - - private void countFrequency() throws IOException { - FileReader frG = new FileReader(inputFile); - BufferedReader bfrG = new BufferedReader(frG, bufSize); - - int counter = 0; - String describeline; - char[] lineCharArray = new char[readLen]; - - while ((describeline = bfrG.readLine()) != null) { - - bfrG.read(lineCharArray, 0, readLen); - bfrG.read(); - - if (stringUtils.isReadLegal(lineCharArray)) { - for (int i = 0; i <= lineCharArray.length - pivotLength; i++) { - - int value = stringUtils.getNormalizedValue(stringUtils.getDecimal(lineCharArray, i, i + pivotLength), pivotLength); - mmerFrequency[value] += 1; - counter++; - } - if (counter > numMmersToCount) { - break; - } - } - } - bfrG.close(); - frG.close(); - } - - -} diff --git a/src/dumbo/Ordering/UHS/UHSOrderingBase.java b/src/dumbo/Ordering/UHS/UHSOrderingBase.java deleted file mode 100644 index 7163188..0000000 --- a/src/dumbo/Ordering/UHS/UHSOrderingBase.java +++ /dev/null @@ -1,71 +0,0 @@ -package dumbo.Ordering.UHS; - -import dumbo.Ordering.OrderingBase; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; - -public abstract class UHSOrderingBase extends OrderingBase { - - protected byte[] normalizedUHS; - - protected static final int BOTH_IN_UHS = 824; - protected static final int BOTH_NOT_IN_UHS = 1001; - - - public UHSOrderingBase(int pivotLen) throws IOException { - super(pivotLen); - normalizedUHS = uhsBitSet(pivotLen); - - } - - public boolean isInUHS(int pmerDecimal) { - return normalizedUHS[pmerDecimal] == 1; - } - - - protected int compareMmerBase(int xNormalized, int yNormalized) { - if (xNormalized == yNormalized) - return 0; - - boolean xInUHS = isInUHS(xNormalized); - boolean yInUHS = isInUHS(yNormalized); - if (xInUHS) { - if (!yInUHS) return -1; - return BOTH_IN_UHS; - } - if (yInUHS) - return 1; - return BOTH_NOT_IN_UHS; - } - - private byte[] uhsBitSet(int pivotLen) throws IOException { - int i = 0; - byte[] bits = new byte[numMmers]; - - String DocksFile = "res_" + pivotLen + ".txt"; - FileReader frG = new FileReader(DocksFile); - int count = 0; - - BufferedReader reader; - try { - reader = new BufferedReader(frG); - String line; - while ((line = reader.readLine()) != null) { - i = stringUtils.getNormalizedValue(stringUtils.getDecimal(line.toCharArray(), 0, pivotLen), pivotLength); - bits[i] = 1; - count++; - } - reader.close(); - } catch (IOException e) { - e.printStackTrace(); - } - System.out.println(count); - frG.close(); - - return bits; - } - - -} diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 4c1424d..41b8cba 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -5,7 +5,6 @@ import dumbo.Ordering.Standard.LexicographicOrdering; import dumbo.Ordering.Standard.RandomOrdering; import dumbo.Ordering.Standard.LexicographicSignatureOrdering; -import dumbo.Ordering.UHS.UHSFrequencySignatureOrdering; import java.io.IOException; import java.util.Arrays; From aeef1aae4a2998b9f40c6000d31176161dbd939c Mon Sep 17 00:00:00 2001 From: danflomin Date: Thu, 15 Jul 2021 22:12:19 +0300 Subject: [PATCH 44/44] fixed removal of frequency --- src/dumbo/OrderingOptimizer.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/dumbo/OrderingOptimizer.java b/src/dumbo/OrderingOptimizer.java index 41b8cba..2adbb9e 100644 --- a/src/dumbo/OrderingOptimizer.java +++ b/src/dumbo/OrderingOptimizer.java @@ -96,11 +96,6 @@ else if (args[i].equals("-punishPercentage")) iterativeSignature.initializeRanks(); ordering = iterativeSignature; break; - case "frequency": - FrequencyOrdering frequencyOrdering = new FrequencyOrdering(pivot_len, infile, readLen, bufferSize, numRounds * samplesPerRound); - frequencyOrdering.initializeRanks(); - ordering = frequencyOrdering; - break; case "signature": ordering = new LexicographicSignatureOrdering(pivot_len); ordering.initializeRanks();