-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathOffTarget.h
More file actions
109 lines (92 loc) · 4.68 KB
/
OffTarget.h
File metadata and controls
109 lines (92 loc) · 4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#pragma once
#include "FileOperations.h"
#include "Score.h"
#include <thread>
using namespace std;
/* OffTarget class represents the primary object of the algorithms implementation */
class OffTarget
{
public:
/* function for parsing input arguments */
void parseInputArguments(int argc, char *argv[]);
/* parse data needed for algorithm */
void getAlgorithmData();
/* function for running the OffTarget algorithm */
void run();
private:
/*
Input argument variable definitions:
queryFile => File path to input file holding sequence data to run against the reference organism data and get off target score for
endo => Defines endonuclease used
csprFile => File path to GZIP cspr file of organism
sqlFile => File path for SQL repeats file of organism
outputFile => File path for output file
casperInfoFile => File path to CASPERinfo
maxMismatches => Defines the number of max mismatched letters between two sequences
threshold => Defines the threshold value for scores: score below this threshold will not be reported
avgOutput => Defines if average output format is used: only the offtarget score is shown for each query sequences in the output file
detailedOutput => Defines if detailed output format is used: provides additional information on targets found from the algorithm
hsuMatrixName => HSU matrix name to parse from CASPERinfo
three_prime => boolean - True = 3 prime, False = 5 prime
*/
bool avgOutput = false, detailedOutput = false;
int maxMismatches = 0;
double threshold = 0;
string endo, queryFilePath, csprFilePath, sqlFilePath, outputFilePath, casperInfoFilePath, hsuMatrixName;
bool three_prime = true;
/*
CASPERinfo variable definitions
endoData => vector containing {pam length, 3' length, seed length, 5' length, sequence length}
hsuMatrix => See CASPERinfo for HSU matrix structure
*/
vector<int> endoData;
map<string, vector<double>> hsuMatrix;
vector<string> hsuKeys = { "GT", "AC", "GG", "TG", "TT", "CA", "CT", "GA", "AA", "AG", "TC", "CC" };
/*
CSPR file variable definitions
uniqueSeqs => concatenated string of all unqiue sequences in CSPR file
uniqueScores => vector of ints holding the scores of each unqiue sequence
uniqueLocations => vector holding the locations of sequences from CSPR file
uniqueChroms => vector holding the chromosome of each sequence from CSPR file
*/
string uniqueSeqs;
vector<uint8_t> uniqueScores;
vector<long long> uniqueLocations;
vector<int> uniqueChroms;
/*
DB file variable definitions
repeatSeqs => concatendated string of all repeat sequences in the db file
repeatScores => vector of ints holding the scores of each repeat sequence
repeatLocations => vector holding the locations of the repeat sequences in the db file
repeatChroms => vector holding the chromosome of each sequence in db file
*/
string repeatSeqs;
vector<uint8_t> repeatScores;
vector<long long> repeatLocations;
vector<int> repeatChroms;
/*
Query file variable definitions
querySeqs => concatenated string of all sequences in the query file
queryScores => vector of ints holding the scores of each query sequence
*/
string querySeqs;
vector<uint8_t> queryScores;
/* FileOperations object - used for all file parsing/writing operations */
FileOperations FileOp;
/* score object to run scoring algorithms */
Score score;
/* vector to hold OffTarget scores for query sequences */
vector<double> queryOffTargetScores;
/* OffTarget analysis function for finding similar sequences in the reference organism, scoring the findings, and writing out the results
findSimilars is a wrapper for calling findSimilarsUnique and findSimiarsRepeat for each query sequence
*/
void findSimilars(string currentQuerySeq, int currentQueryScore, vector<vector<double> > &targetScores, vector<vector<unsigned long> > &targetIndexes);
/* function for running off target analysis of query sequence against the unique organism data from CSPR file */
void findSimilarsUnique(string ¤tQuerySeq, int ¤tQueryScore, int &seqLength, vector<double> &runningScores, vector<unsigned long> &targetIndexes);
/* function for running off target analysis of query sequence against the repeat organism data from DB file */
void findSimilarsRepeat(string ¤tQuerySeq, int ¤tQueryScore, int &seqLength, vector<double> &runningScores, vector<unsigned long> &targetIndexes);
/* function for character wise comparison of two sequences */
bool getMismatches(string &refSeq, string ¤tQuerySeq, vector<int> &mismatchLocations, vector<string> &hsuKeys, int &seqLength);
/* function to reverse complement a character */
char reverseComp(char &c);
};