From 20fc928ad1e510c509c5c64d75ad97eda59cf2f5 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 09:02:16 -0400 Subject: [PATCH 01/27] Initial cleanup --- .gitignore | 2 + PHESANT.Rproj | 13 + WAS/fixOddFieldsToCatMul.r | 37 +- WAS/loadConfounders.r | 193 ++-- WAS/loadData.r | 109 +- WAS/loadIndicatorFields.r | 252 ++--- WAS/loadPhenotypes.r | 168 ++- WAS/loadTraitOfInterest.r | 37 +- WAS/phenomeScan.r | 1 + testWAS/results/forest-binary.pdf | Bin 5117 -> 0 bytes testWAS/results/forest-continuous.pdf | Bin 4309 -> 0 bytes testWAS/results/forest-ordered-logistic.pdf | Bin 4748 -> 0 bytes testWAS/results/modelfit-log-1-3.txt | 12 - testWAS/results/modelfit-log-2-3.txt | 43 - testWAS/results/modelfit-log-3-3.txt | 24 - testWAS/results/modelfit-log-all.txt | 6 - testWAS/results/qqplot.pdf | Bin 4916 -> 0 bytes testWAS/results/results-combined.txt | 26 - testWAS/results/results-linear-1-2.txt | 1 - testWAS/results/results-linear-1-3.txt | 3 - testWAS/results/results-linear-2-3.txt | 1 - testWAS/results/results-linear-3-3.txt | 1 - testWAS/results/results-log-1-3.txt | 13 - testWAS/results/results-log-2-3.txt | 11 - testWAS/results/results-log-3-3.txt | 5 - testWAS/results/results-log-all.txt | 2 +- .../results/results-logistic-binary-1-2.txt | 1 - .../results/results-logistic-binary-1-3.txt | 4 - .../results/results-logistic-binary-2-3.txt | 3 - .../results/results-logistic-binary-3-3.txt | 13 - .../results/results-logistic-binary-all.txt | 31 +- .../results-multinomial-logistic-1-2.txt | 1 - .../results-multinomial-logistic-1-3.txt | 1 - .../results-multinomial-logistic-2-3.txt | 10 - .../results-multinomial-logistic-3-3.txt | 1 - .../results-multinomial-logistic-all.txt | 16 +- .../results/results-ordered-logistic-1-2.txt | 1 - .../results/results-ordered-logistic-1-3.txt | 2 - .../results/results-ordered-logistic-2-3.txt | 4 - .../results/results-ordered-logistic-3-3.txt | 1 - .../results/results-ordered-logistic-all.txt | 8 +- testWAS/results/variable-flow-counts-1-3.txt | 22 - testWAS/results/variable-flow-counts-2-3.txt | 16 - testWAS/results/variable-flow-counts-3-3.txt | 9 - testWAS/results/variable-flow-counts-all.txt | 10 +- .../results/variable-flow-counts-combined.txt | 38 - testWAS/testdata.csv | 1001 ----------------- variable-info/update-outcome-info/toTabs.sh | 0 48 files changed, 414 insertions(+), 1739 deletions(-) create mode 100644 .gitignore create mode 100644 PHESANT.Rproj delete mode 100644 testWAS/results/forest-binary.pdf delete mode 100644 testWAS/results/forest-continuous.pdf delete mode 100644 testWAS/results/forest-ordered-logistic.pdf delete mode 100644 testWAS/results/modelfit-log-1-3.txt delete mode 100644 testWAS/results/modelfit-log-2-3.txt delete mode 100644 testWAS/results/modelfit-log-3-3.txt delete mode 100644 testWAS/results/qqplot.pdf delete mode 100644 testWAS/results/results-combined.txt delete mode 100644 testWAS/results/results-linear-1-2.txt delete mode 100644 testWAS/results/results-linear-1-3.txt delete mode 100644 testWAS/results/results-linear-2-3.txt delete mode 100644 testWAS/results/results-linear-3-3.txt delete mode 100644 testWAS/results/results-log-1-3.txt delete mode 100644 testWAS/results/results-log-2-3.txt delete mode 100644 testWAS/results/results-log-3-3.txt delete mode 100644 testWAS/results/results-logistic-binary-1-2.txt delete mode 100644 testWAS/results/results-logistic-binary-1-3.txt delete mode 100644 testWAS/results/results-logistic-binary-2-3.txt delete mode 100644 testWAS/results/results-logistic-binary-3-3.txt delete mode 100644 testWAS/results/results-multinomial-logistic-1-2.txt delete mode 100644 testWAS/results/results-multinomial-logistic-1-3.txt delete mode 100644 testWAS/results/results-multinomial-logistic-2-3.txt delete mode 100644 testWAS/results/results-multinomial-logistic-3-3.txt delete mode 100644 testWAS/results/results-ordered-logistic-1-2.txt delete mode 100644 testWAS/results/results-ordered-logistic-1-3.txt delete mode 100644 testWAS/results/results-ordered-logistic-2-3.txt delete mode 100644 testWAS/results/results-ordered-logistic-3-3.txt delete mode 100644 testWAS/results/variable-flow-counts-1-3.txt delete mode 100644 testWAS/results/variable-flow-counts-2-3.txt delete mode 100644 testWAS/results/variable-flow-counts-3-3.txt delete mode 100644 testWAS/results/variable-flow-counts-combined.txt delete mode 100644 testWAS/testdata.csv mode change 100644 => 100755 variable-info/update-outcome-info/toTabs.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9bea433 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ + +.DS_Store diff --git a/PHESANT.Rproj b/PHESANT.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/PHESANT.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/WAS/fixOddFieldsToCatMul.r b/WAS/fixOddFieldsToCatMul.r index 84eff53..2fb25c4 100644 --- a/WAS/fixOddFieldsToCatMul.r +++ b/WAS/fixOddFieldsToCatMul.r @@ -28,27 +28,22 @@ # This function changes the format of these variable names from VARID_0_0, VARID_1_0, VARID_2_0 etc (which # is instance format), to VARID_0_0, VARID_0_1, VARID_0_2 etc. (array format) so they can be treated as categorical (multiple) fields. fixOddFieldsToCatMul <- function(data) { - - # examples are variables: 40006, 40011, 40012, 40013 - - # get all variables that need their instances changing to arrays - dataPheno = vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),]; - - for (i in 1:nrow(dataPheno)) { - varID = dataPheno[i,]$FieldID; - varidString = paste("x",varID,"_", sep=""); - - # get all columns in data dataframe for this variable - colIdxs = which(grepl(varidString,names(data))); - - # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 - count = 0; - for (j in colIdxs) { - colnames(data)[j] <- paste(varidString, "0_", count, sep="") - count = count + 1; - } + # examples are variables: 40006, 40011, 40012, 40013 + # get all variables that need their instances changing to arrays + dataPheno = vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),]; + for (i in 1:nrow(dataPheno)) { + varID = dataPheno[i,]$FieldID; + varidString = paste("x",varID,"_", sep=""); + + # get all columns in data dataframe for this variable + colIdxs = which(grepl(varidString,names(data))); + + # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 + count = 0; + for (j in colIdxs) { + colnames(data)[j] <- paste(varidString, "0_", count, sep="") + count = count + 1; + } } - return(data) - } diff --git a/WAS/loadConfounders.r b/WAS/loadConfounders.r index 357dbab..72c5f28 100644 --- a/WAS/loadConfounders.r +++ b/WAS/loadConfounders.r @@ -20,117 +20,102 @@ ## loads confounder variables from phenotype file loadConfounders <- function(phenotypes) { - -if (opt$save==TRUE) { - - # saving not running tests so we add a fake confounder - numRows = nrow(phenotypes) + if (opt$save==TRUE) { + # saving not running tests so we add a fake confounder + numRows = nrow(phenotypes) data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) colnames(data)[1] <- "userID" colnames(data)[2] <- "conf1" return(data) - -} else { - - if (!is.null(opt$confounderfile)) { - print("Loading confounders from confounder file ...") - - confs = fread(opt$confounderfile, sep=',', header=TRUE, data.table=FALSE) - confs = lapply(confs,function(x) type.convert(as.character(x))) - confs = as.data.frame(confs) - - ## find userID column and change name to userID - idx = which(colnames(confs) == opt$userId) - confs = confs[,c(opt$userID,setdiff(colnames(confs),opt$userID))] - colnames(confs)[1] <- "userID" - - } else { - print("Loading confounders from phenotypes file ...") - confNames = getConfounderNames() - - ##### - ##### extract confounders from data file - confs = fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) - confs = lapply(confs,function(x) type.convert(as.character(x))) - confs = as.data.frame(confs) - - ##### - ##### process genetic batch to create genetic chip variable - if (opt$genetic == TRUE) { - genoBatch = confs[,"x22000_0_0"] - - # chip comes from batch field 22000 - genoChip = rep.int(NA,nrow(confs)); - idxForVar = which(genoBatch<0); - genoChip[idxForVar] = 0; - idxForVar = which(genoBatch>=0 & genoBatch<2000); - genoChip[idxForVar] = 1; - - # remove geno batch from and add geno chip to confounders - confs = confs[,-which(names(confs) == "x22000_0_0")] - confs = cbind.data.frame(confs, genoChip) - } - - - ##### - ##### Convert assessment centre to an indicator variable - if (opt$sensitivity==TRUE) { - confs$x54_0_0 = as.factor(confs$x54_0_0) - assCentre = model.matrix(~confs$x54_0_0) - assCentre = assCentre[,2:ncol(assCentre)] - confs = cbind(confs, assCentre) - confs$x54_0_0 = NULL - } - - colnames(confs)[1] <- "userID" - - } - - # remove any rows with no values - print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) - confsComp = complete.cases(confs) - print(paste("Number of INCOMPLETE rows removed from confounder data: ", length(which(confsComp==FALSE)),sep="")) - confs = confs[confsComp==TRUE,] - print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) - - print("Confounder columns:") - print(names(confs)) - - return(confs) -} + } else { + if (!is.null(opt$confounderfile)) { + print("Loading confounders from confounder file ...") + confs = fread(opt$confounderfile, sep=',', header=TRUE, data.table=FALSE) + confs = lapply(confs,function(x) type.convert(as.character(x))) + confs = as.data.frame(confs) + + ## find userID column and change name to userID + idx = which(colnames(confs) == opt$userId) + confs = confs[,c(opt$userID,setdiff(colnames(confs),opt$userID))] + colnames(confs)[1] <- "userID" + } else { + print("Loading confounders from phenotypes file ...") + confNames = getConfounderNames() + ##### + ##### extract confounders from data file + confs = fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) + confs = lapply(confs,function(x) type.convert(as.character(x))) + confs = as.data.frame(confs) + + ##### + ##### process genetic batch to create genetic chip variable + if (opt$genetic == TRUE) { + genoBatch = confs[,"x22000_0_0"] + # chip comes from batch field 22000 + genoChip = rep.int(NA,nrow(confs)); + idxForVar = which(genoBatch<0); + genoChip[idxForVar] = 0; + idxForVar = which(genoBatch>=0 & genoBatch<2000); + genoChip[idxForVar] = 1; + + # remove geno batch from and add geno chip to confounders + confs = confs[,-which(names(confs) == "x22000_0_0")] + confs = cbind.data.frame(confs, genoChip) + } + + ##### + ##### Convert assessment centre to an indicator variable + if (opt$sensitivity==TRUE) { + confs$x54_0_0 = as.factor(confs$x54_0_0) + assCentre = model.matrix(~confs$x54_0_0) + assCentre = assCentre[,2:ncol(assCentre)] + confs = cbind(confs, assCentre) + confs$x54_0_0 = NULL + } + colnames(confs)[1] <- "userID" + + } + + # remove any rows with no values + print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) + confsComp = complete.cases(confs) + print(paste("Number of INCOMPLETE rows removed from confounder data: ", length(which(confsComp==FALSE)),sep="")) + confs = confs[confsComp==TRUE,] + print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) + + print("Confounder columns:") + print(names(confs)) + + return(confs) + } } getConfounderNames <- function() { - - ##### - ##### first get vector of confounder names - - # age and sex - confNames = c(opt$userId, "x21022_0_0", "x31_0_0") - - # if genetic trait of interest then adjust for genotype chip - # and also let user choose sensitivity analysis that also adjusts for top 10 genetic principal components and assessment centre - if (opt$genetic == TRUE) { - - confNames = append(confNames, "x22000_0_0") - - if (opt$sensitivity==TRUE) { - confNames = append(confNames, c("x22009_0_1", "x22009_0_2", "x22009_0_3", "x22009_0_4", "x22009_0_5", "x22009_0_6", "x22009_0_7", "x22009_0_8", "x22009_0_9", "x22009_0_10", "x54_0_0")) - print("Adjusting for age, sex, genotype chip, top 10 genetic principal components and assessment centre") - } else { - print("Adjusting for age, sex and genotype chip") - } - } else { - # non genetic trait of interest, then sensitivity adjusts for assessment center - if (opt$sensitivity==TRUE) { - confNames = append(confNames, "x54_0_0") - print("Adjusting for age, sex and assessment centre") - } else { - print("Adjusting for age and sex") - } - } - - return(confNames) + ##### + ##### first get vector of confounder names + # age and sex + confNames = c(opt$userId, "x21022_0_0", "x31_0_0") + + # if genetic trait of interest then adjust for genotype chip + # and also let user choose sensitivity analysis that also adjusts for top 10 genetic principal components and assessment centre + if (opt$genetic == TRUE) { + confNames = append(confNames, "x22000_0_0") + if (opt$sensitivity==TRUE) { + confNames = append(confNames, c("x22009_0_1", "x22009_0_2", "x22009_0_3", "x22009_0_4", "x22009_0_5", "x22009_0_6", "x22009_0_7", "x22009_0_8", "x22009_0_9", "x22009_0_10", "x54_0_0")) + print("Adjusting for age, sex, genotype chip, top 10 genetic principal components and assessment centre") + } else { + print("Adjusting for age, sex and genotype chip") + } + } else { + # non genetic trait of interest, then sensitivity adjusts for assessment center + if (opt$sensitivity==TRUE) { + confNames = append(confNames, "x54_0_0") + print("Adjusting for age, sex and assessment centre") + } else { + print("Adjusting for age and sex") + } + } + return(confNames) } diff --git a/WAS/loadData.r b/WAS/loadData.r index 127ea39..68d4eed 100644 --- a/WAS/loadData.r +++ b/WAS/loadData.r @@ -22,64 +22,57 @@ # creates confounder data frame # returns an object holding these two data frames loadData <- function() { - - library(data.table) - - ##### - ##### validating data - - ## check phenotype file headers - validatePhenotypeInput() - - ## check trait of interest file headers - validateTraitInput() - - ##### - ##### load data - - ## load phenotype - print("Loading phenotypes ...") - phenotype = loadPhenotypes() - - ## load trait of interest - toi <- loadTraitOfInterest(phenotype) - - ## load confounders - conf <- loadConfounders(phenotype) - - ## add trait of interest to phenotype data frame and remove rows with no trait of interest - ## merge in toi with phenotype - keep id list from phenotypes file - phenotype = merge(toi, phenotype, by="userID", all.y=TRUE, all.x=FALSE) - - ## remove any rows with no trait of interest - idxNotEmpty = which(!is.na(phenotype[,"geno"])) - - if (opt$save == TRUE) { - print(paste("Phenotype file has ", nrow(phenotype), " rows.", sep="")) - } else { - print(paste("Phenotype file has ", nrow(phenotype), " rows with ", length(idxNotEmpty), " not NA for trait of interest (",opt$traitofinterest,").", sep="")) - } - - phenotype = phenotype[idxNotEmpty,] - - # match ids from not empty phenotypes list - confsIdx = which(conf$userID %in% phenotype$userID) - conf = conf[confsIdx,] - - if (nrow(phenotype)==0) { - stop("No examples with row in both trait of interest and phenotype files", call.=FALSE) - } else { - print(paste("Phenotype and trait of interest data files merged, with", nrow(phenotype),"examples")) - } - - # some fields are fixed that have a field type as cat single but we want to treat them like cat mult - phenotype = fixOddFieldsToCatMul(phenotype) - - indFields = loadIndicatorFields(colnames(phenotype)) - - d = list(datax=phenotype, confounders=conf, inds=indFields) - return(d) - + library(data.table) + + + ##### validating data + ## check phenotype file headers + validatePhenotypeInput() + + ## check trait of interest file headers + validateTraitInput() + + ##### load data + ## load phenotype + print("Loading phenotypes ...") + phenotype = loadPhenotypes() + + ## load trait of interest + toi <- loadTraitOfInterest(phenotype) + + ## load confounders + conf <- loadConfounders(phenotype) + + ## add trait of interest to phenotype data frame and remove rows with no trait of interest + ## merge in toi with phenotype - keep id list from phenotypes file + phenotype = merge(toi, phenotype, by="userID", all.y=TRUE, all.x=FALSE) + + ## remove any rows with no trait of interest + idxNotEmpty = which(!is.na(phenotype[,"geno"])) + + if (opt$save == TRUE) { + print(paste("Phenotype file has ", nrow(phenotype), " rows.", sep="")) + } else { + print(paste("Phenotype file has ", nrow(phenotype), " rows with ", length(idxNotEmpty), " not NA for trait of interest (",opt$traitofinterest,").", sep="")) + } + + phenotype = phenotype[idxNotEmpty,] + + # match ids from not empty phenotypes list + confsIdx = which(conf$userID %in% phenotype$userID) + conf = conf[confsIdx,] + + if (nrow(phenotype)==0) { + stop("No examples with row in both trait of interest and phenotype files", call.=FALSE) + } else { + print(paste("Phenotype and trait of interest data files merged, with", nrow(phenotype),"examples")) + } + + # some fields are fixed that have a field type as cat single but we want to treat them like cat mult + phenotype = fixOddFieldsToCatMul(phenotype) + indFields = loadIndicatorFields(colnames(phenotype)) + d = list(datax=phenotype, confounders=conf, inds=indFields) + return(d) } diff --git a/WAS/loadIndicatorFields.r b/WAS/loadIndicatorFields.r index 31c94e0..e069c8e 100644 --- a/WAS/loadIndicatorFields.r +++ b/WAS/loadIndicatorFields.r @@ -19,144 +19,134 @@ ## ## load data used for data code default value related field, and categorical multiple indicator field loadIndicatorFields <- function(phenosToTest) { - - print("Loading indicator fields from phenotypes file ...") - - # read pheno file column names - phenoVarsAll = colnames(read.table(opt$phenofile, header=1, nrows=1, sep=',')) - phenoVarsAll = phenoVarsAll[which(phenoVarsAll!=opt$userId)] - - indVars = c(opt$userId) - - ## add indicator variables to pheno data - indVars = addIndicatorVariables(indVars, phenosToTest, phenoVarsAll) - - if (length(indVars)>1) { - # not just user id column - print("Loading required related variable(s):") - print(indVars[2:length(indVars)]) - } - else { - print("No required related variables.") - } - - ## read in the right table columns - data = fread(opt$phenofile, select=indVars, sep=',', header=TRUE, data.table=FALSE) - data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) - colnames(data)[1] <- "userID" - return(data) + print("Loading indicator fields from phenotypes file ...") + + # read pheno file column names + phenoVarsAll = colnames(read.table(opt$phenofile, header=1, nrows=1, sep=',')) + phenoVarsAll = phenoVarsAll[which(phenoVarsAll!=opt$userId)] + indVars = c(opt$userId) + + ## add indicator variables to pheno data + indVars = addIndicatorVariables(indVars, phenosToTest, phenoVarsAll) + if (length(indVars)>1) { + # not just user id column + print("Loading required related variable(s):") + print(indVars[2:length(indVars)]) + } else { + print("No required related variables.") + } + + ## read in the right table columns + data = fread(opt$phenofile, select=indVars, sep=',', header=TRUE, data.table=FALSE) + data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) + colnames(data)[1] <- "userID" + return(data) } addIndicatorVariables <- function(indVars, phenosToTest, phenoVarsAll) { - ##### - ##### default value related fields for data codes - # get list of all indicator variables from outcome info file - - # get datacodes with an indicator variable - dataCodeIdx = which(!is.na(vl$dataCodeInfo$default_related_field) & vl$dataCodeInfo$default_related_field!="") - - # whether there are any related fields that are not in the phenotype data file when they should be - hasIssue=FALSE - - if (length(dataCodeIdx)>0) { - dataCodeWithRF = vl$dataCodeInfo[dataCodeIdx,] - - defaultFields = c() - - # check there is a field in the phenotypes data for each data code and if so then add this data codes related field to the phenosToTest list - if (nrow(dataCodeWithRF)>0) { - for (i in 1:nrow(dataCodeWithRF)) { - dc = dataCodeWithRF$dataCode[i] - - # get all fields with this datacode - fieldsIdx = which(vl$phenoInfo$DATA_CODING == dc) - fieldIDs = vl$phenoInfo$FieldID[fieldsIdx] - fieldIDs = paste("x",fieldIDs,"_0_0", sep="") - - # datacode related field - rf = dataCodeWithRF$default_related_field[i] - rf = paste("x",rf,"_0_0", sep="") - - # if one of these field IDs are in phenotypeColumns then data code related field is needed - if (length(intersect(fieldIDs, phenosToTest))>0) { - defaultFields = append(defaultFields, rf) - } - } - } - - defaultFields = unique(defaultFields) - indVars = append(indVars, defaultFields) - - ##### - ##### check these required variables exist in phenotype file + ##### default value related fields for data codes + # get list of all indicator variables from outcome info file + + # get datacodes with an indicator variable + dataCodeIdx = which(!is.na(vl$dataCodeInfo$default_related_field) & vl$dataCodeInfo$default_related_field!="") + + # whether there are any related fields that are not in the phenotype data file when they should be + hasIssue=FALSE + + if (length(dataCodeIdx)>0) { + dataCodeWithRF = vl$dataCodeInfo[dataCodeIdx,] + defaultFields = c() + + # check there is a field in the phenotypes data for each data code and if so then add this data codes related field to the phenosToTest list + if (nrow(dataCodeWithRF)>0) { + for (i in 1:nrow(dataCodeWithRF)) { + dc = dataCodeWithRF$dataCode[i] + + # get all fields with this datacode + fieldsIdx = which(vl$phenoInfo$DATA_CODING == dc) + fieldIDs = vl$phenoInfo$FieldID[fieldsIdx] + fieldIDs = paste("x",fieldIDs,"_0_0", sep="") + + # datacode related field + rf = dataCodeWithRF$default_related_field[i] + rf = paste("x",rf,"_0_0", sep="") + + # if one of these field IDs are in phenotypeColumns then data code related field is needed + if (length(intersect(fieldIDs, phenosToTest))>0) { + defaultFields = append(defaultFields, rf) + } + } + } + + defaultFields = unique(defaultFields) + indVars = append(indVars, defaultFields) + + ##### check these required variables exist in phenotype file if(length(defaultFields)>0) { - for (i in 1:length(defaultFields)) { - if (!(defaultFields[i] %in% phenoVarsAll)) { - print(paste("Required variable: Field ",defaultFields[i],"is a data code related field (default_related_field column in data code information file) but was not found in phenotype data")) - hasIssue=TRUE - } - } + for (i in 1:length(defaultFields)) { + if (!(defaultFields[i] %in% phenoVarsAll)) { + print(paste("Required variable: Field ",defaultFields[i],"is a data code related field (default_related_field column in data code information file) but was not found in phenotype data")) + hasIssue=TRUE + } + } } - } - - ##### - ##### categorical multiple indicator fields - - # get field info, for fields with cat mult indicator fields - fieldsIdx = which(!is.na(vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS)) - - if (length(fieldsIdx)>0) { - fieldsWithCMIF = vl$phenoInfo[fieldsIdx,] - fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "NO_NAN"),] - fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "ALL"),] - fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == ""),] - - if (nrow(fieldsWithCMIF)>0) { - - # turn into variable format not field ID - fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS = paste("x",fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS,"_0_0", sep="") - fieldsWithCMIF$FieldID = paste("x",fieldsWithCMIF$FieldID,"_", sep="") - phenosToTestIds = sub("_.*", "_", phenosToTest) - - # remove rows where the field isn't in the phenotypes list - idxIn = which(fieldsWithCMIF$FieldID %in% phenosToTestIds) - fieldsWithCMIF = fieldsWithCMIF[idxIn,] - - defaultFields = fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS - defaultFields = unique(defaultFields) - - ##### - ##### remove those that already exist in phenotypes 'part' - idxExists = which(defaultFields %in% indVars) - if (length(idxExists>0)) { - defaultFields = defaultFields[-idxExists] - } - - indVars = unique(append(indVars, defaultFields)) - - ##### - ##### check these required variables exist in phenotype file - if(length(defaultFields)>0) { - for (i in 1:length(defaultFields)) { - if (!(defaultFields[i] %in% phenoVarsAll)) { - print(paste("Required variable: Field ",defaultFields[i],"is a categorical multiple indicator field (CAT_MULT_INDICATOR_FIELDS column in variable information file) but was not found in phenotype data")) - hasIssue=TRUE - } - } - } - - } - } - - # stop script if there are missing variables - if (hasIssue==TRUE) { - print("!!! PHESANT has stopped - add required variables to phenotype file or remove relevant phenotypes (so that required variables are not needed).") - quit() - } - - return(indVars) + } + + ##### categorical multiple indicator fields + # get field info, for fields with cat mult indicator fields + fieldsIdx = which(!is.na(vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS)) + + if (length(fieldsIdx)>0) { + fieldsWithCMIF = vl$phenoInfo[fieldsIdx,] + fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "NO_NAN"),] + fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "ALL"),] + fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == ""),] + + if (nrow(fieldsWithCMIF)>0) { + # turn into variable format not field ID + fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS = paste("x",fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS,"_0_0", sep="") + fieldsWithCMIF$FieldID = paste("x",fieldsWithCMIF$FieldID,"_", sep="") + phenosToTestIds = sub("_.*", "_", phenosToTest) + + # remove rows where the field isn't in the phenotypes list + idxIn = which(fieldsWithCMIF$FieldID %in% phenosToTestIds) + fieldsWithCMIF = fieldsWithCMIF[idxIn,] + + defaultFields = fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS + defaultFields = unique(defaultFields) + + ##### + ##### remove those that already exist in phenotypes 'part' + idxExists = which(defaultFields %in% indVars) + if (length(idxExists>0)) { + defaultFields = defaultFields[-idxExists] + } + + indVars = unique(append(indVars, defaultFields)) + + ##### + ##### check these required variables exist in phenotype file + if(length(defaultFields)>0) { + for (i in 1:length(defaultFields)) { + if (!(defaultFields[i] %in% phenoVarsAll)) { + print(paste("Required variable: Field ",defaultFields[i],"is a categorical multiple indicator field (CAT_MULT_INDICATOR_FIELDS column in variable information file) but was not found in phenotype data")) + hasIssue=TRUE + } + } + } + + } + } + + # stop script if there are missing variables + if (hasIssue==TRUE) { + print("!!! PHESANT has stopped - add required variables to phenotype file or remove relevant phenotypes (so that required variables are not needed).") + quit() + } + + return(indVars) } diff --git a/WAS/loadPhenotypes.r b/WAS/loadPhenotypes.r index c689ebe..4276242 100644 --- a/WAS/loadPhenotypes.r +++ b/WAS/loadPhenotypes.r @@ -20,106 +20,90 @@ ## load phenotypes from phenotype file loadPhenotypes <- function() { - - - ## is not running 'all' then we determine the start and end idxs of phenotypes that we test, so that we can parallelise into multiple jobs - if (opt$varTypeArg!="all") { - - # read pheno file column names + ## is not running 'all' then we determine the start and end idxs of phenotypes that we test, so that we can parallelise into multiple jobs + if (opt$varTypeArg!="all") { + # read pheno file column names phenoVars = read.table(opt$phenofile, header=0, nrows=1, sep=',') phenoVars = phenoVars[which(phenoVars!=opt$userId)] - - ##### - ##### calculate part start and end - - partSize = ceiling(length(phenoVars)/opt$numParts); - partStart = (opt$partIdx-1)*partSize + 1; - - if (opt$partIdx == opt$numParts) { - partEnd = length(phenoVars); - } else { - partEnd = partStart + partSize - 1; - } - - print(paste(partStart, '-', partEnd)); - - - ##### - ##### find range of columns to read in - - ## This is more complicated than just reading in a column range, because we need to determine cut points - ## such that all columns of a particular field are loaded. - ## A field is included in a 'part' if its last column is within the part range. - ## e.g. for part 2 of 5 parts and for 100 columns, then fields having their last column at position 21 - 40 (i.e. its column index) are included in this part. - - ## user ID always included - phenosToTest = c(opt$userId) - - currentVar="" - currentVarLong="" - currentVarShort="" - first=TRUE - phenoIdx=0 - - # all columns for a particular field - thisPhenoToTest = c() - - for (var in phenoVars) { - - varx = gsub("^x", "", var); - varx = gsub("_[0-9]+$", "", varx); - varxShort = gsub("^x", "", var); - varxShort = gsub("_[0-9]+_[0-9]+$", "", varxShort); - currentVarLong = var - - if (currentVar == varx) { # same variable same timepoint - # add current var to pheno list - thisPhenoToTest = append(thisPhenoToTest, as.character(currentVarLong)) - } - else if (currentVarShort == varxShort) { # save var, diff timepoint - ## different time point of this var so skip in testing but add here because some are fixed to cat mult - thisPhenoToTest = append(thisPhenoToTest, as.character(currentVarLong)) - } else { - ## new variable so run test for previous (we have collected all the columns now) - - if (first==FALSE) { - if (phenoIdx>=partStart && phenoIdx<=partEnd) { # only start new variable processing if last column of it is within the idx range for this part - phenosToTest = append(phenosToTest, thisPhenoToTest) - } - } - - first=FALSE; - - ## new variable so set values - currentVar = varx - currentVarShort = varxShort - thisPhenoToTest = c(as.character(currentVarLong)) + ##### + ##### calculate part start and end + partSize = ceiling(length(phenoVars)/opt$numParts); + partStart = (opt$partIdx-1)*partSize + 1; + if (opt$partIdx == opt$numParts) { + partEnd = length(phenoVars); + } else { + partEnd = partStart + partSize - 1; + } + print(paste(partStart, '-', partEnd)); + + + ##### + ##### find range of columns to read in + + ## This is more complicated than just reading in a column range, because we need to determine cut points + ## such that all columns of a particular field are loaded. + ## A field is included in a 'part' if its last column is within the part range. + ## e.g. for part 2 of 5 parts and for 100 columns, then fields having their last column at position 21 - 40 (i.e. its column index) are included in this part. + + ## user ID always included + phenosToTest = c(opt$userId) + + currentVar="" + currentVarLong="" + currentVarShort="" + first=TRUE + phenoIdx=0 + + # all columns for a particular field + thisPhenoToTest = c() + for (var in phenoVars) { + varx = gsub("^x", "", var); + varx = gsub("_[0-9]+$", "", varx); + varxShort = gsub("^x", "", var); + varxShort = gsub("_[0-9]+_[0-9]+$", "", varxShort); + currentVarLong = var + + if (currentVar == varx) { # same variable same timepoint + # add current var to pheno list + thisPhenoToTest = append(thisPhenoToTest, as.character(currentVarLong)) + } else if (currentVarShort == varxShort) { # save var, diff timepoint + ## different time point of this var so skip in testing but add here because some are fixed to cat mult + thisPhenoToTest = append(thisPhenoToTest, as.character(currentVarLong)) + } else { + ## new variable so run test for previous (we have collected all the columns now) + if (first==FALSE) { + if (phenoIdx>=partStart && phenoIdx<=partEnd) { # only start new variable processing if last column of it is within the idx range for this part + phenosToTest = append(phenosToTest, thisPhenoToTest) } - - - phenoIdx = phenoIdx + 1 - } - - # last variable so test association - if (phenoIdx>=partStart && phenoIdx<=partEnd) { - #phenosToTest = append(phenosToTest, as.character(currentVarLong)) - phenosToTest = append(phenosToTest, as.character(thisPhenoToTest)) - } - - ## read in the right table columns - a subset of the data file - data = fread(opt$phenofile, select=phenosToTest, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) - - } else { - # reading all data at once - data = fread(opt$phenofile, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) - } + } + first=FALSE; + + ## new variable so set values + currentVar = varx + currentVarShort = varxShort + thisPhenoToTest = c(as.character(currentVarLong)) + } + phenoIdx = phenoIdx + 1 + } + + # last variable so test association + if (phenoIdx>=partStart && phenoIdx<=partEnd) { + #phenosToTest = append(phenosToTest, as.character(currentVarLong)) + phenosToTest = append(phenosToTest, as.character(thisPhenoToTest)) + } + + ## read in the right table columns - a subset of the data file + data = fread(opt$phenofile, select=phenosToTest, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) + + } else { + # reading all data at once + data = fread(opt$phenofile, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) + } ## this is type conversion as used in the read.table function (that we used to use ((this was changed because read.table cannot read column subsets)) data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) - colnames(data)[1] <- "userID" - return(data) } diff --git a/WAS/loadTraitOfInterest.r b/WAS/loadTraitOfInterest.r index 712362c..52cf5b5 100644 --- a/WAS/loadTraitOfInterest.r +++ b/WAS/loadTraitOfInterest.r @@ -21,34 +21,25 @@ ## load trait of interest, either from separate trait of interest file, or from phenotype file loadTraitOfInterest <- function(phenotypes) { - - -if (opt$save==TRUE) { - # saving not running tests so we don't have a trait of interest - + if (opt$save==TRUE) { + # saving not running tests so we don't have a trait of interest # add pretend trait of interest so other code doesn't break numRows = nrow(phenotypes) - data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) - -} else { - - # load the trait of interest specified by the user - + data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) + } else { + # load the trait of interest specified by the user if (is.null(opt$traitofinterestfile)) { - print("Extracting trait of interest from pheno file ...") - data = fread(opt$phenofile, select=c(opt$userId, opt$traitofinterest), sep=',', header=TRUE, data.table=FALSE) - + print("Extracting trait of interest from pheno file ...") + data = fread(opt$phenofile, select=c(opt$userId, opt$traitofinterest), sep=',', header=TRUE, data.table=FALSE) } else { - print("Loading trait of interest file ...") - data = fread(opt$traitofinterestfile, select=c(opt$userId, opt$traitofinterest), sep=',', header=TRUE, data.table=FALSE) + print("Loading trait of interest file ...") + data = fread(opt$traitofinterestfile, select=c(opt$userId, opt$traitofinterest), sep=',', header=TRUE, data.table=FALSE) } + data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) + } - data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) -} - -colnames(data)[1] <- "userID" -colnames(data)[2] <- "geno" - -return(data) + colnames(data)[1] <- "userID" + colnames(data)[2] <- "geno" + return(data) } diff --git a/WAS/phenomeScan.r b/WAS/phenomeScan.r index 7d31fdd..2be21ca 100644 --- a/WAS/phenomeScan.r +++ b/WAS/phenomeScan.r @@ -42,6 +42,7 @@ option_list = list( opt_parser = OptionParser(option_list=option_list); opt = parse_args(opt_parser); +save.image(file = "opt.RData") source("processArgs.r") processArgs(); diff --git a/testWAS/results/forest-binary.pdf b/testWAS/results/forest-binary.pdf deleted file mode 100644 index 7baa6c453ed2e427c6a85b101f5743dd4fa97524..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5117 zcmZ`-c|4SB`xl|6B6}hp4k7WFF~(k&u|#&+$utI&F~cmf?-|)nXtz^HN3vy$5|S)q zi%9k;LPgnmXF8|e^PcnjJ)h^F`+Hx@eP4fk@9X+Xnj`h)pz?5tWW;2|R0J)e&xr_u z0tx`(!bOON1_ab4Vo+o(0gptHF@Q8u4W-M|v~uY>Zx92j;CYa#|u{>vQHCE&^QB?(aZ;SO42 zNCXNIO^*lvM?8kX&vKl_;62Em02HdM0)YBh9GQLtP#;Gx4ibYVxM3LQCXwki^M?4R z*WWTG2rD0KU0RkQSAYq~xqz2Brm4C5qMr3t5>#>1h$ET&iJNO1l1ja?Q*5ywb!knf zcH%{;xya;RHIjFFjbhl@(_iN26f}d@*3)E{7g1%d#Y!976YmIp<+d-kCTypsl@&+% z7c{Efq*b+G)S)L0{@)hbmm0o%jyeTbhje9yoj-bTX7c9d)6vtraidNy4NBDckMAe$ z?u4-UJ>L;nou>aoS*jx$!^ha3!ADR|BPfmE)#JnUYP@soyQ=Ly4V)Nzzt_5A_lEJ-x+J0~UgSwo6m}u z^R3fKVZJ+)F&Q4C8ynj*Eu33kkeRHH-yG4M3)3A+PZ#rBmRD1a_ddZMlR6J5UNTXA zrC*nYKgE@zan4<~1|_g=od2+8kqcF;&slRTsA;cysSB40C$rovQW!ZH_v|24HW#b6 zSETZ&!C>4tzxLhOd_IfA7tE1;QGkJYze=smURK}%9c+AmJXidV4btcbh?KG1W$#%5 zG#vI}4m-<6DwY=dl8a-j+m7~CikDHdZ63~R?-iP!HzF7BI~#uSJw1%_ARYD6Yi71j z%=6OYWj8H$8jOnN5G%Ys@4%Jz$s_lp_9q$c9gF0`VZEVVC#boNu|vv&iDBlp;FNV$ zIcx?rQ#LRjPeqKOKM^~7Q0ac_--DmOXnWG7EH#@u>l3RcQz8AqdBgpsm2EWm*h?zq3@_*y^ia)m(3r;?$vk`1 zLdocYU+%DU?%Z{A3cK!jqwmGG^>Amyck4?f#$i&i$)dWh@e`8!+uydX&dlwyCpEV5 z@`Sn3qK=CK#SU!s+`)*HWxc@$ExXv)`CQJNlZDTcImN`5(ZwCJZ7c>35aS#2&8RQ1 zHeatErO*0TQzxOE@zUGn+@0={8;UlQ)z50$m&L3uiqPohRUG290{HsLsN>fXhslQ? z@YoM8X~5e8t#jX!bLX$C$otqXJ#te`elEydBE+MpQwsC!qS@8DFcXmPhL^i1FNoEe zNY%PrnPN7$cJGA~r-l?~`hjFx@Q6Ioe_DunR)t3BQ~E={d3a(Ll|21M7FqSNc_ZPv z$_by39qB4Zh;sQ-98M=5-P{b0p7hd7B7VWv4t_G?9jadL7P%mGr_%^t&uw5s;XJYq z)t*^ii7+@LaqWTGA5o@6_KUUDb62FBZc+Vuas0VMe<1s0eci^DCTm}B*A&&t+PqL( zW{oTHdsA67_wci(%q(m)4L z`f2`O(q}-7e2(B5U0od%3F8JJ7*@_dpAp9D=N+{fyp3lh0g)I#EE>ZI3YyZN2JkO} zhbgHd{!fBuoaqZr@ix7 z3K^(ny1Es``uwcj{_ul2cDxg_EwPF59q*Dp@3%3Ws>`rw4Z(I&*Cgii^oZ0{qAGVoLb9u%!0f)Y8d$?|bQJkajMfXn zbg5vlDS}BlWki04WaWmFL#If?BITw6n8pp;&VTL&?K)^Y?VK%L9S-u`5(Tk>$DOQV z5>t8NCulU2+4z+opE9f=qE`4HS+LUy<5i z7t7|!lOD1UMn;^B_hfm=fFG7NvN*zY&j-^i8{RFwNBJ+Os-%#ix-QCi_rQE)^UJPD zpY?+?S=2RA_8p1{Ki8H-Lo9er580l77C3vO((xrvEbDEZ7_GIeJ8PUxJ9md_Zd@5& zz(mEw{K-^(js?4Y^%@iyyTY%OF|nU zVb$CgU7zl3IWWTkCOs{dIW3_NEIiNlK7Phk&BQs*!mKSFyVuthIv&Y;bq^8{;0>?L zK5{Lx*j1{Kjgrj;k3#qyAxEIc;VSI%5$&#fNA_4MaAbzde1>#r9ge>I@-Uwzl?>>< z6fV1LcU5Sbg;kf!T=?2PNnIU7W@HsC;DAv+F)A!G{;4&6;kGbJ;;oM-*^{ALdfa zjaT5}f3B~S>@TsOOYFf_?z_xbt>LR8cSURB43i69I`XpV=|RoUUOVtgl<)Fn?u{Iq z97JA`1-p4h;i>*}{n8g9-?&Wly>dHyx;`+GPpS!K#LMg3W~X%@`%wOYYei*)k1Z+( zod8Wb5_VSjIA1w)xnQ|aUy|x&!50ar{8GC=)CPp-;|{eDA6>vfA47r;2XP0nswag+ z{WeMVd8~5YNy9W&w)xz+gO8Mttq)JN;1llA!%GRAEz6(VzhKt%gE>R@oleRUljcd- z|1jE6=LAYFC#73o#kktoUt5CvTdZlbPi}a1jNKUxq=uQWWP_xIp}Pb^CPG5YTWmop zL;Rh1J&@X_@5V{gus=JhgFl`nHYWAdBKw5hiM`UFd2M9-rO79pEd29hZQ<4{)=q!i zDzVG)NBSCmxzFv=W_YEz*}~zTbspLJV@z)a?A(yPsGb2(kJ+hWa_Vb7ton=8nHcyp}C0c|B7YQ26DwQUCC3{r=JWiv1z|ef?GU zFWn#O)56}uMteiGF+D|=x7w&$ z#lRPV7J-)o7XnvT_HUfrme@YFQNLlurp6}5#>!UAR(0r0%*iI+Cg+&#n3BYV#0v?C zBNRbRn2jQbLYFdLsliJJ;j3)#-s?*2&x#~>y{7A9+<>GF;;}f~^EABHwJq`nYl(Qhc)R!#@;35JZeSi?9#3vs-uYJl)@!YX=;8~uFI?jK zVtJY=t>~$|skF%gfsKT57GL=n~`NMusc&XtWuYcE32&;J0vPNTs8GOUq^-kOC=R<~5Dlm^a za&R=84x5RA`L`og$E0oLX`tv^k+OOcJKCTv>8AS4PfyHIwiZ?KkBrNV9R!_*oH6E= zm2)8!qi=?ZwkAQv?hE9Z*~XX~(vFL7P!41BuL#g-Qn7ji2^byEbP zulzl%bECHAOy2#9k4>Qyp&uh+SwcFKIy>d=$YF|o9^9pTb$#C0KIR|R_wEJm0imGk zrU)}tf+`Z0wzlxCNt+t<`=g|a5RWxyZ}!T(yF7DFqpGj3_ew4#&5N8m>R(aSdaqzA zsC{Wwv`^E9Gg;}Uu2bedew^+@*Bb{erC-q`6@yO*pK;wo%d^3> zjE}3IP6v#B+bTWKC6MIe=KQI4+UGv$S!nhY`=+QG+c7qEsxY+{RCit!%v27ZjN824 zXIo#AS;z0Q(ED;Ld1BsTkT7akEmq%FxBkIqqj;gOWa*K4O@nyy-`xPX|Bh@~4d<2Iq$%W6>xG z__HnmqoRM8fBcCUcZdRAqJb#<`T&HIlA;pe4*bGkbdihk0r+1S6hWVZ{uhHP!RhkT zKQIK0K5YC0Q&yzUNdLr?>0{0RVhS+mzv3ytRR4v+;fR0bfGflPW=AHXus95nuF(OY w6*dqi%=97u0TT1?5&!@I diff --git a/testWAS/results/forest-continuous.pdf b/testWAS/results/forest-continuous.pdf deleted file mode 100644 index 301fd687901b79ef8ea88b4e83208c43fa93ca26..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4309 zcmZ`-c|4SB8@7Z@WtW^RFChxA84QZDOvsWoJDJ8iOwFQMgp56ee6)zNuPw)skS$8K z%D$vXDJ?{HAxnMlIMwOb`M&qR=X&n#x!>#g{qF0Lx5gN&AXPOG@?j%kZ^Jsndfli9 zB%lURj-Np2>LNfRDuksIC}a$l4grdoeJC}gmKstWiBwZZYicPWKue0(|J}1E(21~@ zEnq^$`uY%XG{A~M_hrywH)|@zi-CivtX*|gbyXycZRDb`ASJ|s0|@?p!F{YmL1EQ3X$pG%|RmynGP>$fcAzvXbaIO z3@Q$er}0}nmcfs9SU_ZNx(}ePg#{@iWkHxn?{FQMnd?fUADEf zdZ;f@J6)WzOG3gAU-M$Je#>U#9EHJGp+-hFGS<5U)>o&t-&OI6k;0=XbFbPuc`^1hINdQ{X-fy z^>*B;#9#HRNi1qqyY)o!#5P;UR_az#CZCkMM*obwf_kF$=J1lz%i1D=0*ylM$894M z?%yc6bRFjuw{V3osGpYJkK7}9`&l{?@Wg8%`V$fP%FTDNn}*bml#+qhQL~xOCf=og zsU;aS9c`GJZAive9$11(MmFUi5X}8r<970F5~20u^zkDi3fzAhCVTAvn74XQd5*;1 zl(WUmaFmuhePvp{ZFp{u(VcF9<};!BX%}c})7)m?Hy}lD5>>MVi|3M|Hi#m`b7}o{ z&$2;vj=L#xJjPv{j2~#`mb>;3GF4?4D@av?lbzSURbu0M90)0I_E04y*ZtVntJen` zwebIGiiHXDz4_m%!(ztYf9?l${$>nlPVk}ut^kWEwtzd#C<+5+2K-MgAuj^fkm3)x zs=<3`4Q&9ag>wIeTr6_@AOp-{oE=05TtWC2!5JWbx;p|ihKU8}>Vn270LconK~3J$9a45 zuy37V<0P`Z_AM-c1L5rynzs@`=Qdhi{)-eLjyc^p6 z_PmZzU3`+KxY+3CEmNFJ)l!tr*JEOQ%9SNs*qe^jJSfEUDWxD!f|gpJ1s}!n%dO(w9fAuK|W6vcb0fV zWy5!_Ig134#P7)Nf3B^gtwvHI)hYIcxUXHiu;iEQ5g9Tij*(t&Sab|d-Ntsi04g6? zi6#+5+1w}Ct!vh!+dVL3r>#t~)jy|;#j+|1cCPbz_9(<_;LAx6i7 zI&JG-PQS2r?j?;PV4ez588 z1Hno*{@0uw28uD8PI)3QgWrQltAlmc=BF{b&5Veh_14fS|?woZ>-1&}^%Sg~#;?ib$BSTXTQT>T5 z+v^fHqsFuGW3FS0^gpWEwi#7q+4u;phz9cPQsXVCceyP{vM*CWB+owMT2@z{bA zr$6`gOH9OWYogvhPDI{CoDn!9bcSnx0yARQ2;KLt_AxizBQbj$kG^*C-RD_kjl zPw17vbUc64{7maCv}_#2AH4ZsLZ*zOa6Io{k*0=wuqxRXx{S5WE6x24SOd7k2{ZRZ!=Oa z;Oep^s|o_u%WQ@8f5Svnsnp;clk z?SfA9=um&8X1m5N4M(4;UWu->{+|B1-X}dbd-mmiw!ik*WOn+^TBFs-#XP>FEtV!( zrADK99k?h&!^NbI>s|}U0=gz+ogrWSxjLd+J+L{@Ch%Q?~3xe+`8mS?TQ`u zK5iLqF76`kifyw|$_*k7?osPe#pmPCACE`xV2JCX9Mt*LI<&}|btetcr?i~$J)YEd z5!@4$rTR-!;J{XNsg^nN7IG&lx?Qr3>qAX@v`W|ZBijw*xZ`$O2=!h}Z6HDkEhotj)prQ;PVJ|s#y`K1rDCF;1akDX61ZvIwLs>-vw&5MI>b8f>`w?_Rs znPhKrOT`P^>E4VJ(#P-D*FF0h5s@45D7zio)BOP1UD7|x9oLM&k?`rK4fhA=6#4D@ zu+TqH@BAs~`4=&<0`sfpajm%l&GXCie~g~58n2?dO}OWiw^m>(YRR^z-A^Z+iCE?2 zXYyy5Tx^Lfyy1Ow)S%Pbqu)cen6LO5(R$au9&7P$xzu8cD?0Aj>=D}=BizVn!M)9Wdu^mpY(;sf%2&7kK(_Xq z71_6k0^(Ga`<>G2mY0Q^JVSj$zku8=lG8dKf6l(Q2uOy|M zvNshh;9W(PtIwmNOWR6gOY>UOu1vb27QKD4taVbC#{U=a}>NJ;Ge--Md}0TyNj?r00)5SvlHuGyhhg<3zEli(Xwb zr9G@XvYc=-M6pa&84RjY#k*T2IdosmDft{c;Tk6zd#FhZT}izDB4$wWxppOCs^F%j zw7I{$n0b-;UbA7!2Xfu=<#xx*$C!f`%AnhA*~CtVHitL;6EB7;Bo!T1J3;9eQg>=; zJUElb30L-C`E<`3>u6IEcia4qxr?~lfIDPeUOvubn0+=q?`V0Z2tQdc_^c+NruftB z^InI-qmC|10Uhtn%jH-0p5q_a)dLFe8@~Y zaG5dw@#DK*T&;QFblN#RS`m1UGDGYdm>&)5O#is_>0rRC&ub-HI>Zt@yxc!kzw^C8 zdk~!UmS#*gT`WGOd)U zeO&Wp%weTyw1P(N9CD_KF)Q9>^wh?#UR~N5tg&9=J`U&|7}wi3<5?fEA2(F8`yljf zA^6^y%ol7^WCvz`*9(&t1?Fz>pvSWR+Ts3-sfA4k_kMF*ZF@Ix`o8}fN=e^!nJvH_V!_yHh53;~aaU}?tE zUao+KCP*TX88ko#4W439=nxUY(-BAw4bY1Mt4SJ-0D#_9tRFn%VsQ-CsEZ>|aSRfk z2>Amboj~-0ArclxrH}#eI2B?U!k$9Ee+=S(>(mV;0NOE*(>G)* z3xp-c5iAKV304OiefitQ6e9XTbOH{G0DtraV0H9YS?f=Q@CY>kiA1RV`~bA3rn)A8 z2YzBGSiG}7fcz6fqG28M8>XcW3z^?AZ6vIRe#3N7a2da2Xjqc}hM|!E1Jn5z#(FdO z-B(=)`AaQyDwaTmsPNkZ0NN1(A^5xic$kc*!0L?E573rEp#!WpfsNkK=vXR!qca+6 NaA)M@4_O%_{tLjSJIG_TMPn?A5=)gcU6_2G8$RrGwjt3MlS_l=mx(Zws4p&h{si`Z%Knt?>|9fUn zpcA26wtx{8OYtM%Xn+-iPGQiYYt~e9`!f_g0HU#UERpOBfYw-FJPlBV3~YhFE~x+O0?UqNO~sSwKg>ZinM8*^X@JJJ z>!2;3MrKfPkU!)<{ILvvmctxR@}>I$8gLZ=G$at|&;vk2B2*g;9!K`Zv(l!~p^^o_ zf^urJEUYBqyW$p?4^hXO>rHZ)XwbSWX`?W!d0Tu^fE@qx3gWddTcg?DQzJ5kg_LKJ zfh!u86DE=pjFObb<9FkjE83&Gs@6Owq{`>F3iSOeU(v$1a6ycdiIX9Yag*g=_SBy8 zx{ByW%bzTDC|$IixZ7Hw?5O_y=^o{${u83k#Ua_SX zf%hdiOoB#hfW9ap=xoRD1rW$1BeVS_VKC|u?;lGwZ!EWKEt{}>`}n3nac>qc)JHek z)xf9Y2I54-qy|lQ|Gs6c(i6}0;Jc00M>hO+CuPlyYV<+%(QLhJ;4pqM5kJq_sT3eS zaaty%b4=--sJiO7a%bAgf$jj$jJ?hANE)vgo++@>t>u<6j-Bc`bbmbKt+LOO29IaP z_;afEW8FI0rqX$Z4(IiZxBB}p9d2urnb|9{qtdn^Nnzu}Dxqoo`lecHnS*Ee?4h3$EG{0pD`>KuHoCf_@#vWG(4|B_oOw){ z=kVFfIjUJr;wN|2hiz@%rtgQJUp8y2FE-`2mijn3)c+NfOeYzS78Taq##Obgh7aky zNlX~ic-wgTMF+_*sr*6lfhiN7zz~r?5$2DXEtD@|A7rzq&MClqHj*c7L4Z3 zi-9i7tzIX-n-e^94x$31CMchR)?)44jYA4AoX&5W8ub+(Z*-L}!>cu}^~lMvmzbu& zbIWk%4+cd^)N)Ovc@3XZ$eP3#cJ9T;R#2b+=$RiK&uZxJ4fnI-zc)H)M)0Nqt^ljS zZ2@yYh(bZ{WSlJ zgIEwNo-1gAMjK#hcy9p3vU2~)YOJrHZ`5b;2#FO0#Nba8aCnwe&=O)Q@CV;OuR;Bv zgu}||7vbb)1=`z<1}`TNf*rb>2 zKq$P3C_<;-lu3%+W^;3XyEta(6h&Af%f!1?t5Mqc34 z=E1?EDXnZ@^UpUp%fQ*JFQv?USRZi{7wgo}bcrty|>#%Sx~k#L0sO=E-p1wDETFm zX)%-B-mSu`p1OYu{w!PES@I>7&4>BfS@=wV_$?XjXBwIsDgjCXEeb>7?w_w@d_0}y z5fi>3j*(t(TXPJ{-pzKS1YbL{5g$MhVRN6~X5F+E-{S?cCEaRa@P?j}Y7FtPeK2*{ z_t5pRF*mbq`TmOF?BYf~Z#ddBglXF{zW66LXgO0OgAPY~sFQqxOE3=}^-fVf?RuRb zzb?)Fg&`#>@L9Gc0X%;U(^X;)m}k|xJ`zgc%r=PAThG6|&e!&(c%&gKadcgvFUyJj z+P>fP!lc*~4hjG}&w3x~TMXrSXvFcUC%HfSQtl=*>Px6&z4@0%qO##>I3o_~v^m>$QRp2RGN(7o-q?+Mhi@x9<+*H=i`e##5}B1-t&HVhyp98p`a=X_M9 z*MTxFMu7k_8b#SdkHoz|YH%w@c6n_d+h(i6n-`(D1nbu0kBNQ6FJc={2hfisZ^b&D zlX%6!i59SyJikK@ZD7nUqBmb)TP9f%H&^I0>pH7I|E-a2H@dFC#!qlVB$#`@%FdEj zmm2~^&T20!jU8z*Vy&G^2n}iFsJeR>C>$aD3o%x07hecC!H!0j3PkV=XkATK5fHs^ zXpkNxyHjA_^>c#7>;%2hb5h094N1o7rH@>NxsDx!TbrNfc`PjwJ9Ray(4i1jRAIwy zom+Nf==hMrN!Y4@g`xk|p8nohHu_;L@!Vu(L&t(EeS2rCX9ZR?Hbl6h3vns%v_0YG zlJX+e?A7Ac5`$@)vEmO?uq8L0{?HnhoKM=_PQ7u02)_+G%YRnzET?u_X!QOmI_0*8 zr<;yt!l4JpU${^XP#h^j_2PE~C-@gr_}Z73y58g04MX_Cb{tL1-=`pyvhzPN#s&wm zN`)DHh8kw|Wf@YDbTB$rl5#C_HpV`(D8)$GeF6L49mtg#muUtv zI}N@0s5;K(69y#t{C$%L?%EU_bUL_QVM*BG(2xTCpqoulNrEHNe#PGHk4sffg+Z88 z#_!7oJvxmOD<9anT(d8t+rNn$s6iYbG1RozM4zdqCV$}k(@6Mmd%_g@rA?B;rh=~m z&OWBX3UAji#27N1bDq=KjaDt;?6W0Yt0DVE*C>i4CFPpdnC6+LK0YO)xVKQEpJVpT z3StAh!Pw9~u5{e$c*~sp6V#JK6;)4Omz^nl|HO1?^oilnM7iov=+NL$U3p;n`c>~hABo}frG~;K+hGW!vko%F2ehY(=eL2Ge!yg784^#|j6|LG| z`Omw;+=^!OX3SbK@9_=`qk?MmOmQzR4%U{L-dpCq0xzMfF}4_TjjN5(jjF*9gKdIi zgWm;zT-mvCcuRI`??&^69hVl@J}yqKO0K%y@8b@)3Aee&ZN*ikrlg)oLG59P>mnRf zc~yGVNopyL;8T>k20ukS{x0~~IQ%}57pQxp% zK{8jegt_2gW_BA9pXM^46s^Rg)O0xCS=_1JHK2N~Zo@BtoJ$(_3+;$IA>?Q0H;7xl zUYV`*^!CHGQMV6nlMUBrPBTMEzNC)2=eWSZypz%=?zXl({SqBr6#b~M2RqPz58i)s z_&rzhLl`c=CpXYQ`;2DgX|2~Q!y~QEZ$qAa5)&&iyKI)+RdnXz^7`^`GpP-84OF*z z_iLnGb(p$ll5L=SVA|Q}O%A?LzC5GMj+oMN--;Q1rmx4ahfEc3)l;JHn(3M%`N4B$ z(}S~p@l!&7+P$MqIx^Kq>YfBXW=@b}Cl$kM!q0t0eU;eevTZElbws&d$Gr=Yq58!I zocd4n`J#HFjqON?iF2}|6OQ-CO1xj{qB5GBa5%JaC)k)RGT{#!uxuN7>f@*m5Nb;?w zcH{_iDB=O;mIvbUbe3J_vb?(1N8>zdo(kEkVdR5F*MSeA%Xd8@U0U7yT?<@q-1eeh zn|Zu(yszTg^In-a6&BKFL2_bdm1c)bC&{l#E$bJ094~l=j%L*0Z*&(DnGW3! zFNf!!kJar}a8zc3($A%CHPg6pMxALFwJ*NCV~usRsY||LcFW8~+-<}iZ(UnE7s@bQ zHBNQ3I9uuSu4MFS)0w8Kx9^_~I+PxFboofC@>=W(zPr)ORTP&LdMQ+MQwn9Mz82oI z(b!;8R9^F@E$n63o5%!?(4MrO9;M4l_)5z4V#Wur`>kD*LE(er4~f^wrF9pj*yCm6 zrNXbQzgun7k3akSjkKCj-*xu@?%JYap*gn+%_kQIeIzQ*g1;*T)H?Ib9&1S)iIf=ToO7ISOiP$%?JQKpqee?0{ z(K8dPpKtQ?ilupYyT5IGMJcD<3oDrB-jvqj+RLRKFB#tmYP&Cp=c$KGC2d|9bZoB5 zYZ9fr8+i0N{pGyPQ}Tpy{l4bTrcbjD8nnI;xm_bP9oIcf9qM~N`x)=l?0 zK=;U;uGW%QYqU0Q?B;=^_^&HrcV_o}!nVisVwU$mH|mfNJpdl{SP%MaGMt%R+J5xV zSGUdXS0jOUgFYivM6}huxUGC1th(Y^cK6xWfd%{`{n^Tvqj;5%$HhA~;j|Y@^8sI* zE(HpyHZ0y;JaqI@*zBi0EAJOdQ>^mCI6lTKZ~kl4*8DYw!hVSBV(g9d1#z*7t|9Z$sj&|z>S67(iRO8^>; z0D!(!>}hDm#o`#OQ5Q#`;urxwM0^kc(g{Rw2nxXBsALiVo}l7chR{tc4hM;J80bxa zLeU5`08}QC>E3uB0Hl6P5(iEGL?ZTYxF3V$i={FGh*$<40Li{&65bz*hW+N5#+uyS z|F&KDHtYW!#J{f>z9|9Fj&XwiZQH;CA&Fs$4S-UD)WNs9e77;i6Hnvm1RNFy{;UhY zs_5Uh5kXYE4@?Ds!(l4FegH~MO;ru>0e)c!Xy?NE0i<6T90lp1|Ha^N^?zdsHAqYR z!%h_iEiL|mAyxhz0|AG&EB|Y!g3$aIhE##%$3N_ls%n4Bg-*p1h4VC)i;HOZ2lj4_yunQ0bVc0x#@M0P@vkS#=HOJeLL zDyghliV&*bJ34jF_nhDNkMDb3bIpA2_u1ZOuJ^v4`w>5l(vySBD?-FWCqmza(nALv z$PhRH1Bfo?AQ~DFs4f|crs9YM6qeoX$aJW==xvpS>mX8 z(90aqC!I5=(HWdH^a&1OU~;;i=#QK=troZ%|ka(G|-mn?eO!<^}Oh z&$l)q@~iG@q?az0YmT?aFVk3?jjnr(SC>|@kB1l?FS6s8D*iBqF_lR2D=wlw&KdZ$ zxo#*lTCTZsN}I%Ww($89pU!c~s~RyOSDjs!F2P^{^ssI!le8 z)5CR7atia-u?9c(-C8GiPHp&TCP+7BB}GKv9` zCFYIn0}&tHxO8kk6eXCYlM&{y%O_UDgxt796x?RLeKlcUON34GM*0-%gta7O$L5y| zdqG4?Rrbzk-+O{SZSq4#og5(@x9D>tt8gl6<|7}Dx?5x@BZ|1U%r%hmrOiU;Qq$d> zXwrj=Nof`;4)mgdu9nj2bPmHGX02D*rkm4bQub?d^)@J|v3|LhuFmBQp0A`b6uQzL;Sh zjW{2P)nDNkx+tsRX(J-e%b6%t?#HgKaQ~P~*T9Hu&&~E^VV^SN`Kg26L29FO5jnX2 zk;)EFB=1GvR;LClcgcqQ9*M}Km(HXkqBakmOi3FwGw&cdy*)8D{<0=)&X-TUhsVp+ zb6u3MWPW$L6g)4qIAk1bj21o{Kd{t2Q+;oAeQu=vvQCPqD^a2wec{Rc6>`bh4mEtl zj%tD&&8By?BCSUY*@biV>B%=y(7;w(^Of02HOcypmuF=-x_pd!kVB&x z=VIAA_E0UIH{;4wtK}PItzbkg|00jrhRh)5`Op`wAUFK^p8d%Z3=;VHlKx_epX2~F z#<@}edw@X-=71we9z+^Q0&oCof_25Ab%?%zJq*031XE;u9RJ263?BIP-h*_7vA|LR zdniblUK133iH&;Tg_4!{{v%)y@s0L~!LZAOBg@TKZoP#N3_nt|j1rpFV> z79=!g8-Y^H^>qNHU*>;t7z1MDwuc((>gu2=SXThauyXvxij32*=RMBgAp#=_6ovK1 zVX%y#&{H7OLI30{gpw-q|0OI&O}_~%BiYB&VuU}6-k{uYI&p8zwO&sTb7ljs27?6o zqi~MBMom|lM3E><63Q&0;17hf{>ptZaJY`?>Q*@GqnBdSFI9qPAfOt0c;2S^I$1RO}wrWkiIAU>Bul0=x8nO!D1jRi4JU1ZP2ksXx|h z5sean*F+L8)9!Wd+!(k&P%X93PY8+>X=r-=eym zp-eIN8)>@(&mXD_a58V5kad?S9vgEo-jU@g4V_-x$l4vOdp3Y>-uP*o5NmhZPlYLhR$T<7jhwqedcd_X(s5R=qVZh+&0t4!}=AZNQi%h&GMzSNYAO?ue( zKyUl0RsC%s~T^To?wZhp{<&XFXf#%L7go=oKSWVM)5=+3|g$UD@ z8{^zIkYk~R+#!3p)pBED-29LAbP{~UIJgfNU*);QjMIL3_0TPm+8Bd`!YB6o*iM{) zpFVkw^Qj2m#fjYH9IG5;UXdC5>5SW2gJ%Y%&OyF#o9KDwcK7v6Gf`#L1T$jg^{lhg zdWEMerny&CHu%`Wb1-r6`2E2rg(UbYm@5P;4i3btUKDJPLl>0W+*KPEnv3CTBbT}0 z;nk1}doS=@U{#L~3_mhKB~`1ObRtkvL^995PZn~(Nncl)%B|& z$9`u0pubPHEqX%ttyzrJrj)xB#xkPF6l+mCNE_6fwVhSr(p4y6?KQ{cR}wwKE2a5j zVls>>jWUfcKRwSUEu3?(k7ash1+jtNplzt1kvn60rg2tc2stEER5~U zLwbX+?p>^HG*!W#`67Hn#4huK)!O4w?+K1Is5 z>}zq1{1#byIqtGc9CANRP!nOLup8E+Oi*g{)IpwCwsq@wCU@_{JVlr&yz=rN*@di7 zHij3&4#Z8zI6EP@TRtS%9|cYa3^%sjADESGS7*) zJZf%y{yjWAFZ@YPH@d&C3Eo#e{E02L9fI+4%ka@r_fswLR-0TI9%;7y81Uk&fIxxq zP2A`0)g7fl_f zyE_d#iI(mzeU5iuHCmM>wvN*4TQBrRP4IrOn5Vq4rYn!s4Ea2zzb0OMBOP2BeDw$N z$H5(TJH|pLL+)sIG+hr3Jbo*i_4v^7Jz?EpCz>@6lMI}ntX{2O=9=Q1^0^q+8`3+U zv-UWyCHZ2YNIb_L&Qgxx{b*jZy?u{D>YO{nl37bd8$`QA7g4Dw!(9J7zC7OCw7jz& zz8%*(3@{}w*6mI)1JS&#v<}R>ymx66g#sJ0W3?oWX005Rw;Q7GPWZ*j%MRP5H?FVq zw27yS>+|$of?txmw5Xbs&epTiACg~XT=)RLAiuzy$xdV^CN4gOPMM0;N?e!JH1ali zovD#Jk!har*|Le>k~6b#qi{IC@f4@YXx6HH3 zn2K9n={M#bkm3*?C9Lj9?EU&S#S!{oNGs~T6XNE(WQ(L_31zi4gG_QJjObiD^4X+I z@^j$wBd1WiX2(AJZ2PimXKMb`(~UE|Mft`4)^nxucAAat#O~0lh$@_CuvDeIEHt22 z-p$c8!K&|OZuys>Ir~_?m=kTv$Orh_qtP#=UZ_04Eff@)h#32d3mBIe%NV^OP7)f| zuXkHtKO6Wmu@YPMEC)}wdS>-@cy4s8Mp(*Po(>fmJ#@c;!j94JjK86NfnwJN8WxH8x>3^(djJ zI;gtif?%d{z(maE^#SXK(#(2((tQ7ut%SF8X3vSQ4IUhB=&b)bZM9J{RYM`r$80GA zfi>?l`x|05Z>}8-Qv6o#I16ZM&1$MGIX8!^W5&uQk70kT1U;NS{1x36(Suq(GOFJp z5hw{g=Ctm+WjLIaTG)0>=7+=Pv-cxDk9@ZfFg|sq?+z~s`V=2th1$jPMVVtMLDOINuY6i4j5Ey&Vp)q=-u&0tt@?Wm zh5RYbPY^&Mj3a>RoH4fkb+X10juR-jUjPJ(!nwI&K}F7x-1dN?64VPvpiux-B=kIu zNX6o@Zd3?dQ4#7&1eXXD3J!p}lhNMbjElz57^5x*N5;^+-0)am07}K-T|vkTjUf{W z0MvzyWf+2o`Mzu_JDL!=NG1U%dtx|I*(#B)(*< z8w3Wx5fIq#3qUF{xhDUA{^9t|G<=Be|rup8I8kZ$>3@WfLh@Ev0x7X ps3F0P2r9yD`DRWeQUNg8&wf#;Xfk!XTZ&4mDiCq;6Q+8Q{{#HiYfb20% IN ONE CATEGORY || cat N: 208, 458, 334 || CAT-ORD || order: 0|1|2 || num categories: 3 || SUCCESS results-ordered-logistic -4_0|| CONTINUOUS MAIN || CONTINUOUS || >20% IN ONE CATEGORY || cat N: 16, 109, 75 || CAT-ORD || order: 0|1|2 || CATORD-SKIP-500 (200) || -5_0|| CONTINUOUS MAIN || CONTINUOUS || >20% IN ONE CATEGORY || Inc(>=10): 0(120) || Inc(>=10): 1(380) || sample 120/380(500) || SUCCESS results-logistic-binary -6_0|| CONTINUOUS MAIN || CONTINUOUS || >20% IN ONE CATEGORY || Removed 0: 5<10 examples || Inc(>=10): 1(195) || SKIP (number of levels: 1) -21_0|| INTEGER || CONTINUOUS || IRNT || SUCCESS results-linear -22_0|| INTEGER || Inc(>=10): 0(891) || Removed 1: 9<10 examples || SKIP (number of levels: 1) -23_0|| INTEGER || Inc(>=10): 0(800) || Inc(>=10): 1(100) || sample 800/100(900) || SUCCESS results-logistic-binary -24_0|| INTEGER || Inc(>=10): 0(760) || Inc(>=10): 2(131) || Removed 1: 9<10 examples || sample 760/131(891) || SUCCESS results-logistic-binary diff --git a/testWAS/results/results-log-2-3.txt b/testWAS/results/results-log-2-3.txt deleted file mode 100644 index 461413b..0000000 --- a/testWAS/results/results-log-2-3.txt +++ /dev/null @@ -1,11 +0,0 @@ -25_0|| INTEGER || Inc(>=10): 0(500) || Inc(>=10): 1(200) || Inc(>=10): 2(200) || 3-20 values || CAT-ORD || order: 0|1|2 || num categories: 3 || SUCCESS results-ordered-logistic -26_0|| INTEGER || Inc(>=10): 0(600) || Inc(>=10): 1(291) || Removed 2: 9<10 examples || sample 600/291(891) || SUCCESS results-logistic-binary -27_0|| INTEGER || Inc(>=10): 11(106) || Inc(>=10): 12(106) || Inc(>=10): 13(72) || Inc(>=10): 14(87) || Inc(>=10): 15(84) || Inc(>=10): 16(89) || Inc(>=10): 17(74) || Inc(>=10): 18(70) || Inc(>=10): 19(56) || Inc(>=10): 20(46) || Inc(>=10): 21(23) || Inc(>=10): 22(31) || Inc(>=10): 23(27) || Inc(>=10): 24(10) || Inc(>=10): 25(10) || Removed 26: 4<10 examples || Removed 27: 2<10 examples || Removed 28: 1<10 examples || Removed 29: 2<10 examples || 3-20 values || CAT-ORD || order: 11|12|13|14|15|16|17|18|19|20|21|22|23|24|25 || num categories: 15 || SUCCESS results-ordered-logistic -32_0|| CAT-SINGLE || Inc(>=10): 0(891) || Removed 1: 9<10 examples || SKIP (only one value) || -33_0|| CAT-SINGLE || Inc(>=10): 0(300) || Inc(>=10): 1(199) || CAT-SINGLE-BINARY || BINARY-LOGISTIC-SKIP-500 (499) || -34_0|| CAT-SINGLE || Inc(>=10): 0(300) || Inc(>=10): 1(600) || CAT-SINGLE-BINARY || sample 300/600(900) || SUCCESS results-logistic-binary -35_0|| CAT-SINGLE || Inc(>=10): 0(300) || Inc(>=10): 1(100) || Inc(>=10): 2(99) || ordered || CAT-ORD || order: 0|1|2 || CATORD-SKIP-500 (499) || -36_0|| CAT-SINGLE || Inc(>=10): 6(39) || Inc(>=10): 8(43) || Inc(>=10): 7(52) || Inc(>=10): 5(24) || Inc(>=10): 10(55) || Inc(>=10): 9(42) || Inc(>=10): 14(78) || Inc(>=10): 11(69) || Inc(>=10): 12(94) || Inc(>=10): 13(73) || Inc(>=10): 15(62) || Inc(>=10): 20(30) || Inc(>=10): 16(49) || Inc(>=10): 17(44) || Inc(>=10): 19(41) || Inc(>=10): 18(44) || Inc(>=10): 21(16) || Inc(>=10): 23(11) || Inc(>=10): 22(13) || Inc(>=10): 24(10) || Removed 25: 7<10 examples || Removed 27: 1<10 examples || Removed 29: 1<10 examples || Removed 26: 1<10 examples || Removed 28: 1<10 examples || ordered || CAT-ORD || order: 5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24 || num categories: 20 || SUCCESS results-ordered-logistic -37_0|| CAT-SINGLE || Inc(>=10): 0(300) || Inc(>=10): 1(100) || Inc(>=10): 2(99) || CAT-SINGLE-UNORDERED || CATUNORD-SKIP-500 (499) || -38_0|| CAT-SINGLE || Inc(>=10): 0(50) || Inc(>=10): 9(200) || Inc(>=10): 1(200) || Inc(>=10): 4(300) || CAT-SINGLE-UNORDERED || reference: 4=300 || SUCCESS results-notordered-logistic -39_0|| CAT-SINGLE || Default related field: x99_0_0 || default value 100 set, N= 250 || Inc(>=10): 100(250) || Inc(>=10): 0(50) || Inc(>=10): 9(200) || Inc(>=10): 1(200) || Inc(>=10): 4(300) || CAT-SINGLE-UNORDERED || reference: 4=300 || SUCCESS results-notordered-logistic diff --git a/testWAS/results/results-log-3-3.txt b/testWAS/results/results-log-3-3.txt deleted file mode 100644 index 1255194..0000000 --- a/testWAS/results/results-log-3-3.txt +++ /dev/null @@ -1,5 +0,0 @@ -41_0|| CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || NO_NAN Remove NA participants 470 || Not numeric || sample 516/14(530) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A2 || NO_NAN Remove NA participants 470 || Not numeric || IS_CM_EXPOSURE || sample 495/35(530) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A3 || NO_NAN Remove NA participants 470 || Not numeric || sample 25/505(530) || SUCCESS results-logistic-binary -42_0|| CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || ALL || Not numeric || sample 986/14(1000) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A2 || ALL || Not numeric || sample 965/35(1000) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A3 || ALL || Not numeric || sample 495/505(1000) || SUCCESS results-logistic-binary -43_0|| CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || Indicator name x99_0_0 || Remove indicator var NAs: 465 || Not numeric || sample 521/14(535) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A2 || Indicator name x99_0_0 || Remove indicator var NAs: 465 || Not numeric || sample 500/35(535) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A3 || Indicator name x99_0_0 || Remove indicator var NAs: 465 || Not numeric || sample 30/505(535) || SUCCESS results-logistic-binary -44_0|| cat-single to cat-multiple || CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || NO_NAN Remove NA participants 470 || Not numeric || sample 516/14(530) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A2 || NO_NAN Remove NA participants 470 || Not numeric || sample 495/35(530) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A3 || NO_NAN Remove NA participants 470 || Not numeric || sample 25/505(530) || SUCCESS results-logistic-binary -99_0|| Excluded continuous: YES || diff --git a/testWAS/results/results-log-all.txt b/testWAS/results/results-log-all.txt index a0d0940..5d26780 100644 --- a/testWAS/results/results-log-all.txt +++ b/testWAS/results/results-log-all.txt @@ -22,7 +22,7 @@ 37_0|| CAT-SINGLE || Inc(>=10): 0(300) || Inc(>=10): 1(100) || Inc(>=10): 2(99) || CAT-SINGLE-UNORDERED || CATUNORD-SKIP-500 (499) || 38_0|| CAT-SINGLE || Inc(>=10): 0(50) || Inc(>=10): 9(200) || Inc(>=10): 1(200) || Inc(>=10): 4(300) || CAT-SINGLE-UNORDERED || reference: 4=300 || SUCCESS results-notordered-logistic 39_0|| CAT-SINGLE || Default related field: x99_0_0 || default value 100 set, N= 250 || Inc(>=10): 100(250) || Inc(>=10): 0(50) || Inc(>=10): 9(200) || Inc(>=10): 1(200) || Inc(>=10): 4(300) || CAT-SINGLE-UNORDERED || reference: 4=300 || SUCCESS results-notordered-logistic -41_0|| CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || NO_NAN Remove NA participants 470 || Not numeric || sample 516/14(530) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A2 || NO_NAN Remove NA participants 470 || Not numeric || IS_CM_EXPOSURE || sample 495/35(530) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A3 || NO_NAN Remove NA participants 470 || Not numeric || sample 25/505(530) || SUCCESS results-logistic-binary +41_0|| cat-single to cat-multiple || CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || [1] "ERROR: 41_0 Error: Categorical multiples variables need a value for CAT_MULT_INDICATOR_FIELDS\n" 42_0|| CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || ALL || Not numeric || sample 986/14(1000) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A2 || ALL || Not numeric || sample 965/35(1000) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A3 || ALL || Not numeric || sample 495/505(1000) || SUCCESS results-logistic-binary 43_0|| CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || Indicator name x99_0_0 || Remove indicator var NAs: 465 || Not numeric || sample 521/14(535) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A2 || Indicator name x99_0_0 || Remove indicator var NAs: 465 || Not numeric || sample 500/35(535) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A3 || Indicator name x99_0_0 || Remove indicator var NAs: 465 || Not numeric || sample 30/505(535) || SUCCESS results-logistic-binary 44_0|| cat-single to cat-multiple || CAT-MULTIPLE || CAT-MUL-BINARY-VAR A1 || NO_NAN Remove NA participants 470 || Not numeric || sample 516/14(530) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A2 || NO_NAN Remove NA participants 470 || Not numeric || sample 495/35(530) || SUCCESS results-logistic-binary CAT-MUL-BINARY-VAR A3 || NO_NAN Remove NA participants 470 || Not numeric || sample 25/505(530) || SUCCESS results-logistic-binary diff --git a/testWAS/results/results-logistic-binary-1-2.txt b/testWAS/results/results-logistic-binary-1-2.txt deleted file mode 100644 index 5682a47..0000000 --- a/testWAS/results/results-logistic-binary-1-2.txt +++ /dev/null @@ -1 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue diff --git a/testWAS/results/results-logistic-binary-1-3.txt b/testWAS/results/results-logistic-binary-1-3.txt deleted file mode 100644 index 9fe7ab7..0000000 --- a/testWAS/results/results-logistic-binary-1-3.txt +++ /dev/null @@ -1,4 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue -5,CONTINUOUS,120/380(500),3.18943911658698,2.46928754995082,3.99516617340677,2.32981040891531e-16 -23,INTEGER,800/100(900),1.93633598767705,1.60360866435611,2.29882481880602,7.31196027630102e-28 -24,INTEGER,760/131(891),1.75433797759218,1.46468192507283,2.06597680366808,2.17988718368106e-30 diff --git a/testWAS/results/results-logistic-binary-2-3.txt b/testWAS/results/results-logistic-binary-2-3.txt deleted file mode 100644 index 3b68a8d..0000000 --- a/testWAS/results/results-logistic-binary-2-3.txt +++ /dev/null @@ -1,3 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue -26,INTEGER,600/291(891),2.21647829260412,1.92186361168347,2.53418345127812,8.11560488921865e-46 -34,CAT-SIN,300/600(900),2.67342038082319,2.31959546662729,3.05528828132202,3.8648784862921e-46 diff --git a/testWAS/results/results-logistic-binary-3-3.txt b/testWAS/results/results-logistic-binary-3-3.txt deleted file mode 100644 index 2613fbf..0000000 --- a/testWAS/results/results-logistic-binary-3-3.txt +++ /dev/null @@ -1,13 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue -41#A1,CAT-MUL,516/14(530),-0.0923557834369921,-0.604318268636898,0.415860123576197,0.731283609278491 -41#A2,CAT-MUL,495/35(530),-1.24250185838579,-1.62960042688341,-0.887102760424099,4.02529697948092e-11 -41#A3,CAT-MUL,25/505(530),2.60299104913144,1.92190675044312,3.49344841921067,4.01993700982098e-11 -42#A1,CAT-MUL,986/14(1000),-0.742168551681468,-1.27958997966834,-0.21074132857573,0.00625348360881744 -42#A2,CAT-MUL,965/35(1000),-1.9048315094404,-2.41596033097747,-1.45191305152567,6.6036582021544e-15 -42#A3,CAT-MUL,495/505(1000),-2.53879885591662,-2.86719437561936,-2.23511475752686,5.60824611427478e-56 -43#A1,CAT-MUL,521/14(535),-0.118660670448722,-0.660567582659025,0.404639992765588,0.672574033804116 -43#A2,CAT-MUL,500/35(535),-1.34326623535089,-1.76166165073619,-0.959249118076131,3.96615578379219e-11 -43#A3,CAT-MUL,30/505(535),1.08228383424423,0.702061178293818,1.48159863585603,4.2383532774057e-08 -44#A1,CAT-MUL,516/14(530),-0.0923557834369921,-0.604318268636898,0.415860123576197,0.731283609278491 -44#A2,CAT-MUL,495/35(530),-1.24250185838579,-1.62960042688341,-0.887102760424099,4.02529697948092e-11 -44#A3,CAT-MUL,25/505(530),2.60299104913144,1.92190675044312,3.49344841921067,4.01993700982098e-11 diff --git a/testWAS/results/results-logistic-binary-all.txt b/testWAS/results/results-logistic-binary-all.txt index ec88660..31b11a5 100644 --- a/testWAS/results/results-logistic-binary-all.txt +++ b/testWAS/results/results-logistic-binary-all.txt @@ -1,18 +1,15 @@ varName,varType,n,beta,lower,upper,pvalue -5,CONTINUOUS,120/380(500),3.18943911658698,2.46928754995082,3.99516617340677,2.32981040891531e-16 -23,INTEGER,800/100(900),1.93633598767705,1.60360866435611,2.29882481880602,7.31196027630102e-28 -24,INTEGER,760/131(891),1.75433797759218,1.46468192507283,2.06597680366808,2.17988718368106e-30 -26,INTEGER,600/291(891),2.21647829260412,1.92186361168347,2.53418345127812,8.11560488921865e-46 -34,CAT-SIN,300/600(900),2.67342038082319,2.31959546662729,3.05528828132202,3.8648784862921e-46 -41#A1,CAT-MUL,516/14(530),-0.0923557834369921,-0.604318268636898,0.415860123576197,0.731283609278491 -41#A2,CAT-MUL,495/35(530),-1.24250185838579,-1.62960042688341,-0.887102760424099,4.02529697948092e-11 -41#A3,CAT-MUL,25/505(530),2.60299104913144,1.92190675044312,3.49344841921067,4.01993700982098e-11 -42#A1,CAT-MUL,986/14(1000),-0.742168551681468,-1.27958997966834,-0.21074132857573,0.00625348360881744 -42#A2,CAT-MUL,965/35(1000),-1.9048315094404,-2.41596033097747,-1.45191305152567,6.6036582021544e-15 -42#A3,CAT-MUL,495/505(1000),-2.53879885591662,-2.86719437561936,-2.23511475752686,5.60824611427478e-56 -43#A1,CAT-MUL,521/14(535),-0.118660670448722,-0.660567582659025,0.404639992765588,0.672574033804116 -43#A2,CAT-MUL,500/35(535),-1.34326623535089,-1.76166165073619,-0.959249118076131,3.96615578379219e-11 -43#A3,CAT-MUL,30/505(535),1.08228383424423,0.702061178293818,1.48159863585603,4.2383532774057e-08 -44#A1,CAT-MUL,516/14(530),-0.0923557834369921,-0.604318268636898,0.415860123576197,0.731283609278491 -44#A2,CAT-MUL,495/35(530),-1.24250185838579,-1.62960042688341,-0.887102760424099,4.02529697948092e-11 -44#A3,CAT-MUL,25/505(530),2.60299104913144,1.92190675044312,3.49344841921067,4.01993700982098e-11 +5,CONTINUOUS,120/380(500),3.18943911658699,2.46928754995082,3.99516617340677,2.3298104089149e-16 +23,INTEGER,800/100(900),1.93633598767705,1.60360866435611,2.29882481880602,7.31196027630188e-28 +24,INTEGER,760/131(891),1.75433797759218,1.46468192507283,2.06597680366808,2.17988718368084e-30 +26,INTEGER,600/291(891),2.21647829260412,1.92186361168347,2.53418345127812,8.11560488921906e-46 +34,CAT-SIN,300/600(900),2.67342038082318,2.31959546662729,3.05528828132202,3.8648784862923e-46 +42#A1,CAT-MUL,986/14(1000),-0.74216855168147,-1.27958997966834,-0.21074132857573,0.00625348360881725 +42#A2,CAT-MUL,965/35(1000),-1.9048315094404,-2.41596033097747,-1.45191305152567,6.60365820215384e-15 +42#A3,CAT-MUL,495/505(1000),-2.53879885591661,-2.86719437561935,-2.23511475752686,5.60824611426784e-56 +43#A1,CAT-MUL,521/14(535),-0.118660670448726,-0.660567582659025,0.404639992765588,0.672574033804106 +43#A2,CAT-MUL,500/35(535),-1.34326623535089,-1.76166165073619,-0.95924911807613,3.96615578379178e-11 +43#A3,CAT-MUL,30/505(535),1.08228383424425,0.702061178293763,1.48159863585598,4.23835327740328e-08 +44#A1,CAT-MUL,516/14(530),-0.0923557834369891,-0.6043182686369,0.415860123576197,0.7312836092785 +44#A2,CAT-MUL,495/35(530),-1.24250185838579,-1.62960042688341,-0.887102760424098,4.02529697948063e-11 +44#A3,CAT-MUL,25/505(530),2.60299104913143,1.92190675044312,3.49344841921067,4.01993700982103e-11 diff --git a/testWAS/results/results-multinomial-logistic-1-2.txt b/testWAS/results/results-multinomial-logistic-1-2.txt deleted file mode 100644 index 5682a47..0000000 --- a/testWAS/results/results-multinomial-logistic-1-2.txt +++ /dev/null @@ -1 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue diff --git a/testWAS/results/results-multinomial-logistic-1-3.txt b/testWAS/results/results-multinomial-logistic-1-3.txt deleted file mode 100644 index 5682a47..0000000 --- a/testWAS/results/results-multinomial-logistic-1-3.txt +++ /dev/null @@ -1 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue diff --git a/testWAS/results/results-multinomial-logistic-2-3.txt b/testWAS/results/results-multinomial-logistic-2-3.txt deleted file mode 100644 index fd4d32b..0000000 --- a/testWAS/results/results-multinomial-logistic-2-3.txt +++ /dev/null @@ -1,10 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue -38-4,CAT-SIN,300/750,-999,-999,-999,2.36929803366859e-124 -38-4#0,CAT-SIN,300#50,-2.40849608862529,-3.24568232787998,-1.5713098493706,1.71326515197023e-08 -38-4#9,CAT-SIN,300#200,3.84847729362016,3.17580034120106,4.52115424603927,0 -38-4#1,CAT-SIN,300#200,-2.04799198103754,-2.53421642573651,-1.56176753633858,2.22044604925031e-16 -39-4,CAT-SIN,300/1000,-999,-999,-999,0 -39-4#100,CAT-SIN,300#250,-3091.67248735407,-3091.68083398189,-3091.66414072624,0 -39-4#0,CAT-SIN,300#50,-2.40847366034011,-3.24564980598633,-1.57129751469388,1.71311120844564e-08 -39-4#9,CAT-SIN,300#200,3.84809435137089,3.17549284227653,4.52069586046524,0 -39-4#1,CAT-SIN,300#200,-2.04793321181161,-2.53415006998987,-1.56171635363335,2.22044604925031e-16 diff --git a/testWAS/results/results-multinomial-logistic-3-3.txt b/testWAS/results/results-multinomial-logistic-3-3.txt deleted file mode 100644 index 5682a47..0000000 --- a/testWAS/results/results-multinomial-logistic-3-3.txt +++ /dev/null @@ -1 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue diff --git a/testWAS/results/results-multinomial-logistic-all.txt b/testWAS/results/results-multinomial-logistic-all.txt index fd4d32b..01ad9c0 100644 --- a/testWAS/results/results-multinomial-logistic-all.txt +++ b/testWAS/results/results-multinomial-logistic-all.txt @@ -1,10 +1,10 @@ varName,varType,n,beta,lower,upper,pvalue -38-4,CAT-SIN,300/750,-999,-999,-999,2.36929803366859e-124 -38-4#0,CAT-SIN,300#50,-2.40849608862529,-3.24568232787998,-1.5713098493706,1.71326515197023e-08 -38-4#9,CAT-SIN,300#200,3.84847729362016,3.17580034120106,4.52115424603927,0 -38-4#1,CAT-SIN,300#200,-2.04799198103754,-2.53421642573651,-1.56176753633858,2.22044604925031e-16 +38-4,CAT-SIN,300/750,-999,-999,-999,2.36929803366805e-124 +38-4#0,CAT-SIN,300#50,-2.40849608862529,-3.24566694438637,-1.57132523286421,1.71326515197023e-08 +38-4#9,CAT-SIN,300#200,3.84847729362016,3.17581270179792,4.52114188544241,0 +38-4#1,CAT-SIN,300#200,-2.04799198103754,-2.53420749124824,-1.56177647082684,2.22044604925031e-16 39-4,CAT-SIN,300/1000,-999,-999,-999,0 -39-4#100,CAT-SIN,300#250,-3091.67248735407,-3091.68083398189,-3091.66414072624,0 -39-4#0,CAT-SIN,300#50,-2.40847366034011,-3.24564980598633,-1.57129751469388,1.71311120844564e-08 -39-4#9,CAT-SIN,300#200,3.84809435137089,3.17549284227653,4.52069586046524,0 -39-4#1,CAT-SIN,300#200,-2.04793321181161,-2.53415006998987,-1.56171635363335,2.22044604925031e-16 +39-4#100,CAT-SIN,300#250,-3091.67248733479,-3091.68083380925,-3091.66414086034,0 +39-4#0,CAT-SIN,300#50,-2.40847366034002,-3.24563442267806,-1.57131289800198,1.71311120844564e-08 +39-4#9,CAT-SIN,300#200,3.84809435137093,3.17550520148713,4.52068350125473,0 +39-4#1,CAT-SIN,300#200,-2.04793321181159,-2.534141135641,-1.56172528798218,2.22044604925031e-16 diff --git a/testWAS/results/results-ordered-logistic-1-2.txt b/testWAS/results/results-ordered-logistic-1-2.txt deleted file mode 100644 index 5682a47..0000000 --- a/testWAS/results/results-ordered-logistic-1-2.txt +++ /dev/null @@ -1 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue diff --git a/testWAS/results/results-ordered-logistic-1-3.txt b/testWAS/results/results-ordered-logistic-1-3.txt deleted file mode 100644 index 093e33c..0000000 --- a/testWAS/results/results-ordered-logistic-1-3.txt +++ /dev/null @@ -1,2 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue -3,CONTINUOUS,1000,0.661807286044053,0.534467956678656,0.789146615409451,3.01434968530619e-23 diff --git a/testWAS/results/results-ordered-logistic-2-3.txt b/testWAS/results/results-ordered-logistic-2-3.txt deleted file mode 100644 index eeea18b..0000000 --- a/testWAS/results/results-ordered-logistic-2-3.txt +++ /dev/null @@ -1,4 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue -25,INTEGER,900,4.47054036959362,3.98675684857394,4.95432389061329,1.07328655178677e-62 -27,INTEGER,891,6.00635627030678,5.57811072838858,6.43460181222497,2.40837709920153e-120 -36,CAT-SIN,889,4.26406070415127,3.97401193807006,4.55410947023248,1.45251953559895e-128 diff --git a/testWAS/results/results-ordered-logistic-3-3.txt b/testWAS/results/results-ordered-logistic-3-3.txt deleted file mode 100644 index 5682a47..0000000 --- a/testWAS/results/results-ordered-logistic-3-3.txt +++ /dev/null @@ -1 +0,0 @@ -varName,varType,n,beta,lower,upper,pvalue diff --git a/testWAS/results/results-ordered-logistic-all.txt b/testWAS/results/results-ordered-logistic-all.txt index a7b9803..0c3e84e 100644 --- a/testWAS/results/results-ordered-logistic-all.txt +++ b/testWAS/results/results-ordered-logistic-all.txt @@ -1,5 +1,5 @@ varName,varType,n,beta,lower,upper,pvalue -3,CONTINUOUS,1000,0.661807286044053,0.534467956678656,0.789146615409451,3.01434968530619e-23 -25,INTEGER,900,4.47054036959362,3.98675684857394,4.95432389061329,1.07328655178677e-62 -27,INTEGER,891,6.00635627030678,5.57811072838858,6.43460181222497,2.40837709920153e-120 -36,CAT-SIN,889,4.26406070415127,3.97401193807006,4.55410947023248,1.45251953559895e-128 +3,CONTINUOUS,1000,0.661807286044053,0.534467956678638,0.789146615409469,3.01434968534588e-23 +25,INTEGER,900,4.47054036959369,3.98675684857102,4.95432389061637,1.07328655338457e-62 +27,INTEGER,891,6.00635627030785,5.57811072839006,6.43460181222563,2.40837709810009e-120 +36,CAT-SIN,889,4.26406070415127,3.97401193807007,4.55410947023247,1.45251953558349e-128 diff --git a/testWAS/results/variable-flow-counts-1-3.txt b/testWAS/results/variable-flow-counts-1-3.txt deleted file mode 100644 index b09ba23..0000000 --- a/testWAS/results/variable-flow-counts-1-3.txt +++ /dev/null @@ -1,22 +0,0 @@ -name,countValue -cont.binary,1 -cont.main,3 -cont.main.500,1 -cont.onevalue,1 -cont.ordcattry,2 -cont.ordcattry.ordcat,2 -excluded.catMul,1 -excluded.catSin,1 -excluded.int,1 -int.binary,2 -int.continuous,1 -int.onevalue,1 -ordCat,2 -ordCat.500,1 -start.cont,6 -start.exposure.cont,1 -start.int,4 -success.binary,3 -success.continuous,2 -success.exposure.continuous,1 -success.ordCat,1 diff --git a/testWAS/results/variable-flow-counts-2-3.txt b/testWAS/results/variable-flow-counts-2-3.txt deleted file mode 100644 index e3620ad..0000000 --- a/testWAS/results/variable-flow-counts-2-3.txt +++ /dev/null @@ -1,16 +0,0 @@ -name,countValue -binary.500,1 -catSin.case1,2 -catSin.case2,3 -catSin.case3,2 -catSin.onevalue,1 -int.binary,1 -int.catord,2 -ordCat,4 -ordCat.500,1 -start.catSin,8 -start.int,3 -success.binary,2 -success.ordCat,3 -success.unordCat,2 -unordCat.500,1 diff --git a/testWAS/results/variable-flow-counts-3-3.txt b/testWAS/results/variable-flow-counts-3-3.txt deleted file mode 100644 index 1c778f3..0000000 --- a/testWAS/results/variable-flow-counts-3-3.txt +++ /dev/null @@ -1,9 +0,0 @@ -name,countValue -catMul.binary,12 -catMul.over10,12 -catSinToCatMul,1 -excluded.cont,1 -start.catMul,4 -start.exposure.catMulvalues,1 -success.binary,12 -success.exposure.binary,1 diff --git a/testWAS/results/variable-flow-counts-all.txt b/testWAS/results/variable-flow-counts-all.txt index 4dadad2..78c716d 100644 --- a/testWAS/results/variable-flow-counts-all.txt +++ b/testWAS/results/variable-flow-counts-all.txt @@ -1,12 +1,12 @@ name,countValue binary.500,1 -catMul.binary,12 -catMul.over10,12 +catMul.binary,10 +catMul.over10,9 catSin.case1,2 catSin.case2,3 catSin.case3,2 catSin.onevalue,1 -catSinToCatMul,1 +catSinToCatMul,2 cont.binary,1 cont.main,3 cont.main.500,1 @@ -26,12 +26,10 @@ ordCat.500,2 start.catMul,4 start.catSin,8 start.cont,6 -start.exposure.catMulvalues,1 start.exposure.cont,1 start.int,7 -success.binary,17 +success.binary,14 success.continuous,2 -success.exposure.binary,1 success.exposure.continuous,1 success.ordCat,4 success.unordCat,2 diff --git a/testWAS/results/variable-flow-counts-combined.txt b/testWAS/results/variable-flow-counts-combined.txt deleted file mode 100644 index 4dadad2..0000000 --- a/testWAS/results/variable-flow-counts-combined.txt +++ /dev/null @@ -1,38 +0,0 @@ -name,countValue -binary.500,1 -catMul.binary,12 -catMul.over10,12 -catSin.case1,2 -catSin.case2,3 -catSin.case3,2 -catSin.onevalue,1 -catSinToCatMul,1 -cont.binary,1 -cont.main,3 -cont.main.500,1 -cont.onevalue,1 -cont.ordcattry,2 -cont.ordcattry.ordcat,2 -excluded.catMul,1 -excluded.catSin,1 -excluded.cont,1 -excluded.int,1 -int.binary,3 -int.catord,2 -int.continuous,1 -int.onevalue,1 -ordCat,6 -ordCat.500,2 -start.catMul,4 -start.catSin,8 -start.cont,6 -start.exposure.catMulvalues,1 -start.exposure.cont,1 -start.int,7 -success.binary,17 -success.continuous,2 -success.exposure.binary,1 -success.exposure.continuous,1 -success.ordCat,4 -success.unordCat,2 -unordCat.500,1 diff --git a/testWAS/testdata.csv b/testWAS/testdata.csv deleted file mode 100644 index 513de9c..0000000 --- a/testWAS/testdata.csv +++ /dev/null @@ -1,1001 +0,0 @@ -"geno","x21022_0_0","x31_0_0","x22000_0_0","cont1","cont2","cont3","cont4" -"1",-1.20706574938542,47.9466658037648,0,1,1.2438155595732,NA,0,1 -"2",0.27742924211066,63.0146673976471,1,1,0.116104203964638,NA,0,0 -"3",1.08444117668306,44.6085477640437,0,0,2.22954950810941,NA,0,0 -"4",-2.34569770262935,66.3537072349362,1,0,0.361772739183953,NA,0,0 -"5",0.42912468881105,67.0295177475515,0,0,-0.63109008650694,NA,0,1 -"6",0.506055892157574,40.9411713538266,1,0,1.6467781891876,NA,0,0 -"7",-0.574739960134649,69.3892144955624,1,1,0.373786490804675,NA,0,1 -"8",-0.546631855784187,57.7550789450421,1,1,-1.54388992456,NA,0,1 -"9",-0.564451999093283,53.2618322461205,0,1,-0.510904308698956,NA,0,1 -"10",-0.890037829044104,64.4578744480567,1,0,1.14870093595012,NA,0,0 -"11",-0.477192699753547,72.8061710689861,0,1,-0.718869720667227,NA,0,1 -"12",-0.998386444859704,75.6513013425123,0,0,1.7378226082064,NA,0,1 -"13",-0.77625389463799,47.994539638105,0,1,0.267475750107545,NA,0,0 -"14",0.0644588172762693,55.6230503362601,0,1,-0.550524203305469,NA,0,0 -"15",0.959494058970771,61.4638148182447,1,0,0.391629853781272,NA,0,1 -"16",-0.110285494390774,60.6601907555389,1,1,1.32923582506416,NA,0,1 -"17",-0.511009505806642,55.5496398470349,0,1,-0.80160294770982,NA,0,0 -"18",-0.911195416629811,36.578876135585,0,0,0.667314523667813,NA,0,1 -"19",-0.83717168026894,57.2161763249215,0,1,0.591688789537777,NA,0,0 -"20",2.41583517848934,48.1819636765515,1,0,0.123586982982799,NA,0,1 -"21",0.134088220152031,60.33125272564,0,0,-0.462237562002588,NA,0,0 -"22",-0.490685896690943,51.6228298815946,1,0,-0.688925532781386,NA,0,0 -"23",-0.440547872353227,58.8238891975371,0,0,-0.0535212677736979,NA,0,1 -"24",0.459589441005854,52.584774288708,1,0,-0.572706502414098,NA,0,0 -"25",-0.693720246937475,55.9165303081683,1,0,0.21375633645251,NA,0,1 -"26",-1.44820491038647,56.2222923904128,1,0,0.359803188599339,NA,0,0 -"27",0.574755720900728,73.2836671199718,1,0,2.27695732166317,NA,0,0 -"28",-1.02365572296388,67.1643630684198,1,1,-0.440291411753061,NA,0,1 -"29",-0.0151383003641817,57.6771081174904,0,1,-0.318971349212263,NA,0,0 -"30",-0.935948601168394,47.7377309975733,0,1,-0.15261217570315,NA,0,1 -"31",1.10229754620026,70.4753809093163,1,1,-0.131889299757686,NA,0,0 -"32",-0.475593078869057,45.6359281317617,0,0,0.446197218612174,NA,0,0 -"33",-0.709440037512506,48.175310716134,1,0,2.24976919884358,NA,0,0 -"34",-0.501258060594761,52.2379516418684,0,0,0.181818520118387,NA,0,0 -"35",-1.6290934690787,50.2768649052425,1,0,-0.270863415704329,NA,0,0 -"36",-1.16761926209419,50.9634243880893,0,0,-0.037073218575801,NA,0,0 -"37",-2.18003964894867,72.8720394604443,1,1,1.40637562902228,NA,0,0 -"38",-1.34099319210003,58.9381239901269,1,0,-0.266690213501999,NA,0,0 -"39",-0.294293858763713,52.8412904634293,1,1,-1.29619552537285,NA,0,1 -"40",-0.46589754040611,66.7172219647766,1,1,-0.249005464442341,NA,0,1 -"41",1.44949626528713,61.9763362591688,1,0,-1.18874052663649,NA,0,0 -"42",-1.06864272367458,47.7990327466637,0,1,-0.309260543638855,NA,0,0 -"43",-0.855364633737507,54.3903487954904,0,0,0.589199536628837,NA,0,1 -"44",-0.280623001957454,65.3373263441171,0,1,-1.68359211475623,NA,0,1 -"45",-0.994340076295665,79.5031546468388,1,0,-0.530653818657169,NA,0,0 -"46",-0.968514317872049,75.8194322960051,1,1,-0.0166840920859275,NA,0,0 -"47",-1.10731819264177,68.3778752223611,1,0,0.189736857901506,NA,0,1 -"48",-1.25198588563894,59.8693033731552,0,1,1.34757691898234,NA,0,1 -"49",-0.523828118987356,42.7420403798901,0,1,1.19532024763734,NA,0,1 -"50",-0.4968499572667,52.6914642631802,0,0,-0.620171719988641,NA,0,1 -"51",-1.80603125680195,57.8458613465304,1,0,-0.175189000933116,NA,0,1 -"52",-0.582075924689333,64.9020666855371,1,0,-1.64714780166645,NA,0,0 -"53",-1.10888962442678,60.5939342375615,0,0,-1.3320729000756,NA,0,0 -"54",-1.01496200949201,60.5831205292457,0,1,-1.00968055555978,NA,0,0 -"55",-0.162309523556819,78.3836310657496,1,1,0.227512840423866,NA,0,1 -"56",0.563055818994517,63.8765089941351,1,0,-0.789891569760385,NA,0,1 -"57",1.64781747281506,57.1944927534534,1,1,0.943648066445016,NA,0,0 -"58",-0.773353423943695,55.7148226203643,1,0,0.346260971880386,NA,0,0 -"59",1.6059096288573,60.9532493553194,0,1,1.3637663943923,NA,0,1 -"60",-1.15780854769441,51.8666859105858,0,1,1.10482445945345,NA,0,0 -"61",0.656588464332807,73.6819007742547,0,0,0.10188234898138,NA,0,1 -"62",2.54899107071786,57.1246396990977,0,1,-0.301585633083307,NA,0,1 -"63",-0.0347603898038376,54.7144757652248,1,0,-1.51063939131438,NA,0,0 -"64",-0.669633579661957,51.2698707674302,0,1,0.529007075806006,NA,0,0 -"65",-0.00760475571097463,69.3589307240067,0,1,-0.129745906677617,NA,0,1 -"66",1.77708444814343,47.2273267697779,0,1,0.325945377059653,NA,0,0 -"67",-1.13860773675091,55.3126016106077,0,0,-0.215780727869325,NA,0,0 -"68",1.36782717942326,63.1251627333967,1,1,1.51600368457984,NA,0,1 -"69",1.32956479078236,58.9167769835649,1,1,0.691256197053224,NA,0,1 -"70",0.336472797203817,62.5079041733303,0,1,0.656187161645506,NA,0,0 -"71",0.00689283844195796,46.8380660579378,0,1,-0.855226548741805,NA,0,0 -"72",-0.455468738161428,55.6992455083261,1,0,-0.279060276153948,NA,0,0 -"73",-0.366523932873933,42.9589030241387,1,0,-0.742372375227027,NA,0,1 -"74",0.648286567503148,51.5786263478554,1,0,0.777492837430792,NA,0,1 -"75",2.07027086133094,71.4825059530873,1,1,-0.289240352525919,NA,0,1 -"76",-0.15339841246697,62.5283153590922,0,1,0.666245990233836,NA,0,1 -"77",-1.39070094670149,60.8528718612452,1,0,-1.9264631392104,NA,0,0 -"78",-0.723581777110154,50.6637386947879,1,1,0.176749020379374,NA,0,0 -"79",0.258261762167853,54.4318039384037,0,1,-1.09468471677104,NA,0,0 -"80",-0.317059114961199,49.8362968057923,1,0,0.13939769112346,NA,0,1 -"81",-0.177789957664294,60.5352524982115,1,1,0.707865641700464,NA,0,1 -"82",-0.169994076854541,67.6513360614726,1,1,-0.56074604752531,NA,0,1 -"83",-1.37230188587669,62.991948153815,1,0,-0.00518705947728593,NA,0,0 -"84",-0.173787170414033,53.823842517868,1,1,-1.25973317328134,NA,0,0 -"85",0.850232257121884,65.5278731357432,0,0,-0.699440510273066,NA,0,1 -"86",0.697608711668253,63.8790849886898,1,1,-1.87625722455871,NA,0,0 -"87",0.549997351091101,68.4321094225036,1,0,-0.152968849695331,NA,0,1 -"88",-0.402731975447623,55.5937241366341,0,0,-0.334822262418323,NA,0,1 -"89",-0.191593770478477,48.2063462674728,1,0,0.986650708606245,NA,0,1 -"90",-1.19452787960251,59.1543093464118,1,1,0.455944722358052,NA,0,0 -"91",-0.0531588187433239,63.0277769505261,0,0,-0.0192355444580243,NA,0,1 -"92",0.255196000995833,75.5909373112435,0,0,0.529381440271203,NA,0,0 -"93",1.70596400724007,59.3033747269476,0,0,-0.780230182566805,NA,0,1 -"94",1.00151325236345,64.270394144678,0,1,0.171797365839054,NA,0,1 -"95",-0.49558344300146,59.1673680172382,1,0,-1.60905679167636,NA,0,1 -"96",0.355550296999749,81.5639774076182,0,0,-1.6751510908634,NA,0,1 -"97",-1.13460804353385,73.3796805938236,1,1,0.942514927592197,NA,0,0 -"98",0.878203626952291,57.6769555260476,1,0,0.0795745542571885,NA,0,0 -"99",0.972916753376122,58.7259591005261,0,0,1.41140206718487,NA,0,1 -"100",2.12111710537568,59.7831459572825,1,0,-0.170360912244337,NA,0,0 -"101",0.414523534374985,61.1325408438459,1,1,-1.37681034516758,NA,0,0 -"102",-0.47471847352936,54.3490458950753,0,1,-0.899420361203481,NA,0,0 -"103",0.0659934937044614,60.807003872445,1,1,1.54358338788558,NA,0,1 -"104",-0.502477782393866,53.0422859341106,1,1,-1.81914236035552,NA,0,0 -"105",-0.825998587129336,49.0379786704802,1,0,-0.834698956166578,NA,0,1 -"106",0.166989279752984,68.0444055775452,0,1,1.2821658455835,NA,0,1 -"107",-0.896264626244759,66.5664411993471,0,1,-0.155315871583421,NA,0,0 -"108",0.168185387655757,54.3196932265546,1,0,-1.78837946522822,NA,0,1 -"109",0.354968261014854,72.7278584148814,1,0,-0.299097216479544,NA,0,0 -"110",-0.0521051168711336,58.0745608424303,0,0,0.446961511361309,NA,0,0 -"111",-0.195934618716253,57.1633478141118,0,0,1.067308412976,NA,0,0 -"112",-0.64906975160576,50.4632828706075,0,1,-0.986960123284403,NA,0,1 -"113",-1.10976723139873,59.7564935396584,0,0,1.85707765802524,NA,0,0 -"114",0.849274202655058,58.6301403514248,0,1,-0.0138826606861277,NA,0,0 -"115",0.022362526076024,60.6751839112109,1,1,0.565559334823796,NA,0,1 -"116",0.831140617306835,64.4516835983198,0,1,-0.602297404440719,NA,0,1 -"117",-1.24428785134423,69.0660038824601,1,0,1.28512367047536,NA,0,0 -"118",0.169026413766175,63.2741056646057,1,0,1.33601685013132,NA,0,0 -"119",0.673166306947679,69.9256162675209,0,0,-0.556389298983822,NA,0,1 -"120",-0.0262763764485261,70.6540057402709,1,1,-0.716651176935715,NA,0,0 -"121",-0.191392168556024,30.6951794461942,0,0,-0.759021685044223,NA,0,1 -"122",-0.781906646699876,56.3722625701225,0,1,1.56851089568209,NA,0,0 -"123",2.05816198776349,65.6993253809256,0,1,0.535463570505942,NA,0,0 -"124",0.750501453155831,48.8036452769496,0,1,0.209412200921675,NA,0,0 -"125",1.82420830237829,50.3934964139591,1,1,0.992473242702389,NA,0,1 -"126",0.080059641091988,62.7373647191221,0,1,-0.53725012215092,NA,0,1 -"127",-0.631409298966366,55.5857671135984,1,1,-0.268172819727498,NA,0,0 -"128",-1.51328812003373,42.6851218232149,0,0,0.929007556651931,NA,0,1 -"129",-0.636099831085051,69.0585484397637,1,1,-1.55826690472119,NA,0,0 -"130",0.226301532497161,51.5916035849323,0,1,0.64991826547118,NA,0,0 -"131",1.01369034746575,63.8634785492597,0,1,-1.51152614736535,NA,0,0 -"132",0.252750135066342,64.1243011445702,0,0,-0.865319866372719,NA,0,1 -"133",-1.17194831284112,75.2126864304235,1,1,0.347684637661273,NA,0,1 -"134",0.668714328525034,61.9656423546082,1,0,-0.186095036517136,NA,0,1 -"135",-1.65010093484436,66.6561035283678,0,1,0.595346298736087,NA,0,0 -"136",-0.365852248191901,71.4059109224811,1,1,-1.08826919443383,NA,0,0 -"137",-0.31611832883308,55.9361887195241,0,0,-0.379360773245381,NA,0,1 -"138",-1.94824604732137,43.4443864631071,0,0,-0.133805150064513,NA,0,1 -"139",0.920057522390905,61.429300364911,0,0,0.293893744779534,NA,0,0 -"140",-0.622871594878875,73.2591430830882,1,1,-0.801214309674805,NA,0,0 -"141",-0.334036649997458,45.9372233936045,0,0,2.30573365422277,NA,0,1 -"142",1.39514789252962,70.1213161499515,1,1,-1.01508205682307,NA,0,0 -"143",0.63667441103895,59.3986915704217,1,0,-0.942518263570327,NA,0,1 -"144",-0.108431696990383,60.9948142428027,0,1,-0.107134274345954,NA,0,1 -"145",0.513762778404674,56.5077820249413,1,0,-1.13134071861089,NA,0,1 -"146",0.399271807273958,70.3685327959033,1,0,-0.51857369605202,NA,0,1 -"147",1.6628564474304,74.2415647914164,1,1,0.319354057090191,NA,0,0 -"148",0.275893403585483,75.7084879138618,1,1,0.599169972037201,NA,0,0 -"149",0.506272623207687,50.3259926693272,0,0,0.56608886040822,NA,0,0 -"150",0.347551975023811,67.5928584434279,1,0,-0.814755450314007,NA,0,0 -"151",-0.377237647496313,56.2381034554186,1,1,0.393385013408062,NA,0,0 -"152",0.0976194631092374,50.1659058608481,0,0,-0.161707331153351,NA,0,1 -"153",1.63874464512173,63.8989510110322,0,1,-0.870589063755767,NA,0,1 -"154",-0.875592474189569,59.8204928625175,0,1,-1.36678640380301,NA,0,1 -"155",0.121759998641046,67.9172460509004,0,1,0.400568533158509,NA,0,0 -"156",1.36213066084498,77.9239787563701,0,1,-1.36484949210307,NA,0,0 -"157",-0.23462108692271,70.3671688493632,0,1,2.49552373193942,NA,0,0 -"158",-1.05338280794309,53.0431438443961,1,0,-0.839692875155713,NA,0,0 -"159",-0.869783605635912,61.3962266496932,0,0,0.954222182420512,NA,0,1 -"160",-0.390127029647055,74.1324549389255,1,0,1.25069343500921,NA,0,1 -"161",-0.847350073129572,71.7443759629564,0,0,1.06574895295509,NA,0,1 -"162",-0.260639392099399,44.1521841733571,1,0,1.15944236767692,NA,0,1 -"163",-0.414419706341893,69.464299206338,0,0,1.27198726757385,NA,0,1 -"164",-0.183050797719434,56.5948591488133,1,1,0.927341416264284,NA,0,0 -"165",0.407056097880578,53.7463730929629,0,1,0.101609421110908,NA,0,0 -"166",0.624633128202147,48.9879777101729,0,0,0.55776883720937,NA,0,0 -"167",1.67820574370471,56.8820453992648,1,1,0.463414412674698,NA,0,1 -"168",-0.0686936535649907,66.8865650538364,0,1,-0.660694489533208,NA,0,1 -"169",-0.320839912658177,74.3527187786736,1,0,0.398961252222583,NA,0,0 -"170",1.47100571704891,61.3890709820022,1,1,-1.30582701702516,NA,0,0 -"171",1.7043293978271,59.2694761357826,1,0,-0.443201083998582,NA,0,1 -"172",0.0432440375575291,57.976640618882,0,1,-1.21740052305349,NA,0,1 -"173",-0.332657319133519,69.5379933574217,1,1,-0.83982983727178,NA,0,1 -"174",-1.8222354175631,49.1880655255796,1,1,-0.677318403999919,NA,0,1 -"175",1.41126239874678,62.5001870455132,1,0,2.19605464828752,NA,0,1 -"176",-0.837582434076861,70.2162198057182,0,1,-1.15467197745111,NA,0,0 -"177",-1.12376279439532,70.6185151113365,1,1,-0.233399703728512,NA,0,1 -"178",3.04376588609529,64.1707021007276,0,1,-0.389744443583323,NA,0,0 -"179",0.235021308250671,48.3763384863002,0,1,0.0287354454515505,NA,0,0 -"180",-0.0332586101133057,51.7891527018534,0,0,-1.70836970797038,NA,0,0 -"181",-2.7322195229558,52.795117422846,0,0,-0.561405110125317,NA,0,1 -"182",-0.0997905884418961,45.5672803130302,1,1,-1.139342962383,NA,0,0 -"183",0.976031734922396,60.4002033444342,1,0,1.09505700857317,NA,0,1 -"184",0.413868915451097,72.2294668396681,0,1,0.166786940665951,NA,0,1 -"185",0.912322161610113,35.5330608337029,1,1,1.17248789106492,NA,0,0 -"186",1.98373220068438,67.5838789198152,1,0,0.688176022899368,NA,0,1 -"187",1.16910851401363,57.9733834587649,1,0,1.51547424437832,NA,0,1 -"188",-0.50873701541522,52.3086615280499,1,1,-0.636050707196628,NA,0,1 -"189",0.704180178465512,67.7000881141061,1,1,-1.16362571218479,NA,0,0 -"190",-0.198416273822079,66.4838515883954,0,1,-0.427164795592366,NA,0,0 -"191",-0.538070788884863,60.9621737861886,1,1,0.196091333321825,NA,0,1 -"192",-2.85575865501923,50.8035664683063,1,1,-0.997047804364754,NA,0,0 -"193",-0.789646852263761,52.9513822276208,1,1,0.517466397560559,NA,0,1 -"194",0.487814635163685,43.8276593772242,0,0,-0.671756980072126,NA,0,1 -"195",2.16803253951933,78.8089390836698,1,1,0.375318025739388,NA,0,0 -"196",0.500694614280786,58.4440563915239,1,0,-0.415601683558189,NA,0,0 -"197",0.62021020366732,52.4548625121876,1,0,1.15966859915141,NA,0,1 -"198",-0.965903210133575,38.3904755583043,0,1,-0.856041968843027,NA,0,0 -"199",0.162654708118265,55.705383196996,0,0,-0.13875389573912,NA,0,0 -"200",-2.07823754188738,54.508720953604,0,0,-0.617461910983908,NA,0,1 -"201",0.485226820569252,45.9768411711567,0,0,0.264669172152278,NA,0,0 -"202",0.696768778564913,83.8397182312468,1,0,0.812430386935776,NA,0,1 -"203",0.185513915583057,68.7294435097848,1,0,0.573408898080292,NA,0,0 -"204",0.700733515544461,44.6433411282576,0,1,-0.611756195058755,NA,0,1 -"205",0.311681028661359,71.339752513847,0,0,0.927582036106653,NA,0,1 -"206",0.760462361967838,70.4484016061832,0,1,0.459632168468754,NA,0,1 -"207",1.84246362620766,65.8983068874561,1,1,0.0777094202079709,NA,0,0 -"208",1.1123628412626,63.9910373489487,1,1,-0.794012677255858,NA,0,1 -"209",0.0326639575014441,61.4494918465514,0,1,0.755647696808639,NA,0,1 -"210",-1.11444896479736,66.1916121760774,1,0,-1.49004740974421,NA,0,1 -"211",0.418057822385083,49.6738089390188,1,1,-1.95210194506861,NA,0,0 -"212",-0.400235237343163,53.5064386172432,0,0,-0.452398750065752,NA,0,0 -"213",1.49349310261748,49.9799274180761,0,0,-1.36862613526372,NA,0,0 -"214",-1.60708093984972,58.341190400115,1,0,0.295810885487959,NA,0,1 -"215",-0.415751788401515,61.4665629511642,1,0,1.62348352547161,NA,0,0 -"216",0.42200837321742,66.0205652414274,1,0,-0.52968985535574,NA,0,0 -"217",-0.151736536534977,65.2905552549282,1,1,-0.257188260480086,NA,0,0 -"218",-0.60615111526422,74.4569584076136,0,1,0.750955500249836,NA,0,0 -"219",-0.304721068966714,65.9730928742892,0,0,-1.63589691767136,NA,0,1 -"220",0.629536099884472,72.5631692128956,1,0,-1.53165443855064,NA,0,0 -"221",0.895171980275539,70.9744204280521,0,1,1.83455634245432,NA,0,1 -"222",0.660212631820405,54.1153958690102,0,0,-0.527524602007166,NA,0,0 -"223",2.27348352044748,67.5130128150681,0,1,-0.223240229593461,NA,0,1 -"224",1.17349757263239,64.5351434101759,1,1,1.30636084938437,NA,0,1 -"225",0.287709728313787,61.1917823416888,1,1,0.772907132748949,NA,0,1 -"226",-0.659770093821306,50.9661229169478,0,0,-1.85157391464842,NA,0,1 -"227",2.91914013071762,65.879824566943,0,0,1.16872326218352,NA,0,1 -"228",0.677415500438328,60.6372874823125,0,0,-0.471723098985895,NA,0,0 -"229",-0.684320344136007,56.4516414506322,1,0,-0.788011402128258,NA,0,0 -"230",0.186492083080971,69.2249574441879,0,0,-1.67479075787562,NA,0,1 -"231",-0.324393300483657,69.9232536859556,1,1,-1.96975711534648,NA,0,0 -"232",-0.274704218225806,68.7356390179351,1,1,0.0873659274372556,NA,0,1 -"233",-0.933503340589868,33.8895653889269,0,0,0.844907183544593,NA,0,1 -"234",0.116845344986082,51.0268316728995,0,0,-1.71285342452175,NA,0,0 -"235",0.319160238648117,69.3723531860462,0,0,-1.5756685439766,NA,0,0 -"236",-1.07754212275943,52.5466366519901,1,1,0.858715130088106,NA,0,1 -"237",-3.23315213292314,64.0853962974254,0,0,0.327499502368055,NA,0,1 -"238",-0.254874652654534,61.8291267264276,0,0,0.482599276074397,NA,0,1 -"239",0.0295178303214797,71.0740229118283,1,0,-0.861239506548091,NA,0,1 -"240",0.594273774110513,52.8962832962036,1,0,-0.465507726476707,NA,0,1 -"241",0.0591351681787969,59.4219763063927,1,0,-1.38840970257199,NA,0,0 -"242",0.413398894737046,63.4492341671265,0,0,0.494310883699177,NA,0,0 -"243",-1.09777217457042,72.72654447469,1,0,-0.447774737710839,NA,0,0 -"244",0.711175257270441,58.2261916475795,0,0,-0.845243025822087,NA,0,0 -"245",0.718888729854143,62.9040599697828,1,0,0.692127259774552,NA,0,1 -"246",0.251651069028968,46.5715179399608,0,0,-0.554400351447954,NA,0,0 -"247",1.35727443615177,66.3305002993816,0,1,0.805144115271628,NA,0,0 -"248",0.404468471278607,62.0681970608308,0,1,-0.343846725138525,NA,0,0 -"249",0.264364269837939,66.3930734251053,0,1,0.169318041646852,NA,0,0 -"250",0.268043904143697,55.8932943503209,1,0,0.280421965396122,NA,0,0 -"251",0.436930577039483,76.8535662126586,1,1,0.717654589428342,NA,0,1 -"252",1.06012390548736,58.6017389575769,0,1,-0.398660716202873,NA,0,1 -"253",0.452190396528715,61.2113874885007,0,0,1.0280059648229,NA,0,0 -"254",0.663198615687065,58.9432082370055,1,0,0.53352921437778,NA,0,1 -"255",-1.13637355399084,68.4630489128897,0,0,1.98337843440331,NA,0,1 -"256",-0.370497517092117,52.3155097607692,0,0,-1.20073705622921,NA,0,0 -"257",1.47696958979247,62.014624052689,0,0,-0.414474090581805,NA,0,0 -"258",-1.22390375051376,65.7456959846799,1,1,0.451136490702432,NA,0,0 -"259",0.258068387396106,47.5476011201701,1,0,-0.262450560666971,NA,0,1 -"260",0.405002805433516,62.2771880814336,1,0,1.02042672061091,NA,0,0 -"261",0.97580332180945,54.0619793773133,1,0,-0.43403248030227,NA,0,0 -"262",-0.348876736539909,74.2881078576221,0,0,-0.335940938967553,NA,0,1 -"263",0.158625439491262,67.0489806899413,1,1,-0.218092381570754,NA,0,1 -"264",-1.76325506654115,42.7486609059989,1,1,-0.114706806432635,NA,0,1 -"265",0.338596047099905,69.19284781933,1,0,0.090373957608694,NA,0,0 -"266",-0.666565029559273,62.1774092677819,1,0,-1.29027186439312,NA,0,1 -"267",-0.238646624327644,54.386752381008,0,1,-0.445302431780947,NA,0,1 -"268",-1.18776528245719,56.4441437218518,0,0,1.05959611849342,NA,0,0 -"269",0.384935321768578,47.2785783338835,0,1,1.60176387946255,NA,0,0 -"270",0.66657951565448,63.8619685165391,0,1,0.290704963562124,NA,0,0 -"271",-0.304613889457735,43.6147686198574,0,1,-1.5398402644936,NA,0,0 -"272",1.8250110640863,58.5973722272885,0,0,1.43152193223662,NA,0,0 -"273",0.670559370567588,64.5517560783696,0,1,1.1475890314884,NA,0,0 -"274",0.948632573255153,60.8338336199505,1,0,-0.0369320076461112,NA,0,1 -"275",2.04940300167091,68.5917332231044,0,0,0.390688747616043,NA,0,1 -"276",-0.651113609237535,60.5054810455568,0,1,1.11208742842588,NA,0,1 -"277",0.808619272846293,83.0894410422706,1,0,0.852353197942567,NA,0,0 -"278",0.986580613954563,51.4053682526367,0,1,1.14932956130465,NA,0,1 -"279",-0.00617079609070003,75.3911616866609,1,0,-1.37341062252185,NA,0,0 -"280",0.319052357931912,43.8553382265105,0,1,-1.33478426311924,NA,0,0 -"281",-1.01182190274184,49.2268354169066,0,1,-0.324747858050283,NA,0,0 -"282",0.470167547726389,50.986394529512,1,0,-0.922288234919009,NA,0,1 -"283",-0.700970331629097,44.7888763883714,1,1,0.92873970287116,NA,0,1 -"284",0.813682862526491,74.8469384506777,0,1,0.705086309642083,NA,0,0 -"285",-0.811430783992772,70.3370332012997,1,0,-0.0590209227685563,NA,0,0 -"286",0.31939748709946,73.7126649842358,0,1,-0.740775868739709,NA,0,1 -"287",-0.846522653163624,53.9107855854228,1,0,-1.67173099749737,NA,0,0 -"288",-0.245763179128733,60.8559709330906,1,1,-0.612541215827006,NA,0,0 -"289",-1.55285901110546,71.1850749108373,0,1,-1.48286853528974,NA,0,0 -"290",0.128434032925753,70.6052978815381,1,1,-0.71213118391869,NA,0,1 -"291",0.985443389404868,65.6013677357227,0,1,-0.277137445806187,NA,0,1 -"292",0.183247523067191,61.5814754090066,0,1,-0.00722792989139269,NA,0,1 -"293",-1.76622921343046,48.3582677768336,1,0,0.505992335498047,NA,0,0 -"294",-0.620533696615944,63.0968619619025,0,0,1.23163275222356,NA,0,1 -"295",1.65604303699874,58.215061851263,1,0,0.276685312887525,NA,0,0 -"296",1.80980538558388,53.4547542797855,1,0,-2.09970117138004,NA,0,1 -"297",-1.17503676801214,75.7615726400038,1,0,-0.803775430626082,NA,0,1 -"298",-0.366703258827641,52.9859626261871,0,1,0.804082706755595,NA,0,1 -"299",0.353625448927287,62.4287638937281,0,1,-0.177674868331488,NA,0,0 -"300",0.319156220867898,65.1603575646107,0,0,-0.494156769663437,NA,0,0 -"301",-0.579956989019402,70.5115476204457,0,1,0.980121542524162,NA,0,1 -"302",-0.953278701946907,63.6730439621612,1,1,-0.10056813602669,NA,0,1 -"303",-0.179428586945621,79.7321531497352,1,1,0.970848486101545,NA,0,0 -"304",1.0098082149443,64.4513674789454,0,1,-0.310366879864164,NA,0,0 -"305",0.0236266149214088,55.1151983150947,0,1,0.654996309218518,NA,0,0 -"306",-0.649028219583901,61.0251153583565,1,0,-0.376913155880549,NA,0,0 -"307",-0.504374222048191,66.6912924560925,1,1,0.510511530249471,NA,0,0 -"308",1.61439149545836,38.0949119182938,0,1,0.43622840726424,NA,0,0 -"309",-0.446959811255663,59.0443594865297,1,0,-0.113637816241966,NA,0,0 -"310",0.763176761113315,60.4152570851599,1,0,1.33940885629495,NA,0,1 -"311",1.4717186861858,57.6139912610199,0,0,-0.0218513586569336,NA,0,1 -"312",0.443664903687416,60.5136853648673,1,0,0.373164528251115,NA,0,1 -"313",-0.421721870070189,40.0819363979294,1,0,0.975058565890974,NA,0,0 -"314",-0.0400016262060736,63.4623628474444,1,1,0.428690951318362,NA,0,0 -"315",-0.492279968227887,60.0302934280113,1,1,-0.271214166419369,NA,0,0 -"316",1.22771711731275,55.0757149923363,0,0,-1.46472965857938,NA,0,0 -"317",-0.149553565244392,31.3558127740949,1,1,1.12136237253886,NA,0,0 -"318",1.54998338398246,55.8878726310216,1,0,-2.09093337363249,NA,0,0 -"319",-0.561612535381314,61.1820853805227,1,1,0.594813435824133,NA,0,1 -"320",-0.647117250632361,59.2883923223199,0,1,-1.22252855803645,NA,0,1 -"321",0.143132159050427,53.913025710452,1,1,-0.0678768826920807,NA,0,0 -"322",0.0241886479910512,63.2164528804192,1,0,-0.0834526806469843,NA,0,1 -"323",-0.504451524580869,50.895104679573,1,1,-1.77454980173004,NA,0,1 -"324",-1.58139680993034,66.9928104731946,0,1,-0.651151381128995,NA,0,1 -"325",0.030066421255418,68.2573237519313,0,1,-1.27796221073948,NA,0,0 -"326",-0.716576699070701,69.5754341191487,1,0,-1.07304892113685,NA,0,0 -"327",1.08261095547612,67.9884627036529,1,1,-0.20598745128514,NA,0,0 -"328",-0.952685454939293,56.4787278994997,0,0,-0.474454059463991,NA,0,1 -"329",1.12648272900916,59.7505317176422,1,0,0.736065656861654,NA,0,1 -"330",-0.649043017379116,72.4507337169484,1,0,-1.01819225085773,NA,0,0 -"331",0.292470078406726,66.1094195854966,1,1,-0.78662902278494,NA,0,1 -"332",0.898702724402737,63.4688303266061,0,0,0.0376161805801649,NA,0,1 -"333",-0.518742356791874,72.7917971095465,1,1,-1.85714141506033,NA,0,1 -"334",0.554438552345125,53.7315415813036,0,1,-0.168876560809693,NA,0,1 -"335",-0.0879736711840128,60.2526327160146,1,1,-0.247124779081277,NA,0,1 -"336",-1.13521292940752,73.0150900263725,0,0,-0.954434193635686,NA,0,1 -"337",-0.270079610999902,57.5021030543229,0,0,-0.550610577398466,NA,0,1 -"338",1.61978987753494,61.349983447954,0,0,-1.64139744423389,NA,0,0 -"339",-0.21413117369664,58.5332676938911,1,1,-0.781575921650955,NA,0,0 -"340",-0.817782462924957,72.3393059413951,1,0,-0.394591207093351,NA,0,1 -"341",-0.0540229240816071,61.3542598236762,0,1,-0.469383314539063,NA,0,0 -"342",0.330141609857869,60.672402924147,1,0,-0.735999128417405,NA,0,0 -"343",0.955324614657399,53.1877789148867,0,0,-2.1491238970204,NA,0,0 -"344",1.14395987779244,55.815370597461,0,0,0.132892471811892,NA,0,1 -"345",0.100522398375063,44.3078579091802,0,0,-1.22840510433402,NA,0,0 -"346",1.16457524565738,64.0958654077477,1,1,-0.24767481305761,NA,0,1 -"347",-0.764259947799148,68.4780319293457,0,1,0.0465609630034831,NA,0,1 -"348",-2.3445134046903,46.6261569044753,0,1,0.402484703932059,NA,0,0 -"349",-0.471683425529536,64.0135472863779,1,0,-0.112161426828015,NA,0,0 -"350",-0.51585548950739,63.2893589749903,1,1,-0.567274117442933,NA,0,0 -"351",-2.31603615150256,61.9532100004021,0,0,-0.992337662046278,NA,0,1 -"352",0.562471756969533,60.7146261011216,1,1,1.03936805870832,NA,0,1 -"353",-0.783775139422381,76.349332608997,1,1,0.0598427201580219,NA,0,0 -"354",-0.226053986228165,63.330070775793,1,0,-1.14933906139856,NA,0,1 -"355",-1.58710298765404,74.0181262288016,0,0,-1.87055140838007,NA,0,0 -"356",0.547524201868316,64.5101702086016,1,0,0.462124610549455,NA,0,0 -"357",1.89122701735548,64.8732359573994,0,0,0.286521464155645,NA,0,1 -"358",-0.878077110224044,54.6026254488847,0,1,1.08188443726883,NA,0,0 -"359",-0.112558911044787,86.9090741575432,0,1,1.09134840256743,NA,0,1 -"360",1.94871306497599,59.4018392431068,0,1,2.25769840603041,NA,0,1 -"361",0.933816332207727,55.0252413213957,0,0,-1.28702661360351,NA,0,0 -"362",1.91305942169705,63.8241714991964,1,0,-1.22080303572,NA,0,0 -"363",-0.00523405793193957,64.0918017843792,0,1,-1.50047956507908,NA,0,1 -"364",-0.152260048921635,67.2374592278896,0,0,-0.590727059737978,NA,0,1 -"365",-0.509631657179118,50.8507542562753,1,1,0.767959651027252,NA,0,1 -"366",1.43457370342612,58.6281490848044,0,1,0.409779947427045,NA,0,1 -"367",-1.28583853233299,75.7429327787333,1,1,0.973400941237074,NA,0,0 -"368",0.307314215255018,55.1708229695724,0,0,-1.36392238722292,NA,0,1 -"369",-0.046318533277383,56.4380948688943,0,0,-0.0115796151778788,NA,0,0 -"370",2.25184180012688,66.1022227422258,0,0,0.482908374800972,NA,0,0 -"371",-0.608033728609708,53.9396485263296,0,1,0.15500012159551,NA,0,0 -"372",-1.50928816985104,71.3451200339725,0,1,1.21508975014897,NA,0,0 -"373",0.232631769508576,64.2246828892729,0,1,-0.83778584449963,NA,0,1 -"374",-0.0396486960510932,60.8954971140714,1,1,0.621594843184185,NA,0,0 -"375",-0.839125067505345,44.0828600603638,0,1,0.155524900096752,NA,0,0 -"376",0.132291088244156,56.5283298575697,1,0,0.408982790756931,NA,0,0 -"377",-0.275524746705939,80.3429598442535,0,1,-1.57755448926486,NA,0,1 -"378",-0.678758641268032,57.1409484440775,1,1,0.836836094188595,NA,0,0 -"379",0.500835931775825,66.5437878298221,0,0,0.723514559652086,NA,0,1 -"380",-0.331662312006789,50.7525238303052,1,0,0.13147873919411,NA,0,1 -"381",-1.83498025077253,44.0929696074596,1,1,-0.347255182614316,NA,0,0 -"382",-2.65174124837268,67.093984696494,0,0,1.57682166298993,NA,0,0 -"383",-0.580582172084415,74.4932679629708,1,0,-0.196836397057139,NA,0,0 -"384",1.45418687087119,44.5561969061596,0,0,0.410923281958588,NA,0,0 -"385",0.838129379964693,68.460692482214,1,1,-0.553991035221659,NA,0,0 -"386",1.21505358044614,63.3093247067837,0,0,-0.175077428542885,NA,0,0 -"387",0.982505442732795,56.3381268914086,0,1,-2.57507720718692,NA,0,1 -"388",0.315764024018431,77.4928498341616,1,0,-0.986967654557719,NA,0,1 -"389",-1.50706262771632,49.94855038544,1,0,1.18030861235152,NA,0,0 -"390",0.205569787166277,58.8256070115816,0,1,-0.311600897935701,NA,0,0 -"391",1.59722809034381,66.685055465381,1,0,-1.28928390016459,NA,0,0 -"392",-3.39606353457436,49.1752597856717,0,0,-1.55573527655244,NA,0,0 -"393",-0.781352284302082,71.0596780578619,1,0,0.351864193767146,NA,0,0 -"394",1.10246464075191,58.8584661806551,1,1,-1.02361871850118,NA,0,0 -"395",0.528745019445293,72.291483044291,1,0,2.5831692892774,NA,0,0 -"396",0.789394403514854,59.7670318390833,0,0,1.22632376791003,NA,0,1 -"397",0.457099505856081,52.6489453337983,1,1,1.08451182102266,NA,0,0 -"398",0.538833115658123,46.5948516686648,1,1,0.126743197570183,NA,0,0 -"399",0.0146431224111012,46.2693410762816,0,1,-1.12144201654816,NA,0,1 -"400",-0.916489135703093,58.5094290602607,1,0,0.289207990325689,NA,0,1 -"401",-1.22681509076766,39.9372404080284,1,1,0.638154888999721,NA,4,0 -"402",0.0361528681302234,44.4043146258191,0,1,-0.0953672363546544,NA,3,0 -"403",-0.42139311357139,79.6270156915108,1,0,-0.806979203185191,NA,3,0 -"404",-0.899364408462905,58.2168177639437,1,0,1.60819567906429,NA,2,1 -"405",0.417441317486562,73.5762067316143,0,0,0.411553303651001,NA,4,0 -"406",0.15344473686401,76.4899204713817,0,0,-0.402484324529066,NA,2,0 -"407",1.46328304519325,61.1948457059363,1,0,0.110962357974858,NA,0,0 -"408",-1.12150249732554,60.0362061633761,1,1,0.654036068679298,NA,3,0 -"409",-0.517788083055024,61.9644344592465,0,0,1.06906574617112,NA,2,0 -"410",-0.0749470883385233,66.797395067307,0,0,-1.88341014148936,NA,2,0 -"411",-1.40779008143908,68.0112484496439,0,0,-1.4429269621452,NA,1,1 -"412",-0.284705868110477,39.8479942022391,1,1,0.461029846216729,NA,2,0 -"413",-0.708171178213376,60.4739598808669,0,0,0.598279919438516,NA,4,0 -"414",-2.14763900113752,66.1309660685985,1,0,-0.8844762950372,NA,4,0 -"415",-0.283837160611358,58.299815189466,1,1,0.360707882967097,NA,5,0 -"416",-0.534072163976087,62.2575952443179,0,0,-1.69402405553405,NA,3,1 -"417",1.1330101941542,68.9240414112506,0,1,-0.462399627624698,NA,3,0 -"418",-0.604068922847656,58.5238690124354,0,1,1.02525119985162,NA,2,0 -"419",0.557511595592472,75.3718231037047,1,1,0.123430942155291,NA,4,0 -"420",0.142629293255087,64.5261935487076,1,1,0.756816132810634,NA,5,1 -"421",-1.23686021484532,44.0430574742946,0,0,-0.987323492483483,NA,2,1 -"422",0.37414397250319,58.1283470796627,1,1,0.140070935388242,NA,5,1 -"423",-0.106086123466453,65.6763446812895,1,1,-1.53748694519716,NA,4,1 -"424",0.166061986509049,61.626791167636,0,0,0.0257718325228052,NA,0,1 -"425",0.658034848342406,46.2801698707418,1,1,-1.02303046739119,NA,2,0 -"426",0.113962919971386,41.1604748830867,1,1,-1.75905799813544,NA,4,0 -"427",1.58515916403119,59.3195299183527,1,0,-0.888458790043248,NA,1,0 -"428",0.115553440443324,66.8730693385291,1,0,1.96280409188127,NA,3,1 -"429",-1.25399903928352,52.9199927122785,1,1,-0.494058066622838,NA,3,1 -"430",0.74186477930074,63.7767726881616,0,1,-0.489052716862196,NA,2,0 -"431",-1.22290161499985,65.8075773927448,1,1,-0.531672209620644,NA,2,0 -"432",-1.91889413176211,69.7462609239334,1,1,-0.70578379825765,NA,1,0 -"433",-0.809936389920794,65.8745250933105,1,0,-0.787245598526667,NA,5,0 -"434",-0.744677228785278,62.4527877882343,1,0,0.0407062842653212,NA,4,0 -"435",-0.607505922693199,44.339962958283,1,1,-0.160257426739252,NA,3,1 -"436",1.60675956606494,54.9187185190614,1,1,0.707436075130338,NA,3,1 -"437",1.63559944628363,68.5827885392439,0,0,0.67259647060077,NA,1,0 -"438",0.726091096698492,64.518667192148,0,0,0.444869491301001,NA,4,0 -"439",-1.27383492504346,51.0209576314687,1,1,-0.132896094565712,NA,2,1 -"440",0.0231465554333402,63.1901114488487,1,1,0.236582456988663,NA,1,0 -"441",0.487121233826478,59.0603548942023,1,1,1.11388056515839,NA,5,0 -"442",-0.137027503269731,72.6430832492208,1,1,-0.109997400872429,NA,3,0 -"443",0.171745482456009,59.6599063472136,1,1,0.041185051013977,NA,1,1 -"444",-0.944793914899987,56.1183096987207,0,1,-1.79925084231441,NA,1,1 -"445",-1.28762030600621,57.3954516809066,0,1,-1.05031744528165,NA,1,1 -"446",1.40776573875103,51.2256548591289,0,0,0.380665228128833,NA,2,0 -"447",0.26737268905316,72.9275143050665,1,1,-1.49439331682552,NA,3,0 -"448",-0.408397642146012,61.6518785360755,1,0,-0.0653339484432832,NA,3,0 -"449",0.325271219916941,53.967929242885,0,1,-0.364169952975103,NA,4,1 -"450",2.06248097311935,63.9860390080151,1,1,0.845529325639494,NA,1,1 -"451",0.362666955663951,68.5895212545866,1,0,0.435429302544269,NA,0,1 -"452",1.4114046447039,55.7808753143062,0,0,-0.53413789339427,NA,1,1 -"453",1.36754431457543,56.945111105703,0,0,0.943239360539817,NA,3,0 -"454",-0.407176147121539,59.4081198549152,0,1,0.301835461893759,NA,3,1 -"455",0.762541730885853,39.9812781204761,1,0,0.0574454909242895,NA,5,0 -"456",-0.651200823031723,61.5543642135745,0,0,1.17823339900018,NA,5,0 -"457",-1.47365577946832,60.4256222751568,1,0,0.294456918483786,NA,3,0 -"458",-1.20166579640511,57.3796080120584,1,1,1.14526303524807,NA,0,0 -"459",-0.148720478130092,49.788067416777,0,1,-0.154182678560202,NA,4,1 -"460",1.79706242855231,70.6535710343492,1,0,0.636476192810274,NA,2,1 -"461",0.104808717855571,55.4240916212448,0,0,0.910773865410215,NA,3,1 -"462",-0.802837973351476,61.3530391287928,0,0,-0.662806250607168,NA,3,1 -"463",0.230374964163675,81.1278759570127,0,1,-0.290630354786015,NA,1,1 -"464",0.697975193218088,54.081663046136,1,0,-1.46382575920573,NA,2,0 -"465",-1.29520173194588,53.3126210595422,1,0,-0.109033197419866,NA,1,1 -"466",-1.05468021797766,61.5387874093147,0,1,-0.718369922643492,NA,1,0 -"467",-1.94160748967507,57.1460838840075,1,0,-0.298180863282341,NA,1,1 -"468",-1.26956059300372,55.0730356079902,1,0,-0.605295550729906,NA,0,1 -"469",-0.887438810816108,64.097715393881,0,1,-0.377983953309427,NA,0,1 -"470",-0.291915762703194,62.2935645078686,0,0,-0.521744921541124,NA,5,0 -"471",1.41009405297303,49.2537436668267,0,1,-0.61820601713851,NA,2,1 -"472",0.88864143026394,46.1924914556153,0,1,-0.436407984532611,NA,1,1 -"473",0.272974319690261,65.9823324144569,0,1,1.36410917391319,NA,1,1 -"474",-0.566641466026189,51.6053135785225,1,1,-0.948246264578507,NA,0,0 -"475",-0.0402069374365745,50.4457493995265,0,1,0.901429133926489,NA,3,1 -"476",0.547612245301063,72.3621807067256,0,0,-0.800665134230229,NA,5,0 -"477",-0.504033321605536,60.759841521696,1,0,0.307890055852477,NA,1,0 -"478",1.0590701295588,54.6042709236522,1,1,-0.910998770565304,NA,3,1 -"479",1.09745356643579,67.7353811942751,0,0,-1.07592057138663,NA,1,1 -"480",-1.16650462962559,73.0430052382949,0,0,0.445539081785094,NA,5,1 -"481",-0.74691131008606,69.8510997410927,1,1,-0.198342170801574,NA,4,1 -"482",1.20673873333927,49.8827848931382,1,0,-1.50647668921738,NA,3,0 -"483",-1.68721988054335,53.9397097428161,1,1,0.725739408768338,NA,4,1 -"484",0.419709894906327,63.1672605550687,1,1,-0.583429191806288,NA,4,1 -"485",0.233498761638796,46.3530889135471,0,1,0.140482006085695,NA,5,0 -"486",3.19590119785739,56.1803372047234,0,0,0.360450314117149,NA,4,0 -"487",-2.72968034122695,59.8570725884825,0,1,-1.21243708583669,NA,3,0 -"488",-0.840795801424047,59.3458186800103,0,1,-0.675731348315961,NA,0,0 -"489",0.674028668650456,62.2343241886172,1,0,1.5696434732764,NA,0,0 -"490",1.67432650967139,61.8318216667889,0,1,0.619497282943088,NA,3,0 -"491",0.839687083840494,68.9066124222916,0,0,-1.43622916032155,NA,1,1 -"492",-1.42836843555519,59.4692815517991,1,1,-0.314778857388245,NA,3,0 -"493",2.23938816916782,51.7014188448492,0,0,-1.14509720885026,NA,4,0 -"494",-1.75726622410175,58.7897090217489,1,0,1.04451293282252,NA,3,0 -"495",-1.11301608790466,57.9034195656516,0,1,-0.00689052928968512,NA,2,0 -"496",-0.0431403906133336,60.3958956171276,1,1,-0.290987619256496,NA,4,1 -"497",2.2255869089066,74.8099019747618,1,0,1.01085610285753,NA,4,1 -"498",0.50631658733822,65.6666005839204,0,1,1.1116436393317,NA,0,0 -"499",0.730396051435768,77.8958035863141,1,0,-2.29398597219737,NA,4,0 -"500",1.72807759473682,71.1404785115473,1,1,0.65588476565193,NA,1,1 -"501",0.984779968314556,77.9407452864789,1,1,-0.399908439414822,NA,5,1 -"502",-1.22473787596427,46.3545110010355,0,0,0.706900239659698,NA,3,0 -"503",0.709726218065825,52.9256001500028,1,0,-0.841793729346687,NA,3,0 -"504",-0.109219992578064,54.4371565411251,1,0,0.627844674789504,NA,4,0 -"505",1.78260789538833,56.8991890624728,0,0,-0.865794010742884,NA,2,1 -"506",-0.243444679602975,56.2382066427167,1,0,0.315109395167471,NA,2,0 -"507",-1.52710702225461,64.5852600842686,1,1,-1.23613033525829,NA,2,1 -"508",0.491834371466622,47.3885085056887,1,0,0.0770462714583238,NA,1,1 -"509",0.354503660926992,54.7253483679385,1,1,2.08698542160568,NA,3,1 -"510",-0.0176263476303775,54.4318577820146,0,0,1.21353587898004,NA,4,0 -"511",-1.05655032211723,56.2556127378398,1,0,0.888689083732765,NA,3,1 -"512",-0.838914666078269,53.9560001901894,1,0,-1.04095179499479,NA,4,1 -"513",-0.0126257164904259,72.0312707053266,1,1,1.31100543436612,NA,5,1 -"514",1.03811014600433,67.6887323340547,1,0,0.858565664121565,NA,5,1 -"515",-0.364457713905061,62.3835102324136,1,0,-0.549804243968874,NA,4,0 -"516",-0.871449472185395,45.8497185861131,0,1,-0.331829226096368,NA,2,1 -"517",-2.7042030254467,89.3377438660602,1,1,-0.60923508470323,NA,0,0 -"518",0.494249836540647,69.3502575735832,1,1,0.828145323839061,NA,2,0 -"519",0.427340324999113,72.3388452172586,1,1,-0.86453131416218,NA,1,1 -"520",1.30919086010968,41.0861527820195,1,1,-0.342300730886153,NA,1,1 -"521",1.4667226423107,57.7734870688206,1,1,-1.1273804260992,NA,0,0 -"522",-0.143056066294616,67.6812752836092,0,0,0.725663561640484,NA,3,1 -"523",1.29851372621074,63.8795365826285,0,1,-1.50442814718175,NA,4,1 -"524",0.00131354755592194,66.0913303155833,1,1,-1.22740506524656,NA,0,1 -"525",-0.676410075685581,65.9051860827632,1,1,-1.53475342481241,NA,0,0 -"526",-0.848196596885468,55.6485924671455,1,0,-0.631744162117857,NA,1,1 -"527",-0.346941100407436,57.4923010150099,1,0,0.287560601294239,NA,1,0 -"528",0.227479443027071,50.7373061627669,0,1,-1.36246786870891,NA,2,1 -"529",-0.0274014036111113,70.8743580325344,0,1,1.48183223446739,NA,1,1 -"530",0.200877790585033,40.5604078409579,0,0,0.206621169315406,NA,3,0 -"531",0.198942671822796,57.1937956166365,1,1,-0.448302654690569,NA,4,1 -"532",0.71459935608369,68.1207755667137,0,0,0.0588951606055983,NA,1,0 -"533",-0.273181774889849,57.9229631299196,1,0,0.412204811749354,NA,5,1 -"534",1.63534745793173,74.5075731996683,0,1,0.172681707982134,NA,1,1 -"535",0.437693886479643,68.4149317815536,1,1,0.254266315050371,NA,4,0 -"536",-1.19353585074908,51.3426217787062,1,0,0.00467235118544757,NA,1,1 -"537",-0.276649804199181,59.4012219329904,0,0,0.379782069197037,NA,1,0 -"538",0.515883050535119,60.1153115324976,1,1,-0.116966273461431,NA,4,1 -"539",-0.411055332086343,62.6911145139178,0,0,-0.235452686427768,NA,2,1 -"540",0.344995049472574,54.7183949407659,0,1,-1.50127941204146,NA,4,1 -"541",0.434395860831138,53.8292615863329,1,1,1.22310588631937,NA,3,0 -"542",1.30347768300503,57.3762748497832,0,0,-0.590962958826448,NA,2,1 -"543",-0.651695910733076,69.1234717100544,0,0,0.838119396605941,NA,3,1 -"544",-0.0168808623279884,71.2361058688565,0,1,-0.593254569869571,NA,1,1 -"545",-0.567692455293591,53.9648028631747,0,1,1.26517771265799,NA,1,0 -"546",-1.12713286976328,60.5532339363837,0,1,-0.904355807064555,NA,2,1 -"547",-1.84897031117903,49.9934156711519,1,0,-0.505215327863061,NA,2,1 -"548",0.19552905518189,52.4182117086078,0,1,-0.0855138246373087,NA,3,1 -"549",0.601565225195587,53.8856418549474,0,0,0.0398872990007094,NA,4,1 -"550",0.639158159789933,46.8327773679206,1,1,0.691657670066614,NA,2,1 -"551",-1.38999281746208,39.9258498278283,0,0,1.6034764267485,NA,1,1 -"552",-1.12967785339758,67.6800212752914,0,0,0.835300881754122,NA,2,1 -"553",-1.14115419483757,64.0576131350423,0,0,-0.103643379450682,NA,4,1 -"554",-0.688671499600886,51.6828412108396,1,0,-0.0872413331772054,NA,4,1 -"555",0.170197818152129,69.5807577311228,1,1,-1.82243053694214,NA,2,0 -"556",0.59448342725741,78.1071925143842,1,1,0.520602167106094,NA,1,0 -"557",-0.815032650724957,34.499855151722,1,0,0.65845054053139,NA,3,0 -"558",-2.86434683706673,71.5025508071742,0,1,1.53474704885865,NA,3,0 -"559",0.95382348170826,72.2789365442077,0,1,-0.608853092026856,NA,4,1 -"560",0.615284770613444,61.5210316268381,1,1,-1.25002578560839,NA,4,1 -"561",-0.700870700550263,54.7691619081263,1,1,0.050433788343753,NA,3,1 -"562",-1.07366134146979,48.9840663565416,0,0,-0.817017518601282,NA,5,0 -"563",-0.299991196799737,51.2026971858702,1,1,-1.20516090614205,NA,5,0 -"564",0.108106562802571,53.6329974224591,0,1,-0.812208718091528,NA,5,0 -"565",0.723200763587674,38.940685691571,1,1,-0.126940403277922,NA,0,0 -"566",1.39009509607427,53.5374163987757,1,1,0.425982751248675,NA,1,1 -"567",-0.848883398378404,69.8679342631946,0,0,0.526296647949334,NA,3,0 -"568",0.290722844643498,52.4509985406741,0,1,0.840636091160457,NA,2,0 -"569",1.33417603370242,75.6520486289813,0,1,-0.1230836694655,NA,4,0 -"570",-0.901342155259083,59.4995073900847,1,1,-1.1952174160169,NA,5,0 -"571",-0.222763412932851,61.110287061467,1,0,0.297898909938711,NA,2,1 -"572",-1.7714680282034,64.7547104599742,0,1,0.269973033352478,NA,5,0 -"573",0.287422250114445,50.6099609529286,0,1,-0.487325909162453,NA,1,1 -"574",-0.641191762323999,52.2595438035702,1,1,-0.21760252906034,NA,1,1 -"575",-0.924937642238947,55.7644153276736,0,0,-0.175365093537259,NA,5,1 -"576",-0.779977469185323,77.7287949441054,1,0,0.328622386725262,NA,1,1 -"577",0.444503373594371,64.265506583303,1,0,-1.25986632646462,NA,4,0 -"578",-2.16246900261166,59.0552982691128,0,1,0.118448413456417,NA,3,0 -"579",-0.425510899920945,48.8340456618434,0,1,1.28970835771858,NA,4,0 -"580",-1.01371765920455,59.1256485347046,1,0,-1.62452495612896,NA,0,1 -"581",-0.962119028303747,63.9252609017327,1,1,-0.6251284659458,NA,1,1 -"582",0.474634627277364,56.3937783074627,1,0,0.898480565702257,NA,2,1 -"583",0.351754269604537,51.1201563838252,1,1,2.69864035559375,NA,2,1 -"584",-0.107020356246981,50.5358344620414,0,0,-0.476011271828003,NA,4,1 -"585",0.537517256600346,57.8008844392544,0,1,0.768070900586023,NA,0,0 -"586",-0.60210489844813,63.4222011692021,0,0,-1.60136744074432,NA,3,1 -"587",-2.02329764533368,69.6992412941553,0,0,-0.895774214016735,NA,3,0 -"588",1.15442128059814,46.6454685980288,0,1,0.0343952662533795,NA,4,0 -"589",-0.230692997694109,51.9912052459385,0,0,-1.68357887922913,NA,0,0 -"590",-0.365088768214911,39.5320365581593,0,1,0.598280628779714,NA,0,1 -"591",0.157517929375723,61.1005621819368,0,0,1.5794687911285,NA,0,0 -"592",0.765932222984396,67.161003737353,0,0,-1.71301936747456,NA,4,1 -"593",0.426086966202313,62.5412060237561,1,0,1.45877019242422,NA,4,0 -"594",0.927265399965311,72.2876218801499,0,0,-1.13148864482625,NA,1,0 -"595",0.0461345928404875,65.8340400595612,1,1,-1.21309833985091,NA,0,1 -"596",-0.504285643091111,54.1287522627104,0,1,-1.1964310966019,NA,4,1 -"597",-0.300449408355562,74.466653579162,1,0,0.141451563358798,NA,2,0 -"598",0.822915978645871,64.4339403542803,0,0,-0.393832432131072,NA,1,0 -"599",-0.849268796129039,65.7794688571545,0,1,-0.0619582961780062,NA,3,1 -"600",-0.953610095681196,50.7816503612272,0,0,-0.287800864267034,NA,0,1 -"601",0.313478715882576,57.5067329483484,0,0,-0.825144206271538,NA,2,0 -"602",0.612473307433314,47.7810135362433,0,0,0.347168217045257,NA,0,1 -"603",-1.69108322753688,59.2746035395449,1,0,-0.920092945237873,NA,4,0 -"604",0.784698627442325,73.6490124503674,0,1,-0.287336465542101,NA,4,0 -"605",0.0119251352600229,55.4246898015577,1,1,-0.551130291554083,NA,1,0 -"606",-0.181255287869961,60.6520021936009,1,0,0.848645639995586,NA,3,1 -"607",1.10695440248067,67.839803359237,1,0,1.30351300254165,NA,2,0 -"608",1.47939487098826,44.122919268352,1,0,-0.528633316317668,NA,3,0 -"609",-1.14739944099984,46.3063568676183,0,0,0.762950659152549,NA,0,1 -"610",1.011654376514,50.7248019737764,0,0,1.79031043197109,NA,5,1 -"611",-0.632069194974882,63.0084155983022,1,1,0.515653432567394,NA,0,0 -"612",0.13236131951909,60.8046894610394,0,0,0.50995632659946,NA,2,1 -"613",0.480926064725324,72.4674683051722,1,0,0.877534755180437,NA,2,1 -"614",0.133177627584607,47.6555089979498,1,0,-0.305159380450246,NA,4,1 -"615",-1.46286446232819,55.6093070567735,0,1,-0.998621027342654,NA,2,0 -"616",-0.506401780474741,38.984060469444,0,0,0.779717443181239,NA,1,1 -"617",1.53966323733755,76.1875617137326,0,1,-0.202455829498655,NA,4,1 -"618",0.172793618387291,70.8432849074413,0,1,0.0576926549857799,NA,3,1 -"619",-0.835823924079678,53.5919909308699,0,0,-0.784453934886022,NA,1,0 -"620",-0.655928703879608,57.4718017374294,1,0,-2.30477854744121,NA,0,0 -"621",1.05756608653532,58.6517072677753,0,0,0.705845185436105,NA,1,0 -"622",0.346083489865367,77.9535599171383,1,1,0.285270288864788,NA,1,0 -"623",-0.897186541993694,60.3839671736902,1,0,-0.185219169053537,NA,5,1 -"624",0.00626521125563042,64.1299828889006,1,1,-1.53285677380564,NA,1,0 -"625",0.59667050907573,66.9957212296699,0,1,0.0330032767412004,NA,3,1 -"626",1.72255180794401,66.8979094457615,1,1,-0.689865477443077,NA,4,0 -"627",-0.881749483409345,43.2894053470812,1,0,-0.660371007482841,NA,3,1 -"628",-0.379945955166868,67.4004830752263,0,0,0.88886063882028,NA,1,0 -"629",0.996110015873762,50.7587510088624,1,1,0.878608946597332,NA,1,1 -"630",-0.459120892943841,59.6831142668903,1,1,0.366984772412494,NA,2,1 -"631",1.46026574621257,40.7220701847977,1,0,-0.305016749827467,NA,5,0 -"632",1.04324617695035,62.1430302967833,0,1,0.0843869852438986,NA,2,0 -"633",-1.65904097034683,49.9496113449304,1,1,-0.317787011202578,NA,4,1 -"634",0.423957776227396,38.2104650771021,1,1,-0.882748434256429,NA,3,1 -"635",-1.41916292618524,43.2745086944509,1,1,-0.420131846914995,NA,5,0 -"636",-1.46469245667068,45.7927786444898,1,0,0.211772323155758,NA,2,0 -"637",0.0704209721478317,65.5053163877836,1,1,0.178742568384748,NA,4,0 -"638",0.0566982219802716,63.1043317691679,1,0,0.159911059881684,NA,1,1 -"639",-0.0327081772655874,66.8507391296196,0,1,2.34022278036937,NA,2,1 -"640",1.26428122406637,54.6808950918317,0,1,-2.08364389013229,NA,2,1 -"641",0.294987885730239,61.8178545986328,0,1,-1.65874663082723,NA,1,0 -"642",-0.673086820504489,64.1159617779636,0,0,0.586514202862168,NA,5,0 -"643",0.119588291490956,59.1494095104168,1,1,-0.00838215866917112,NA,1,0 -"644",-0.514396508991294,48.9329379646391,1,1,0.263408012095898,NA,5,1 -"645",-0.0681115738665369,71.2913359573999,1,1,-0.861975614928186,NA,3,0 -"646",-1.14801191183674,76.6801562437414,0,0,-1.82646160006687,NA,4,1 -"647",-0.115244480129169,80.1910292593947,0,1,0.157684692593684,NA,1,0 -"648",0.102936391849009,68.9254803557245,0,0,0.457828090493501,NA,3,1 -"649",-0.701743487103139,58.1279531679218,0,1,-1.81147770998711,NA,3,1 -"650",0.041230624270294,58.723025843893,0,1,1.16992415129467,NA,3,0 -"651",-0.648768350598479,56.9466986034193,0,0,1.36303655425287,NA,3,1 -"652",-0.644007859466443,57.3713316214369,1,0,-0.248942773977312,NA,2,0 -"653",-0.204438825066372,59.4853135557367,1,1,1.09936446212006,NA,3,1 -"654",0.490840917661593,40.2733664026596,0,0,-0.107551492563844,NA,5,0 -"655",-0.20533698727053,60.1122825038515,0,0,0.0894339240327522,NA,2,1 -"656",0.190691323925256,62.3534485729383,1,0,0.606327319614907,NA,3,1 -"657",0.970024192661124,50.4963014647889,1,1,-0.0804754643912515,NA,3,0 -"658",-0.285261077847069,42.4285202480467,0,0,0.810892128943754,NA,1,1 -"659",-0.103619268541766,56.4003934908483,0,0,-0.520685306740175,NA,1,0 -"660",0.511254541203706,28.7841044157547,0,1,-1.311726827066,NA,1,0 -"661",-1.25816245001572,76.7254711224307,1,0,0.13926896706412,NA,1,0 -"662",0.543825968956194,66.9054981644897,1,0,2.05028132773367,NA,1,0 -"663",-0.534963586582989,50.2332967992888,0,0,0.408030144810329,NA,2,0 -"664",-1.13804602678007,60.1513417560489,0,1,0.205003350637751,NA,4,1 -"665",-1.76800377087565,58.3454582474957,1,0,-2.25286871899928,NA,1,1 -"666",-0.714978202867742,43.3978663147313,1,1,-0.565403743167805,NA,3,0 -"667",-1.17020226435962,51.215656197345,0,0,0.950213555862081,NA,3,0 -"668",-1.17844814841946,69.6581583860238,0,0,-0.372804808189972,NA,2,1 -"669",1.62729133795333,48.44178693266,1,0,-0.1492126428995,NA,5,1 -"670",-1.21214666562853,52.486042883844,0,1,-0.0922176935601109,NA,4,0 -"671",0.997509917441285,52.5491564414526,1,0,-0.574496620028689,NA,2,0 -"672",0.545245641420171,72.2705248547405,1,0,-1.96122664049141,NA,3,1 -"673",-0.88478911517512,58.0914200617858,0,0,0.971325375264889,NA,3,0 -"674",-0.611230697881371,65.5431074597468,0,1,-0.713199115251178,NA,4,1 -"675",-0.410199512833357,67.7143312710373,1,1,-1.51317945394904,NA,1,0 -"676",0.190409369836815,56.1993320020924,0,1,-0.687123937818725,NA,0,0 -"677",-0.348324928959338,48.7144861164402,1,0,0.0118395408472662,NA,4,0 -"678",0.826847530284988,61.7010177703998,0,0,-0.116434978511697,NA,3,1 -"679",-0.292686902302723,56.4145766794893,1,0,-0.0422275015339314,NA,2,0 -"680",-2.16534620996324,74.429384906866,0,0,0.762295840447976,NA,2,0 -"681",-0.0962167762033804,54.8005313675673,0,0,-0.726095094064359,NA,0,1 -"682",-1.85195684624858,56.8935790560068,0,1,-0.0677972470128565,NA,2,1 -"683",0.937811135232089,66.8273746460155,0,0,-1.26107920253219,NA,3,0 -"684",-0.747686481439497,53.3642157963456,0,1,0.206892040110787,NA,1,0 -"685",-1.11524143570017,59.8204945136309,1,0,1.36441064967075,NA,4,1 -"686",-0.591506415298824,64.9659101653161,0,1,1.14095787081644,NA,3,1 -"687",0.585230683909193,55.957252994943,0,0,0.878779615734706,NA,3,0 -"688",-1.91855819930986,47.8835118978589,0,0,0.809802914368629,NA,2,0 -"689",0.253816791504093,63.1274371673616,1,1,0.902337900109669,NA,2,1 -"690",-0.972241766246169,57.2013813581033,1,0,1.38438037496282,NA,2,0 -"691",-0.00851344803801686,39.8014486545528,1,1,-1.14947882024253,NA,4,0 -"692",0.679041911007479,62.6957886317779,1,0,3.04137005820899,NA,3,1 -"693",0.569631366692422,61.3179925086918,1,0,0.308372526803296,NA,2,0 -"694",1.5029616611284,43.2122872961847,1,0,2.63980862992207,NA,4,0 -"695",0.129085401165474,54.7412049669595,1,1,2.15875203633165,NA,4,1 -"696",-0.701182951676618,58.2487537655487,1,1,1.0255478204364,NA,0,1 -"697",2.15370721597723,54.4336923660839,0,0,-0.712636079025502,NA,1,1 -"698",-0.30324613826175,65.5184840910853,1,1,0.547552248004117,NA,2,1 -"699",0.807579776908013,44.2394848671974,0,1,-0.0912041102216604,NA,4,0 -"700",0.930991466126415,50.4570722760887,1,0,-0.0847202595553852,NA,1,1 -"701",0.454902098272933,55.5013657540147,0,1,2.828918800837,NA,3,0 -"702",1.4254206900636,74.4390639515157,0,1,0.386740110051894,NA,1,1 -"703",-0.20899117251275,58.4474591118618,0,1,0.035752971150918,NA,4,1 -"704",-1.23140277909061,49.782810920101,0,1,0.0953234242182932,NA,5,1 -"705",-0.360603577153347,75.3846974718311,1,1,-1.87627347653956,NA,2,1 -"706",-0.547213299484056,57.6882402587354,1,0,-0.836680050194782,NA,4,1 -"707",0.327996007514467,65.5192386215171,1,0,-1.59098150147176,NA,3,1 -"708",-0.933299832093657,71.2179403686998,0,0,0.679609200101788,NA,4,0 -"709",0.680810824800176,64.0954539235249,1,1,0.0394033025755432,NA,4,0 -"710",-1.16388240018636,73.0026152604088,1,0,-1.13110578221052,NA,2,0 -"711",1.75544950988335,52.0907872846478,1,0,1.54246283173142,NA,3,1 -"712",1.44963736425491,75.9861945374855,1,1,0.979382929015041,NA,4,1 -"713",-0.448794941211084,41.7709972649556,0,1,0.479663328297167,NA,0,0 -"714",0.140067288930464,50.2463688602315,1,0,0.640033119859951,NA,2,0 -"715",-0.373650328177582,54.1353934114822,0,1,-1.1331543057143,NA,3,1 -"716",-1.20643411723808,62.3310955180241,1,0,-0.370884779043532,NA,1,1 -"717",-2.66134597494101,55.2655137517466,1,0,0.446182869021833,NA,1,1 -"718",-0.0250944932655905,65.8311098068857,0,1,-1.09814310255957,NA,1,1 -"719",-0.0343732572490637,64.2002944729188,1,0,2.71111481120823,NA,0,0 -"720",1.64235903904485,66.7886090857055,0,0,1.32545606767588,NA,2,1 -"721",-1.99546214753883,74.8246102516947,0,0,1.12590205791424,NA,4,0 -"722",-0.662170758602409,45.7471596223815,0,0,0.0540639496507907,NA,3,0 -"723",0.880037992310033,69.6997498065008,1,0,-1.41726775744781,NA,5,1 -"724",0.342387420001643,57.3453423213813,0,0,1.1157841655732,NA,1,1 -"725",0.121753855523786,51.9689444931483,1,1,1.42687828227381,NA,3,0 -"726",-0.620761034511813,42.2296589808404,0,0,0.317797102510567,NA,1,1 -"727",0.200945267574023,51.5096310837433,0,1,-0.496273411043631,NA,4,0 -"728",-0.235905348023341,57.0052178085398,1,1,0.599696904650334,NA,2,0 -"729",0.113661114970227,40.5975552081015,1,0,0.473425513276744,NA,2,1 -"730",-0.577929195287818,56.3996721236615,1,0,-0.789327412942633,NA,4,0 -"731",1.18856488583833,65.1010618716982,0,0,-0.609905289258947,NA,2,1 -"732",1.05993627812147,47.0202958147036,1,0,0.219664136878533,NA,5,0 -"733",1.33675821086468,59.9515259765206,1,0,0.992447441204347,NA,4,1 -"734",-0.367169171130217,46.5514776511103,0,1,2.26042217472389,NA,3,0 -"735",-0.304336704778395,59.4831581476884,1,0,-1.11485663619506,NA,2,0 -"736",-0.350279624235713,68.9484979335908,1,1,-0.572319724030329,NA,5,1 -"737",1.37414476272803,38.2011772720581,1,0,1.35270086680642,NA,4,0 -"738",-0.299720491875069,83.0641643226721,1,1,0.107868729562373,NA,2,0 -"739",0.0976351442252321,48.3224964444907,1,0,0.633432253650029,NA,1,0 -"740",0.381188028723722,69.4042971867315,0,1,-0.879849328007825,NA,4,1 -"741",0.678212556818783,80.7910159606188,0,0,-0.332886377356052,NA,3,1 -"742",-0.26767327223427,50.5617597672666,0,0,0.324992574993461,NA,3,0 -"743",2.34596566433491,53.2454585487364,1,0,-0.938415415864997,NA,5,1 -"744",-0.927494655524829,70.2165032799741,0,1,1.21315153594942,NA,5,0 -"745",-1.42655717369774,51.8249656933302,0,1,-1.26881966939225,NA,4,0 -"746",0.0423026643898127,50.4287926598901,1,1,0.840155824165012,NA,4,1 -"747",1.76744347000481,75.811192798267,0,0,0.165133587479138,NA,5,1 -"748",0.0527503271453671,77.6644280148737,1,1,-1.3149626812135,NA,2,0 -"749",0.109726039969968,57.7296484111446,1,0,1.70603877388749,NA,5,1 -"750",0.863202615558153,82.8111019497527,0,1,0.177503683488102,NA,2,0 -"751",-0.0989732047696172,79.0374721659382,0,0,-0.393377531770713,NA,4,0 -"752",0.103696546086221,59.3137846349919,1,1,0.578306036031158,NA,5,0 -"753",1.20848487568464,51.8614086073333,1,1,-0.191761909283258,NA,1,0 -"754",0.0937460210527718,53.2674468560267,0,0,-0.282224274118901,NA,3,0 -"755",-0.752196518946083,74.02030463378,0,0,-0.683561574477846,NA,3,1 -"756",0.394924219872661,65.4001660829658,0,0,0.580194679530021,NA,1,1 -"757",-1.09152043398742,66.4227416879848,0,0,-1.06993557582488,NA,3,1 -"758",-1.45829242499569,59.8728023501304,1,1,0.147119675762222,NA,3,1 -"759",-0.122712284955754,61.8287955322963,1,1,-0.17676429931807,NA,1,1 -"760",-1.1009291916684,55.1148541976927,0,0,-0.341774689629685,NA,3,1 -"761",0.580923071839757,71.3523673607646,0,0,0.677529248771208,NA,0,0 -"762",-0.146947489359788,56.7394897775078,1,1,0.42083523424221,NA,3,0 -"763",-0.766334615072354,79.9186253382779,1,1,-0.251818711919797,NA,0,0 -"764",1.62289757854462,65.8677289435209,1,1,0.525188913077835,NA,3,1 -"765",-0.109940544035991,69.8833322371712,1,0,-1.65274959549991,NA,2,1 -"766",1.42170485426206,74.7562393528371,0,0,-0.962670395989128,NA,1,1 -"767",-0.113421048996398,59.5304662014389,1,1,0.0850246742327054,NA,4,1 -"768",-0.329280941581153,42.3246890079977,0,1,1.51904565227881,NA,1,0 -"769",0.373460983597143,53.903660282301,0,0,-1.16116797368342,NA,0,1 -"770",1.02988813093981,65.9325691225055,0,1,-1.60993793483563,NA,3,1 -"771",2.70577504719934,72.0338721999315,0,1,0.0862946021244864,NA,4,0 -"772",-1.03494587124924,62.0825682693907,0,1,0.0107823868250337,NA,4,1 -"773",-0.183429858640727,68.3913935853181,0,0,-1.41869621389183,NA,0,0 -"774",1.07777130745133,69.4304143399232,0,1,1.48418828060195e-05,NA,3,0 -"775",0.341620225298793,70.280931190928,0,1,-0.279330391091714,NA,2,0 -"776",-0.187343624118674,67.4756384421526,1,0,0.173484296127729,NA,3,0 -"777",-1.30285910852947,58.0857293471377,1,1,-1.0706374396176,NA,4,0 -"778",-0.278658261975666,70.2469016466156,0,1,-0.583210546544618,NA,2,1 -"779",-0.184753908458266,54.9577802523955,0,0,-0.678069216586378,NA,4,1 -"780",-0.0817774971735639,63.4854885945376,0,0,1.14598833725543,NA,5,0 -"781",1.61224179372707,54.9062612481006,1,0,-0.51436710134668,NA,3,1 -"782",-0.461939839504668,64.7782445730895,0,1,-0.370051597234955,NA,4,1 -"783",-1.57605030081864,65.0098539560201,1,1,-1.83842242580121,NA,2,0 -"784",0.496433073531127,58.944194827226,1,1,0.0434175847865763,NA,0,0 -"785",-0.111953783818969,60.2730873890106,0,1,-0.623635815887942,NA,3,0 -"786",-0.204346400211973,64.3371580457113,0,1,0.883466273862089,NA,3,0 -"787",0.635568162856401,48.6893551555668,1,0,0.994831062739734,NA,2,1 -"788",-2.90667395407155,70.2068690052451,1,1,-0.381437329858756,NA,2,1 -"789",-0.553658252760955,62.6927428999677,0,1,-1.49760050303526,NA,2,1 -"790",-0.153451828250923,65.485486413912,1,0,-0.354756523414952,NA,3,0 -"791",0.60529051576044,58.3057883764132,0,1,-0.00319690430103726,NA,3,1 -"792",-0.261006596184361,41.2155975004951,1,0,-0.932423892622701,NA,4,1 -"793",-1.00513846241125,67.4830719512744,1,1,-0.669411139308381,NA,5,1 -"794",0.372566114312583,57.3989985999219,0,1,0.467315695223277,NA,0,0 -"795",0.0194461765201242,66.1826065906371,1,1,1.44357713657811,NA,4,0 -"796",0.110528815395276,80.4793696163662,1,0,-1.26792012422636,NA,0,1 -"797",1.15657043692358,48.4437516202214,1,0,-0.55280402285121,NA,4,0 -"798",-0.542220460211763,73.5294140674424,0,0,-2.25236504201458,NA,3,0 -"799",-1.24920128341908,56.3247801839008,1,1,0.789145208708141,NA,3,0 -"800",-1.28001259675319,59.4168953671932,0,0,0.630803743129537,NA,1,0 -"801",-1.02393278779038,54.6778190756221,0,1,0.915461403145339,0.915461403145339,3,0 -"802",-1.38765057998806,55.0149952264239,0,0,0.0217775991973722,0.0217775991973722,2,0 -"803",-0.0492211144061267,48.4986042242414,1,1,-0.538692377632498,-0.538692377632498,4,1 -"804",1.81096073048314,61.2624092272811,0,0,-2.03288160541304,-2.03288160541304,3,1 -"805",-0.0995100657082667,64.7286389917113,0,0,0.635409682093095,0.635409682093095,3,0 -"806",0.777237124297383,58.1347478479356,1,0,1.21794626140681,1.21794626140681,0,1 -"807",-1.10259632518001,43.8071355004172,0,1,0.63796618526594,0.63796618526594,2,0 -"808",-0.221644782701794,55.9724095378059,0,0,-0.634898013775829,-0.634898013775829,0,0 -"809",0.5660949625959,64.1281224180517,0,1,-0.763522359047601,-0.763522359047601,4,1 -"810",-0.354959345255063,52.7928059593948,0,1,-0.101990719944618,-0.101990719944618,5,1 -"811",0.785262601468044,54.0993476958089,1,1,-0.065395507533636,-0.065395507533636,4,0 -"812",0.694047378599404,48.3037338213833,0,1,0.421495213312655,0.421495213312655,0,1 -"813",-0.632729895584955,65.9145664607184,0,0,1.24369083450269,1.24369083450269,2,0 -"814",-0.771958641961308,53.3136105099826,1,0,1.11412556472886,1.11412556472886,2,0 -"815",2.3578173876011,91.6793764721645,1,1,0.352007640693812,0.352007640693812,2,1 -"816",-0.19254933635981,70.1303941735815,0,0,-1.09417696530104,-1.09417696530104,3,1 -"817",0.00661933421419712,58.3401268034862,1,0,-0.852122202550184,-0.852122202550184,1,0 -"818",-0.0545460246273611,60.0241647226761,1,0,-1.17664670710138,-1.17664670710138,1,1 -"819",-0.568206880540376,52.8269801682559,0,0,1.05151524836991,1.05151524836991,0,0 -"820",-0.926975943696645,62.8133802830802,1,1,-1.64170068936338,-1.64170068936338,2,1 -"821",-1.1357351813807,60.0728189711361,1,1,-0.999322055614457,-0.999322055614457,2,0 -"822",0.913962339054452,42.7029977359167,1,0,0.197181052483672,0.197181052483672,1,1 -"823",-0.986062833322873,41.565263835908,1,1,0.858572747161643,0.858572747161643,4,1 -"824",0.66748541348963,66.0120303141026,0,1,0.379268814639385,0.379268814639385,5,0 -"825",1.40659548970662,36.8206242918338,0,0,0.691272621242877,0.691272621242877,5,0 -"826",1.15777896011788,56.1968792628043,0,0,-0.332866679460152,-0.332866679460152,3,1 -"827",-0.138753197573533,76.5516490019156,1,1,-0.702466854631497,-0.702466854631497,4,1 -"828",-0.539321850960138,43.3714495098447,0,1,-1.71827188864524,-1.71827188864524,4,1 -"829",-1.07889349906152,62.9472966578159,1,0,-0.0533594167322972,-0.0533594167322972,4,0 -"830",-0.0185621399511091,75.6990143919341,0,1,1.33033001889087,1.33033001889087,5,0 -"831",-0.232615685678704,65.1194689676205,0,0,0.184433224194206,0.184433224194206,1,0 -"832",0.15629354251416,72.8590240106394,0,0,1.40437355070227,1.40437355070227,2,0 -"833",-0.604009754754033,50.3248524644029,1,0,0.439859671988793,0.439859671988793,0,0 -"834",0.629035120121665,66.3238935450673,0,1,0.215679969692286,0.215679969692286,5,0 -"835",0.714169570852022,61.8307866764632,0,1,-0.839563940423475,-0.839563940423475,2,1 -"836",1.07588610128876,73.5011611692373,1,0,0.173038200664969,0.173038200664969,2,1 -"837",2.24975551737932,84.8732767144425,1,0,-0.985303072019256,-0.985303072019256,3,1 -"838",0.197890598879292,61.1466169708147,1,0,1.10488004265206,1.10488004265206,2,0 -"839",-0.575546318478704,70.9574036400398,1,1,-1.00691086515951,-1.00691086515951,2,1 -"840",-0.0773208650045946,55.9729012880704,0,1,-0.605765264093382,-0.605765264093382,2,1 -"841",-1.62791988910102,56.4093028564553,1,1,1.23631924513615,1.23631924513615,4,0 -"842",0.129923153969037,54.4721177008355,1,1,-0.667938284296919,-0.667938284296919,2,1 -"843",-0.425673133260061,46.4532501972434,0,1,1.11904774776545,1.11904774776545,3,0 -"844",0.406011748521125,60.7333886206226,1,0,-0.172943387963443,-0.172943387963443,1,0 -"845",-0.268810185638072,59.6101147000787,1,0,-0.374972337078608,-0.374972337078608,1,0 -"846",-1.38098453629771,67.2186988398137,0,1,-0.362724158095648,-0.362724158095648,5,0 -"847",0.0365833213445593,57.8508104219382,0,1,0.879702337381518,0.879702337381518,5,1 -"848",-1.35324453117639,50.8441506622652,0,0,-1.50123695347916,-1.50123695347916,3,1 -"849",0.897006193942258,58.7156145919783,1,0,-1.49010928703793,-1.49010928703793,1,0 -"850",-1.82935971226393,78.4021048782139,1,1,0.527434864974974,0.527434864974974,4,1 -"851",-0.141876944171991,64.5053125599064,0,0,1.25170413109103,1.25170413109103,1,1 -"852",0.457981777842779,62.7479367766071,0,1,2.08247273578487,2.08247273578487,5,1 -"853",-1.51961751308999,70.5637641799707,0,1,0.825769355822583,0.825769355822583,3,1 -"854",1.39905531466027,77.9205427905425,1,1,0.831442747299394,0.831442747299394,2,0 -"855",-0.885339190903023,77.1081921609865,0,1,1.12265888209484,1.12265888209484,3,0 -"856",-0.506330291221321,55.6857106606254,0,0,-0.70525925520244,-0.70525925520244,2,0 -"857",0.163332105966331,59.2072313561489,0,1,-0.171084647027825,-0.171084647027825,0,1 -"858",-0.337143856258392,56.3677495549941,1,1,0.134300960325259,0.134300960325259,3,0 -"859",-1.03595153200983,57.990095269631,1,0,-0.0540683683497902,-0.0540683683497902,2,0 -"860",0.409337928216715,64.5128192983634,1,0,1.50517157529026,1.50517157529026,1,1 -"861",-0.524223458216886,55.2005801519779,0,1,1.22713757188919,1.22713757188919,0,1 -"862",0.506500373524185,48.344748198733,1,1,1.47929760840955,1.47929760840955,1,1 -"863",-0.504722862774708,57.6105602870942,0,1,1.20631205327699,1.20631205327699,3,1 -"864",-0.957250328953504,74.4983465190903,1,1,1.1083535507302,1.1083535507302,2,1 -"865",-0.0946343584532563,78.5287387225315,0,0,-1.49938280598482,-1.49938280598482,2,0 -"866",0.649439317610584,55.9676849553722,1,0,0.60989975618154,0.60989975618154,5,0 -"867",0.242321799327066,68.2696831595617,1,1,-1.82115152407692,-1.82115152407692,1,0 -"868",-0.0589340902617142,58.7086509758736,0,0,0.077820357983384,0.077820357983384,1,1 -"869",-1.94779407157618,58.7847311320163,0,0,0.478241320191152,0.478241320191152,1,0 -"870",1.4514436318915,53.2572671519954,1,1,-0.740192918234158,-0.740192918234158,3,0 -"871",-0.0399396776377224,44.2156620120559,1,0,1.44447797881903,1.44447797881903,2,1 -"872",-0.863997691069911,78.0207639644711,0,1,0.323853778514233,0.323853778514233,1,1 -"873",-0.201690239712518,68.0111681386451,0,0,-0.356084214835065,-0.356084214835065,1,0 -"874",0.261255562727432,42.0773527249183,1,0,0.552137107246651,0.552137107246651,3,1 -"875",-1.51835138330586,56.5258127372455,1,1,0.449175119547768,0.449175119547768,5,0 -"876",0.717308632275803,75.5431592932965,0,0,1.23074133402351,1.23074133402351,1,0 -"877",0.246832728096417,61.064927931859,1,0,0.965154616132545,0.965154616132545,4,0 -"878",1.20276704630509,61.9250892067048,0,0,1.56168460211253,1.56168460211253,0,1 -"879",0.986039270447351,68.4922796714094,1,0,-0.316111942044904,-0.316111942044904,5,1 -"880",-0.537940599062239,69.9051461175328,1,0,-0.917320660872748,-0.917320660872748,4,0 -"881",-0.490282741813956,66.275822116448,1,0,1.55722150429562,1.55722150429562,3,0 -"882",-0.995516172562447,64.1521548555604,0,0,-0.399468222498247,-0.399468222498247,2,0 -"883",-0.361702928254615,74.7493906483472,0,1,0.442155462945799,0.442155462945799,4,1 -"884",0.944354811296923,77.0947207391152,0,0,0.781209235101923,0.781209235101923,1,1 -"885",1.79220701161545,62.2835960194932,0,1,-1.47966001438128,-1.47966001438128,5,0 -"886",0.928409395864299,57.9188311793936,0,1,0.514096507221489,0.514096507221489,3,0 -"887",-1.82993766963068,60.1214575527235,1,0,-0.824996216671844,-0.824996216671844,0,1 -"888",0.821878106163116,63.4972266325496,0,0,-0.865937769635399,-0.865937769635399,4,1 -"889",1.0362687620034,53.2235535759013,0,1,2.58007784938081,2.58007784938081,0,0 -"890",0.617420144129801,66.4218267919739,1,0,-1.60328067887141,-1.60328067887141,4,1 -"891",-0.854930363118786,47.1056196473064,1,0,-0.932155695231725,-0.932155695231725,1,1 -"892",1.06451051848471,82.2606725497448,0,1,-1.63416463348034,-1.63416463348034,5,0 -"893",0.729481458103667,35.1658064102412,0,0,1.14526199164209,1.14526199164209,4,1 -"894",0.66265658435809,61.7283427870576,1,0,-0.873270479383926,-0.873270479383926,3,1 -"895",-0.381407202704243,59.4109585234552,0,0,-0.157622793650065,-0.157622793650065,0,1 -"896",0.845891777793581,65.4745229730498,0,0,0.511908207706561,0.511908207706561,2,0 -"897",0.111591884445813,55.8952639789569,0,0,1.18477431013477,1.18477431013477,5,1 -"898",-0.0677702616560359,84.024051603136,1,0,1.66262494013238,1.66262494013238,1,1 -"899",1.48709809651828,58.5577906055765,0,1,-1.44003666766935,-1.44003666766935,5,1 -"900",0.75722337664623,60.2339644283464,0,1,0.0718381042586017,0.0718381042586017,3,0 -"901",-2.87404249339871,71.2412194136023,1,0,1.60026127461792,1.60026127461792,5,1 -"902",-0.426542449077482,66.9777449047275,0,1,1.09024282511803,1.09024282511803,4,1 -"903",-1.48740629563197,61.8090747869435,1,1,0.529549933033616,0.529549933033616,0,0 -"904",0.588791645710936,58.3039716959421,1,0,0.439869052170178,0.439869052170178,2,0 -"905",-0.44259951077578,57.6547195264732,1,1,2.19439783890363,2.19439783890363,1,1 -"906",0.901304802320198,46.4506930716295,0,0,1.21259811758189,1.21259811758189,2,0 -"907",0.467828340367688,66.2995607852048,0,0,-0.583330212349268,-0.583330212349268,4,0 -"908",0.0220403186670302,64.4727015251654,0,0,-0.122696798736509,-0.122696798736509,3,0 -"909",-0.0851078317956122,49.5595814154325,1,1,-1.11572822879677,-1.11572822879677,2,0 -"910",-0.361267142653458,58.1271442537623,1,1,-0.245228769113442,-0.245228769113442,1,0 -"911",0.199506501537268,72.8729087029212,0,0,-0.33479722149108,-0.33479722149108,4,0 -"912",-1.1082542547003,72.7409281326188,0,1,-1.28266300567136,-1.28266300567136,4,0 -"913",0.672794070486354,47.7883864180764,1,1,0.796043792076802,0.796043792076802,1,1 -"914",-0.39411363337364,68.0202190654962,0,1,0.823456399610159,0.823456399610159,4,1 -"915",-0.743967585584992,61.2683070070261,0,1,1.78880270166576,1.78880270166576,3,0 -"916",-0.32611347678341,63.8851485718354,0,1,0.975574368447363,0.975574368447363,0,0 -"917",1.87466882681447,51.2780382678861,1,0,0.412025202686387,0.412025202686387,3,0 -"918",-1.75799914534674,57.9291300999102,1,1,1.97493733334156,1.97493733334156,4,0 -"919",-0.348860909964869,39.4158731102794,0,1,-0.951142928618088,-0.951142928618088,3,0 -"920",0.106589229118385,52.608766828855,1,0,0.567952353137801,0.567952353137801,2,0 -"921",-0.537658781205147,56.9337787562458,0,0,-0.441991599466074,-0.441991599466074,4,0 -"922",-0.411269131911835,73.0341763353321,1,0,-1.76203008968521,-1.76203008968521,1,1 -"923",0.382348337976121,49.4134014494489,1,1,-1.7618261210662,-1.7618261210662,1,1 -"924",0.222514088542642,63.0172018583522,0,0,-0.199306385745731,-0.199306385745731,1,1 -"925",-0.673754508319701,60.8997318683277,1,1,0.474692677271762,0.474692677271762,3,1 -"926",0.567432913870525,46.3660774942888,1,0,0.359995385588833,0.359995385588833,1,1 -"927",0.17948365476,62.825238713036,1,1,-0.132368527220886,-0.132368527220886,1,0 -"928",0.686341749488027,75.8069945043382,1,1,0.12107844105611,0.12107844105611,2,0 -"929",-1.32726366148341,69.4004024632186,1,1,-0.0381938134769495,-0.0381938134769495,2,1 -"930",-0.767916438222332,71.3143023201082,0,0,-0.872905972754614,-0.872905972754614,0,0 -"931",-0.192702508672789,57.8630409930615,0,1,-0.797107844441803,-0.797107844441803,0,0 -"932",0.95905119808221,69.5783890475,0,1,0.323471025211062,0.323471025211062,2,1 -"933",-0.529031361979442,36.2919037046433,0,1,1.65605831529332,1.65605831529332,1,1 -"934",-1.03288699095086,47.4674139064516,0,0,0.0637501511077446,0.0637501511077446,2,1 -"935",0.771745490447405,58.2515204836704,1,0,2.25298948645765,2.25298948645765,3,1 -"936",0.465356664198533,61.0851159290109,0,1,-0.9438673491284,-0.9438673491284,0,0 -"937",-1.22567106691397,65.1692045431265,1,0,0.343178024320818,0.343178024320818,4,1 -"938",2.04667724022033,67.2163438594481,0,0,0.18411953937326,0.18411953937326,2,1 -"939",1.49867444889729,55.400561672237,1,1,-0.239758700034669,-0.239758700034669,2,1 -"940",-1.38894908467684,55.1838845348883,1,0,0.851805632237612,0.851805632237612,0,1 -"941",2.15337646372281,56.5072467774652,0,0,-1.26896125131166,-1.26896125131166,3,0 -"942",-1.72911674054936,54.3823402815629,1,1,-0.148853941632772,-0.148853941632772,4,1 -"943",0.102042485094495,58.7529730160193,1,0,0.023438855961796,0.023438855961796,4,0 -"944",-1.14127912749709,68.7755232786064,0,0,0.860632635277945,0.860632635277945,3,0 -"945",-0.377009088203781,64.6334691360602,1,1,-0.799273980270713,-0.799273980270713,0,1 -"946",-0.348782098762047,55.6063466367055,0,0,-2.83653272169744,-2.83653272169744,5,0 -"947",0.322885081228563,53.075827075032,1,0,1.0722458639509,1.0722458639509,2,0 -"948",-1.12433961794016,58.9554720640944,0,0,0.974138633268488,0.974138633268488,4,1 -"949",-2.75704972232067,57.9200008743287,1,1,1.05385557723761,1.05385557723761,2,1 -"950",-0.0958110991197285,72.1930841222822,1,1,-0.209404913847133,-0.209404913847133,1,1 -"951",-0.746699804956649,37.2956430098056,1,1,-2.53391876785558,-2.53391876785558,4,0 -"952",0.210781418005385,50.2654733095165,0,0,0.324381377840888,0.324381377840888,3,1 -"953",-0.848509808508819,66.2119025904558,0,1,1.52923093464334,1.52923093464334,2,1 -"954",0.792264765704344,66.6256167229789,1,0,-0.49502553937255,-0.49502553937255,3,1 -"955",0.0291874325756546,53.4900975127976,0,1,-0.74992055147891,-0.74992055147891,1,1 -"956",0.603388012483792,57.9126293144572,1,1,-0.0381006103542124,-0.0381006103542124,5,1 -"957",-0.994940104813448,77.9691164714174,0,1,0.153699179840078,0.153699179840078,0,0 -"958",-0.211332458143761,49.0122158510559,0,0,-0.279685204486181,-0.279685204486181,1,1 -"959",0.344021087496185,77.7774209175325,0,1,0.738721035004576,0.738721035004576,1,1 -"960",-0.0893785289835366,63.1694440814458,1,0,1.24054859264869,1.24054859264869,3,0 -"961",1.6540374665165,70.4957642163157,1,1,-0.779622052059768,-0.779622052059768,4,1 -"962",-1.96290871320897,54.7371921499326,1,0,0.265049589448697,0.265049589448697,4,0 -"963",0.756125511499489,48.797874636976,1,0,0.0591445263956009,0.0591445263956009,5,1 -"964",0.0144186579430547,63.9825949406673,1,1,-1.13984338266092,-1.13984338266092,3,1 -"965",0.945713804060792,64.2130680300369,0,1,1.40524178999936,1.40524178999936,5,0 -"966",-0.098808492386284,55.7484635094942,0,1,0.411013833171653,0.411013833171653,4,1 -"967",-2.73975424338714,51.230089940189,1,0,1.13299332536587,1.13299332536587,1,1 -"968",0.420305849423153,72.9999660430953,1,1,-0.543361565575453,-0.543361565575453,4,0 -"969",-0.218379262260984,46.63213243505,1,1,-0.212711776294238,-0.212711776294238,3,1 -"970",-0.989355329598254,74.2924242973265,1,1,-0.238925993307699,-0.238925993307699,2,0 -"971",-0.656468896184539,54.0530687991355,0,0,0.588847108405563,0.588847108405563,4,1 -"972",0.055663548201307,41.1731354217979,0,0,0.238272526252067,0.238272526252067,1,1 -"973",2.45206344477497,81.80311231724,1,0,1.36858573287737,1.36858573287737,2,0 -"974",0.18817357373771,63.7877727864363,1,1,-0.32345239834268,-0.32345239834268,4,1 -"975",-0.979696368377701,49.3068717129501,0,1,-0.205802607928562,-0.205802607928562,1,1 -"976",1.90361046750456,56.2485501837805,0,1,-0.790108072379854,-0.790108072379854,3,1 -"977",-0.438519782195613,52.0520228153762,1,1,-0.300860202232585,-0.300860202232585,2,0 -"978",-0.829726095585463,63.1512286871127,1,0,-0.398807652887792,-0.398807652887792,3,0 -"979",-1.247231318762,61.3393938568366,0,0,1.8097043061288,1.8097043061288,1,0 -"980",-0.673809163087791,47.5944842442115,1,0,-1.11568094665946,-1.11568094665946,4,0 -"981",1.93516726396696,62.56480536921,1,0,1.03671143043759,1.03671143043759,0,1 -"982",0.906561434864075,53.2592979325425,0,0,1.17357458312996,1.17357458312996,3,1 -"983",0.736526771427079,66.3322735378512,1,0,-0.867450686176965,-0.867450686176965,1,1 -"984",0.146625304045126,66.0236487324394,1,0,-0.736363718750934,-0.736363718750934,3,1 -"985",0.283120796739,59.6596631347373,1,0,-0.368193908693371,-0.368193908693371,5,0 -"986",0.589276720770128,52.1029363945098,1,1,0.628323960404366,0.628323960404366,2,1 -"987",0.215849476577042,55.2956444578579,0,1,-0.417201443455236,-0.417201443455236,4,1 -"988",0.155053234874317,49.871818623723,1,0,-1.0960078796469,-1.0960078796469,1,1 -"989",0.953826636303852,68.3355327007342,0,0,1.1488297620837,1.1488297620837,3,0 -"990",-0.165852702479289,73.5402356223174,0,1,0.110196214486804,0.110196214486804,0,0 -"991",0.25500374557096,60.5007271198645,0,0,-1.17523493852895,-1.17523493852895,2,1 -"992",-0.897472359354659,63.3688075564445,0,1,0.838088347089023,0.838088347089023,2,1 -"993",1.58397010470279,57.9100112668061,1,0,-0.171711241613191,-0.171711241613191,2,0 -"994",1.39275862816597,62.1845998789684,1,1,0.145241397802456,0.145241397802456,2,0 -"995",-1.81225823172892,78.7347520792372,1,1,-0.480092901716805,-0.480092901716805,2,0 -"996",-0.216319678589458,62.617218049717,0,0,0.870525638997579,0.870525638997579,5,0 -"997",0.549856056710878,73.3469451935642,1,0,-0.730408660241045,-0.730408660241045,3,1 -"998",0.482736237517578,62.477466461451,0,0,-0.209510059658523,-0.209510059658523,3,0 -"999",0.760848728950074,46.2421513881639,1,1,1.06559159302466,1.06559159302466,5,0 -"1000",-0.445658016934015,48.0946021824269,1,1,1.47733324836137,1.47733324836137,3,0 diff --git a/variable-info/update-outcome-info/toTabs.sh b/variable-info/update-outcome-info/toTabs.sh old mode 100644 new mode 100755 From d4a99d3e5a36d5fc76cd3302a25f1a167f0e0fa7 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 10:29:50 -0400 Subject: [PATCH 02/27] Code refactor: make an R package first --- .Rhistory | 2 + .gitignore | 1 + PHESANT/.Rbuildignore | 2 + PHESANT/.Rhistory | 0 PHESANT/DESCRIPTION | 9 ++ PHESANT/NAMESPACE | 1 + PHESANT.Rproj => PHESANT/PHESANT.Rproj | 4 + PHESANT/R/PHESANT-internal.R | 127 ++++++++++++++++++ {WAS => PHESANT/R}/addToCounts.r | 0 {WAS => PHESANT/R}/binaryLogisticRegression.r | 0 {WAS => PHESANT/R}/equalSizedBins.r | 0 {WAS => PHESANT/R}/fixOddFieldsToCatMul.r | 0 {WAS => PHESANT/R}/getIsCatMultExposure.r | 0 {WAS => PHESANT/R}/getIsExposure.r | 0 .../R}/getNumValuesCatMultExposure.r | 0 {WAS => PHESANT/R}/incrementCounter.r | 0 {WAS => PHESANT/R}/initFunctions.r | 33 ----- {WAS => PHESANT/R}/loadConfounders.r | 0 {WAS => PHESANT/R}/loadData.r | 0 {WAS => PHESANT/R}/loadIndicatorFields.r | 0 {WAS => PHESANT/R}/loadPhenotypes.r | 0 {WAS => PHESANT/R}/loadTraitOfInterest.r | 0 {WAS => PHESANT/R}/makeTestDataFrame.r | 0 PHESANT/R/option_list.R | 20 +++ {WAS => PHESANT/R}/processArgs.r | 0 {WAS => PHESANT/R}/reassignValue.r | 0 {WAS => PHESANT/R}/replaceMissingCodes.r | 0 {WAS => PHESANT/R}/replaceNaN.r | 0 {WAS => PHESANT/R}/saveCounts.r | 0 {WAS => PHESANT/R}/storeNewVar.r | 0 {WAS => PHESANT/R}/testAssociations.r | 0 {WAS => PHESANT/R}/testCatMultiple.r | 0 {WAS => PHESANT/R}/testCatSingle.r | 0 {WAS => PHESANT/R}/testCategoricalOrdered.r | 0 {WAS => PHESANT/R}/testCategoricalUnordered.r | 0 {WAS => PHESANT/R}/testContinuous.r | 0 {WAS => PHESANT/R}/testInteger.r | 0 {WAS => PHESANT/R}/testNumExamples.r | 0 {WAS => PHESANT/R}/unittests/run-tests.sh | 0 .../R}/unittests/test_equalSizedBins.r | 0 .../R}/unittests/test_reassignValue.r | 0 .../R}/unittests/test_testCatMultiple.r | 0 .../R}/unittests/test_testCatSingle.r | 0 {WAS => PHESANT/R}/validatePhenotypeInput.r | 0 {WAS => PHESANT/R}/validateTraitInput.r | 0 PHESANT/man/PHESANT-package.Rd | 34 +++++ PHESANT/opt_only.RData | Bin 0 -> 330 bytes {WAS => PHESANT}/phenomeScan.r | 38 ++---- PHESANT/test.sh | 3 + opt.RData | Bin 0 -> 4811 bytes 50 files changed, 214 insertions(+), 60 deletions(-) create mode 100644 .Rhistory create mode 100644 PHESANT/.Rbuildignore create mode 100644 PHESANT/.Rhistory create mode 100644 PHESANT/DESCRIPTION create mode 100644 PHESANT/NAMESPACE rename PHESANT.Rproj => PHESANT/PHESANT.Rproj (67%) create mode 100644 PHESANT/R/PHESANT-internal.R rename {WAS => PHESANT/R}/addToCounts.r (100%) rename {WAS => PHESANT/R}/binaryLogisticRegression.r (100%) rename {WAS => PHESANT/R}/equalSizedBins.r (100%) rename {WAS => PHESANT/R}/fixOddFieldsToCatMul.r (100%) rename {WAS => PHESANT/R}/getIsCatMultExposure.r (100%) rename {WAS => PHESANT/R}/getIsExposure.r (100%) rename {WAS => PHESANT/R}/getNumValuesCatMultExposure.r (100%) rename {WAS => PHESANT/R}/incrementCounter.r (100%) rename {WAS => PHESANT/R}/initFunctions.r (76%) rename {WAS => PHESANT/R}/loadConfounders.r (100%) rename {WAS => PHESANT/R}/loadData.r (100%) rename {WAS => PHESANT/R}/loadIndicatorFields.r (100%) rename {WAS => PHESANT/R}/loadPhenotypes.r (100%) rename {WAS => PHESANT/R}/loadTraitOfInterest.r (100%) rename {WAS => PHESANT/R}/makeTestDataFrame.r (100%) create mode 100644 PHESANT/R/option_list.R rename {WAS => PHESANT/R}/processArgs.r (100%) rename {WAS => PHESANT/R}/reassignValue.r (100%) rename {WAS => PHESANT/R}/replaceMissingCodes.r (100%) rename {WAS => PHESANT/R}/replaceNaN.r (100%) rename {WAS => PHESANT/R}/saveCounts.r (100%) rename {WAS => PHESANT/R}/storeNewVar.r (100%) rename {WAS => PHESANT/R}/testAssociations.r (100%) rename {WAS => PHESANT/R}/testCatMultiple.r (100%) rename {WAS => PHESANT/R}/testCatSingle.r (100%) rename {WAS => PHESANT/R}/testCategoricalOrdered.r (100%) rename {WAS => PHESANT/R}/testCategoricalUnordered.r (100%) rename {WAS => PHESANT/R}/testContinuous.r (100%) rename {WAS => PHESANT/R}/testInteger.r (100%) rename {WAS => PHESANT/R}/testNumExamples.r (100%) rename {WAS => PHESANT/R}/unittests/run-tests.sh (100%) rename {WAS => PHESANT/R}/unittests/test_equalSizedBins.r (100%) rename {WAS => PHESANT/R}/unittests/test_reassignValue.r (100%) rename {WAS => PHESANT/R}/unittests/test_testCatMultiple.r (100%) rename {WAS => PHESANT/R}/unittests/test_testCatSingle.r (100%) rename {WAS => PHESANT/R}/validatePhenotypeInput.r (100%) rename {WAS => PHESANT/R}/validateTraitInput.r (100%) create mode 100644 PHESANT/man/PHESANT-package.Rd create mode 100644 PHESANT/opt_only.RData rename {WAS => PHESANT}/phenomeScan.r (62%) create mode 100755 PHESANT/test.sh create mode 100644 opt.RData diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000..591e1ef --- /dev/null +++ b/.Rhistory @@ -0,0 +1,2 @@ +package.skeleton(name = "PHESANT") +package.skeleton(name = "PHESANT") diff --git a/.gitignore b/.gitignore index 9bea433..85c0afc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .DS_Store +.Rproj.user diff --git a/PHESANT/.Rbuildignore b/PHESANT/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/PHESANT/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/PHESANT/.Rhistory b/PHESANT/.Rhistory new file mode 100644 index 0000000..e69de29 diff --git a/PHESANT/DESCRIPTION b/PHESANT/DESCRIPTION new file mode 100644 index 0000000..4769eb6 --- /dev/null +++ b/PHESANT/DESCRIPTION @@ -0,0 +1,9 @@ +Package: PHESANT +Type: Package +Title: What the package does (short line) +Version: 1.0 +Date: 2019-03-14 +Author: Who wrote it +Maintainer: Who to complain to +Description: More about what it does (maybe more than one line) +License: What license is it under? diff --git a/PHESANT/NAMESPACE b/PHESANT/NAMESPACE new file mode 100644 index 0000000..d75f824 --- /dev/null +++ b/PHESANT/NAMESPACE @@ -0,0 +1 @@ +exportPattern("^[[:alpha:]]+") diff --git a/PHESANT.Rproj b/PHESANT/PHESANT.Rproj similarity index 67% rename from PHESANT.Rproj rename to PHESANT/PHESANT.Rproj index 8e3c2eb..21a4da0 100644 --- a/PHESANT.Rproj +++ b/PHESANT/PHESANT.Rproj @@ -11,3 +11,7 @@ Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/PHESANT/R/PHESANT-internal.R b/PHESANT/R/PHESANT-internal.R new file mode 100644 index 0000000..3b1d509 --- /dev/null +++ b/PHESANT/R/PHESANT-internal.R @@ -0,0 +1,127 @@ +.Random.seed <- +c(403L, 10L, -2055377151L, -1261003152L, -653149682L, 1435497081L, +1855348235L, -576966854L, 341399100L, 778157439L, -1613771051L, +1531682540L, 412921922L, 1086953373L, 325854743L, -1838982274L, +-1413973704L, -1064806885L, 323434361L, 1468101064L, -205967882L, +-631271375L, -660989341L, -707934670L, 2008175492L, -1116261945L, +385783965L, -399175660L, -563583302L, 625440517L, 1609831647L, +1445418342L, -1909055248L, 1313776659L, 1749100977L, -1356585888L, +1455157758L, 1388664201L, 2074088123L, -1846870390L, -1243661972L, +343572207L, 1950927141L, 1541089244L, -359217742L, -1348217683L, +413594727L, 726502286L, 1026280552L, -622679957L, 1120416265L, +1076264088L, -2134790202L, 2005156641L, 902238131L, -1253660894L, +770078996L, 1084119447L, -1439872627L, 590254244L, -1844983158L, +-1204420363L, -1441798321L, 1447714902L, 1974291200L, -2095841725L, +-585513503L, -1731227312L, 937445294L, 934640985L, -1011479701L, +913148634L, 1909599132L, 1373632415L, 178208245L, -821178228L, +-2087284382L, -1997959875L, 1589640631L, 764332638L, 1962285976L, +261272315L, -1048315239L, -201784536L, -1601408554L, -2012176815L, +668146947L, -1514712942L, 30025252L, -873383705L, 764988733L, +-731944524L, 1454405146L, 904033637L, -33915905L, -515351930L, +2009025040L, 617304243L, 158108497L, -1127143680L, -258595362L, +-1664987607L, 1106011611L, 724134634L, -2001639284L, 262526927L, +499109765L, 1971379260L, 1522683154L, -712324595L, -9821881L, +-1493900498L, 744160904L, -1168410741L, 426661737L, 1753729336L, +-490466330L, 555131777L, 1754946323L, -813456190L, -416563340L, +1511400311L, 1291424877L, 2086259460L, 1989440042L, 735829909L, +-250657745L, -759055050L, -2138913440L, 601457571L, 2098922689L, +1814889648L, -126393394L, 2014160697L, 1337096011L, 2010817786L, +94510588L, -753603009L, -1139018987L, 612225068L, 1440518402L, +1251727837L, -859002665L, 1183175614L, -1121028616L, -388640293L, +850886713L, -2082842232L, -286673610L, -1376702607L, 289504675L, +1370276082L, 2064014404L, 1143236359L, 244238045L, -1750603820L, +2003793658L, 251030085L, -1643296097L, 813770790L, 1153809584L, +368946515L, -1173546895L, 2066941344L, 2125658430L, -1106106295L, +-1619944197L, 56489162L, -1429099988L, -702894545L, 2097114853L, +-1469324388L, -1305795086L, 1942503533L, -783513177L, 979614670L, +-464289496L, -2063539413L, -1102053047L, -571165096L, -1266678010L, +1462313697L, 866272883L, 863080162L, -2029530540L, -1653605545L, +-1657831603L, -1705729308L, -581874998L, -1662598347L, 317032847L, +-1302323562L, -710869440L, 2045748995L, 1157095201L, 1237002256L, +609251310L, 481136537L, -2050820053L, -149807846L, -1435701284L, +712831455L, 124114357L, 1680823116L, 408859042L, -1960140547L, +-134820361L, -212387042L, -1368816168L, 1844444987L, 1914468569L, +1771902056L, -1979266666L, -1699702511L, 427268803L, 1609642706L, +-2067868828L, 137279143L, -1601814531L, -127387148L, 59483866L, +1411659813L, 223425855L, -1177511226L, 1897576272L, 414222195L, +1107216785L, -426198080L, 904228254L, -675370654L, 1407568616L, +-1648012084L, -773699076L, 1481535986L, 1360236672L, 1007529460L, +-1342715256L, -1851472070L, -949733040L, 729675884L, -570554988L, +1833331794L, 2089554400L, -318619444L, 193276576L, 1429041538L, +-2126949144L, -306398484L, 1763143996L, -1277055886L, 1554374256L, +421975588L, 2032640056L, 211760410L, 1924362448L, -1268650692L, +-1928376428L, 1611904898L, -603361344L, -1687638468L, -1350393456L, +1079395362L, 248160520L, 1618027660L, -1988603108L, 665800850L, +1214202240L, -187286508L, -2088061144L, 545226298L, -1092967984L, +-1418993300L, 567848884L, 746534418L, -772762656L, 1200707020L, +1496597728L, 1776060450L, -568946904L, -433355252L, -639612100L, +368270322L, 706859760L, 1953071684L, -1698060712L, -1286869190L, +-834147344L, -415867204L, 1347345684L, -740441918L, -1348860832L, +284594556L, 1966673872L, -535664350L, 656744104L, 677991308L, +-464448324L, -598055118L, -1857457600L, 548719796L, -660904888L, +123257274L, 2067885584L, -2115319572L, 1984745812L, 381529874L, +-298453600L, 2077870796L, -1096423840L, -1618591806L, 1879972648L, +-1549703764L, 2022029628L, 206182258L, 1161427248L, 2028039972L, +1369963832L, 1577974490L, 1466738576L, 1352827644L, -1536618604L, +-1325287358L, 1242152192L, 574403644L, -1861933744L, 1895827618L, +1347786568L, -1926394100L, 604801756L, 1377291090L, -2066652544L, +-127154092L, -1934056472L, 1591219962L, -180620976L, 189311788L, +1286340468L, -37364846L, 1854013024L, 2103385996L, 1819648864L, +543637474L, -1091715160L, -1094826804L, -477037060L, 1789887410L, +255111664L, -654716476L, -1160838824L, 146863482L, -1985572432L, +894295996L, -496635820L, -159435838L, 685579296L, -314557252L, +-8424880L, 438237154L, -1256534424L, -1636173236L, -235171588L, +8093682L, -1930136320L, 695349108L, 1842151432L, -1619477062L, +-851016240L, 881351660L, -1716387948L, -115207214L, -1791137952L, +-33112756L, 324058400L, 157033346L, 208630504L, -594262804L, +1896161852L, 265798514L, -1382278544L, 1861693476L, -602374344L, +1108468890L, 484203216L, 484362428L, -709216620L, -1557066110L, +-1252221760L, 683766076L, -2027837296L, -1045503454L, -228385272L, +2107578636L, 1733003932L, -1349392878L, -558103168L, -496880108L, +1038675496L, 1056519482L, 2093201360L, -2010307988L, -1508511948L, +-377087342L, -1249011360L, -1160312500L, 1870274528L, -232228958L, +1435512744L, -1369185396L, -243099972L, 1927674738L, -2106238736L, +-554463164L, 1986329944L, -1455619014L, 1574239088L, -1919521732L, +914670356L, 445555138L, 2084602080L, -778729604L, -1345243440L, +199575202L, -1920988120L, 1905594636L, -253809732L, -1626538702L, +1548744640L, -677629388L, -2139472952L, 519623866L, -1041137776L, +1637101036L, 2136944724L, -1615245678L, 2128794400L, -59988532L, +-1540817952L, 1147640514L, -44053976L, -882973012L, 1048963772L, +698567154L, -1162995408L, -453493596L, -1281574216L, -413576358L, +846591376L, 1607352700L, -716217068L, 72880450L, 930200320L, +291130300L, -596200624L, 568531234L, 1941716296L, 23336972L, +1518573143L, 1694002593L, 1330335334L, -367935900L, -591315563L, +459321023L, -743172520L, -1153875338L, 1744498307L, -1595034939L, +-2103107614L, -546796976L, -202584199L, 390984875L, 2084477980L, +1812200266L, -1649591489L, -1192012407L, 1048710558L, 440534092L, +222078493L, 670313383L, -1613058352L, 1954390126L, -688183525L, +1469139069L, 1553757802L, -316926200L, -1165826127L, -1885802461L, +1929285924L, -477141550L, -2023655897L, 2073149553L, -958355786L, +-1588544940L, -616261915L, 311055087L, 1339211304L, -692618618L, +-890616653L, -1587672555L, 1473830002L, 1794930880L, -1751978711L, +-706341989L, -174078868L, 1009748090L, -577465937L, -1331312455L, +954243790L, -898918372L, -550806643L, 1301792247L, -1906050816L, +-471410850L, 243503147L, -964325587L, 2040009626L, 165040472L, +4277441L, -1684012173L, -2112323436L, 707749154L, 1246311479L, +1815489793L, -1161228282L, -106256828L, 953040757L, -1505234209L, +-1434948040L, -2082533866L, -1478655453L, 1141738981L, 620266498L, +181872240L, -1541364711L, -1821358581L, -1404373764L, -1721304790L, +430858079L, 403881705L, 62217726L, 1339273708L, -1570424515L, +1231956231L, -1980606224L, 458605710L, -1919336773L, 1922633373L, +-2114897334L, -53110680L, 313559121L, 954907459L, -1235980732L, +-683520782L, -1188665145L, 595888785L, 787797014L, 181722740L, +-550557435L, -172300465L, 1876414792L, -1850278682L, -642551277L, +-1898592907L, -657802926L, 1363939744L, 1281517705L, 1436188859L, +-1660478708L, 526215834L, -359997425L, -1166039975L, 401238254L, +-391058564L, -739133459L, 28807319L, 686529376L, -1191093442L, +-252455925L, -1116826483L, -1645730374L, -1123120136L, 1970423713L, +243630419L, 640639092L, -818695678L, -795186025L, -2076447135L, +1631671974L, -1814995292L, -831882411L, -1899210753L, -1279039848L, +185248694L, 455811139L, -1657661051L, -1742312926L, -1304084208L, +1057480889L, -1839299221L, 2022677596L, 1940418954L, -2044848769L, +1693505737L, 134421854L, 1079041548L, -855240227L, -1810362137L, +-98392816L, -1493717970L, -373067045L, 798565309L, -890258134L, +-1339358776L, 374128369L, -1103474589L, -1752112028L, -1493973102L, +746695783L, -1192483791L, -800576138L, 303344276L, 1196185765L, +-1719902801L, 1792706152L, 1735704134L, -396461069L, -1485459313L +) diff --git a/WAS/addToCounts.r b/PHESANT/R/addToCounts.r similarity index 100% rename from WAS/addToCounts.r rename to PHESANT/R/addToCounts.r diff --git a/WAS/binaryLogisticRegression.r b/PHESANT/R/binaryLogisticRegression.r similarity index 100% rename from WAS/binaryLogisticRegression.r rename to PHESANT/R/binaryLogisticRegression.r diff --git a/WAS/equalSizedBins.r b/PHESANT/R/equalSizedBins.r similarity index 100% rename from WAS/equalSizedBins.r rename to PHESANT/R/equalSizedBins.r diff --git a/WAS/fixOddFieldsToCatMul.r b/PHESANT/R/fixOddFieldsToCatMul.r similarity index 100% rename from WAS/fixOddFieldsToCatMul.r rename to PHESANT/R/fixOddFieldsToCatMul.r diff --git a/WAS/getIsCatMultExposure.r b/PHESANT/R/getIsCatMultExposure.r similarity index 100% rename from WAS/getIsCatMultExposure.r rename to PHESANT/R/getIsCatMultExposure.r diff --git a/WAS/getIsExposure.r b/PHESANT/R/getIsExposure.r similarity index 100% rename from WAS/getIsExposure.r rename to PHESANT/R/getIsExposure.r diff --git a/WAS/getNumValuesCatMultExposure.r b/PHESANT/R/getNumValuesCatMultExposure.r similarity index 100% rename from WAS/getNumValuesCatMultExposure.r rename to PHESANT/R/getNumValuesCatMultExposure.r diff --git a/WAS/incrementCounter.r b/PHESANT/R/incrementCounter.r similarity index 100% rename from WAS/incrementCounter.r rename to PHESANT/R/incrementCounter.r diff --git a/WAS/initFunctions.r b/PHESANT/R/initFunctions.r similarity index 76% rename from WAS/initFunctions.r rename to PHESANT/R/initFunctions.r index e412c6e..03c0f23 100644 --- a/WAS/initFunctions.r +++ b/PHESANT/R/initFunctions.r @@ -16,39 +16,6 @@ # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -# load the required r files -loadSource <- function() { - source("loadData.r") - source("reassignValue.r") - source("validatePhenotypeInput.r") - source("validateTraitInput.r") - source("testNumExamples.r") - source("binaryLogisticRegression.r") - source("equalSizedBins.r") - source("fixOddFieldsToCatMul.r") - source("replaceMissingCodes.r") - source("replaceNaN.r") - source("testAssociations.r") - source("testCatMultiple.r") - source("testCatSingle.r") - source("testContinuous.r") - source("testInteger.r") - source("testCategoricalOrdered.r") - source("testCategoricalUnordered.r") - source("saveCounts.r") - source("incrementCounter.r") - source("getIsCatMultExposure.r") - source("getIsExposure.r") - source("addToCounts.r") - source("getNumValuesCatMultExposure.r") - source("storeNewVar.r") - source("loadPhenotypes.r") - source("loadTraitOfInterest.r") - source("loadConfounders.r") - source("makeTestDataFrame.r") - source("loadIndicatorFields.r") -} - # init the counters used to determine how many variables took each path in the variable processing flow. initCounters <- function() { counters = data.frame(name=character(),countValue=integer(), stringsAsFactors=FALSE) diff --git a/WAS/loadConfounders.r b/PHESANT/R/loadConfounders.r similarity index 100% rename from WAS/loadConfounders.r rename to PHESANT/R/loadConfounders.r diff --git a/WAS/loadData.r b/PHESANT/R/loadData.r similarity index 100% rename from WAS/loadData.r rename to PHESANT/R/loadData.r diff --git a/WAS/loadIndicatorFields.r b/PHESANT/R/loadIndicatorFields.r similarity index 100% rename from WAS/loadIndicatorFields.r rename to PHESANT/R/loadIndicatorFields.r diff --git a/WAS/loadPhenotypes.r b/PHESANT/R/loadPhenotypes.r similarity index 100% rename from WAS/loadPhenotypes.r rename to PHESANT/R/loadPhenotypes.r diff --git a/WAS/loadTraitOfInterest.r b/PHESANT/R/loadTraitOfInterest.r similarity index 100% rename from WAS/loadTraitOfInterest.r rename to PHESANT/R/loadTraitOfInterest.r diff --git a/WAS/makeTestDataFrame.r b/PHESANT/R/makeTestDataFrame.r similarity index 100% rename from WAS/makeTestDataFrame.r rename to PHESANT/R/makeTestDataFrame.r diff --git a/PHESANT/R/option_list.R b/PHESANT/R/option_list.R new file mode 100644 index 0000000..203393e --- /dev/null +++ b/PHESANT/R/option_list.R @@ -0,0 +1,20 @@ +library("optparse") + +option_list = list( + make_option(c("-f", "--phenofile"), type="character", default=NULL, help="Phenotype dataset file name", metavar="character"), + make_option(c("-g", "--traitofinterestfile"), type="character", default=NULL, help="Trait of interest dataset file name", metavar="character"), + make_option(c("-v", "--variablelistfile"), type="character", default=NULL, help="variablelistfile file name (should be tab separated)", metavar="character"), + make_option(c("-d", "--datacodingfile"), type="character", default=NULL, help="datacodingfile file name (should be comma separated)", metavar="character"), + make_option(c("-e", "--traitofinterest"), type="character", default=NULL, help="traitofinterest option should specify trait of interest variable name", metavar="character"), + make_option(c("-r", "--resDir"), type="character", default=NULL, help="resDir option should specify directory where results files should be stored", metavar="character"), + make_option(c("-u", "--userId"), type="character", default="userId", help="userId option should specify user ID column in trait of interest and phenotype files [default= %default]", metavar="character"), + make_option(c("-t", "--test"), action="store_true", default=FALSE, help="Run test phenome scan on test data (see test subfolder) [default= %default]"), + make_option(c("-s", "--sensitivity"), action="store_true", default=FALSE, help="Run sensitivity phenome scan [default= %default]"), + make_option(c("-a", "--partIdx"), type="integer", default=NULL, help="Part index of phenotype (used to parellise)"), + make_option(c("-b", "--numParts"), type="integer", default=NULL, help="Number of phenotype parts (used to parellise)"), + make_option(c("-j", "--genetic"), action="store", default=TRUE, help="Trait of interest is genetic, e.g. a SNP or genetic risk score [default= %default]"), + make_option(c("-z", "--save"), action="store_true", default=FALSE, help="Save generated phenotypes to a file rather than testing associations [default= %default]"), + make_option(c("-c", "--confounderfile"), type="character", default=NULL, help="Confounder file name", metavar="character"), + make_option(c("-i", "--confidenceintervals"), type="logical", default=TRUE, help="Whether confidence intervals should be calculated [default= %default]"), + make_option(c("-k", "--standardise"), action="store", default=TRUE, help="Trait of interest is standardised to have mean=0 and std=1 [default= %default]") +) \ No newline at end of file diff --git a/WAS/processArgs.r b/PHESANT/R/processArgs.r similarity index 100% rename from WAS/processArgs.r rename to PHESANT/R/processArgs.r diff --git a/WAS/reassignValue.r b/PHESANT/R/reassignValue.r similarity index 100% rename from WAS/reassignValue.r rename to PHESANT/R/reassignValue.r diff --git a/WAS/replaceMissingCodes.r b/PHESANT/R/replaceMissingCodes.r similarity index 100% rename from WAS/replaceMissingCodes.r rename to PHESANT/R/replaceMissingCodes.r diff --git a/WAS/replaceNaN.r b/PHESANT/R/replaceNaN.r similarity index 100% rename from WAS/replaceNaN.r rename to PHESANT/R/replaceNaN.r diff --git a/WAS/saveCounts.r b/PHESANT/R/saveCounts.r similarity index 100% rename from WAS/saveCounts.r rename to PHESANT/R/saveCounts.r diff --git a/WAS/storeNewVar.r b/PHESANT/R/storeNewVar.r similarity index 100% rename from WAS/storeNewVar.r rename to PHESANT/R/storeNewVar.r diff --git a/WAS/testAssociations.r b/PHESANT/R/testAssociations.r similarity index 100% rename from WAS/testAssociations.r rename to PHESANT/R/testAssociations.r diff --git a/WAS/testCatMultiple.r b/PHESANT/R/testCatMultiple.r similarity index 100% rename from WAS/testCatMultiple.r rename to PHESANT/R/testCatMultiple.r diff --git a/WAS/testCatSingle.r b/PHESANT/R/testCatSingle.r similarity index 100% rename from WAS/testCatSingle.r rename to PHESANT/R/testCatSingle.r diff --git a/WAS/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r similarity index 100% rename from WAS/testCategoricalOrdered.r rename to PHESANT/R/testCategoricalOrdered.r diff --git a/WAS/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r similarity index 100% rename from WAS/testCategoricalUnordered.r rename to PHESANT/R/testCategoricalUnordered.r diff --git a/WAS/testContinuous.r b/PHESANT/R/testContinuous.r similarity index 100% rename from WAS/testContinuous.r rename to PHESANT/R/testContinuous.r diff --git a/WAS/testInteger.r b/PHESANT/R/testInteger.r similarity index 100% rename from WAS/testInteger.r rename to PHESANT/R/testInteger.r diff --git a/WAS/testNumExamples.r b/PHESANT/R/testNumExamples.r similarity index 100% rename from WAS/testNumExamples.r rename to PHESANT/R/testNumExamples.r diff --git a/WAS/unittests/run-tests.sh b/PHESANT/R/unittests/run-tests.sh similarity index 100% rename from WAS/unittests/run-tests.sh rename to PHESANT/R/unittests/run-tests.sh diff --git a/WAS/unittests/test_equalSizedBins.r b/PHESANT/R/unittests/test_equalSizedBins.r similarity index 100% rename from WAS/unittests/test_equalSizedBins.r rename to PHESANT/R/unittests/test_equalSizedBins.r diff --git a/WAS/unittests/test_reassignValue.r b/PHESANT/R/unittests/test_reassignValue.r similarity index 100% rename from WAS/unittests/test_reassignValue.r rename to PHESANT/R/unittests/test_reassignValue.r diff --git a/WAS/unittests/test_testCatMultiple.r b/PHESANT/R/unittests/test_testCatMultiple.r similarity index 100% rename from WAS/unittests/test_testCatMultiple.r rename to PHESANT/R/unittests/test_testCatMultiple.r diff --git a/WAS/unittests/test_testCatSingle.r b/PHESANT/R/unittests/test_testCatSingle.r similarity index 100% rename from WAS/unittests/test_testCatSingle.r rename to PHESANT/R/unittests/test_testCatSingle.r diff --git a/WAS/validatePhenotypeInput.r b/PHESANT/R/validatePhenotypeInput.r similarity index 100% rename from WAS/validatePhenotypeInput.r rename to PHESANT/R/validatePhenotypeInput.r diff --git a/WAS/validateTraitInput.r b/PHESANT/R/validateTraitInput.r similarity index 100% rename from WAS/validateTraitInput.r rename to PHESANT/R/validateTraitInput.r diff --git a/PHESANT/man/PHESANT-package.Rd b/PHESANT/man/PHESANT-package.Rd new file mode 100644 index 0000000..0178cc4 --- /dev/null +++ b/PHESANT/man/PHESANT-package.Rd @@ -0,0 +1,34 @@ +\name{PHESANT-package} +\alias{PHESANT-package} +\alias{PHESANT} +\docType{package} +\title{ +\packageTitle{PHESANT} +} +\description{ +\packageDescription{PHESANT} +} +\details{ + +The DESCRIPTION file: +\packageDESCRIPTION{PHESANT} +\packageIndices{PHESANT} +~~ An overview of how to use the package, including the most important functions ~~ +} +\author{ +\packageAuthor{PHESANT} + +Maintainer: \packageMaintainer{PHESANT} +} +\references{ +~~ Literature or other references for background information ~~ +} +~~ Optionally other standard keywords, one per line, from file KEYWORDS in the R documentation directory ~~ +\keyword{ package } +\seealso{ +~~ Optional links to other man pages, e.g. ~~ +~~ \code{\link[:-package]{}} ~~ +} +\examples{ +~~ simple examples of the most important functions ~~ +} diff --git a/PHESANT/opt_only.RData b/PHESANT/opt_only.RData new file mode 100644 index 0000000000000000000000000000000000000000..f4d8fa863434c0806fb59d18003a41eb5eb661f2 GIT binary patch literal 330 zcmV-Q0k!@giwFP!000001C3HqOT#b_?$%CS1v}WQ_ydHt;Dhgjz4)qvdyVaN3uH;j zUCZ|8#cS8j5GPnj^5x5yyWDquU2mS^O&kQlGze!rhYX`BPm>@D=DdccCgzt66CMla z=poB;0;G@CTV4thdA$Q|=vV`01-730-dDq+HrQCWpnY_-t*}yjDoB+?!rU}em_4LY zS0Z~A26EoFrIHH9D#!{+%xD6xf}Eqp=vil(!{2r0oxF3{#0xZ9 zz<4dKXUJ(6(U0yHL8BzuN;(eI^%gXcEJiho_WbI?=t`EL3+VH;Lg6_O30(^ADURxB c2dW-$%tgKPU%9ed-hFlB8#^ETW%vRB0GivN*Z=?k literal 0 HcmV?d00001 diff --git a/WAS/phenomeScan.r b/PHESANT/phenomeScan.r similarity index 62% rename from WAS/phenomeScan.r rename to PHESANT/phenomeScan.r index 2be21ca..85fc6a9 100644 --- a/WAS/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -19,35 +19,19 @@ ## ## main phenome scan file +## Updated by Quanli Wang + +library(PHESANT) library("optparse") -option_list = list( - make_option(c("-f", "--phenofile"), type="character", default=NULL, help="Phenotype dataset file name", metavar="character"), - make_option(c("-g", "--traitofinterestfile"), type="character", default=NULL, help="Trait of interest dataset file name", metavar="character"), - make_option(c("-v", "--variablelistfile"), type="character", default=NULL, help="variablelistfile file name (should be tab separated)", metavar="character"), - make_option(c("-d", "--datacodingfile"), type="character", default=NULL, help="datacodingfile file name (should be comma separated)", metavar="character"), - make_option(c("-e", "--traitofinterest"), type="character", default=NULL, help="traitofinterest option should specify trait of interest variable name", metavar="character"), - make_option(c("-r", "--resDir"), type="character", default=NULL, help="resDir option should specify directory where results files should be stored", metavar="character"), - make_option(c("-u", "--userId"), type="character", default="userId", help="userId option should specify user ID column in trait of interest and phenotype files [default= %default]", metavar="character"), - make_option(c("-t", "--test"), action="store_true", default=FALSE, help="Run test phenome scan on test data (see test subfolder) [default= %default]"), - make_option(c("-s", "--sensitivity"), action="store_true", default=FALSE, help="Run sensitivity phenome scan [default= %default]"), - make_option(c("-a", "--partIdx"), type="integer", default=NULL, help="Part index of phenotype (used to parellise)"), - make_option(c("-b", "--numParts"), type="integer", default=NULL, help="Number of phenotype parts (used to parellise)"), - make_option(c("-j", "--genetic"), action="store", default=TRUE, help="Trait of interest is genetic, e.g. a SNP or genetic risk score [default= %default]"), - make_option(c("-z", "--save"), action="store_true", default=FALSE, help="Save generated phenotypes to a file rather than testing associations [default= %default]"), - make_option(c("-c", "--confounderfile"), type="character", default=NULL, help="Confounder file name", metavar="character"), - make_option(c("-i", "--confidenceintervals"), type="logical", default=TRUE, help="Whether confidence intervals should be calculated [default= %default]"), - make_option(c("-k", "--standardise"), action="store", default=TRUE, help="Trait of interest is standardised to have mean=0 and std=1 [default= %default]") -); -opt_parser = OptionParser(option_list=option_list); -opt = parse_args(opt_parser); - -save.image(file = "opt.RData") -source("processArgs.r") -processArgs(); - -source("initFunctions.r") -loadSource(); +args <- commandArgs(T) +if (length(args) == 0) { + load(file = "opt_only.RData") +} else { + opt_parser = OptionParser(option_list=option_list); + opt = parse_args(opt_parser); + processArgs(); +} ## load the files we write to and use counters=initCounters(); diff --git a/PHESANT/test.sh b/PHESANT/test.sh new file mode 100755 index 0000000..17d4de0 --- /dev/null +++ b/PHESANT/test.sh @@ -0,0 +1,3 @@ +testDir="~/Documents/Github/PHESANT/testWAS/" +Rscript phenomeScan.r --phenofile="${testDir}data/phenotypes.csv" --traitofinterestfile="${testDir}data/exposure.csv" --variablelistfile="${testDir}variable-lists/outcome-info.tsv" --datacodingfile="${testDir}variable-lists/data-coding-ordinal-info.txt" --traitofinterest="exposure" --resDir="${testDir}results/" --userId="userId" + diff --git a/opt.RData b/opt.RData new file mode 100644 index 0000000000000000000000000000000000000000..fc4e5ffe212e4a0852f27489b7b9f2e116da4e25 GIT binary patch literal 4811 zcmV;+5;W}}iwFP!000001MMAYa}>w5S2q$M0S1gYG?G9F!AjyZ#>f~OY;zc61O^OP z9_@~#HLKmt&a4o=LT(>9o%^VKP4XX7sr-hdlF#{=5BZQPCm&L&Sm~a5v-9R{ckj$f zE2OGb)l=_Izkc_7-80iWbM4%%Bb~Q88DmYXxt0HK<{w*{_`d{eVe5F?nJ+k#1+!>d zMV4sZL3&$DwmD<*4?XDMo*UxeDWfoJt;KELKFvjMvjjd$uN%PSMOb+z~mhGIvwpkv58q z1EHOEQoD6fge^)&>lOo(cF|!2*Tpa+KW+E}Ri0g6O{5Ko&&I{Dm%10DI;MxRiRX!V zg(o&AlROwR=2X_oX6opM9yu4S5023>f$5DQUXx1Mv@vBFjyYx67B_|ISm^_6$th`t zQ+kp~5p1b^I+L5Jn@cvzc<3Dx9wB!}DnB=8u2F|r3Wsc}s8@CINVk-U)+Oi35W}0! z4e*Cuuu_@nhlW#8&HUV2bCs;9@KHxH$rU@7Db~d`ZJ4&!BbPi(g(K3LqLt!;JTxB6 zaz_{(nJ+qQH$JTUo|^S~6`m5E#)Wgdo@GmO zIi5I@BrnO8lkM63Oop#l z6@F}SLiLVyx_xE!E3@oci91C208VoZo1fa4JU7KT2_{?pe@x(EX-|HSba)i_0D86 zSDF(l+jTH|N0_$O?#U~qxhXzDglsMhX~(cUVYn(x|BgD~XRMs%WKuO*USDY1L!?7R zCZTY^waOFpjBNm&LxwdnGh&#=#FeW?zUUt|iW&PpuR>hT7}IoExK-h#ch!@$ZO*Gs z7neRs2gD;y1h*zQzH5Bv}z7jT^S<{=l`N^eW+IBvbG2IK3YizQ1%2q~U@U5P~GifWAvfLUmZ)R(oHuPDs zMad>oAv=F_)^al-AY+v2pff@)K&q5=^C)`lQp>g~e0N{7ymk1=m03(@?3yyA>dKaq zkL0T>(n$x=a_9Y7QT665GdDiwo*~*!di-z#8E&Z(y(6O%ay=HZ zR`CkOe2Uxetb2Q@MckeIAE0jKkQS-R*&YP9JOQq`AmW9Cz~9W_sK{3M-p1kAAVxcf zZ9)v80~`}{-Ud1sg&H0YX2LNaubbd)0$vzzJIJ&f`1Hdu(LO0cg#(XwHVFD)dOJDn zlJMO_@C*)p61)L|w-5MWyv+oMpC_bB z`#6Ua5l_y^eFE{DI9areNdmq4z!fd3sB|0ak35MAB|nI@5+ zZv(E-iJzJ$+KEnSiB2g`r$k3gou)ZTALum0Kkp{v_$y!0kc6P*eYo${d5IMPYT`6YhrWg;gpxo%zXubRC>m2vDM6Yj3=zasyjT3|4;wXOv8+^n+|ACDA7~~N)xb5M)a(%*4{u09$ z-&|L=SJo7NLw{1Ey-1KocFJb%n_ zyNEA-Dxv!mL^o=USNY3dQ~V5M_&KLL1oVCZy8R01{1V0sp55?StUts$M66>N3zj>H z_dFrpj(|8w%}E)yGr8kh3#RQ@ zVX8YR7p&LD;2Pmb%AN~7FY?CBf<6nf0wwh6TxNab*Se%i z>1v}8tt%|TR$3$1hx@nD{tY#%02kP`GBZiPuAXmPZZk_aR7%a*9cEz=lG;MB-dZT+ z?NZSS%0SyWG7BPI=UMik0n1o21mszCX9-3Ah zoR74{;%#B_>1I7#uY#MZp)pU?Dth_$xKxy_HIha5eyggiOYdz=U#hk%Ox2#TM{=cX zmY_cwPn}*rCF6>{Lm^gYp~y39686|5PKiYc2SMp61*I0-$rBo(tQ;t>X$-SCQ<}4K zjxkrV9e4L#sZhvf#QwX`@}pJkw;usovJ{fZx0C4|v`ZnsdldvD?p7dn2^~hP1u+kk zA$!=;y-9V_d9^p(KT7)zrkxN%{@GIq{iedZIa&4@J*(zDw?^7AF+6=QV%;zxGa4_7T zr2YHj$vLPCZJB&SRpdJm?mt5NhwCVxE(H%VeF`38(Nl1&a>r|sijrj@P0HjFZ19+q z600%7^vO8N)F~$V$29$qGX)PbeTW}n`V>4GPtM@PlWtNeOQ&iv*JD+rKF;)sc_N_0xSA*|qY%2QSV$o~x4*%E7!LOPN=@2>yM?B4~=W{T#KvkXTUJ z#+U@$85LuiC1Vu(Qv|wl|IFUd{+W<1hn=~?FssbJCX6(1W=sBsnh3HWGr6>oOedn` zKL37FmdI7uQA-7zWQ1|ZW*sWX)8zXeogkIlZxS#fhN|&77=B!c{Wjvxqu6L5obKOY^T@XJoR#>@m%MQf=ZhgKqZSg$O zi|yoVgRIgU)9zY&RQ7oLb_6xD+o+m${kkZqZP!o&sK&5K9i*U^tbO;8uqxU8RL!ab zI(?SDk0}8hVpzD&5(yiT9aYtAw3kXGo2L_i7emuXbcMVdykq<5%*mjCJ+xiWcC&a@ zubb7c>e2m4g5Sn?_8+8ms5@-;!I*Z}(xdX@O-Y}#ihWYmWZ$ogX4!_P1dNHHPi!7S z53-r6njS-{SoQxv_Mw=vN1`j_-5gKe0}*rfKumd~(jE3=OKidNbFrOO{{D}#^2*rF z)z5=|mv;irgZ%t#f8>q9JkejQT#YN&*05ZM>Lr)pwH?|gq21=u9r5kDgGH)w35>s7 zH7)_y0|ectSY^$KxMeB^H$&S6xV`%D`Y{ciR9W0MFG}*Sfrjum2$-Vi|zOU zX!k-p2FbkP~u$}=t1Z_XG2U&dr_z+VEa4q|V;@UL1LN=wkt)Y+} zg?0?uBhVi8fn#D<0Y;s z&#jOtsY+`o{$GOjWoXYs`-(*7I6G1V4Y5NcuFEd7kjn<@8T}Wby#Vd2tUl3yg{h-I z%1)EGHhgV`Y)ExkLt+0qwAY}Wfc7f1*Cn#Y+HoO>yaDY^Xm3D!OF}%#4g)b(-0_ba z4|iE89(LC=r{0A2HnjIxeRAq8rp_tt&Vjhbn_MB|Q6<(;GMUi6?a6c8KLwbQ$h_p6 z=b&iQ!AE5SxX^?^_^y=&H)VZj2Uv%|zctVWun|EQf^Gm`p)Ck{5NwSJPXX*k@DzeQ z0KEYH2zn6=0PF?Wk6)W!HUeGzIT668Gp~RJfQJa)jQ|*5(0?C+uh$0% z#PJUi==Az>1n}v6MFC#}_&S1bL;#HcA*&vP-$L*a0)O1c2tGo%pCHiL;?oH534!k* z@a6a}0&)C%2z30v9|3$ipDEx+06#|XlL&zE1^p;BUgj?`m;4Oi=LmkO0fZN4Q6+H2 zdb3HaExO5=J#j*lz$AIK5D#L^YQwnr1neW)tYO^jgG>NjaO?v4sX~BZ!})5xk3&w6%J^; zTC<*3pHf)8@oLSwoBkbjz&Bp4xmuj`u6mMgyjt_|x+bM?P2<&?jaO^R&)^%c)@;05 zvx;yl`{KvpMb_fKMp1kZ>VLs|#QcBB@wS}*kof-sr8e}Sls5D*mp1g2ueRVJPtPvG zICA{-Vj!8{hw1J@pZy2Flz%i@p7!V+B)1pa!MhoUnH)!i%TE`=Lx%Y|PEJf!4>HcI z!N+@72W)YjAbdsOlv>1ItR7KwYXmc4Q~1xTN)Vh;ne=CbBh`CE#kHxN;W;(H5|Lis zL@NA0M%DmPR5-nCDp2v{N`w_%k_tiHdjEaNQTOG1zLP|!*!h<@^H0VQ>54(mg$Gp& z_*f=79#Y;eTWHF19$4Hqhu!J~cFg2b*^-^{*LT$n6EzFRiJ%4!Wh!tY8CXFG(MJoG lM3AbPx+#>Y;uK9IK~sJjIfC|XS$cL$;s11|weDh*002)0Pgno| literal 0 HcmV?d00001 From 9fee34e163cacd0ad5956b2348433a40528f6fd0 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 12:16:54 -0400 Subject: [PATCH 03/27] Code Refactor: remove global variable vl --- PHESANT/DESCRIPTION | 12 ++-- PHESANT/NAMESPACE | 4 ++ PHESANT/R/addToCounts.r | 33 ---------- PHESANT/R/counters.r | 56 +++++++++++++++++ PHESANT/R/fixOddFieldsToCatMul.r | 49 --------------- PHESANT/R/getIsCatMultExposure.r | 61 ------------------- PHESANT/R/getIsExposure.r | 80 ++++++++++++++++++++++++- PHESANT/R/getNumValuesCatMultExposure.r | 53 ---------------- PHESANT/R/incrementCounter.r | 34 ----------- PHESANT/R/initFunctions.r | 5 -- PHESANT/R/loadData.r | 26 +++++++- PHESANT/R/loadIndicatorFields.r | 6 +- PHESANT/R/reassignValue.r | 6 +- PHESANT/R/saveCounts.r | 31 ---------- PHESANT/R/testAssociations.r | 14 ++--- PHESANT/R/testCatMultiple.r | 14 ++--- PHESANT/R/testCatSingle.r | 10 ++-- PHESANT/R/testCategoricalOrdered.r | 4 +- PHESANT/R/testCategoricalUnordered.r | 4 +- PHESANT/R/testContinuous.r | 12 ++-- PHESANT/R/testInteger.r | 10 ++-- PHESANT/phenomeScan.r | 6 +- 22 files changed, 211 insertions(+), 319 deletions(-) delete mode 100644 PHESANT/R/addToCounts.r create mode 100644 PHESANT/R/counters.r delete mode 100644 PHESANT/R/fixOddFieldsToCatMul.r delete mode 100644 PHESANT/R/getIsCatMultExposure.r delete mode 100644 PHESANT/R/getNumValuesCatMultExposure.r delete mode 100644 PHESANT/R/incrementCounter.r delete mode 100644 PHESANT/R/saveCounts.r diff --git a/PHESANT/DESCRIPTION b/PHESANT/DESCRIPTION index 4769eb6..62068ae 100644 --- a/PHESANT/DESCRIPTION +++ b/PHESANT/DESCRIPTION @@ -1,9 +1,9 @@ Package: PHESANT Type: Package -Title: What the package does (short line) -Version: 1.0 +Title: Repliments PHESANT +Version: 0.1 Date: 2019-03-14 -Author: Who wrote it -Maintainer: Who to complain to -Description: More about what it does (maybe more than one line) -License: What license is it under? +Author: Quanli Wang +Maintainer: Quanli Wang +Description: This will replement the original PHESANT in a package and potentially with some optimizations +License: GPL (>= 3) diff --git a/PHESANT/NAMESPACE b/PHESANT/NAMESPACE index d75f824..404475d 100644 --- a/PHESANT/NAMESPACE +++ b/PHESANT/NAMESPACE @@ -1 +1,5 @@ exportPattern("^[[:alpha:]]+") +importFrom("stats", "coef", "complete.cases", "confint", "end", "glm", + "lm", "model.matrix", "na.omit", "pnorm", "qnorm", + "quantile", "relevel") +importFrom("utils", "read.table", "type.convert", "write.table") diff --git a/PHESANT/R/addToCounts.r b/PHESANT/R/addToCounts.r deleted file mode 100644 index e41420c..0000000 --- a/PHESANT/R/addToCounts.r +++ /dev/null @@ -1,33 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -# adds given value to counter, that are used to count how many variables go down each route in the data flow -addToCounts <- function(countName, num) { - - idx = which(counters$name==countName) - - if (length(idx)==0) { - # counter does not exist so add with countValue 1 - counters <<- rbind(counters, data.frame(name=countName, countValue=num)) - } - else { - # add to counter that already exists - counters$countValue[idx] <<- counters$countValue[idx]+num - } - -} diff --git a/PHESANT/R/counters.r b/PHESANT/R/counters.r new file mode 100644 index 0000000..15fdc47 --- /dev/null +++ b/PHESANT/R/counters.r @@ -0,0 +1,56 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# adds given value to counter, that are used to count how many variables go down each route in the data flow +addToCounts <- function(countName, num) { + idx = which(counters$name==countName) + if (length(idx)==0) { + # counter does not exist so add with countValue 1 + counters <<- rbind(counters, data.frame(name=countName, countValue=num)) + } else { + # add to counter that already exists + counters$countValue[idx] <<- counters$countValue[idx]+num + } +} + +# increments counters used to count how many variables go down each route in the data flow +incrementCounter <- function(countName) { + idx = which(counters$name==countName) + if (length(idx)==0) { + # counter does not exist so add with countValue 1 + counters <<- rbind(counters, data.frame(name=countName, countValue=1)) + } else { + # increment counter that already exists + counters$countValue[idx] <<- counters$countValue[idx]+1 + } +} + +# Saves the counters stored in count variables, to a file in results directory +saveCounts <- function() { + countFile = paste(opt$resDir,"variable-flow-counts-",opt$varTypeArg,".txt",sep="") + # sort on counter name + sortIdx = order(as.character(counters[,"name"])) + counters <<- counters[sortIdx,] + write.table(counters, file=countFile, sep=",", quote=FALSE, row.names=FALSE) +} + +# init the counters used to determine how many variables took each path in the variable processing flow. +initCounters <- function() { + counters = data.frame(name=character(),countValue=integer(), stringsAsFactors=FALSE) + return(counters) +} \ No newline at end of file diff --git a/PHESANT/R/fixOddFieldsToCatMul.r b/PHESANT/R/fixOddFieldsToCatMul.r deleted file mode 100644 index 2fb25c4..0000000 --- a/PHESANT/R/fixOddFieldsToCatMul.r +++ /dev/null @@ -1,49 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# Changes variable name from instances to arrays -# -# This function changes the format of variable names from instance format to array format (i.e. we treat -# the instances as arrays), for a small subset of Biobank variables. -# -# Some variables are stored in Biobank as categorical (single) fields with the data stored as set of instances, -# but we want to treat these instead as categorical (multiple) with a set of arrays. -# These are indicated by the value "YES-INSTANCES" in the CAT_SINGLE_TO_CAT_MULT column of the variable info file. -# This function changes the format of these variable names from VARID_0_0, VARID_1_0, VARID_2_0 etc (which -# is instance format), to VARID_0_0, VARID_0_1, VARID_0_2 etc. (array format) so they can be treated as categorical (multiple) fields. -fixOddFieldsToCatMul <- function(data) { - # examples are variables: 40006, 40011, 40012, 40013 - # get all variables that need their instances changing to arrays - dataPheno = vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),]; - for (i in 1:nrow(dataPheno)) { - varID = dataPheno[i,]$FieldID; - varidString = paste("x",varID,"_", sep=""); - - # get all columns in data dataframe for this variable - colIdxs = which(grepl(varidString,names(data))); - - # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 - count = 0; - for (j in colIdxs) { - colnames(data)[j] <- paste(varidString, "0_", count, sep="") - count = count + 1; - } - } - return(data) -} diff --git a/PHESANT/R/getIsCatMultExposure.r b/PHESANT/R/getIsCatMultExposure.r deleted file mode 100644 index 560263f..0000000 --- a/PHESANT/R/getIsCatMultExposure.r +++ /dev/null @@ -1,61 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# looks up categorical multiple field in the variable info file, return -# whether field has YES in TRAIT_OF_INTEREST column (i.e. all values in -# this field denote the exposure), or whether varName has varValue stated -# as a trait of interest in the TRAIT_OF_INTEREST column (multiple values are -# separated by "|" in this field -getIsCatMultExposure <- function(varName, varValue) { - - # get row index of field in variable information file - idx=which(vl$phenoInfo$FieldID==varName) - - # may be empty of may contain VALUE1|VALUE2 etc .. to denote those - # cat mult values denoting exposure variable - isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] - - if (!is.na(isExposure) & isExposure!="") { - - isExposure = as.character(isExposure) - - ## first check if value is YES, then all values are exposure traits - if (isExposure == "YES") { - cat("IS_CM_ALL_EXPOSURE || ") - return(TRUE) - } - - ## try to split by |, to set particular values as exposure - - # split into variable Values - exposureValues = unlist(strsplit(isExposure,"\\|")) - - # for each value stated, check whether it is varValue - for (thisVal in exposureValues) { - if (thisVal == varValue) { - cat("IS_CM_EXPOSURE || ") - return(TRUE) - } - } - } - - # varValue is not in list of exposure values - return(FALSE) - -} diff --git a/PHESANT/R/getIsExposure.r b/PHESANT/R/getIsExposure.r index c8df69e..a05d561 100644 --- a/PHESANT/R/getIsExposure.r +++ b/PHESANT/R/getIsExposure.r @@ -20,7 +20,7 @@ # returns boolean var - whether this field denotes the trait of interest, as specified # in the variable information file # to determine if values of cat mult fields (not the whole field) are exposure values, use getIsCatMultExposure function instead. -getIsExposure <- function(varName) { +getIsExposure <- function(vl, varName) { idx=which(vl$phenoInfo$FieldID==varName) isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] @@ -29,3 +29,81 @@ getIsExposure <- function(varName) { } return(FALSE) } + +# looks up categorical multiple field in the variable info file, return +# whether field has YES in TRAIT_OF_INTEREST column (i.e. all values in +# this field denote the exposure), or whether varName has varValue stated +# as a trait of interest in the TRAIT_OF_INTEREST column (multiple values are +# separated by "|" in this field +getIsCatMultExposure <- function(vl, varName, varValue) { + + # get row index of field in variable information file + idx=which(vl$phenoInfo$FieldID==varName) + + # may be empty of may contain VALUE1|VALUE2 etc .. to denote those + # cat mult values denoting exposure variable + isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] + + if (!is.na(isExposure) & isExposure!="") { + + isExposure = as.character(isExposure) + + ## first check if value is YES, then all values are exposure traits + if (isExposure == "YES") { + cat("IS_CM_ALL_EXPOSURE || ") + return(TRUE) + } + + ## try to split by |, to set particular values as exposure + + # split into variable Values + exposureValues = unlist(strsplit(isExposure,"\\|")) + + # for each value stated, check whether it is varValue + for (thisVal in exposureValues) { + if (thisVal == varValue) { + cat("IS_CM_EXPOSURE || ") + return(TRUE) + } + } + } + + # varValue is not in list of exposure values + return(FALSE) + +} + +# looks up categorical multiple field in the variable info file, return +# number of values denoted as trait of interest. +# returns zero if whole field is denoted trait of interest, not particular values. +getNumValuesCatMultExposure <- function(vl, varName) { + + # get row index of field in variable information file + idx=which(vl$phenoInfo$FieldID==varName) + + # may be empty of may contain VALUE1|VALUE2 etc .. to denote those + # cat mult values denoting exposure variable + isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] + + if (!is.na(isExposure) & isExposure!="") { + + isExposure = as.character(isExposure) + + ## first check if value is YES, then no partic values are traits of interest + if (isExposure == "YES") { + return(0) + } + + ## try to split by |, to set particular values as exposure + + # split into variable Values + exposureValues = unlist(strsplit(isExposure,"\\|")) + + return(length(exposureValues)) + + } + + # varValue is not in list of exposure values + return(0) + +} diff --git a/PHESANT/R/getNumValuesCatMultExposure.r b/PHESANT/R/getNumValuesCatMultExposure.r deleted file mode 100644 index 4cae9a9..0000000 --- a/PHESANT/R/getNumValuesCatMultExposure.r +++ /dev/null @@ -1,53 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# looks up categorical multiple field in the variable info file, return -# number of values denoted as trait of interest. -# returns zero if whole field is denoted trait of interest, not particular values. -getNumValuesCatMultExposure <- function(varName) { - - # get row index of field in variable information file - idx=which(vl$phenoInfo$FieldID==varName) - - # may be empty of may contain VALUE1|VALUE2 etc .. to denote those - # cat mult values denoting exposure variable - isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] - - if (!is.na(isExposure) & isExposure!="") { - - isExposure = as.character(isExposure) - - ## first check if value is YES, then no partic values are traits of interest - if (isExposure == "YES") { - return(0) - } - - ## try to split by |, to set particular values as exposure - - # split into variable Values - exposureValues = unlist(strsplit(isExposure,"\\|")) - - return(length(exposureValues)) - - } - - # varValue is not in list of exposure values - return(0) - -} diff --git a/PHESANT/R/incrementCounter.r b/PHESANT/R/incrementCounter.r deleted file mode 100644 index c430a24..0000000 --- a/PHESANT/R/incrementCounter.r +++ /dev/null @@ -1,34 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# increments counters used to count how many variables go down each route in the data flow -incrementCounter <- function(countName) { - - idx = which(counters$name==countName) - - if (length(idx)==0) { - # counter does not exist so add with countValue 1 - counters <<- rbind(counters, data.frame(name=countName, countValue=1)) - } - else { - # increment counter that already exists - counters$countValue[idx] <<- counters$countValue[idx]+1 - } - -} diff --git a/PHESANT/R/initFunctions.r b/PHESANT/R/initFunctions.r index 03c0f23..38a04b5 100644 --- a/PHESANT/R/initFunctions.r +++ b/PHESANT/R/initFunctions.r @@ -16,11 +16,6 @@ # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -# init the counters used to determine how many variables took each path in the variable processing flow. -initCounters <- function() { - counters = data.frame(name=character(),countValue=integer(), stringsAsFactors=FALSE) - return(counters); -} # create new results files and headers initResultsFiles <- function() { diff --git a/PHESANT/R/loadData.r b/PHESANT/R/loadData.r index 68d4eed..450b103 100644 --- a/PHESANT/R/loadData.r +++ b/PHESANT/R/loadData.r @@ -21,7 +21,7 @@ # creates phenotype / trait of interest data frame # creates confounder data frame # returns an object holding these two data frames -loadData <- function() { +loadData <- function(vl) { library(data.table) @@ -69,10 +69,30 @@ loadData <- function() { } # some fields are fixed that have a field type as cat single but we want to treat them like cat mult - phenotype = fixOddFieldsToCatMul(phenotype) - indFields = loadIndicatorFields(colnames(phenotype)) + phenotype = fixOddFieldsToCatMul(vl, phenotype) + indFields = loadIndicatorFields(vl, colnames(phenotype)) d = list(datax=phenotype, confounders=conf, inds=indFields) return(d) } +fixOddFieldsToCatMul <- function(vl, data) { + # examples are variables: 40006, 40011, 40012, 40013 + # get all variables that need their instances changing to arrays + dataPheno = vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),]; + for (i in 1:nrow(dataPheno)) { + varID = dataPheno[i,]$FieldID; + varidString = paste("x",varID,"_", sep=""); + + # get all columns in data dataframe for this variable + colIdxs = which(grepl(varidString,names(data))); + + # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 + count = 0; + for (j in colIdxs) { + colnames(data)[j] <- paste(varidString, "0_", count, sep="") + count = count + 1; + } + } + return(data) +} diff --git a/PHESANT/R/loadIndicatorFields.r b/PHESANT/R/loadIndicatorFields.r index e069c8e..8159ddb 100644 --- a/PHESANT/R/loadIndicatorFields.r +++ b/PHESANT/R/loadIndicatorFields.r @@ -18,7 +18,7 @@ ## ## load data used for data code default value related field, and categorical multiple indicator field -loadIndicatorFields <- function(phenosToTest) { +loadIndicatorFields <- function(vl, phenosToTest) { print("Loading indicator fields from phenotypes file ...") # read pheno file column names @@ -27,7 +27,7 @@ loadIndicatorFields <- function(phenosToTest) { indVars = c(opt$userId) ## add indicator variables to pheno data - indVars = addIndicatorVariables(indVars, phenosToTest, phenoVarsAll) + indVars = addIndicatorVariables(vl, indVars, phenosToTest, phenoVarsAll) if (length(indVars)>1) { # not just user id column print("Loading required related variable(s):") @@ -44,7 +44,7 @@ loadIndicatorFields <- function(phenosToTest) { } -addIndicatorVariables <- function(indVars, phenosToTest, phenoVarsAll) { +addIndicatorVariables <- function(vl, indVars, phenosToTest, phenoVarsAll) { ##### default value related fields for data codes # get list of all indicator variables from outcome info file diff --git a/PHESANT/R/reassignValue.r b/PHESANT/R/reassignValue.r index ed9701c..c9bc289 100644 --- a/PHESANT/R/reassignValue.r +++ b/PHESANT/R/reassignValue.r @@ -18,7 +18,7 @@ # Reassigns values as specified in data coding info file -reassignValue <- function(pheno, varName) { +reassignValue <- function(vl, pheno, varName) { # get data code info - whether this data code is ordinal or not and any reordering and resassignments dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; @@ -38,12 +38,12 @@ reassignValue <- function(pheno, varName) { dataDataCode = vl$dataCodeInfo[dataCodeRow,]; reassignments = as.character(dataDataCode$reassignments); - return(reassignValue2(pheno, reassignments)) + return(reassignValue2(vl, pheno, reassignments)) } # Reassigns values in pheno, as specified in resassignments argument -reassignValue2 <- function(pheno, reassignments) { +reassignValue2 <- function(vl, pheno, reassignments) { # can be NA if row not included in data coding info file if (!is.na(reassignments) && nchar(reassignments)>0) { diff --git a/PHESANT/R/saveCounts.r b/PHESANT/R/saveCounts.r deleted file mode 100644 index 5a72916..0000000 --- a/PHESANT/R/saveCounts.r +++ /dev/null @@ -1,31 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# Saves the counters stored in count variables, to a file in results directory -saveCounts <- function() { - - countFile = paste(opt$resDir,"variable-flow-counts-",opt$varTypeArg,".txt",sep="") - - # sort on counter name - sortIdx = order(as.character(counters[,"name"])) - counters <<- counters[sortIdx,] - - write.table(counters, file=countFile, sep=",", quote=FALSE, row.names=FALSE) -} - diff --git a/PHESANT/R/testAssociations.r b/PHESANT/R/testAssociations.r index 71592ab..8a2aba1 100644 --- a/PHESANT/R/testAssociations.r +++ b/PHESANT/R/testAssociations.r @@ -18,7 +18,7 @@ # Tests the association of a field, determined by its field type -testAssociations <- function(currentVar, currentVarShort, thisdata) { +testAssociations <- function(vl, currentVar, currentVarShort, thisdata) { ## call file for variable type @@ -38,7 +38,7 @@ testAssociations <- function(currentVar, currentVarShort, thisdata) { excluded = vl$phenoInfo$EXCLUDED[idx] catSinToMult = vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT[idx] fieldType = vl$phenoInfo$ValueType[idx] - isExposure = getIsExposure(currentVarShort) #vl$phenoInfo$EXPOSURE_PHENOTYPE[idx] + isExposure = getIsExposure(vl, currentVarShort) #vl$phenoInfo$EXPOSURE_PHENOTYPE[idx] if (fieldType=="Integer") { @@ -55,7 +55,7 @@ testAssociations <- function(currentVar, currentVarShort, thisdata) { incrementCounter("start.exposure.int") } - testInteger(currentVarShort, "INTEGER", thisdata); + testInteger(vl, currentVarShort, "INTEGER", thisdata); } cat("\n"); } @@ -73,7 +73,7 @@ testAssociations <- function(currentVar, currentVarShort, thisdata) { if (isExposure==TRUE) { incrementCounter("start.exposure.cont") } - testContinuous(currentVarShort, "CONTINUOUS", thisdata); + testContinuous(vl, currentVarShort, "CONTINUOUS", thisdata); } cat("\n"); } @@ -91,7 +91,7 @@ testAssociations <- function(currentVar, currentVarShort, thisdata) { if (isExposure==TRUE) { incrementCounter("start.exposure.catSin") } - testCategoricalSingle(currentVarShort, "CAT-SIN", thisdata); + testCategoricalSingle(vl, currentVarShort, "CAT-SIN", thisdata); } cat("\n"); } @@ -117,12 +117,12 @@ testAssociations <- function(currentVar, currentVarShort, thisdata) { } else { # get number of cat mult values denoting trait of interest - numVals = getNumValuesCatMultExposure(currentVarShort) + numVals = getNumValuesCatMultExposure(vl, currentVarShort) if (numVals>0) { addToCounts("start.exposure.catMulvalues", numVals) } } - testCategoricalMultiple(currentVarShort, "CAT-MUL", thisdata); + testCategoricalMultiple(vl, currentVarShort, "CAT-MUL", thisdata); } cat("\n"); } diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index 8aaf974..bf0768f 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -23,11 +23,11 @@ # in CAT_MULT_INDICATOR_FIELDS field of variable info file (either NO_NAN, ALL or a field ID) # 3) Checking derived variable has at least 10 cases in each group # 4) Calling binaryLogisticRegression function for this derived binary variable -testCategoricalMultiple <- function(varName, varType, thisdata) { +testCategoricalMultiple <- function(vl, varName, varType, thisdata) { cat("CAT-MULTIPLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] - pheno = reassignValue(pheno, varName) + pheno = reassignValue(vl, pheno, varName) ## get unique values from all columns of this variable uniqueValues = unique(na.omit(pheno[,1])); @@ -65,7 +65,7 @@ testCategoricalMultiple <- function(varName, varType, thisdata) { newthisdata = cbind.data.frame(thisdata[,1:numPreceedingCols], varBinaryFactor) ## one of 3 ways to decide which examples are negative - idxsToRemove = restrictSample(varName, pheno, variableVal, thisdata[,"userID", drop=FALSE]) + idxsToRemove = restrictSample(vl, varName, pheno, variableVal, thisdata[,"userID", drop=FALSE]) if (!is.null(idxsToRemove) & length(idxsToRemove) > 0) { newthisdata = newthisdata[-idxsToRemove,] @@ -80,7 +80,7 @@ testCategoricalMultiple <- function(varName, varType, thisdata) { incrementCounter("catMul.10") } else { - isExposure = getIsCatMultExposure(varName, variableVal) + isExposure = getIsCatMultExposure(vl, varName, variableVal) incrementCounter("catMul.over10") # binary - so logistic regression @@ -92,16 +92,16 @@ testCategoricalMultiple <- function(varName, varType, thisdata) { # restricts sample based on value in CAT_MULT_INDICATOR_FIELDS column of variable info file, # either NO_NAN, ALL or a field ID # returns idx's that should be removed from the sample -restrictSample <- function(varName,pheno,variableVal, userID) { +restrictSample <- function(vl, varName,pheno,variableVal, userID) { # get definition for sample for this variable either NO_NAN, ALL or a variable ID varIndicator = vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS[which(vl$phenoInfo$FieldID==varName)] - return(restrictSample2(varName,pheno,varIndicator,variableVal, userID)) + return(restrictSample2(vl, varName,pheno,varIndicator,variableVal, userID)) } -restrictSample2 <- function(varName,pheno, varIndicator,variableVal, userID) { +restrictSample2 <- function(vl, varName,pheno, varIndicator,variableVal, userID) { if (varIndicator=="NO_NAN") { # remove NAs ## remove all people with no value for this variable diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index 109f88b..e801e44 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -23,16 +23,16 @@ # 3) Replacing missing codes - we assume values < 0 are missing for categorical (single) variables # 4) Remove values with <10 cases # 5) Deterimine correct test to perform, either binary, ordered or unordered. -testCategoricalSingle <- function(varName, varType, thisdata) { +testCategoricalSingle <- function(vl, varName, varType, thisdata) { cat("CAT-SINGLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] - isExposure = getIsExposure(varName) + isExposure = getIsExposure(vl, varName) # assert variable has only one column if (!is.null(dim(pheno))) stop("More than one column for categorical single") - pheno = reassignValue(pheno, varName) + pheno = reassignValue(vl, pheno, varName) # get data code info - whether this data code is ordinal or not and any reordering dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; @@ -94,7 +94,7 @@ testCategoricalSingle <- function(varName, varType, thisdata) { incrementCounter("catSin.case2") thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); - testCategoricalUnordered(varName, varType, thisdatanew); + testCategoricalUnordered(vl, varName, varType, thisdatanew); } else if (ordered == 1) { @@ -105,7 +105,7 @@ testCategoricalSingle <- function(varName, varType, thisdata) { ## reorder variable values into increasing order thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); - testCategoricalOrdered(varName, varType, thisdatanew, order) + testCategoricalOrdered(vl, varName, varType, thisdatanew, order) } else if (ordered == -2) { diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index 78debf6..38f86bf 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -18,7 +18,7 @@ # Performs ordered logistic regression test and saves results in ordered logistic results file -testCategoricalOrdered <- function(varName, varType, thisdata, orderStr="") { +testCategoricalOrdered <- function(vl, varName, varType, thisdata, orderStr="") { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -98,7 +98,7 @@ testCategoricalOrdered <- function(varName, varType, thisdata, orderStr="") { cat("SUCCESS results-ordered-logistic"); incrementCounter("success.ordCat") - isExposure = getIsExposure(varName) + isExposure = getIsExposure(vl, varName) if (isExposure == TRUE) { incrementCounter("success.exposure.ordCat") } diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index eae9c73..6667e5c 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -19,7 +19,7 @@ # Tests an unordered categorical phenotype with multinomial regression # and saves this result in the multinomial logistic results file -testCategoricalUnordered <- function(varName, varType, thisdata) { +testCategoricalUnordered <- function(vl, varName, varType, thisdata) { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] #geno = thisdata[,"geno"] @@ -131,7 +131,7 @@ testCategoricalUnordered <- function(varName, varType, thisdata) { cat("SUCCESS results-notordered-logistic "); incrementCounter("success.unordCat") - isExposure = getIsExposure(varName) + isExposure = getIsExposure(vl, varName) if (isExposure == TRUE) { incrementCounter("success.exposure.unordCat") } diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 1532eba..2cc159a 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -18,28 +18,28 @@ # Main function called for continuous fields -testContinuous <- function(varName, varType, thisdata) { +testContinuous <- function(vl, varName, varType, thisdata) { cat("CONTINUOUS MAIN || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] # reassign values - pheno = reassignValue(pheno, varName) + pheno = reassignValue(vl, pheno, varName) thisdata[,phenoStartIdx:ncol(thisdata)] = pheno - testContinuous2(varName, varType, thisdata) + testContinuous2(vl, varName, varType, thisdata) } # Main code used to process continuous fields, or integer fields that have been reassigned as continuous because they have >20 distinct values. # This is needed because we have already reassigned values for integer fields, so do this in the function above for continuous fields. -testContinuous2 <- function(varName, varType, thisdata) { +testContinuous2 <- function(vl, varName, varType, thisdata) { cat("CONTINUOUS || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] - isExposure = getIsExposure(varName) + isExposure = getIsExposure(vl, varName) if (!is.null(dim(pheno))) { phenoAvg = rowMeans(pheno, na.rm=TRUE) @@ -113,7 +113,7 @@ testContinuous2 <- function(varName, varType, thisdata) { incrementCounter("cont.ordcattry.ordcat") thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned); - testCategoricalOrdered(varName, varType, thisdatanew); + testCategoricalOrdered(vl, varName, varType, thisdatanew); } else { # try to treat as binary because not enough examples in each bin diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 9f87511..0738de0 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -22,18 +22,18 @@ # 2) Generate a single value if there are several values (arrays) by taking the mean # 3) Treating this field as continuous if at least 20 distinct values. # Otherwise treat as binary or ordered categorical if 2 or more than two values. -testInteger <- function(varName, varType, thisdata) { +testInteger <- function(vl, varName, varType, thisdata) { cat("INTEGER || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] - isExposure = getIsExposure(varName) + isExposure = getIsExposure(vl, varName) if (!is.numeric(as.matrix(pheno))) { cat("SKIP Integer type but not numeric",sep=""); return(NULL) } - pheno = reassignValue(pheno, varName) + pheno = reassignValue(vl, pheno, varName) ## average if multiple columns if (!is.null(dim(pheno))) { @@ -52,7 +52,7 @@ testInteger <- function(varName, varType, thisdata) { if (length(uniqVar)>=20) { thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoAvg); - testContinuous2(varName, varType, thisdatanew) + testContinuous2(vl, varName, varType, thisdatanew) incrementCounter("int.continuous") } else { @@ -82,7 +82,7 @@ testInteger <- function(varName, varType, thisdata) { thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); # treat as ordinal categorical - testCategoricalOrdered(varName, varType, thisdatanew); + testCategoricalOrdered(vl, varName, varType, thisdatanew); } } } diff --git a/PHESANT/phenomeScan.r b/PHESANT/phenomeScan.r index 85fc6a9..6435475 100644 --- a/PHESANT/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -41,7 +41,7 @@ if (opt$save==FALSE) { vl=initVariableLists(); ## load data -d <- loadData() +d <- loadData(vl) data=d$datax confounders=d$confounders indicatorFields=d$inds @@ -105,7 +105,7 @@ for (var in phenoVars) { if (first==FALSE) { thisdata = makeTestDataFrame(data, confounders, currentVarValues) - testAssociations(currentVar, currentVarShort, thisdata) + testAssociations(vl, currentVar, currentVarShort, thisdata) } first=FALSE; @@ -124,7 +124,7 @@ for (var in phenoVars) { if (phenoIdx>0){ # last variable so test association thisdata = makeTestDataFrame(data, confounders, currentVarValues) - testAssociations(currentVar, currentVarShort, thisdata) + testAssociations(vl, currentVar, currentVarShort, thisdata) } sink() From 8da1087f0a595e05cde1d415874d6212a7c0908e Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 12:37:51 -0400 Subject: [PATCH 04/27] Fix library imports --- PHESANT/DESCRIPTION | 3 ++- PHESANT/NAMESPACE | 4 ++++ PHESANT/R/loadData.r | 2 -- PHESANT/R/testCategoricalOrdered.r | 5 +---- PHESANT/R/testCategoricalUnordered.r | 2 -- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/PHESANT/DESCRIPTION b/PHESANT/DESCRIPTION index 62068ae..5677e5f 100644 --- a/PHESANT/DESCRIPTION +++ b/PHESANT/DESCRIPTION @@ -5,5 +5,6 @@ Version: 0.1 Date: 2019-03-14 Author: Quanli Wang Maintainer: Quanli Wang -Description: This will replement the original PHESANT in a package and potentially with some optimizations +Description: This will replement the original PHESANT in a package and potentially with some optimizations. License: GPL (>= 3) +Depends: MASS, lmtest, data.table, nnet diff --git a/PHESANT/NAMESPACE b/PHESANT/NAMESPACE index 404475d..7331d97 100644 --- a/PHESANT/NAMESPACE +++ b/PHESANT/NAMESPACE @@ -1,4 +1,8 @@ exportPattern("^[[:alpha:]]+") +import(MASS) +import(lmtest) +import(data.table) +import(nnet) importFrom("stats", "coef", "complete.cases", "confint", "end", "glm", "lm", "model.matrix", "na.omit", "pnorm", "qnorm", "quantile", "relevel") diff --git a/PHESANT/R/loadData.r b/PHESANT/R/loadData.r index 450b103..39f24a2 100644 --- a/PHESANT/R/loadData.r +++ b/PHESANT/R/loadData.r @@ -22,8 +22,6 @@ # creates confounder data frame # returns an object holding these two data frames loadData <- function(vl) { - library(data.table) - ##### validating data ## check phenotype file headers diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index 38f86bf..ba025bd 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -61,10 +61,7 @@ testCategoricalOrdered <- function(vl, varName, varType, thisdata, orderStr="") sink(modelFitLogFile, append=TRUE) print("--------------") print(varName) - - require(MASS) - require(lmtest) - + ### BEGIN TRYCATCH tryCatch({ confounders=thisdata[,3:numPreceedingCols, drop = FALSE] diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index 6667e5c..c0bb914 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -60,7 +60,6 @@ testCategoricalUnordered <- function(vl, varName, varType, thisdata) { print("--------------") print(varName) - require(nnet) if (opt$standardise==TRUE) { geno = scale(thisdata[,"geno"]) } @@ -80,7 +79,6 @@ testCategoricalUnordered <- function(vl, varName, varType, thisdata) { fitB <- multinom(phenoFactor ~ ., data=confounders, maxit=1000) ## compare model to baseline model - require(lmtest) lres = lrtest(fit, fitB) modelP = lres[2,"Pr(>Chisq)"]; From 0315f17f492094ede96cb797b8004381fd2cef58 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 13:45:16 -0400 Subject: [PATCH 05/27] Code refactor: remove global variables counters for now: might need a better solution later on --- PHESANT/R/binaryLogisticRegression.r | 19 ++++++------ PHESANT/R/counters.r | 16 ++++++----- PHESANT/R/testAssociations.r | 42 +++++++++++++-------------- PHESANT/R/testCatMultiple.r | 11 +++---- PHESANT/R/testCatSingle.r | 20 ++++++------- PHESANT/R/testCategoricalOrdered.r | 15 +++++----- PHESANT/R/testCategoricalUnordered.r | 17 +++++------ PHESANT/R/testContinuous.r | 43 ++++++++++++++-------------- PHESANT/R/testInteger.r | 17 +++++------ PHESANT/phenomeScan.r | 8 +++--- 10 files changed, 108 insertions(+), 100 deletions(-) diff --git a/PHESANT/R/binaryLogisticRegression.r b/PHESANT/R/binaryLogisticRegression.r index 6fbcefd..0ff9ecc 100644 --- a/PHESANT/R/binaryLogisticRegression.r +++ b/PHESANT/R/binaryLogisticRegression.r @@ -21,7 +21,7 @@ # # Performs binary logistic regression on the phenotype stored in thisdata # and stores result in 'results-logistic-binary' results file. -binaryLogisticRegression <- function(varName, varType, thisdata, isExposure) { +binaryLogisticRegression <- function(varName, counters, varType, thisdata, isExposure) { phenoFactor = factor(thisdata[,phenoStartIdx]) @@ -31,7 +31,7 @@ binaryLogisticRegression <- function(varName, varType, thisdata, isExposure) { if (length(facLevels)!=2) { #stop(paste("Not 2 levels: ", length(facLevels), " || ", sep="")) cat("BINARY-NOT2LEVELS- (", length(facLevels), ") || ",sep=""); - incrementCounter("binary.nottwolevels") + counters <- incrementCounter(counters, "binary.nottwolevels") } idxTrue = length(which(phenoFactor==facLevels[1])) @@ -40,11 +40,11 @@ binaryLogisticRegression <- function(varName, varType, thisdata, isExposure) { if (idxTrue<10 || idxFalse<10) { cat("BINARY-LOGISTIC-SKIP-10 (", idxTrue, "/", idxFalse, ") || ", sep="") - incrementCounter("binary.10") + counters <- incrementCounter(counters, "binary.10") } else if (numNotNA<500) { cat("BINARY-LOGISTIC-SKIP-500 (", numNotNA, ") || ",sep=""); - incrementCounter("binary.500") + counters <- incrementCounter(counters, "binary.500") } else { @@ -54,7 +54,7 @@ binaryLogisticRegression <- function(varName, varType, thisdata, isExposure) { # add pheno to dataframe storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'bin') cat("SUCCESS results-logistic-binary "); - incrementCounter("success.binary") + counters <- incrementCounter(counters, "success.binary") } else { @@ -102,10 +102,10 @@ binaryLogisticRegression <- function(varName, varType, thisdata, isExposure) { write(paste(varName,varType,paste(idxTrue,"/",idxFalse,"(",numNotNA,")",sep=""), beta,lower,upper,pvalue, sep=","), file=paste(opt$resDir,"results-logistic-binary-",opt$varTypeArg,".txt",sep=""), append="TRUE"); cat("SUCCESS results-logistic-binary "); - incrementCounter("success.binary") + counters <- incrementCounter(counters, "success.binary") if (isExposure==TRUE) { - incrementCounter("success.exposure.binary") + counters <-incrementCounter(counters,"success.exposure.binary") } ## END TRYCATCH @@ -113,9 +113,10 @@ binaryLogisticRegression <- function(varName, varType, thisdata, isExposure) { sink() sink(resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - incrementCounter("binary.error") + counters <- incrementCounter(counters, "binary.error") }) } - } + } + return(counters) } diff --git a/PHESANT/R/counters.r b/PHESANT/R/counters.r index 15fdc47..a6e22a7 100644 --- a/PHESANT/R/counters.r +++ b/PHESANT/R/counters.r @@ -17,31 +17,33 @@ # DEALINGS IN THE SOFTWARE. # adds given value to counter, that are used to count how many variables go down each route in the data flow -addToCounts <- function(countName, num) { +addToCounts <- function(counters, countName, num) { idx = which(counters$name==countName) if (length(idx)==0) { # counter does not exist so add with countValue 1 - counters <<- rbind(counters, data.frame(name=countName, countValue=num)) + counters <- rbind(counters, data.frame(name=countName, countValue=num)) } else { # add to counter that already exists - counters$countValue[idx] <<- counters$countValue[idx]+num + counters$countValue[idx] <- counters$countValue[idx]+num } + return(counters) } # increments counters used to count how many variables go down each route in the data flow -incrementCounter <- function(countName) { +incrementCounter <- function(counters, countName) { idx = which(counters$name==countName) if (length(idx)==0) { # counter does not exist so add with countValue 1 - counters <<- rbind(counters, data.frame(name=countName, countValue=1)) + counters <- rbind(counters, data.frame(name=countName, countValue=1)) } else { # increment counter that already exists - counters$countValue[idx] <<- counters$countValue[idx]+1 + counters$countValue[idx] <- counters$countValue[idx]+1 } + return(counters) } # Saves the counters stored in count variables, to a file in results directory -saveCounts <- function() { +saveCounts <- function(counters) { countFile = paste(opt$resDir,"variable-flow-counts-",opt$varTypeArg,".txt",sep="") # sort on counter name sortIdx = order(as.character(counters[,"name"])) diff --git a/PHESANT/R/testAssociations.r b/PHESANT/R/testAssociations.r index 8a2aba1..1ecb5eb 100644 --- a/PHESANT/R/testAssociations.r +++ b/PHESANT/R/testAssociations.r @@ -18,7 +18,7 @@ # Tests the association of a field, determined by its field type -testAssociations <- function(vl, currentVar, currentVarShort, thisdata) { +testAssociations <- function(vl, counters, currentVar, currentVarShort, thisdata) { ## call file for variable type @@ -30,7 +30,7 @@ testAssociations <- function(vl, currentVar, currentVarShort, thisdata) { # check if variable info is found for this field if (length(idx)==0) { cat(paste(currentVar, " || Variable could not be found in pheno info file. \n", sep="")) - incrementCounter("notinphenofile") + counters <- incrementCounter(counters, "notinphenofile") } else { @@ -47,15 +47,15 @@ testAssociations <- function(vl, currentVar, currentVarShort, thisdata) { if (excluded!="") { cat(paste("Excluded integer: ", excluded, " || ", sep="")) - incrementCounter("excluded.int") + counters <- incrementCounter(counters, "excluded.int") } else { - incrementCounter("start.int") + counters <- incrementCounter(counters, "start.int") if (isExposure==TRUE) { - incrementCounter("start.exposure.int") + counters <- incrementCounter(counters, "start.exposure.int") } - testInteger(vl, currentVarShort, "INTEGER", thisdata); + counters <- testInteger(vl, couinters, currentVarShort, "INTEGER", thisdata); } cat("\n"); } @@ -66,14 +66,14 @@ testAssociations <- function(vl, currentVar, currentVarShort, thisdata) { if (excluded!="") { cat(paste("Excluded continuous: ", excluded, " || ", sep="")) - incrementCounter("excluded.cont") + counters <- incrementCounter(counters, "excluded.cont") } else { - incrementCounter("start.cont") + counters <- incrementCounter(counters, "start.cont") if (isExposure==TRUE) { - incrementCounter("start.exposure.cont") + counters<- incrementCounter(counters, "start.exposure.cont") } - testContinuous(vl, currentVarShort, "CONTINUOUS", thisdata); + counters <- testContinuous(vl, counters, currentVarShort, "CONTINUOUS", thisdata); } cat("\n"); } @@ -84,14 +84,14 @@ testAssociations <- function(vl, currentVar, currentVarShort, thisdata) { if (excluded!="") { cat(paste("Excluded cat-single: ", excluded, " || ", sep="")) - incrementCounter("excluded.catSin") + counters <- incrementCounter(counters, "excluded.catSin") } else { - incrementCounter("start.catSin") + counters <- incrementCounter(counters, "start.catSin") if (isExposure==TRUE) { - incrementCounter("start.exposure.catSin") + counters <-incrementCounter(counters, "start.exposure.catSin") } - testCategoricalSingle(vl, currentVarShort, "CAT-SIN", thisdata); + counters <- testCategoricalSingle(vl, counters, currentVarShort, "CAT-SIN", thisdata); } cat("\n"); } @@ -102,27 +102,27 @@ testAssociations <- function(vl, currentVar, currentVarShort, thisdata) { if (excluded!="") { cat(paste("Excluded cat-multiple: ", excluded, " || ", sep="")) - incrementCounter("excluded.catMul") + counters <- incrementCounter(counters, "excluded.catMul") } else { if (catSinToMult!="") { cat("cat-single to cat-multiple || ", sep="") - incrementCounter("catSinToCatMul") + counters <- incrementCounter(counters, "catSinToCatMul") } - incrementCounter("start.catMul") + counters <- incrementCounter(counters, "start.catMul") if (isExposure==TRUE) { - incrementCounter("start.exposure.catMul") + counters <- incrementCounter(counters, "start.exposure.catMul") } else { # get number of cat mult values denoting trait of interest numVals = getNumValuesCatMultExposure(vl, currentVarShort) if (numVals>0) { - addToCounts("start.exposure.catMulvalues", numVals) + counters <- addToCounts(addToCounts, "start.exposure.catMulvalues", numVals) } } - testCategoricalMultiple(vl, currentVarShort, "CAT-MUL", thisdata); + counters <- testCategoricalMultiple(vl, counters, currentVarShort, "CAT-MUL", thisdata); } cat("\n"); } @@ -134,7 +134,7 @@ testAssociations <- function(vl, currentVar, currentVarShort, thisdata) { }, error = function(e) { print(paste("ERROR:", currentVar,e)) }) - + return(counters) } diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index bf0768f..4df64fe 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -23,7 +23,7 @@ # in CAT_MULT_INDICATOR_FIELDS field of variable info file (either NO_NAN, ALL or a field ID) # 3) Checking derived variable has at least 10 cases in each group # 4) Calling binaryLogisticRegression function for this derived binary variable -testCategoricalMultiple <- function(vl, varName, varType, thisdata) { +testCategoricalMultiple <- function(vl, counters, varName, varType, thisdata) { cat("CAT-MULTIPLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] @@ -54,7 +54,7 @@ testCategoricalMultiple <- function(vl, varName, varType, thisdata) { idxsTrue = idxForVar[,"row"] cat(" CAT-MUL-BINARY-VAR ", variableVal, " || ", sep=""); - incrementCounter("catMul.binary") + counters <- incrementCounter(counters, "catMul.binary") # make zero vector and set 1s for those with this variable value varBinary = rep.int(0,numRows); @@ -77,16 +77,17 @@ testCategoricalMultiple <- function(vl, varName, varType, thisdata) { if (idxTrue<10 || idxFalse<10) { cat("CAT-MULT-SKIP-10 (", idxTrue, " vs ", idxFalse, ") || ", sep=""); - incrementCounter("catMul.10") + counters <- incrementCounter(counters, "catMul.10") } else { isExposure = getIsCatMultExposure(vl, varName, variableVal) - incrementCounter("catMul.over10") + counters <- incrementCounter(counters, "catMul.over10") # binary - so logistic regression - binaryLogisticRegression(paste(varName, variableVal,sep="#"), varType, newthisdata, isExposure) + counters <- binaryLogisticRegression(paste(varName, variableVal,sep="#"), counters, varType, newthisdata, isExposure) } } + return(counters) } # restricts sample based on value in CAT_MULT_INDICATOR_FIELDS column of variable info file, diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index e801e44..664622d 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -23,7 +23,7 @@ # 3) Replacing missing codes - we assume values < 0 are missing for categorical (single) variables # 4) Remove values with <10 cases # 5) Deterimine correct test to perform, either binary, ordered or unordered. -testCategoricalSingle <- function(vl, varName, varType, thisdata) { +testCategoricalSingle <- function(vl, counters, varName, varType, thisdata) { cat("CAT-SINGLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -68,17 +68,17 @@ testCategoricalSingle <- function(vl, varName, varType, thisdata) { if (length(uniqVar)<=1) { cat("SKIP (only one value) || "); - incrementCounter("catSin.onevalue") + counters <- incrementCounter(counters, "catSin.onevalue") } else if (length(uniqVar)==2) { cat("CAT-SINGLE-BINARY || "); - incrementCounter("catSin.case3") + counters <- incrementCounter(counters, "catSin.case3") # binary so logistic regression phenoFactor = factor(pheno) # binary - so logistic regression thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - binaryLogisticRegression(varName, varType, thisdatanew, isExposure) + counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure) } else { # > 2 categories @@ -91,33 +91,33 @@ testCategoricalSingle <- function(vl, varName, varType, thisdata) { if (ordered == 0) { cat("CAT-SINGLE-UNORDERED || ") - incrementCounter("catSin.case2") + counters <- incrementCounter(counters, "catSin.case2") thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); - testCategoricalUnordered(vl, varName, varType, thisdatanew); + counters <- testCategoricalUnordered(vl, counters, varName, varType, thisdatanew); } else if (ordered == 1) { ## ordered cat("ordered || "); - incrementCounter("catSin.case1") + counters <- incrementCounter(counters, "catSin.case1") ## reorder variable values into increasing order thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); - testCategoricalOrdered(vl, varName, varType, thisdatanew, order) + counters <- testCategoricalOrdered(vl, counters, varName, varType, thisdatanew, order) } else if (ordered == -2) { cat(" EXCLUDED or BINARY variable: Should not get here in code. ") - incrementCounter("catSin.binaryorexcluded") + counters <- incrementCounter(counters, "catSin.binaryorexcluded") } else { print(paste("ERROR", varName, varType, dataCode)); } } } - + return(counters) } ## values are reordered and assigned values 1:N for N categories diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index ba025bd..efc8386 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -18,14 +18,14 @@ # Performs ordered logistic regression test and saves results in ordered logistic results file -testCategoricalOrdered <- function(vl, varName, varType, thisdata, orderStr="") { +testCategoricalOrdered <- function(vl, counters, varName, varType, thisdata, orderStr="") { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] geno = thisdata[,"geno"] cat("CAT-ORD || "); - incrementCounter("ordCat") + counters <- incrementCounter(counters, "ordCat") doCatOrdAssertions(pheno) @@ -39,7 +39,7 @@ testCategoricalOrdered <- function(vl, varName, varType, thisdata, orderStr="") numNotNA = length(which(!is.na(pheno))) if (numNotNA<500) { cat("CATORD-SKIP-500 (", numNotNA, ") || ",sep=""); - incrementCounter("ordCat.500") + counters <- incrementCounter(counters, "ordCat.500") } else { # test this cat ordered variable with ordered logistic regression @@ -52,7 +52,7 @@ testCategoricalOrdered <- function(vl, varName, varType, thisdata, orderStr="") # add pheno to dataframe storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catOrd') cat("SUCCESS results-ordered-logistic"); - incrementCounter("success.ordCat") + counters <- incrementCounter(counters, "success.ordCat") } else { @@ -93,11 +93,11 @@ testCategoricalOrdered <- function(vl, varName, varType, thisdata, orderStr="") write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-ordered-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE"); cat("SUCCESS results-ordered-logistic"); - incrementCounter("success.ordCat") + counters <- incrementCounter(counters, "success.ordCat") isExposure = getIsExposure(vl, varName) if (isExposure == TRUE) { - incrementCounter("success.exposure.ordCat") + counters <- incrementCounter(counters, "success.exposure.ordCat") } ### END TRYCATCH @@ -105,11 +105,12 @@ testCategoricalOrdered <- function(vl, varName, varType, thisdata, orderStr="") sink() sink(resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - incrementCounter("ordCat.error") + counters <- incrementCounter(counters, "ordCat.error") }) } } + return(counters) } # check that the phenotype is valid - that there are more than two categories diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index c0bb914..561ffa3 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -19,7 +19,7 @@ # Tests an unordered categorical phenotype with multinomial regression # and saves this result in the multinomial logistic results file -testCategoricalUnordered <- function(vl, varName, varType, thisdata) { +testCategoricalUnordered <- function(vl, counters, varName, varType, thisdata) { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] #geno = thisdata[,"geno"] @@ -27,7 +27,7 @@ testCategoricalUnordered <- function(vl, varName, varType, thisdata) { numNotNA = length(which(!is.na(pheno))) if (numNotNA<500) { cat("CATUNORD-SKIP-500 (", numNotNA, ") || ",sep=""); - incrementCounter("unordCat.500") + counters <- incrementCounter(counters, "unordCat.500") } else { @@ -38,8 +38,8 @@ testCategoricalUnordered <- function(vl, varName, varType, thisdata) { numWeights=(numUnique-1)*((numPreceedingCols-2)+1+1) if (numWeights>1000) { cat("Too many weights in model: ", numWeights, " > 1000, (num outcomes values: ", numUnique, ") || SKIP ", sep="") - incrementCounter("unordCat.cats") - return(NULL) + counters <-incrementCounter(counters, "unordCat.cats") + return(counters) } phenoFactor = chooseReferenceCategory(pheno); @@ -48,7 +48,7 @@ testCategoricalUnordered <- function(vl, varName, varType, thisdata) { # add pheno to dataframe storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catUnord') cat("SUCCESS results-notordered-logistic "); - incrementCounter("success.unordCat") + counters <- incrementCounter(counters, "success.unordCat") } else { @@ -127,11 +127,11 @@ testCategoricalUnordered <- function(vl, varName, varType, thisdata) { } cat("SUCCESS results-notordered-logistic "); - incrementCounter("success.unordCat") + counters <- incrementCounter(counters, "success.unordCat") isExposure = getIsExposure(vl, varName) if (isExposure == TRUE) { - incrementCounter("success.exposure.unordCat") + counters <- incrementCounter(counters, "success.exposure.unordCat") } ## END TRYCATCH @@ -139,10 +139,11 @@ testCategoricalUnordered <- function(vl, varName, varType, thisdata) { sink() sink(resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - incrementCounter("unordCat.error") + counters <- incrementCounter(counters, "unordCat.error") }) } } + return(counters) } # find reference category - category with most number of examples diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 2cc159a..865c021 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -18,7 +18,7 @@ # Main function called for continuous fields -testContinuous <- function(vl, varName, varType, thisdata) { +testContinuous <- function(vl, counters, varName, varType, thisdata) { cat("CONTINUOUS MAIN || "); @@ -29,13 +29,13 @@ testContinuous <- function(vl, varName, varType, thisdata) { thisdata[,phenoStartIdx:ncol(thisdata)] = pheno - testContinuous2(vl, varName, varType, thisdata) + counters <- testContinuous2(vl, counters, varName, varType, thisdata) } # Main code used to process continuous fields, or integer fields that have been reassigned as continuous because they have >20 distinct values. # This is needed because we have already reassigned values for integer fields, so do this in the function above for continuous fields. -testContinuous2 <- function(vl, varName, varType, thisdata) { +testContinuous2 <- function(vl, counters, varName, varType, thisdata) { cat("CONTINUOUS || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -86,19 +86,19 @@ testContinuous2 <- function(vl, varName, varType, thisdata) { if (numLevels<=1) { cat("SKIP (number of levels: ",numLevels,")",sep="") - incrementCounter("cont.onevalue") + counters <- incrementCounter(counters,"cont.onevalue") } else if (numLevels==2) { # binary - incrementCounter("cont.binary") + counters <- incrementCounter(counters, "cont.binary") thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - binaryLogisticRegression(varName, varType, thisdatanew, isExposure); + counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure); } } else { ## try to treat as ordered categorical - incrementCounter("cont.ordcattry") + counters <- incrementCounter(counters, "cont.ordcattry") ## equal sized bins phenoBinned = equalSizedBins(phenoAvg); @@ -111,9 +111,9 @@ testContinuous2 <- function(vl, varName, varType, thisdata) { # successful binning. >=10 examples in each of the 3 bins - incrementCounter("cont.ordcattry.ordcat") + counters <- incrementCounter(counters, "cont.ordcattry.ordcat") thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned); - testCategoricalOrdered(vl, varName, varType, thisdatanew); + counters <- testCategoricalOrdered(vl, counters, varName, varType, thisdatanew); } else { # try to treat as binary because not enough examples in each bin @@ -122,36 +122,36 @@ testContinuous2 <- function(vl, varName, varType, thisdata) { ## skip - not possible to create binary variable because first and third bins are too small ## ie. could merge bin1 with bin 2 but then bin3 still too small etc cat("SKIP 2 bins are too small || ") - incrementCounter("cont.ordcattry.smallbins") + counters <-incrementCounter(counters, "cont.ordcattry.smallbins") } else if ((bin0Num<10 | bin1Num<10) & (bin0Num+bin1Num)>=10) { # combine first and second bin to create binary variable - incrementCounter("cont.ordcattry.binsbinary") + counters <- incrementCounter(counters, "cont.ordcattry.binsbinary") cat("Combine first two bins and treat as binary || ") phenoBinned[which(phenoBinned==0)] = 1 # test binary thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned) - binaryLogisticRegression(varName, varType, thisdatanew, isExposure); + counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure); } else if ((bin2Num<10 | bin1Num<10) & (bin2Num+bin1Num)>=10) { # combine second and last bin to create binary variable - incrementCounter("cont.ordcattry.binsbinary") + counters <- incrementCounter(counters, "cont.ordcattry.binsbinary") cat("Combine last two bins and treat as binary || ") phenoBinned[which(phenoBinned==2)] = 1 # test binary thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned) - binaryLogisticRegression(varName, varType, thisdatanew, isExposure) + counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure) } else { ## skip - not possible to create binary variable because combining bins would still be too small cat("SKIP 2 bins are too small(2) || ") - incrementCounter("cont.ordcattry.smallbins2") + counters <-incrementCounter(counters, "cont.ordcattry.smallbins2") } } @@ -160,13 +160,13 @@ testContinuous2 <- function(vl, varName, varType, thisdata) { } else { cat("IRNT || "); - incrementCounter("cont.main") + counters <- incrementCounter(counters, "cont.main") # check there are at least 500 examples numNotNA = length(which(!is.na(phenoAvg))) if (numNotNA<500) { cat("CONTINUOUS-SKIP-500 (", numNotNA, ") || ",sep=""); - incrementCounter("cont.main.500") + counters <- incrementCounter(counters, "cont.main.500") } else { ## inverse rank normal transformation @@ -176,7 +176,7 @@ testContinuous2 <- function(vl, varName, varType, thisdata) { # add pheno to dataframe storeNewVar(thisdata[,"userID"], phenoIRNT, varName, 'cont') cat("SUCCESS results-linear"); - incrementCounter("success.continuous") + counters <- incrementCounter(counters, "success.continuous") } else { @@ -223,9 +223,9 @@ testContinuous2 <- function(vl, varName, varType, thisdata) { write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt", sep=""), append="TRUE"); cat("SUCCESS results-linear"); - incrementCounter("success.continuous") + counters <- incrementCounter(counters, "success.continuous") if (isExposure == TRUE) { - incrementCounter("success.exposure.continuous") + counters <- incrementCounter(counters, "success.exposure.continuous") } ## END TRYCATCH @@ -233,11 +233,12 @@ testContinuous2 <- function(vl, varName, varType, thisdata) { sink() sink(resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - incrementCounter("continuous.error") + counters <-incrementCounter(counters, "continuous.error") }) } } } + return(counters) } irnt <- function(pheno) { diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 0738de0..02791bc 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -22,7 +22,7 @@ # 2) Generate a single value if there are several values (arrays) by taking the mean # 3) Treating this field as continuous if at least 20 distinct values. # Otherwise treat as binary or ordered categorical if 2 or more than two values. -testInteger <- function(vl, varName, varType, thisdata) { +testInteger <- function(vl, counters, varName, varType, thisdata) { cat("INTEGER || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -52,8 +52,8 @@ testInteger <- function(vl, varName, varType, thisdata) { if (length(uniqVar)>=20) { thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoAvg); - testContinuous2(vl, varName, varType, thisdatanew) - incrementCounter("int.continuous") + counters <- testContinuous2(vl, counters, varName, varType, thisdatanew) + counters <- incrementCounter(counters, "int.continuous") } else { @@ -65,24 +65,25 @@ testInteger <- function(vl, varName, varType, thisdata) { numLevels = length(levels(phenoFactor)) if (numLevels<=1) { cat("SKIP (number of levels: ",numLevels,")",sep=""); - incrementCounter("int.onevalue") + counters <- incrementCounter(counters, "int.onevalue") } else if (numLevels==2) { - incrementCounter("int.binary") + counters <- incrementCounter(counters, "int.binary") # binary thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - binaryLogisticRegression(varName, varType, thisdatanew, isExposure); + counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure); } else { - incrementCounter("int.catord") + counters <- incrementCounter(counters, "int.catord") cat("3-20 values || ") # we don't use equal sized bins just the original integers (that have >=10 examples) as categories thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); # treat as ordinal categorical - testCategoricalOrdered(vl, varName, varType, thisdatanew); + counters <- testCategoricalOrdered(vl, counters, varName, varType, thisdatanew); } } + return(counters) } diff --git a/PHESANT/phenomeScan.r b/PHESANT/phenomeScan.r index 6435475..33c2ef5 100644 --- a/PHESANT/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -34,7 +34,7 @@ if (length(args) == 0) { } ## load the files we write to and use -counters=initCounters(); +counters <- initCounters() if (opt$save==FALSE) { initResultsFiles(); } @@ -105,7 +105,7 @@ for (var in phenoVars) { if (first==FALSE) { thisdata = makeTestDataFrame(data, confounders, currentVarValues) - testAssociations(vl, currentVar, currentVarShort, thisdata) + counters <- testAssociations(vl, counters, currentVar, currentVarShort, thisdata) } first=FALSE; @@ -124,13 +124,13 @@ for (var in phenoVars) { if (phenoIdx>0){ # last variable so test association thisdata = makeTestDataFrame(data, confounders, currentVarValues) - testAssociations(vl, currentVar, currentVarShort, thisdata) + counters <- testAssociations(vl, counters, currentVar, currentVarShort, thisdata) } sink() # save counters of each path in variable flow -saveCounts() +saveCounts(counters) if (opt$save == TRUE) { write.table(derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); From c40962fc1c78f86ceebc78b86ef9bc9f737807af Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 14:37:23 -0400 Subject: [PATCH 06/27] Code refactor: remove global variable opt --- PHESANT/R/binaryLogisticRegression.r | 2 +- PHESANT/R/counters.r | 2 +- PHESANT/R/initFunctions.r | 4 +- PHESANT/R/loadConfounders.r | 6 +- PHESANT/R/loadData.r | 14 +-- PHESANT/R/loadIndicatorFields.r | 2 +- PHESANT/R/loadPhenotypes.r | 2 +- PHESANT/R/loadTraitOfInterest.r | 2 +- PHESANT/R/processArgs.r | 158 ++++++++++++--------------- PHESANT/R/testAssociations.r | 10 +- PHESANT/R/testCatMultiple.r | 4 +- PHESANT/R/testCatSingle.r | 8 +- PHESANT/R/testCategoricalOrdered.r | 2 +- PHESANT/R/testCategoricalUnordered.r | 2 +- PHESANT/R/testContinuous.r | 14 +-- PHESANT/R/testInteger.r | 8 +- PHESANT/R/validatePhenotypeInput.r | 2 +- PHESANT/R/validateTraitInput.r | 2 +- PHESANT/man/PHESANT-package.Rd | 10 +- PHESANT/phenomeScan.r | 20 ++-- 20 files changed, 128 insertions(+), 146 deletions(-) diff --git a/PHESANT/R/binaryLogisticRegression.r b/PHESANT/R/binaryLogisticRegression.r index 0ff9ecc..a5f9d4f 100644 --- a/PHESANT/R/binaryLogisticRegression.r +++ b/PHESANT/R/binaryLogisticRegression.r @@ -21,7 +21,7 @@ # # Performs binary logistic regression on the phenotype stored in thisdata # and stores result in 'results-logistic-binary' results file. -binaryLogisticRegression <- function(varName, counters, varType, thisdata, isExposure) { +binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, isExposure) { phenoFactor = factor(thisdata[,phenoStartIdx]) diff --git a/PHESANT/R/counters.r b/PHESANT/R/counters.r index a6e22a7..ef706b9 100644 --- a/PHESANT/R/counters.r +++ b/PHESANT/R/counters.r @@ -43,7 +43,7 @@ incrementCounter <- function(counters, countName) { } # Saves the counters stored in count variables, to a file in results directory -saveCounts <- function(counters) { +saveCounts <- function(opt, counters) { countFile = paste(opt$resDir,"variable-flow-counts-",opt$varTypeArg,".txt",sep="") # sort on counter name sortIdx = order(as.character(counters[,"name"])) diff --git a/PHESANT/R/initFunctions.r b/PHESANT/R/initFunctions.r index 38a04b5..7e9f996 100644 --- a/PHESANT/R/initFunctions.r +++ b/PHESANT/R/initFunctions.r @@ -18,7 +18,7 @@ # create new results files and headers -initResultsFiles <- function() { +initResultsFiles <- function(opt) { ## only linear and continuous fields can create linear results file.create(paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt",sep="")); @@ -39,7 +39,7 @@ initResultsFiles <- function() { } # load the variable information and data code information files -initVariableLists <- function() { +initVariableLists <- function(opt) { phenoInfo=read.table(opt$variablelistfile,sep="\t",header=1,comment.char="",quote=""); diff --git a/PHESANT/R/loadConfounders.r b/PHESANT/R/loadConfounders.r index 72c5f28..b2cdda2 100644 --- a/PHESANT/R/loadConfounders.r +++ b/PHESANT/R/loadConfounders.r @@ -19,7 +19,7 @@ ## ## loads confounder variables from phenotype file -loadConfounders <- function(phenotypes) { +loadConfounders <- function(opt, phenotypes) { if (opt$save==TRUE) { # saving not running tests so we add a fake confounder numRows = nrow(phenotypes) @@ -40,7 +40,7 @@ loadConfounders <- function(phenotypes) { colnames(confs)[1] <- "userID" } else { print("Loading confounders from phenotypes file ...") - confNames = getConfounderNames() + confNames = getConfounderNames(opt) ##### ##### extract confounders from data file confs = fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) @@ -90,7 +90,7 @@ loadConfounders <- function(phenotypes) { } } -getConfounderNames <- function() { +getConfounderNames <- function(opt) { ##### ##### first get vector of confounder names # age and sex diff --git a/PHESANT/R/loadData.r b/PHESANT/R/loadData.r index 39f24a2..e73ca27 100644 --- a/PHESANT/R/loadData.r +++ b/PHESANT/R/loadData.r @@ -21,25 +21,25 @@ # creates phenotype / trait of interest data frame # creates confounder data frame # returns an object holding these two data frames -loadData <- function(vl) { +loadData <- function(opt, vl) { ##### validating data ## check phenotype file headers - validatePhenotypeInput() + validatePhenotypeInput(opt) ## check trait of interest file headers - validateTraitInput() + validateTraitInput(opt) ##### load data ## load phenotype print("Loading phenotypes ...") - phenotype = loadPhenotypes() + phenotype = loadPhenotypes(opt) ## load trait of interest - toi <- loadTraitOfInterest(phenotype) + toi <- loadTraitOfInterest(opt, phenotype) ## load confounders - conf <- loadConfounders(phenotype) + conf <- loadConfounders(opt, phenotype) ## add trait of interest to phenotype data frame and remove rows with no trait of interest ## merge in toi with phenotype - keep id list from phenotypes file @@ -68,7 +68,7 @@ loadData <- function(vl) { # some fields are fixed that have a field type as cat single but we want to treat them like cat mult phenotype = fixOddFieldsToCatMul(vl, phenotype) - indFields = loadIndicatorFields(vl, colnames(phenotype)) + indFields = loadIndicatorFields(opt, vl, colnames(phenotype)) d = list(datax=phenotype, confounders=conf, inds=indFields) return(d) } diff --git a/PHESANT/R/loadIndicatorFields.r b/PHESANT/R/loadIndicatorFields.r index 8159ddb..1a815cd 100644 --- a/PHESANT/R/loadIndicatorFields.r +++ b/PHESANT/R/loadIndicatorFields.r @@ -18,7 +18,7 @@ ## ## load data used for data code default value related field, and categorical multiple indicator field -loadIndicatorFields <- function(vl, phenosToTest) { +loadIndicatorFields <- function(opt, vl, phenosToTest) { print("Loading indicator fields from phenotypes file ...") # read pheno file column names diff --git a/PHESANT/R/loadPhenotypes.r b/PHESANT/R/loadPhenotypes.r index 4276242..0a55079 100644 --- a/PHESANT/R/loadPhenotypes.r +++ b/PHESANT/R/loadPhenotypes.r @@ -19,7 +19,7 @@ ## ## load phenotypes from phenotype file -loadPhenotypes <- function() { +loadPhenotypes <- function(opt) { ## is not running 'all' then we determine the start and end idxs of phenotypes that we test, so that we can parallelise into multiple jobs if (opt$varTypeArg!="all") { # read pheno file column names diff --git a/PHESANT/R/loadTraitOfInterest.r b/PHESANT/R/loadTraitOfInterest.r index 52cf5b5..7f637f2 100644 --- a/PHESANT/R/loadTraitOfInterest.r +++ b/PHESANT/R/loadTraitOfInterest.r @@ -20,7 +20,7 @@ ## ## load trait of interest, either from separate trait of interest file, or from phenotype file -loadTraitOfInterest <- function(phenotypes) { +loadTraitOfInterest <- function(opt, phenotypes) { if (opt$save==TRUE) { # saving not running tests so we don't have a trait of interest # add pretend trait of interest so other code doesn't break diff --git a/PHESANT/R/processArgs.r b/PHESANT/R/processArgs.r index 2b9a438..0413c78 100644 --- a/PHESANT/R/processArgs.r +++ b/PHESANT/R/processArgs.r @@ -20,96 +20,80 @@ # Parse the arguments input by the user # if argument 'test' is used then run test phenome scan processArgs <- function() { - -if (opt$test==TRUE) { - - # set up the test phenome scan settings - datadir='../testWAS/data/'; - opt$resDir <<- '../testWAS/results/'; - opt$userId <<- 'userId'; - opt$phenofile <<- paste(datadir,'phenotypes.csv', sep=""); - opt$variablelistfile <<- '../testWAS/variable-lists/outcome-info.tsv'; - opt$datacodingfile <<- '../testWAS/variable-lists/data-coding-ordinal-info.txt'; - opt$confidenceintervals <<- TRUE - - if(opt$save == FALSE) { - opt$traitofinterestfile <<- paste(datadir,'exposure.csv', sep=""); - opt$traitofinterest <<- 'exposure'; - opt$sensitivity <<- FALSE; - opt$genetic <<- TRUE; - } - - processParts(opt$partIdx, opt$numParts); -} -else { - - ## check arguments are supplied correctly - - if (is.null(opt$phenofile)){ - print_help(opt_parser) - stop("phenofile argument must be supplied", call.=FALSE) - } - else if (!file.exists(opt$phenofile)) { - stop(paste("phenotype data file phenofile=", opt$phenofile, " does not exist", sep=""), call.=FALSE) - } - -# if (is.null(opt$traitofinterestfile)){ -# print_help(opt_parser) -# stop("traitofinterestfile argument must be supplied", call.=FALSE) -# } - if (opt$save==FALSE && !is.null(opt$traitofinterestfile) && !file.exists(opt$traitofinterestfile)) { - stop(paste("trait of interest data file traitofinterestfile=", opt$traitofinterestfile, " does not exist", sep=""), call.=FALSE) - } - - if (opt$save==FALSE && !is.null(opt$confounderfile) && !file.exists(opt$confounderfile)) { - stop(paste("confounder data file confounderfile=", opt$confounderfile, " does not exist", sep=""), call.=FALSE) - } - - if (is.null(opt$variablelistfile)){ - print_help(opt_parser) - stop("variablelistfile argument must be supplied", call.=FALSE) - } - else if (!file.exists(opt$variablelistfile)) { - stop(paste("variable listing file variablelistfile=", opt$variablelistfile, " does not exist", sep=""), call.=FALSE) - } - - if (is.null(opt$datacodingfile)){ - print_help(opt_parser) - stop("datacodingfile argument must be supplied", call.=FALSE) - } - else if (!file.exists(opt$datacodingfile)) { - stop(paste("data coding file datacodingfile=", opt$datacodingfile, " does not exist", sep=""), call.=FALSE) - } - - if (opt$save==FALSE && is.null(opt$traitofinterest)){ - print_help(opt_parser) - stop("traitofinterest argument must be supplied", call.=FALSE) - } - - if (is.null(opt$resDir)){ - print_help(opt_parser) - stop("resDir argument must be supplied", call.=FALSE) - } - else if (!file.exists(opt$resDir)) { - stop(paste("results directory resDir=", opt$resDir, " does not exist", sep=""), call.=FALSE) - } - - - processParts(opt$partIdx, opt$numParts); -} - - -if (opt$save==TRUE) { - print("Saving phenotypes to file. Tests of association will not run!") -} - + if (opt$test==TRUE) { + # set up the test phenome scan settings + datadir='../testWAS/data/'; + opt$resDir <- '../testWAS/results/'; + opt$userId <- 'userId'; + opt$phenofile <- paste(datadir,'phenotypes.csv', sep=""); + opt$variablelistfile <- '../testWAS/variable-lists/outcome-info.tsv'; + opt$datacodingfile <- '../testWAS/variable-lists/data-coding-ordinal-info.txt'; + opt$confidenceintervals <- TRUE + + if(opt$save == FALSE) { + opt$traitofinterestfile <- paste(datadir,'exposure.csv', sep=""); + opt$traitofinterest <- 'exposure'; + opt$sensitivity <- FALSE; + opt$genetic <- TRUE; + } + opt <- processParts(opt, opt$partIdx, opt$numParts) + } + else { + ## check arguments are supplied correctly + if (is.null(opt$phenofile)){ + print_help(opt_parser) + stop("phenofile argument must be supplied", call.=FALSE) + } else if (!file.exists(opt$phenofile)) { + stop(paste("phenotype data file phenofile=", opt$phenofile, " does not exist", sep=""), call.=FALSE) + } + + if (opt$save==FALSE && !is.null(opt$traitofinterestfile) && !file.exists(opt$traitofinterestfile)) { + stop(paste("trait of interest data file traitofinterestfile=", opt$traitofinterestfile, " does not exist", sep=""), call.=FALSE) + } + + if (opt$save==FALSE && !is.null(opt$confounderfile) && !file.exists(opt$confounderfile)) { + stop(paste("confounder data file confounderfile=", opt$confounderfile, " does not exist", sep=""), call.=FALSE) + } + + if (is.null(opt$variablelistfile)){ + print_help(opt_parser) + stop("variablelistfile argument must be supplied", call.=FALSE) + } else if (!file.exists(opt$variablelistfile)) { + stop(paste("variable listing file variablelistfile=", opt$variablelistfile, " does not exist", sep=""), call.=FALSE) + } + + if (is.null(opt$datacodingfile)){ + print_help(opt_parser) + stop("datacodingfile argument must be supplied", call.=FALSE) + } else if (!file.exists(opt$datacodingfile)) { + stop(paste("data coding file datacodingfile=", opt$datacodingfile, " does not exist", sep=""), call.=FALSE) + } + + if (opt$save==FALSE && is.null(opt$traitofinterest)){ + print_help(opt_parser) + stop("traitofinterest argument must be supplied", call.=FALSE) + } + + if (is.null(opt$resDir)){ + print_help(opt_parser) + stop("resDir argument must be supplied", call.=FALSE) + } + else if (!file.exists(opt$resDir)) { + stop(paste("results directory resDir=", opt$resDir, " does not exist", sep=""), call.=FALSE) + } + opt <-processParts(opt, opt$partIdx, opt$numParts); + } + if (opt$save==TRUE) { + print("Saving phenotypes to file. Tests of association will not run!") + } + return(opt) } # Parse the 'part' arguments and check they are valid -processParts <- function(pIdx, nParts) { +processParts <- function(opt, pIdx, nParts) { if (is.null(pIdx) && is.null(nParts)) { - opt$varTypeArg <<- "all"; + opt$varTypeArg <- "all"; print(paste("Running with all traits in phenotype file:", opt$phenofile)); } else if (is.null(pIdx)) { @@ -125,8 +109,8 @@ processParts <- function(pIdx, nParts) { stop("pIdx arguments must be between 1 and nParts inclusive", call.=FALSE) } else { - opt$varTypeArg <<- paste(pIdx, "-", nParts, sep=""); + opt$varTypeArg <- paste(pIdx, "-", nParts, sep=""); print(paste("Running with part",pIdx,"of",nParts," in phenotype file:", opt$phenofile)); } - + return(opt) } diff --git a/PHESANT/R/testAssociations.r b/PHESANT/R/testAssociations.r index 1ecb5eb..d15c64c 100644 --- a/PHESANT/R/testAssociations.r +++ b/PHESANT/R/testAssociations.r @@ -18,7 +18,7 @@ # Tests the association of a field, determined by its field type -testAssociations <- function(vl, counters, currentVar, currentVarShort, thisdata) { +testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thisdata) { ## call file for variable type @@ -55,7 +55,7 @@ testAssociations <- function(vl, counters, currentVar, currentVarShort, thisdata counters <- incrementCounter(counters, "start.exposure.int") } - counters <- testInteger(vl, couinters, currentVarShort, "INTEGER", thisdata); + counters <- testInteger(opt, vl, couinters, currentVarShort, "INTEGER", thisdata); } cat("\n"); } @@ -73,7 +73,7 @@ testAssociations <- function(vl, counters, currentVar, currentVarShort, thisdata if (isExposure==TRUE) { counters<- incrementCounter(counters, "start.exposure.cont") } - counters <- testContinuous(vl, counters, currentVarShort, "CONTINUOUS", thisdata); + counters <- testContinuous(opt, vl, counters, currentVarShort, "CONTINUOUS", thisdata); } cat("\n"); } @@ -91,7 +91,7 @@ testAssociations <- function(vl, counters, currentVar, currentVarShort, thisdata if (isExposure==TRUE) { counters <-incrementCounter(counters, "start.exposure.catSin") } - counters <- testCategoricalSingle(vl, counters, currentVarShort, "CAT-SIN", thisdata); + counters <- testCategoricalSingle(opt, vl, counters, currentVarShort, "CAT-SIN", thisdata); } cat("\n"); } @@ -122,7 +122,7 @@ testAssociations <- function(vl, counters, currentVar, currentVarShort, thisdata counters <- addToCounts(addToCounts, "start.exposure.catMulvalues", numVals) } } - counters <- testCategoricalMultiple(vl, counters, currentVarShort, "CAT-MUL", thisdata); + counters <- testCategoricalMultiple(opt, vl, counters, currentVarShort, "CAT-MUL", thisdata); } cat("\n"); } diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index 4df64fe..8719511 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -23,7 +23,7 @@ # in CAT_MULT_INDICATOR_FIELDS field of variable info file (either NO_NAN, ALL or a field ID) # 3) Checking derived variable has at least 10 cases in each group # 4) Calling binaryLogisticRegression function for this derived binary variable -testCategoricalMultiple <- function(vl, counters, varName, varType, thisdata) { +testCategoricalMultiple <- function(opt, vl, counters, varName, varType, thisdata) { cat("CAT-MULTIPLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] @@ -84,7 +84,7 @@ testCategoricalMultiple <- function(vl, counters, varName, varType, thisdata) { counters <- incrementCounter(counters, "catMul.over10") # binary - so logistic regression - counters <- binaryLogisticRegression(paste(varName, variableVal,sep="#"), counters, varType, newthisdata, isExposure) + counters <- binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), counters, varType, newthisdata, isExposure) } } return(counters) diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index 664622d..1c595f9 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -23,7 +23,7 @@ # 3) Replacing missing codes - we assume values < 0 are missing for categorical (single) variables # 4) Remove values with <10 cases # 5) Deterimine correct test to perform, either binary, ordered or unordered. -testCategoricalSingle <- function(vl, counters, varName, varType, thisdata) { +testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata) { cat("CAT-SINGLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -78,7 +78,7 @@ testCategoricalSingle <- function(vl, counters, varName, varType, thisdata) { phenoFactor = factor(pheno) # binary - so logistic regression thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure) + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure) } else { # > 2 categories @@ -94,7 +94,7 @@ testCategoricalSingle <- function(vl, counters, varName, varType, thisdata) { counters <- incrementCounter(counters, "catSin.case2") thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); - counters <- testCategoricalUnordered(vl, counters, varName, varType, thisdatanew); + counters <- testCategoricalUnordered(opt, vl, counters, varName, varType, thisdatanew); } else if (ordered == 1) { @@ -105,7 +105,7 @@ testCategoricalSingle <- function(vl, counters, varName, varType, thisdata) { ## reorder variable values into increasing order thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); - counters <- testCategoricalOrdered(vl, counters, varName, varType, thisdatanew, order) + counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew, order) } else if (ordered == -2) { diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index efc8386..b4a87f5 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -18,7 +18,7 @@ # Performs ordered logistic regression test and saves results in ordered logistic results file -testCategoricalOrdered <- function(vl, counters, varName, varType, thisdata, orderStr="") { +testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata, orderStr="") { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index 561ffa3..b32d4a4 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -19,7 +19,7 @@ # Tests an unordered categorical phenotype with multinomial regression # and saves this result in the multinomial logistic results file -testCategoricalUnordered <- function(vl, counters, varName, varType, thisdata) { +testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisdata) { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] #geno = thisdata[,"geno"] diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 865c021..0134787 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -18,7 +18,7 @@ # Main function called for continuous fields -testContinuous <- function(vl, counters, varName, varType, thisdata) { +testContinuous <- function(opt, vl, counters, varName, varType, thisdata) { cat("CONTINUOUS MAIN || "); @@ -29,13 +29,13 @@ testContinuous <- function(vl, counters, varName, varType, thisdata) { thisdata[,phenoStartIdx:ncol(thisdata)] = pheno - counters <- testContinuous2(vl, counters, varName, varType, thisdata) + counters <- testContinuous2(opt, vl, counters, varName, varType, thisdata) } # Main code used to process continuous fields, or integer fields that have been reassigned as continuous because they have >20 distinct values. # This is needed because we have already reassigned values for integer fields, so do this in the function above for continuous fields. -testContinuous2 <- function(vl, counters, varName, varType, thisdata) { +testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { cat("CONTINUOUS || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -92,7 +92,7 @@ testContinuous2 <- function(vl, counters, varName, varType, thisdata) { # binary counters <- incrementCounter(counters, "cont.binary") thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure); + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure); } } else { @@ -113,7 +113,7 @@ testContinuous2 <- function(vl, counters, varName, varType, thisdata) { counters <- incrementCounter(counters, "cont.ordcattry.ordcat") thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned); - counters <- testCategoricalOrdered(vl, counters, varName, varType, thisdatanew); + counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew); } else { # try to treat as binary because not enough examples in each bin @@ -133,7 +133,7 @@ testContinuous2 <- function(vl, counters, varName, varType, thisdata) { # test binary thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned) - counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure); + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure); } else if ((bin2Num<10 | bin1Num<10) & (bin2Num+bin1Num)>=10) { @@ -144,7 +144,7 @@ testContinuous2 <- function(vl, counters, varName, varType, thisdata) { # test binary thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned) - counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure) + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure) } diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 02791bc..e003e01 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -22,7 +22,7 @@ # 2) Generate a single value if there are several values (arrays) by taking the mean # 3) Treating this field as continuous if at least 20 distinct values. # Otherwise treat as binary or ordered categorical if 2 or more than two values. -testInteger <- function(vl, counters, varName, varType, thisdata) { +testInteger <- function(opt, vl, counters, varName, varType, thisdata) { cat("INTEGER || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -52,7 +52,7 @@ testInteger <- function(vl, counters, varName, varType, thisdata) { if (length(uniqVar)>=20) { thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoAvg); - counters <- testContinuous2(vl, counters, varName, varType, thisdatanew) + counters <- testContinuous2(opt, vl, counters, varName, varType, thisdatanew) counters <- incrementCounter(counters, "int.continuous") } else { @@ -72,7 +72,7 @@ testInteger <- function(vl, counters, varName, varType, thisdata) { # binary thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - counters <- binaryLogisticRegression(varName, counters, varType, thisdatanew, isExposure); + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure); } else { counters <- incrementCounter(counters, "int.catord") @@ -82,7 +82,7 @@ testInteger <- function(vl, counters, varName, varType, thisdata) { thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); # treat as ordinal categorical - counters <- testCategoricalOrdered(vl, counters, varName, varType, thisdatanew); + counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew); } } return(counters) diff --git a/PHESANT/R/validatePhenotypeInput.r b/PHESANT/R/validatePhenotypeInput.r index 9b95f71..cc3ca87 100644 --- a/PHESANT/R/validatePhenotypeInput.r +++ b/PHESANT/R/validatePhenotypeInput.r @@ -18,7 +18,7 @@ # Validate the contents of the phenotype file -validatePhenotypeInput <- function() { +validatePhenotypeInput <- function(opt) { print("Validating phenotype data ...") diff --git a/PHESANT/R/validateTraitInput.r b/PHESANT/R/validateTraitInput.r index 16a35f4..a42c28b 100644 --- a/PHESANT/R/validateTraitInput.r +++ b/PHESANT/R/validateTraitInput.r @@ -18,7 +18,7 @@ # Validate the contents of the trait of interest file -validateTraitInput <- function(snpIn) { +validateTraitInput <- function(opt) { if (opt$save!=TRUE) { diff --git a/PHESANT/man/PHESANT-package.Rd b/PHESANT/man/PHESANT-package.Rd index 0178cc4..3625692 100644 --- a/PHESANT/man/PHESANT-package.Rd +++ b/PHESANT/man/PHESANT-package.Rd @@ -13,7 +13,7 @@ The DESCRIPTION file: \packageDESCRIPTION{PHESANT} \packageIndices{PHESANT} -~~ An overview of how to use the package, including the most important functions ~~ +An overview of how to use the package, including the most important functions } \author{ \packageAuthor{PHESANT} @@ -21,14 +21,12 @@ The DESCRIPTION file: Maintainer: \packageMaintainer{PHESANT} } \references{ -~~ Literature or other references for background information ~~ +Literature or other references for background information } -~~ Optionally other standard keywords, one per line, from file KEYWORDS in the R documentation directory ~~ \keyword{ package } \seealso{ -~~ Optional links to other man pages, e.g. ~~ -~~ \code{\link[:-package]{}} ~~ +To be done } \examples{ -~~ simple examples of the most important functions ~~ + } diff --git a/PHESANT/phenomeScan.r b/PHESANT/phenomeScan.r index 33c2ef5..57d82ab 100644 --- a/PHESANT/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -26,22 +26,22 @@ library("optparse") args <- commandArgs(T) if (length(args) == 0) { - load(file = "opt_only.RData") + load(file = "opt_only.RData") } else { - opt_parser = OptionParser(option_list=option_list); - opt = parse_args(opt_parser); - processArgs(); + opt_parser <- OptionParser(option_list=option_list) + opt <- parse_args(opt_parser) + opt <- processArgs(opt) } ## load the files we write to and use counters <- initCounters() if (opt$save==FALSE) { - initResultsFiles(); + initResultsFiles(opt) } -vl=initVariableLists(); +vl <- initVariableLists(opt) ## load data -d <- loadData(vl) +d <- loadData(opt, vl) data=d$datax confounders=d$confounders indicatorFields=d$inds @@ -105,7 +105,7 @@ for (var in phenoVars) { if (first==FALSE) { thisdata = makeTestDataFrame(data, confounders, currentVarValues) - counters <- testAssociations(vl, counters, currentVar, currentVarShort, thisdata) + counters <- testAssociations(opt, vl, counters, currentVar, currentVarShort, thisdata) } first=FALSE; @@ -124,13 +124,13 @@ for (var in phenoVars) { if (phenoIdx>0){ # last variable so test association thisdata = makeTestDataFrame(data, confounders, currentVarValues) - counters <- testAssociations(vl, counters, currentVar, currentVarShort, thisdata) + counters <- testAssociations(opt, vl, counters, currentVar, currentVarShort, thisdata) } sink() # save counters of each path in variable flow -saveCounts(counters) +saveCounts(opt, counters) if (opt$save == TRUE) { write.table(derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); From 620ea3707a50f9aed6bc9496f8a2818fbe3ca8fe Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 14:37:58 -0400 Subject: [PATCH 07/27] ginire history --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 85c0afc..5fc7a4a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .DS_Store .Rproj.user +PHESANT/.Rhistory +PHESANT/.Rhistory From 5eea081f7b502abe9307dbc8141fd50d9ba694aa Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 16:01:29 -0400 Subject: [PATCH 08/27] hide some global varibles --- PHESANT/R/binaryLogisticRegression.r | 10 +++---- PHESANT/R/option_list.R | 4 ++- PHESANT/R/storeNewVar.r | 31 +++++++++------------- PHESANT/R/testAssociations.r | 10 +++---- PHESANT/R/testCatMultiple.r | 6 ++--- PHESANT/R/testCatSingle.r | 14 +++++----- PHESANT/R/testCategoricalOrdered.r | 10 +++---- PHESANT/R/testCategoricalUnordered.r | 12 ++++----- PHESANT/R/testContinuous.r | 30 ++++++++++----------- PHESANT/R/testInteger.r | 14 +++++----- PHESANT/phenomeScan.r | 39 +++++++++++++--------------- 11 files changed, 86 insertions(+), 94 deletions(-) diff --git a/PHESANT/R/binaryLogisticRegression.r b/PHESANT/R/binaryLogisticRegression.r index a5f9d4f..48ccf85 100644 --- a/PHESANT/R/binaryLogisticRegression.r +++ b/PHESANT/R/binaryLogisticRegression.r @@ -21,7 +21,7 @@ # # Performs binary logistic regression on the phenotype stored in thisdata # and stores result in 'results-logistic-binary' results file. -binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, isExposure) { +binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, isExposure, phenoStartIdx) { phenoFactor = factor(thisdata[,phenoStartIdx]) @@ -65,10 +65,10 @@ binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, else { geno = thisdata[,"geno"] } - confounders=thisdata[,3:numPreceedingCols, drop = FALSE] + confounders=thisdata[,3:(phenoStartIdx -1) , drop = FALSE] sink() - sink(modelFitLogFile, append=TRUE) + sink(pkg.env$modelFitLogFile, append=TRUE) print("--------------") print(varName) @@ -78,7 +78,7 @@ binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, mylogit <- glm(phenoFactor ~ geno + ., data=confounders, family="binomial") sink() - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) sumx = summary(mylogit) @@ -111,7 +111,7 @@ binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, ## END TRYCATCH }, error = function(e) { sink() - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) counters <- incrementCounter(counters, "binary.error") }) diff --git a/PHESANT/R/option_list.R b/PHESANT/R/option_list.R index 203393e..255bfbe 100644 --- a/PHESANT/R/option_list.R +++ b/PHESANT/R/option_list.R @@ -17,4 +17,6 @@ option_list = list( make_option(c("-c", "--confounderfile"), type="character", default=NULL, help="Confounder file name", metavar="character"), make_option(c("-i", "--confidenceintervals"), type="logical", default=TRUE, help="Whether confidence intervals should be calculated [default= %default]"), make_option(c("-k", "--standardise"), action="store", default=TRUE, help="Trait of interest is standardised to have mean=0 and std=1 [default= %default]") -) \ No newline at end of file +) + +pkg.env <- new.env(parent = emptyenv()) \ No newline at end of file diff --git a/PHESANT/R/storeNewVar.r b/PHESANT/R/storeNewVar.r index 20c4d5a..e2f640a 100644 --- a/PHESANT/R/storeNewVar.r +++ b/PHESANT/R/storeNewVar.r @@ -1,21 +1,14 @@ - - storeNewVar <- function(userIDData, phenoData, varName, type) { - - # add pheno to dataframe - newdata = data.frame(userID=userIDData, newvar=phenoData) - names(newdata)[names(newdata)=="newvar"] = varName - - if (type == "bin") { - derivedBinary <<- merge(derivedBinary, newdata, by="userID", all=TRUE); - } else if (type == "cont") { - derivedCont <<- merge(derivedCont, newdata, by="userID", all=TRUE); - } else if (type == "catOrd") { - derivedCatOrd <<- merge(derivedCatOrd, newdata, by="userID", all=TRUE); - } else if (type == "catUnord") { - derivedCatUnord <<- merge(derivedCatUnord, newdata, by="userID", all=TRUE); - } - - #write.table(phenoFactor, file=paste(opt$resDir, "data-binary-", varName, ".csv", sep=""), row.names=FALSE, col.names=FALSE, na="", quote=FALSE); - + # add pheno to dataframe + newdata = data.frame(userID=userIDData, newvar=phenoData) + names(newdata)[names(newdata)=="newvar"] = varName + if (type == "bin") { + pkg.env$derivedBinary <- merge(pkg.env$derivedBinary, newdata, by="userID", all=TRUE); + } else if (type == "cont") { + pkg.env$derivedCont <- merge(pkg.env$derivedCont, newdata, by="userID", all=TRUE); + } else if (type == "catOrd") { + pkg.env$derivedCatOrd <- merge(pkg.env$derivedCatOrd, newdata, by="userID", all=TRUE); + } else if (type == "catUnord") { + pkg.env$derivedCatUnord <- merge(pkg.env$derivedCatUnord, newdata, by="userID", all=TRUE); + } } diff --git a/PHESANT/R/testAssociations.r b/PHESANT/R/testAssociations.r index d15c64c..04dda72 100644 --- a/PHESANT/R/testAssociations.r +++ b/PHESANT/R/testAssociations.r @@ -18,7 +18,7 @@ # Tests the association of a field, determined by its field type -testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thisdata) { +testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thisdata, phenoStartIdx) { ## call file for variable type @@ -55,7 +55,7 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi counters <- incrementCounter(counters, "start.exposure.int") } - counters <- testInteger(opt, vl, couinters, currentVarShort, "INTEGER", thisdata); + counters <- testInteger(opt, vl, counters, currentVarShort, "INTEGER", thisdata, phenoStartIdx); } cat("\n"); } @@ -73,7 +73,7 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi if (isExposure==TRUE) { counters<- incrementCounter(counters, "start.exposure.cont") } - counters <- testContinuous(opt, vl, counters, currentVarShort, "CONTINUOUS", thisdata); + counters <- testContinuous(opt, vl, counters, currentVarShort, "CONTINUOUS", thisdata, phenoStartIdx); } cat("\n"); } @@ -91,7 +91,7 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi if (isExposure==TRUE) { counters <-incrementCounter(counters, "start.exposure.catSin") } - counters <- testCategoricalSingle(opt, vl, counters, currentVarShort, "CAT-SIN", thisdata); + counters <- testCategoricalSingle(opt, vl, counters, currentVarShort, "CAT-SIN", thisdata, phenoStartIdx); } cat("\n"); } @@ -122,7 +122,7 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi counters <- addToCounts(addToCounts, "start.exposure.catMulvalues", numVals) } } - counters <- testCategoricalMultiple(opt, vl, counters, currentVarShort, "CAT-MUL", thisdata); + counters <- testCategoricalMultiple(opt, vl, counters, currentVarShort, "CAT-MUL", thisdata, phenoStartIdx); } cat("\n"); } diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index 8719511..3a0c9df 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -23,7 +23,7 @@ # in CAT_MULT_INDICATOR_FIELDS field of variable info file (either NO_NAN, ALL or a field ID) # 3) Checking derived variable has at least 10 cases in each group # 4) Calling binaryLogisticRegression function for this derived binary variable -testCategoricalMultiple <- function(opt, vl, counters, varName, varType, thisdata) { +testCategoricalMultiple <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { cat("CAT-MULTIPLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] @@ -62,7 +62,7 @@ testCategoricalMultiple <- function(opt, vl, counters, varName, varType, thisdat varBinaryFactor = factor(varBinary) ## data for this new binary variable - newthisdata = cbind.data.frame(thisdata[,1:numPreceedingCols], varBinaryFactor) + newthisdata = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], varBinaryFactor) ## one of 3 ways to decide which examples are negative idxsToRemove = restrictSample(vl, varName, pheno, variableVal, thisdata[,"userID", drop=FALSE]) @@ -84,7 +84,7 @@ testCategoricalMultiple <- function(opt, vl, counters, varName, varType, thisdat counters <- incrementCounter(counters, "catMul.over10") # binary - so logistic regression - counters <- binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), counters, varType, newthisdata, isExposure) + counters <- binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), counters, varType, newthisdata, isExposure, phenoStartIdx) } } return(counters) diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index 1c595f9..cf8b121 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -23,7 +23,7 @@ # 3) Replacing missing codes - we assume values < 0 are missing for categorical (single) variables # 4) Remove values with <10 cases # 5) Deterimine correct test to perform, either binary, ordered or unordered. -testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata) { +testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { cat("CAT-SINGLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -77,8 +77,8 @@ testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata) phenoFactor = factor(pheno) # binary - so logistic regression - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure) + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure, phenoStartIdx) } else { # > 2 categories @@ -93,8 +93,8 @@ testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata) cat("CAT-SINGLE-UNORDERED || ") counters <- incrementCounter(counters, "catSin.case2") - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); - counters <- testCategoricalUnordered(opt, vl, counters, varName, varType, thisdatanew); + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); + counters <- testCategoricalUnordered(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx); } else if (ordered == 1) { @@ -104,8 +104,8 @@ testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata) counters <- incrementCounter(counters, "catSin.case1") ## reorder variable values into increasing order - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); - counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew, order) + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); + counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx, order) } else if (ordered == -2) { diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index b4a87f5..d0b972d 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -18,7 +18,7 @@ # Performs ordered logistic regression test and saves results in ordered logistic results file -testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata, orderStr="") { +testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx, orderStr="") { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -58,13 +58,13 @@ testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata # ordinal logistic regression sink() - sink(modelFitLogFile, append=TRUE) + sink(pkg.env$modelFitLogFile, append=TRUE) print("--------------") print(varName) ### BEGIN TRYCATCH tryCatch({ - confounders=thisdata[,3:numPreceedingCols, drop = FALSE] + confounders=thisdata[,3:(phenoStartIdx -1), drop = FALSE] if (opt$standardise==TRUE) { @@ -75,7 +75,7 @@ testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata ctable <- coef(summary(fit)) sink() - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) ct = coeftest(fit) pvalue = ct["geno","Pr(>|t|)"] @@ -103,7 +103,7 @@ testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata ### END TRYCATCH }, error = function(e) { sink() - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) counters <- incrementCounter(counters, "ordCat.error") }) diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index b32d4a4..8807b26 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -19,7 +19,7 @@ # Tests an unordered categorical phenotype with multinomial regression # and saves this result in the multinomial logistic results file -testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisdata) { +testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] #geno = thisdata[,"geno"] @@ -35,7 +35,7 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda numUnique = length(unique(na.omit(pheno))) # num outcome values * (num confounders and trait of interest and bias term) - numWeights=(numUnique-1)*((numPreceedingCols-2)+1+1) + numWeights=(numUnique-1)*(((phenoStartIdx -1)-2)+1+1) if (numWeights>1000) { cat("Too many weights in model: ", numWeights, " > 1000, (num outcomes values: ", numUnique, ") || SKIP ", sep="") counters <-incrementCounter(counters, "unordCat.cats") @@ -56,7 +56,7 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda reference = levels(phenoFactor)[1]; sink() - sink(modelFitLogFile, append=TRUE) # hide output of model fitting + sink(pkg.env$modelFitLogFile, append=TRUE) # hide output of model fitting print("--------------") print(varName) @@ -68,7 +68,7 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda } #cat("genoMean=", mean(geno), " genoSD=", sd(geno), " || ", sep="") - confounders=thisdata[,3:numPreceedingCols, drop = FALSE] + confounders=thisdata[,3:(phenoStartIdx -1), drop = FALSE] ###### BEGIN TRYCATCH tryCatch({ @@ -88,7 +88,7 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda write(paste(paste(varName,"-",reference,sep=""), varType, paste(maxFreq,"/",numNotNA,sep=""), -999, -999, -999, modelP, sep=","), file=paste(opt$resDir,"results-multinomial-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE") sink() - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) sumx <- summary(fit) @@ -137,7 +137,7 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda ## END TRYCATCH }, error = function(e) { sink() - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) counters <- incrementCounter(counters, "unordCat.error") }) diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 0134787..73f6f90 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -18,7 +18,7 @@ # Main function called for continuous fields -testContinuous <- function(opt, vl, counters, varName, varType, thisdata) { +testContinuous <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { cat("CONTINUOUS MAIN || "); @@ -29,13 +29,13 @@ testContinuous <- function(opt, vl, counters, varName, varType, thisdata) { thisdata[,phenoStartIdx:ncol(thisdata)] = pheno - counters <- testContinuous2(opt, vl, counters, varName, varType, thisdata) + counters <- testContinuous2(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) } # Main code used to process continuous fields, or integer fields that have been reassigned as continuous because they have >20 distinct values. # This is needed because we have already reassigned values for integer fields, so do this in the function above for continuous fields. -testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { +testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { cat("CONTINUOUS || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -91,8 +91,8 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { else if (numLevels==2) { # binary counters <- incrementCounter(counters, "cont.binary") - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure); + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure, phenoStartIdx); } } else { @@ -112,8 +112,8 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { # successful binning. >=10 examples in each of the 3 bins counters <- incrementCounter(counters, "cont.ordcattry.ordcat") - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned); - counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew); + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned); + counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx); } else { # try to treat as binary because not enough examples in each bin @@ -132,8 +132,8 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { phenoBinned[which(phenoBinned==0)] = 1 # test binary - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned) - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure); + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure, phenoStartIdx); } else if ((bin2Num<10 | bin1Num<10) & (bin2Num+bin1Num)>=10) { @@ -143,8 +143,8 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { phenoBinned[which(phenoBinned==2)] = 1 # test binary - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned) - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure) + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure, phenoStartIdx) } @@ -187,10 +187,10 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { else { geno = thisdata[,"geno"] } - confounders=thisdata[,3:numPreceedingCols, drop = FALSE] + confounders=thisdata[,3:(phenoStartIdx -1), drop = FALSE] sink() - sink(modelFitLogFile, append=TRUE) + sink(pkg.env$modelFitLogFile, append=TRUE) print("--------------") print(varName) @@ -200,7 +200,7 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { fit <- lm(phenoIRNT ~ geno + ., data=confounders) sink() - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) sumx = summary(fit) @@ -231,7 +231,7 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata) { ## END TRYCATCH }, error = function(e) { sink() - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) counters <-incrementCounter(counters, "continuous.error") }) diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index e003e01..190c732 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -22,7 +22,7 @@ # 2) Generate a single value if there are several values (arrays) by taking the mean # 3) Treating this field as continuous if at least 20 distinct values. # Otherwise treat as binary or ordered categorical if 2 or more than two values. -testInteger <- function(opt, vl, counters, varName, varType, thisdata) { +testInteger <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { cat("INTEGER || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -51,8 +51,8 @@ testInteger <- function(opt, vl, counters, varName, varType, thisdata) { # if >=20 separate values then treat as continuous if (length(uniqVar)>=20) { - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoAvg); - counters <- testContinuous2(opt, vl, counters, varName, varType, thisdatanew) + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoAvg); + counters <- testContinuous2(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx) counters <- incrementCounter(counters, "int.continuous") } else { @@ -71,18 +71,18 @@ testInteger <- function(opt, vl, counters, varName, varType, thisdata) { counters <- incrementCounter(counters, "int.binary") # binary - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure); + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); + counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure,phenoStartIdx); } else { counters <- incrementCounter(counters, "int.catord") cat("3-20 values || ") # we don't use equal sized bins just the original integers (that have >=10 examples) as categories - thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); # treat as ordinal categorical - counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew); + counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx); } } return(counters) diff --git a/PHESANT/phenomeScan.r b/PHESANT/phenomeScan.r index 57d82ab..593ebd6 100644 --- a/PHESANT/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -60,21 +60,18 @@ currentVarShort=""; first=TRUE; if (opt$save == TRUE) { - - derivedBinary <- data.frame(userID=data$userID) - derivedCont <- data.frame(userID=data$userID) - derivedCatOrd <- data.frame(userID=data$userID) - derivedCatUnord <- data.frame(userID=data$userID) - - resLogFile = paste(opt$resDir,"data-log-",opt$varTypeArg,".txt",sep="") - sink(resLogFile) + pkg.env$derivedBinary <- data.frame(userID=data$userID) + pkg.env$derivedCont <- data.frame(userID=data$userID) + pkg.env$derivedCatOrd <- data.frame(userID=data$userID) + pkg.env$derivedCatUnord <- data.frame(userID=data$userID) + pkg.env$resLogFile = paste(opt$resDir,"data-log-",opt$varTypeArg,".txt",sep="") + sink(pkg.env$resLogFile) } else { - modelFitLogFile = paste(opt$resDir,"modelfit-log-",opt$varTypeArg,".txt",sep="") - sink(modelFitLogFile) - sink() - - resLogFile = paste(opt$resDir,"results-log-",opt$varTypeArg,".txt",sep="") - sink(resLogFile) + pkg.env$modelFitLogFile = paste(opt$resDir,"modelfit-log-",opt$varTypeArg,".txt",sep="") + sink(pkg.env$modelFitLogFile) + sink() + pkg.env$resLogFile = paste(opt$resDir,"results-log-",opt$varTypeArg,".txt",sep="") + sink(pkg.env$resLogFile) } @@ -84,7 +81,7 @@ for (var in phenoVars) { sink() # print(var) - sink(resLogFile, append=TRUE) + sink(pkg.env$resLogFile, append=TRUE) varx = gsub("^x", "", var); varx = gsub("_[0-9]+$", "", varx); @@ -105,7 +102,7 @@ for (var in phenoVars) { if (first==FALSE) { thisdata = makeTestDataFrame(data, confounders, currentVarValues) - counters <- testAssociations(opt, vl, counters, currentVar, currentVarShort, thisdata) + counters <- testAssociations(opt, vl, counters, currentVar, currentVarShort, thisdata, phenoStartIdx) } first=FALSE; @@ -124,7 +121,7 @@ for (var in phenoVars) { if (phenoIdx>0){ # last variable so test association thisdata = makeTestDataFrame(data, confounders, currentVarValues) - counters <- testAssociations(opt, vl, counters, currentVar, currentVarShort, thisdata) + counters <- testAssociations(opt, vl, counters, currentVar, currentVarShort, thisdata, phenoStartIdx) } sink() @@ -133,10 +130,10 @@ sink() saveCounts(opt, counters) if (opt$save == TRUE) { - write.table(derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); - write.table(derivedCont, file=paste(opt$resDir,"data-cont-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); - write.table(derivedCatOrd, file=paste(opt$resDir,"data-catord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); - write.table(derivedCatUnord, file=paste(opt$resDir,"data-catunord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(pkg.env$derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(pkg.env$derivedCont, file=paste(opt$resDir,"data-cont-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(pkg.env$derivedCatOrd, file=paste(opt$resDir,"data-catord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(pkg.env$derivedCatUnord, file=paste(opt$resDir,"data-catunord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); } warnings() From 8a9a3c60737dc475260e297de54c21f0976d5c6f Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 16:27:44 -0400 Subject: [PATCH 09/27] Use package env for counters --- PHESANT/R/binaryLogisticRegression.r | 17 ++++++----- PHESANT/R/counters.r | 27 ++++++++--------- PHESANT/R/testAssociations.r | 41 +++++++++++++------------- PHESANT/R/testCatMultiple.r | 11 ++++--- PHESANT/R/testCatSingle.r | 19 ++++++------ PHESANT/R/testCategoricalOrdered.r | 15 +++++----- PHESANT/R/testCategoricalUnordered.r | 17 ++++++----- PHESANT/R/testContinuous.r | 43 ++++++++++++++-------------- PHESANT/R/testInteger.r | 17 ++++++----- PHESANT/phenomeScan.r | 8 +++--- 10 files changed, 102 insertions(+), 113 deletions(-) diff --git a/PHESANT/R/binaryLogisticRegression.r b/PHESANT/R/binaryLogisticRegression.r index 48ccf85..666b8b5 100644 --- a/PHESANT/R/binaryLogisticRegression.r +++ b/PHESANT/R/binaryLogisticRegression.r @@ -21,7 +21,7 @@ # # Performs binary logistic regression on the phenotype stored in thisdata # and stores result in 'results-logistic-binary' results file. -binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, isExposure, phenoStartIdx) { +binaryLogisticRegression <- function(opt, varName, varType, thisdata, isExposure, phenoStartIdx) { phenoFactor = factor(thisdata[,phenoStartIdx]) @@ -31,7 +31,7 @@ binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, if (length(facLevels)!=2) { #stop(paste("Not 2 levels: ", length(facLevels), " || ", sep="")) cat("BINARY-NOT2LEVELS- (", length(facLevels), ") || ",sep=""); - counters <- incrementCounter(counters, "binary.nottwolevels") + incrementCounter("binary.nottwolevels") } idxTrue = length(which(phenoFactor==facLevels[1])) @@ -40,11 +40,11 @@ binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, if (idxTrue<10 || idxFalse<10) { cat("BINARY-LOGISTIC-SKIP-10 (", idxTrue, "/", idxFalse, ") || ", sep="") - counters <- incrementCounter(counters, "binary.10") + incrementCounter("binary.10") } else if (numNotNA<500) { cat("BINARY-LOGISTIC-SKIP-500 (", numNotNA, ") || ",sep=""); - counters <- incrementCounter(counters, "binary.500") + incrementCounter("binary.500") } else { @@ -54,7 +54,7 @@ binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, # add pheno to dataframe storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'bin') cat("SUCCESS results-logistic-binary "); - counters <- incrementCounter(counters, "success.binary") + incrementCounter("success.binary") } else { @@ -102,10 +102,10 @@ binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, write(paste(varName,varType,paste(idxTrue,"/",idxFalse,"(",numNotNA,")",sep=""), beta,lower,upper,pvalue, sep=","), file=paste(opt$resDir,"results-logistic-binary-",opt$varTypeArg,".txt",sep=""), append="TRUE"); cat("SUCCESS results-logistic-binary "); - counters <- incrementCounter(counters, "success.binary") + incrementCounter("success.binary") if (isExposure==TRUE) { - counters <-incrementCounter(counters,"success.exposure.binary") + incrementCounter("success.exposure.binary") } ## END TRYCATCH @@ -113,10 +113,9 @@ binaryLogisticRegression <- function(opt, varName, counters, varType, thisdata, sink() sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - counters <- incrementCounter(counters, "binary.error") + incrementCounter("binary.error") }) } } - return(counters) } diff --git a/PHESANT/R/counters.r b/PHESANT/R/counters.r index ef706b9..31a586b 100644 --- a/PHESANT/R/counters.r +++ b/PHESANT/R/counters.r @@ -17,42 +17,39 @@ # DEALINGS IN THE SOFTWARE. # adds given value to counter, that are used to count how many variables go down each route in the data flow -addToCounts <- function(counters, countName, num) { - idx = which(counters$name==countName) +addToCounts <- function(countName, num) { + idx = which(pkg.env$counters$name==countName) if (length(idx)==0) { # counter does not exist so add with countValue 1 - counters <- rbind(counters, data.frame(name=countName, countValue=num)) + pkg.env$counters <- rbind(pkg.env$counters, data.frame(name=countName, countValue=num)) } else { # add to counter that already exists - counters$countValue[idx] <- counters$countValue[idx]+num + pkg.env$counters$countValue[idx] <- pkg.env$counters$countValue[idx]+num } - return(counters) } # increments counters used to count how many variables go down each route in the data flow -incrementCounter <- function(counters, countName) { - idx = which(counters$name==countName) +incrementCounter <- function(countName) { + idx = which(pkg.env$counters$name==countName) if (length(idx)==0) { # counter does not exist so add with countValue 1 - counters <- rbind(counters, data.frame(name=countName, countValue=1)) + pkg.env$counters <- rbind(pkg.env$counters, data.frame(name=countName, countValue=1)) } else { # increment counter that already exists - counters$countValue[idx] <- counters$countValue[idx]+1 + pkg.env$counters$countValue[idx] <- pkg.env$counters$countValue[idx]+1 } - return(counters) } # Saves the counters stored in count variables, to a file in results directory -saveCounts <- function(opt, counters) { +saveCounts <- function(opt) { countFile = paste(opt$resDir,"variable-flow-counts-",opt$varTypeArg,".txt",sep="") # sort on counter name - sortIdx = order(as.character(counters[,"name"])) - counters <<- counters[sortIdx,] + sortIdx = order(as.character(pkg.env$counters[,"name"])) + counters <- pkg.env$counters[sortIdx,] write.table(counters, file=countFile, sep=",", quote=FALSE, row.names=FALSE) } # init the counters used to determine how many variables took each path in the variable processing flow. initCounters <- function() { - counters = data.frame(name=character(),countValue=integer(), stringsAsFactors=FALSE) - return(counters) + pkg.env$counters = data.frame(name=character(),countValue=integer(), stringsAsFactors=FALSE) } \ No newline at end of file diff --git a/PHESANT/R/testAssociations.r b/PHESANT/R/testAssociations.r index 04dda72..7386217 100644 --- a/PHESANT/R/testAssociations.r +++ b/PHESANT/R/testAssociations.r @@ -18,7 +18,7 @@ # Tests the association of a field, determined by its field type -testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thisdata, phenoStartIdx) { +testAssociations <- function(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) { ## call file for variable type @@ -30,7 +30,7 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi # check if variable info is found for this field if (length(idx)==0) { cat(paste(currentVar, " || Variable could not be found in pheno info file. \n", sep="")) - counters <- incrementCounter(counters, "notinphenofile") + incrementCounter("notinphenofile") } else { @@ -47,15 +47,15 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi if (excluded!="") { cat(paste("Excluded integer: ", excluded, " || ", sep="")) - counters <- incrementCounter(counters, "excluded.int") + incrementCounter("excluded.int") } else { - counters <- incrementCounter(counters, "start.int") + incrementCounter("start.int") if (isExposure==TRUE) { - counters <- incrementCounter(counters, "start.exposure.int") + incrementCounter("start.exposure.int") } - counters <- testInteger(opt, vl, counters, currentVarShort, "INTEGER", thisdata, phenoStartIdx); + testInteger(opt, vl, currentVarShort, "INTEGER", thisdata, phenoStartIdx); } cat("\n"); } @@ -66,14 +66,14 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi if (excluded!="") { cat(paste("Excluded continuous: ", excluded, " || ", sep="")) - counters <- incrementCounter(counters, "excluded.cont") + incrementCounter("excluded.cont") } else { - counters <- incrementCounter(counters, "start.cont") + incrementCounter("start.cont") if (isExposure==TRUE) { - counters<- incrementCounter(counters, "start.exposure.cont") + incrementCounter("start.exposure.cont") } - counters <- testContinuous(opt, vl, counters, currentVarShort, "CONTINUOUS", thisdata, phenoStartIdx); + testContinuous(opt, vl, currentVarShort, "CONTINUOUS", thisdata, phenoStartIdx); } cat("\n"); } @@ -84,14 +84,14 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi if (excluded!="") { cat(paste("Excluded cat-single: ", excluded, " || ", sep="")) - counters <- incrementCounter(counters, "excluded.catSin") + incrementCounter("excluded.catSin") } else { - counters <- incrementCounter(counters, "start.catSin") + incrementCounter("start.catSin") if (isExposure==TRUE) { - counters <-incrementCounter(counters, "start.exposure.catSin") + incrementCounter("start.exposure.catSin") } - counters <- testCategoricalSingle(opt, vl, counters, currentVarShort, "CAT-SIN", thisdata, phenoStartIdx); + testCategoricalSingle(opt, vl, currentVarShort, "CAT-SIN", thisdata, phenoStartIdx); } cat("\n"); } @@ -102,27 +102,27 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi if (excluded!="") { cat(paste("Excluded cat-multiple: ", excluded, " || ", sep="")) - counters <- incrementCounter(counters, "excluded.catMul") + incrementCounter("excluded.catMul") } else { if (catSinToMult!="") { cat("cat-single to cat-multiple || ", sep="") - counters <- incrementCounter(counters, "catSinToCatMul") + incrementCounter("catSinToCatMul") } - counters <- incrementCounter(counters, "start.catMul") + incrementCounter("start.catMul") if (isExposure==TRUE) { - counters <- incrementCounter(counters, "start.exposure.catMul") + incrementCounter("start.exposure.catMul") } else { # get number of cat mult values denoting trait of interest numVals = getNumValuesCatMultExposure(vl, currentVarShort) if (numVals>0) { - counters <- addToCounts(addToCounts, "start.exposure.catMulvalues", numVals) + addToCounts(addToCounts, "start.exposure.catMulvalues", numVals) } } - counters <- testCategoricalMultiple(opt, vl, counters, currentVarShort, "CAT-MUL", thisdata, phenoStartIdx); + testCategoricalMultiple(opt, vl, currentVarShort, "CAT-MUL", thisdata, phenoStartIdx); } cat("\n"); } @@ -134,7 +134,6 @@ testAssociations <- function(opt, vl, counters, currentVar, currentVarShort, thi }, error = function(e) { print(paste("ERROR:", currentVar,e)) }) - return(counters) } diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index 3a0c9df..b396c4c 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -23,7 +23,7 @@ # in CAT_MULT_INDICATOR_FIELDS field of variable info file (either NO_NAN, ALL or a field ID) # 3) Checking derived variable has at least 10 cases in each group # 4) Calling binaryLogisticRegression function for this derived binary variable -testCategoricalMultiple <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { +testCategoricalMultiple <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { cat("CAT-MULTIPLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] @@ -54,7 +54,7 @@ testCategoricalMultiple <- function(opt, vl, counters, varName, varType, thisdat idxsTrue = idxForVar[,"row"] cat(" CAT-MUL-BINARY-VAR ", variableVal, " || ", sep=""); - counters <- incrementCounter(counters, "catMul.binary") + incrementCounter("catMul.binary") # make zero vector and set 1s for those with this variable value varBinary = rep.int(0,numRows); @@ -77,17 +77,16 @@ testCategoricalMultiple <- function(opt, vl, counters, varName, varType, thisdat if (idxTrue<10 || idxFalse<10) { cat("CAT-MULT-SKIP-10 (", idxTrue, " vs ", idxFalse, ") || ", sep=""); - counters <- incrementCounter(counters, "catMul.10") + incrementCounter("catMul.10") } else { isExposure = getIsCatMultExposure(vl, varName, variableVal) - counters <- incrementCounter(counters, "catMul.over10") + incrementCounter("catMul.over10") # binary - so logistic regression - counters <- binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), counters, varType, newthisdata, isExposure, phenoStartIdx) + binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), varType, newthisdata, isExposure, phenoStartIdx) } } - return(counters) } # restricts sample based on value in CAT_MULT_INDICATOR_FIELDS column of variable info file, diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index cf8b121..fa69be2 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -23,7 +23,7 @@ # 3) Replacing missing codes - we assume values < 0 are missing for categorical (single) variables # 4) Remove values with <10 cases # 5) Deterimine correct test to perform, either binary, ordered or unordered. -testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { +testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { cat("CAT-SINGLE || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -68,17 +68,17 @@ testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata, if (length(uniqVar)<=1) { cat("SKIP (only one value) || "); - counters <- incrementCounter(counters, "catSin.onevalue") + incrementCounter("catSin.onevalue") } else if (length(uniqVar)==2) { cat("CAT-SINGLE-BINARY || "); - counters <- incrementCounter(counters, "catSin.case3") + incrementCounter("catSin.case3") # binary so logistic regression phenoFactor = factor(pheno) # binary - so logistic regression thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure, phenoStartIdx) + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx) } else { # > 2 categories @@ -91,33 +91,32 @@ testCategoricalSingle <- function(opt, vl, counters, varName, varType, thisdata, if (ordered == 0) { cat("CAT-SINGLE-UNORDERED || ") - counters <- incrementCounter(counters, "catSin.case2") + incrementCounter("catSin.case2") thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); - counters <- testCategoricalUnordered(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx); + testCategoricalUnordered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); } else if (ordered == 1) { ## ordered cat("ordered || "); - counters <- incrementCounter(counters, "catSin.case1") + incrementCounter("catSin.case1") ## reorder variable values into increasing order thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); - counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx, order) + testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx, order) } else if (ordered == -2) { cat(" EXCLUDED or BINARY variable: Should not get here in code. ") - counters <- incrementCounter(counters, "catSin.binaryorexcluded") + incrementCounter( "catSin.binaryorexcluded") } else { print(paste("ERROR", varName, varType, dataCode)); } } } - return(counters) } ## values are reordered and assigned values 1:N for N categories diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index d0b972d..1f208db 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -18,14 +18,14 @@ # Performs ordered logistic regression test and saves results in ordered logistic results file -testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx, orderStr="") { +testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoStartIdx, orderStr="") { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] geno = thisdata[,"geno"] cat("CAT-ORD || "); - counters <- incrementCounter(counters, "ordCat") + incrementCounter("ordCat") doCatOrdAssertions(pheno) @@ -39,7 +39,7 @@ testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata numNotNA = length(which(!is.na(pheno))) if (numNotNA<500) { cat("CATORD-SKIP-500 (", numNotNA, ") || ",sep=""); - counters <- incrementCounter(counters, "ordCat.500") + incrementCounter("ordCat.500") } else { # test this cat ordered variable with ordered logistic regression @@ -52,7 +52,7 @@ testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata # add pheno to dataframe storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catOrd') cat("SUCCESS results-ordered-logistic"); - counters <- incrementCounter(counters, "success.ordCat") + incrementCounter("success.ordCat") } else { @@ -93,11 +93,11 @@ testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-ordered-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE"); cat("SUCCESS results-ordered-logistic"); - counters <- incrementCounter(counters, "success.ordCat") + incrementCounter("success.ordCat") isExposure = getIsExposure(vl, varName) if (isExposure == TRUE) { - counters <- incrementCounter(counters, "success.exposure.ordCat") + incrementCounter("success.exposure.ordCat") } ### END TRYCATCH @@ -105,12 +105,11 @@ testCategoricalOrdered <- function(opt, vl, counters, varName, varType, thisdata sink() sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - counters <- incrementCounter(counters, "ordCat.error") + incrementCounter("ordCat.error") }) } } - return(counters) } # check that the phenotype is valid - that there are more than two categories diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index 8807b26..dce7cb0 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -19,7 +19,7 @@ # Tests an unordered categorical phenotype with multinomial regression # and saves this result in the multinomial logistic results file -testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { +testCategoricalUnordered <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { pheno = thisdata[,phenoStartIdx:ncol(thisdata)] #geno = thisdata[,"geno"] @@ -27,7 +27,7 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda numNotNA = length(which(!is.na(pheno))) if (numNotNA<500) { cat("CATUNORD-SKIP-500 (", numNotNA, ") || ",sep=""); - counters <- incrementCounter(counters, "unordCat.500") + incrementCounter("unordCat.500") } else { @@ -38,8 +38,8 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda numWeights=(numUnique-1)*(((phenoStartIdx -1)-2)+1+1) if (numWeights>1000) { cat("Too many weights in model: ", numWeights, " > 1000, (num outcomes values: ", numUnique, ") || SKIP ", sep="") - counters <-incrementCounter(counters, "unordCat.cats") - return(counters) + incrementCounter("unordCat.cats") + return(NULL) } phenoFactor = chooseReferenceCategory(pheno); @@ -48,7 +48,7 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda # add pheno to dataframe storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catUnord') cat("SUCCESS results-notordered-logistic "); - counters <- incrementCounter(counters, "success.unordCat") + incrementCounter("success.unordCat") } else { @@ -127,11 +127,11 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda } cat("SUCCESS results-notordered-logistic "); - counters <- incrementCounter(counters, "success.unordCat") + incrementCounter("success.unordCat") isExposure = getIsExposure(vl, varName) if (isExposure == TRUE) { - counters <- incrementCounter(counters, "success.exposure.unordCat") + incrementCounter("success.exposure.unordCat") } ## END TRYCATCH @@ -139,11 +139,10 @@ testCategoricalUnordered <- function(opt, vl, counters, varName, varType, thisda sink() sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - counters <- incrementCounter(counters, "unordCat.error") + incrementCounter("unordCat.error") }) } } - return(counters) } # find reference category - category with most number of examples diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 73f6f90..0f0b7aa 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -18,7 +18,7 @@ # Main function called for continuous fields -testContinuous <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { +testContinuous <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { cat("CONTINUOUS MAIN || "); @@ -29,13 +29,13 @@ testContinuous <- function(opt, vl, counters, varName, varType, thisdata, phenoS thisdata[,phenoStartIdx:ncol(thisdata)] = pheno - counters <- testContinuous2(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) + testContinuous2(opt, vl, varName, varType, thisdata, phenoStartIdx) } # Main code used to process continuous fields, or integer fields that have been reassigned as continuous because they have >20 distinct values. # This is needed because we have already reassigned values for integer fields, so do this in the function above for continuous fields. -testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { +testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { cat("CONTINUOUS || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -86,19 +86,19 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, pheno if (numLevels<=1) { cat("SKIP (number of levels: ",numLevels,")",sep="") - counters <- incrementCounter(counters,"cont.onevalue") + incrementCounter("cont.onevalue") } else if (numLevels==2) { # binary - counters <- incrementCounter(counters, "cont.binary") + incrementCounter("cont.binary") thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure, phenoStartIdx); + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx); } } else { ## try to treat as ordered categorical - counters <- incrementCounter(counters, "cont.ordcattry") + incrementCounter("cont.ordcattry") ## equal sized bins phenoBinned = equalSizedBins(phenoAvg); @@ -111,9 +111,9 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, pheno # successful binning. >=10 examples in each of the 3 bins - counters <- incrementCounter(counters, "cont.ordcattry.ordcat") + incrementCounter("cont.ordcattry.ordcat") thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned); - counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx); + testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); } else { # try to treat as binary because not enough examples in each bin @@ -122,36 +122,36 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, pheno ## skip - not possible to create binary variable because first and third bins are too small ## ie. could merge bin1 with bin 2 but then bin3 still too small etc cat("SKIP 2 bins are too small || ") - counters <-incrementCounter(counters, "cont.ordcattry.smallbins") + incrementCounter("cont.ordcattry.smallbins") } else if ((bin0Num<10 | bin1Num<10) & (bin0Num+bin1Num)>=10) { # combine first and second bin to create binary variable - counters <- incrementCounter(counters, "cont.ordcattry.binsbinary") + incrementCounter("cont.ordcattry.binsbinary") cat("Combine first two bins and treat as binary || ") phenoBinned[which(phenoBinned==0)] = 1 # test binary thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure, phenoStartIdx); + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx); } else if ((bin2Num<10 | bin1Num<10) & (bin2Num+bin1Num)>=10) { # combine second and last bin to create binary variable - counters <- incrementCounter(counters, "cont.ordcattry.binsbinary") + incrementCounter("cont.ordcattry.binsbinary") cat("Combine last two bins and treat as binary || ") phenoBinned[which(phenoBinned==2)] = 1 # test binary thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure, phenoStartIdx) + binaryLogisticRegression(opt, varName,varType, thisdatanew, isExposure, phenoStartIdx) } else { ## skip - not possible to create binary variable because combining bins would still be too small cat("SKIP 2 bins are too small(2) || ") - counters <-incrementCounter(counters, "cont.ordcattry.smallbins2") + incrementCounter("cont.ordcattry.smallbins2") } } @@ -160,13 +160,13 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, pheno } else { cat("IRNT || "); - counters <- incrementCounter(counters, "cont.main") + incrementCounter("cont.main") # check there are at least 500 examples numNotNA = length(which(!is.na(phenoAvg))) if (numNotNA<500) { cat("CONTINUOUS-SKIP-500 (", numNotNA, ") || ",sep=""); - counters <- incrementCounter(counters, "cont.main.500") + incrementCounter("cont.main.500") } else { ## inverse rank normal transformation @@ -176,7 +176,7 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, pheno # add pheno to dataframe storeNewVar(thisdata[,"userID"], phenoIRNT, varName, 'cont') cat("SUCCESS results-linear"); - counters <- incrementCounter(counters, "success.continuous") + incrementCounter("success.continuous") } else { @@ -223,9 +223,9 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, pheno write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt", sep=""), append="TRUE"); cat("SUCCESS results-linear"); - counters <- incrementCounter(counters, "success.continuous") + incrementCounter("success.continuous") if (isExposure == TRUE) { - counters <- incrementCounter(counters, "success.exposure.continuous") + incrementCounter("success.exposure.continuous") } ## END TRYCATCH @@ -233,12 +233,11 @@ testContinuous2 <- function(opt, vl, counters, varName, varType, thisdata, pheno sink() sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - counters <-incrementCounter(counters, "continuous.error") + incrementCounter("continuous.error") }) } } } - return(counters) } irnt <- function(pheno) { diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 190c732..5ffac76 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -22,7 +22,7 @@ # 2) Generate a single value if there are several values (arrays) by taking the mean # 3) Treating this field as continuous if at least 20 distinct values. # Otherwise treat as binary or ordered categorical if 2 or more than two values. -testInteger <- function(opt, vl, counters, varName, varType, thisdata, phenoStartIdx) { +testInteger <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { cat("INTEGER || "); pheno = thisdata[,phenoStartIdx:ncol(thisdata)] @@ -52,8 +52,8 @@ testInteger <- function(opt, vl, counters, varName, varType, thisdata, phenoStar if (length(uniqVar)>=20) { thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoAvg); - counters <- testContinuous2(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx) - counters <- incrementCounter(counters, "int.continuous") + testContinuous2(opt, vl, varName, varType, thisdatanew, phenoStartIdx) + incrementCounter( "int.continuous") } else { @@ -65,25 +65,24 @@ testInteger <- function(opt, vl, counters, varName, varType, thisdata, phenoStar numLevels = length(levels(phenoFactor)) if (numLevels<=1) { cat("SKIP (number of levels: ",numLevels,")",sep=""); - counters <- incrementCounter(counters, "int.onevalue") + incrementCounter("int.onevalue") } else if (numLevels==2) { - counters <- incrementCounter(counters, "int.binary") + incrementCounter("int.binary") # binary thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); - counters <- binaryLogisticRegression(opt, varName, counters, varType, thisdatanew, isExposure,phenoStartIdx); + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure,phenoStartIdx); } else { - counters <- incrementCounter(counters, "int.catord") + incrementCounter("int.catord") cat("3-20 values || ") # we don't use equal sized bins just the original integers (that have >=10 examples) as categories thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); # treat as ordinal categorical - counters <- testCategoricalOrdered(opt, vl, counters, varName, varType, thisdatanew, phenoStartIdx); + testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); } } - return(counters) } diff --git a/PHESANT/phenomeScan.r b/PHESANT/phenomeScan.r index 593ebd6..4e492f2 100644 --- a/PHESANT/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -34,7 +34,7 @@ if (length(args) == 0) { } ## load the files we write to and use -counters <- initCounters() +initCounters() if (opt$save==FALSE) { initResultsFiles(opt) } @@ -102,7 +102,7 @@ for (var in phenoVars) { if (first==FALSE) { thisdata = makeTestDataFrame(data, confounders, currentVarValues) - counters <- testAssociations(opt, vl, counters, currentVar, currentVarShort, thisdata, phenoStartIdx) + testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) } first=FALSE; @@ -121,13 +121,13 @@ for (var in phenoVars) { if (phenoIdx>0){ # last variable so test association thisdata = makeTestDataFrame(data, confounders, currentVarValues) - counters <- testAssociations(opt, vl, counters, currentVar, currentVarShort, thisdata, phenoStartIdx) + testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) } sink() # save counters of each path in variable flow -saveCounts(opt, counters) +saveCounts(opt) if (opt$save == TRUE) { write.table(pkg.env$derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); From 2d3a28fe7db647ade20aae91ae3707983fd02027 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 16:46:16 -0400 Subject: [PATCH 10/27] Fix all global variables --- PHESANT/DESCRIPTION | 2 +- PHESANT/NAMESPACE | 1 + PHESANT/R/processArgs.r | 8 ++++---- PHESANT/R/testAssociations.r | 2 +- PHESANT/R/testCatMultiple.r | 2 +- PHESANT/R/testCatSingle.r | 8 ++++---- PHESANT/man/PHESANT-package.Rd | 2 +- PHESANT/phenomeScan.r | 4 ++-- 8 files changed, 15 insertions(+), 14 deletions(-) diff --git a/PHESANT/DESCRIPTION b/PHESANT/DESCRIPTION index 5677e5f..0c25a93 100644 --- a/PHESANT/DESCRIPTION +++ b/PHESANT/DESCRIPTION @@ -7,4 +7,4 @@ Author: Quanli Wang Maintainer: Quanli Wang Description: This will replement the original PHESANT in a package and potentially with some optimizations. License: GPL (>= 3) -Depends: MASS, lmtest, data.table, nnet +Depends: MASS, lmtest, data.table, nnet, optparse diff --git a/PHESANT/NAMESPACE b/PHESANT/NAMESPACE index 7331d97..289fcf4 100644 --- a/PHESANT/NAMESPACE +++ b/PHESANT/NAMESPACE @@ -3,6 +3,7 @@ import(MASS) import(lmtest) import(data.table) import(nnet) +import(optparse) importFrom("stats", "coef", "complete.cases", "confint", "end", "glm", "lm", "model.matrix", "na.omit", "pnorm", "qnorm", "quantile", "relevel") diff --git a/PHESANT/R/processArgs.r b/PHESANT/R/processArgs.r index 0413c78..bca4872 100644 --- a/PHESANT/R/processArgs.r +++ b/PHESANT/R/processArgs.r @@ -19,7 +19,7 @@ # Parse the arguments input by the user # if argument 'test' is used then run test phenome scan -processArgs <- function() { +processArgs <- function(opt, opt_parser) { if (opt$test==TRUE) { # set up the test phenome scan settings datadir='../testWAS/data/'; @@ -36,7 +36,7 @@ processArgs <- function() { opt$sensitivity <- FALSE; opt$genetic <- TRUE; } - opt <- processParts(opt, opt$partIdx, opt$numParts) + opt <- processParts(opt, opt_parser,opt$partIdx, opt$numParts) } else { ## check arguments are supplied correctly @@ -81,7 +81,7 @@ processArgs <- function() { else if (!file.exists(opt$resDir)) { stop(paste("results directory resDir=", opt$resDir, " does not exist", sep=""), call.=FALSE) } - opt <-processParts(opt, opt$partIdx, opt$numParts); + opt <-processParts(opt, opt_parser,opt$partIdx, opt$numParts); } if (opt$save==TRUE) { print("Saving phenotypes to file. Tests of association will not run!") @@ -90,7 +90,7 @@ processArgs <- function() { } # Parse the 'part' arguments and check they are valid -processParts <- function(opt, pIdx, nParts) { +processParts <- function(opt, opt_parser, pIdx, nParts) { if (is.null(pIdx) && is.null(nParts)) { opt$varTypeArg <- "all"; diff --git a/PHESANT/R/testAssociations.r b/PHESANT/R/testAssociations.r index 7386217..8309b13 100644 --- a/PHESANT/R/testAssociations.r +++ b/PHESANT/R/testAssociations.r @@ -119,7 +119,7 @@ testAssociations <- function(opt, vl, currentVar, currentVarShort, thisdata, phe # get number of cat mult values denoting trait of interest numVals = getNumValuesCatMultExposure(vl, currentVarShort) if (numVals>0) { - addToCounts(addToCounts, "start.exposure.catMulvalues", numVals) + addToCounts("start.exposure.catMulvalues", numVals) } } testCategoricalMultiple(opt, vl, currentVarShort, "CAT-MUL", thisdata, phenoStartIdx); diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index b396c4c..8184ee3 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -122,7 +122,7 @@ restrictSample2 <- function(vl, varName,pheno, varIndicator,variableVal, userID) # remove people who have no value for indicator variable indName = paste("x",varIndicator,"_0_0",sep=""); cat("Indicator name ", indName, " || ", sep=""); - indvarx = merge(userID, indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + indvarx = merge(userID, vl$indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) indicatorVar = indvarx[,indName] # remove participants with NA value in this related field diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index fa69be2..2f8724c 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -55,7 +55,7 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar ## this is used where there is no zero option e.g. field 100200 defaultValue = dataDataCode$default_value defaultRelatedID = dataDataCode$default_related_field - pheno = setDefaultValue(pheno, defaultValue, defaultRelatedID, thisdata[,"userID", drop=FALSE]) + pheno = setDefaultValue(vl, pheno, defaultValue, defaultRelatedID, thisdata[,"userID", drop=FALSE]) ## all categories coded as <0 we assume are `missing' values pheno = replaceMissingCodes(pheno) @@ -155,7 +155,7 @@ reorderOrderedCategory <- function(pheno,order) { ## sets default value for people with no value in pheno, but with a value in the ## field specified in the default_value_related_field column in the data coding info file. ## the default value is specified in the default_value column in the data coding info file. -setDefaultValue <- function(pheno, defaultValue, defaultRelatedID, userID) { +setDefaultValue <- function(vl, pheno, defaultValue, defaultRelatedID, userID) { if (!is.na(defaultValue) && nchar(defaultValue)>0) { @@ -164,8 +164,8 @@ setDefaultValue <- function(pheno, defaultValue, defaultRelatedID, userID) { indName = paste("x",defaultRelatedID,"_0_0",sep=""); cat("Default related field: ", indName, " || ", sep=""); - indicatorVar = indicatorFields[,indName] - indvarx = merge(userID, indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + indicatorVar = vl$indicatorFields[,indName] + indvarx = merge(userID, vl$indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) indicatorVar = indvarx[,indName] # remove participants with NA value in this related field diff --git a/PHESANT/man/PHESANT-package.Rd b/PHESANT/man/PHESANT-package.Rd index 3625692..fd5d054 100644 --- a/PHESANT/man/PHESANT-package.Rd +++ b/PHESANT/man/PHESANT-package.Rd @@ -28,5 +28,5 @@ Literature or other references for background information To be done } \examples{ - +#example here } diff --git a/PHESANT/phenomeScan.r b/PHESANT/phenomeScan.r index 4e492f2..3aa1ff5 100644 --- a/PHESANT/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -30,7 +30,7 @@ if (length(args) == 0) { } else { opt_parser <- OptionParser(option_list=option_list) opt <- parse_args(opt_parser) - opt <- processArgs(opt) + opt <- processArgs(opt, opt_parser) } ## load the files we write to and use @@ -44,7 +44,7 @@ vl <- initVariableLists(opt) d <- loadData(opt, vl) data=d$datax confounders=d$confounders -indicatorFields=d$inds +vl$indicatorFields=d$inds numPreceedingCols = ncol(confounders)-1+2; # confounders,minus id column, plus trait of interest and user ID phenoStartIdx = numPreceedingCols+1; From b13aa1dd77037d82589f36e6518147d0829f4113 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 16:52:56 -0400 Subject: [PATCH 11/27] remove unwanted file --- PHESANT/.Rhistory | 215 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/PHESANT/.Rhistory b/PHESANT/.Rhistory index e69de29..0e542d1 100644 --- a/PHESANT/.Rhistory +++ b/PHESANT/.Rhistory @@ -0,0 +1,215 @@ +library(PHESANT) +rm(list = ls()) +rm(list = ls()) +library(PHESANT) +load(file = "opt.RData") +getwd() +load(file = "../opt.RData") +save(opt, file = "opt_only.RData") +rm(list = ls()) +library(PHESANT) +load(file = "opt_only.RData") +counters=initCounters(); +if (opt$save==FALSE) { +initResultsFiles(); +} +vl=initVariableLists(); +## load data +d <- loadData() +data=d$datax +confounders=d$confounders +indicatorFields=d$inds +numPreceedingCols = ncol(confounders)-1+2; # confounders,minus id column, plus trait of interest and user ID +phenoStartIdx = numPreceedingCols+1; +print("LOADING DONE") +phenoVars=colnames(data); +# remove user id and age and sex columns +phenoVars = phenoVars[-c(1,2)]; # first and second columns are the id and snpScore, respectively, as determined in loadData.r +currentVar=""; +currentVarShort=""; +first=TRUE; +if (opt$save == TRUE) { +derivedBinary <- data.frame(userID=data$userID) +derivedCont <- data.frame(userID=data$userID) +derivedCatOrd <- data.frame(userID=data$userID) +derivedCatUnord <- data.frame(userID=data$userID) +resLogFile = paste(opt$resDir,"data-log-",opt$varTypeArg,".txt",sep="") +sink(resLogFile) +} else { +modelFitLogFile = paste(opt$resDir,"modelfit-log-",opt$varTypeArg,".txt",sep="") +sink(modelFitLogFile) +sink() +resLogFile = paste(opt$resDir,"results-log-",opt$varTypeArg,".txt",sep="") +sink(resLogFile) +} +phenoIdx=0; # zero because then the idx is the position of the previous variable, i.e. the var in currentVar +for (var in phenoVars) { +sink() +# print(var) +sink(resLogFile, append=TRUE) +varx = gsub("^x", "", var); +varx = gsub("_[0-9]+$", "", varx); +varxShort = gsub("^x", "", var); +varxShort = gsub("_[0-9]+_[0-9]+$", "", varxShort); +## test this variable +if (currentVar == varx) { +thisCol = data[,eval(var)] +thisCol = replaceNaN(thisCol) +currentVarValues = cbind.data.frame(currentVarValues, thisCol); +} +else if (currentVarShort == varxShort) { +## different time point of this var so skip +} +else { +## new variable so run test for previous (we have collected all the columns now) +if (first==FALSE) { +thisdata = makeTestDataFrame(data, confounders, currentVarValues) +testAssociations(currentVar, currentVarShort, thisdata) +} +first=FALSE; +## new variable so set values +currentVar = varx; +currentVarShort = varxShort; +currentVarValues = data[,eval(var)] +currentVarValues = replaceNaN(currentVarValues) +} +phenoIdx = phenoIdx + 1; +} +if (phenoIdx>0){ +# last variable so test association +thisdata = makeTestDataFrame(data, confounders, currentVarValues) +testAssociations(currentVar, currentVarShort, thisdata) +} +sink() +# save counters of each path in variable flow +saveCounts() +if (opt$save == TRUE) { +write.table(derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); +write.table(derivedCont, file=paste(opt$resDir,"data-cont-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); +write.table(derivedCatOrd, file=paste(opt$resDir,"data-catord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); +write.table(derivedCatUnord, file=paste(opt$resDir,"data-catunord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); +} +warnings() +library(PHESANT) +rm(list = ls()) +library(PHESANT) +library("optparse") +option_list +library(PHESANT) +rm(list = ls()) +args <- commandArgs(T) +args +commandArgs +? commandArgs +len(args) +lenth(args) +size(args) +class(args) +args[1] +dim(args) +length(args) +rm(list = ls()) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +args <- commandArgs(T) +length(args) == 0 +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +source('~/Documents/GitHub/PHESANT/PHESANT/R/testAssociations.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +rm(list = ls()) +## load data +d <- loadData(vl) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/R/loadData.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +opt$save +View(data) +library(PHESANT) +library(PHESANT) +pkg.env +pkg.env$a +pkg.env$derivedBinary <- data.frame(userID=data$userID) +write.table(pkg.env$derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/R/testContinuous.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +pkg.env$modelFitLogFile +source('~/Documents/GitHub/PHESANT/PHESANT/R/testCategoricalOrdered.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +pkg.env +names(pkg.env) +pkg.env$counters +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/R/storeNewVar.r') +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +? print_help +vl +vl$indicatorFields=d$inds +vl +names(vl) +library(PHESANT) +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') +source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') From f947f6f882d2ac57fdb104b6f6bd99dc44852de1 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 16:55:28 -0400 Subject: [PATCH 12/27] remove histtory file --- .gitignore | 2 - PHESANT/.Rhistory | 215 ---------------------------------------------- 2 files changed, 217 deletions(-) delete mode 100644 PHESANT/.Rhistory diff --git a/.gitignore b/.gitignore index 5fc7a4a..85c0afc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,3 @@ .DS_Store .Rproj.user -PHESANT/.Rhistory -PHESANT/.Rhistory diff --git a/PHESANT/.Rhistory b/PHESANT/.Rhistory deleted file mode 100644 index 0e542d1..0000000 --- a/PHESANT/.Rhistory +++ /dev/null @@ -1,215 +0,0 @@ -library(PHESANT) -rm(list = ls()) -rm(list = ls()) -library(PHESANT) -load(file = "opt.RData") -getwd() -load(file = "../opt.RData") -save(opt, file = "opt_only.RData") -rm(list = ls()) -library(PHESANT) -load(file = "opt_only.RData") -counters=initCounters(); -if (opt$save==FALSE) { -initResultsFiles(); -} -vl=initVariableLists(); -## load data -d <- loadData() -data=d$datax -confounders=d$confounders -indicatorFields=d$inds -numPreceedingCols = ncol(confounders)-1+2; # confounders,minus id column, plus trait of interest and user ID -phenoStartIdx = numPreceedingCols+1; -print("LOADING DONE") -phenoVars=colnames(data); -# remove user id and age and sex columns -phenoVars = phenoVars[-c(1,2)]; # first and second columns are the id and snpScore, respectively, as determined in loadData.r -currentVar=""; -currentVarShort=""; -first=TRUE; -if (opt$save == TRUE) { -derivedBinary <- data.frame(userID=data$userID) -derivedCont <- data.frame(userID=data$userID) -derivedCatOrd <- data.frame(userID=data$userID) -derivedCatUnord <- data.frame(userID=data$userID) -resLogFile = paste(opt$resDir,"data-log-",opt$varTypeArg,".txt",sep="") -sink(resLogFile) -} else { -modelFitLogFile = paste(opt$resDir,"modelfit-log-",opt$varTypeArg,".txt",sep="") -sink(modelFitLogFile) -sink() -resLogFile = paste(opt$resDir,"results-log-",opt$varTypeArg,".txt",sep="") -sink(resLogFile) -} -phenoIdx=0; # zero because then the idx is the position of the previous variable, i.e. the var in currentVar -for (var in phenoVars) { -sink() -# print(var) -sink(resLogFile, append=TRUE) -varx = gsub("^x", "", var); -varx = gsub("_[0-9]+$", "", varx); -varxShort = gsub("^x", "", var); -varxShort = gsub("_[0-9]+_[0-9]+$", "", varxShort); -## test this variable -if (currentVar == varx) { -thisCol = data[,eval(var)] -thisCol = replaceNaN(thisCol) -currentVarValues = cbind.data.frame(currentVarValues, thisCol); -} -else if (currentVarShort == varxShort) { -## different time point of this var so skip -} -else { -## new variable so run test for previous (we have collected all the columns now) -if (first==FALSE) { -thisdata = makeTestDataFrame(data, confounders, currentVarValues) -testAssociations(currentVar, currentVarShort, thisdata) -} -first=FALSE; -## new variable so set values -currentVar = varx; -currentVarShort = varxShort; -currentVarValues = data[,eval(var)] -currentVarValues = replaceNaN(currentVarValues) -} -phenoIdx = phenoIdx + 1; -} -if (phenoIdx>0){ -# last variable so test association -thisdata = makeTestDataFrame(data, confounders, currentVarValues) -testAssociations(currentVar, currentVarShort, thisdata) -} -sink() -# save counters of each path in variable flow -saveCounts() -if (opt$save == TRUE) { -write.table(derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); -write.table(derivedCont, file=paste(opt$resDir,"data-cont-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); -write.table(derivedCatOrd, file=paste(opt$resDir,"data-catord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); -write.table(derivedCatUnord, file=paste(opt$resDir,"data-catunord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); -} -warnings() -library(PHESANT) -rm(list = ls()) -library(PHESANT) -library("optparse") -option_list -library(PHESANT) -rm(list = ls()) -args <- commandArgs(T) -args -commandArgs -? commandArgs -len(args) -lenth(args) -size(args) -class(args) -args[1] -dim(args) -length(args) -rm(list = ls()) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -args <- commandArgs(T) -length(args) == 0 -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -source('~/Documents/GitHub/PHESANT/PHESANT/R/testAssociations.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -rm(list = ls()) -## load data -d <- loadData(vl) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/R/loadData.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -opt$save -View(data) -library(PHESANT) -library(PHESANT) -pkg.env -pkg.env$a -pkg.env$derivedBinary <- data.frame(userID=data$userID) -write.table(pkg.env$derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/R/testContinuous.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -pkg.env$modelFitLogFile -source('~/Documents/GitHub/PHESANT/PHESANT/R/testCategoricalOrdered.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -pkg.env -names(pkg.env) -pkg.env$counters -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/R/storeNewVar.r') -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -? print_help -vl -vl$indicatorFields=d$inds -vl -names(vl) -library(PHESANT) -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') -source('~/Documents/GitHub/PHESANT/PHESANT/phenomeScan.r') From ef7fcb73812922b636c1a20d4c66b20e0cc7aecc Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 16:58:35 -0400 Subject: [PATCH 13/27] ignore Rhistory --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 85c0afc..e97dfd5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_Store .Rproj.user +PHESANT/.Rhistory From e5ddaad45382039fd0d12509a0ea9c8dd86f9686 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 17:00:55 -0400 Subject: [PATCH 14/27] remove Rhisitory at the top level --- .Rhistory | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 .Rhistory diff --git a/.Rhistory b/.Rhistory deleted file mode 100644 index 591e1ef..0000000 --- a/.Rhistory +++ /dev/null @@ -1,2 +0,0 @@ -package.skeleton(name = "PHESANT") -package.skeleton(name = "PHESANT") From 174c5d4469bb07bb811627d979b4a357722f5f12 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Thu, 14 Mar 2019 17:45:41 -0400 Subject: [PATCH 15/27] Code fefactor --- PHESANT/R/{replaceNaN.r => cleanData.R} | 39 +++++++++---------------- PHESANT/R/replaceMissingCodes.r | 37 ----------------------- PHESANT/R/testCatSingle.r | 8 ++++- PHESANT/R/testContinuous.r | 12 ++++---- PHESANT/man/irnt.Rd | 24 +++++++++++++++ 5 files changed, 51 insertions(+), 69 deletions(-) rename PHESANT/R/{replaceNaN.r => cleanData.R} (73%) delete mode 100644 PHESANT/R/replaceMissingCodes.r create mode 100644 PHESANT/man/irnt.Rd diff --git a/PHESANT/R/replaceNaN.r b/PHESANT/R/cleanData.R similarity index 73% rename from PHESANT/R/replaceNaN.r rename to PHESANT/R/cleanData.R index dc3dfd4..5848f2d 100644 --- a/PHESANT/R/replaceNaN.r +++ b/PHESANT/R/cleanData.R @@ -19,29 +19,18 @@ # Replace NaN and empty values with NA in pheno replaceNaN <- function(pheno) { - - if (is.factor(pheno)) { - - phenoReplaced = pheno - - nanStr = which(phenoReplaced=="NaN") - phenoReplaced[nanStr]=NA - - emptyx = which(phenoReplaced=="") - phenoReplaced[emptyx]=NA - - } - else { - - phenoReplaced = pheno - nanx = which(is.nan(phenoReplaced)) - phenoReplaced[nanx] = NA; - - emptyStr = which(phenoReplaced=="") - phenoReplaced[emptyStr] = NA; - - } - - return(phenoReplaced) - + if (is.factor(pheno)) { + phenoReplaced <- pheno + nanStr <- which(phenoReplaced=="NaN") + phenoReplaced[nanStr] <- NA + emptyx <- which(phenoReplaced=="") + phenoReplaced[emptyx] <- NA + } else { + phenoReplaced <- pheno + nanx <- which(is.nan(phenoReplaced)) + phenoReplaced[nanx] <- NA + emptyStr <- which(phenoReplaced=="") + phenoReplaced[emptyStr] <- NA + } + return(phenoReplaced) } diff --git a/PHESANT/R/replaceMissingCodes.r b/PHESANT/R/replaceMissingCodes.r deleted file mode 100644 index f792374..0000000 --- a/PHESANT/R/replaceMissingCodes.r +++ /dev/null @@ -1,37 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# Replace negative values with NA as these are assumed to be missing -replaceMissingCodes <- function(pheno) { - - phenoReplaced = pheno; - - uniqVar = unique(na.omit(phenoReplaced)) - - # variable values <0 are `missing' codes - for (u in uniqVar) { - if (u<0) { - idxU = which(phenoReplaced==u) - phenoReplaced[idxU]=NA - } - } - - return(phenoReplaced) - -} diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index 2f8724c..bd32e4d 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -58,7 +58,7 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar pheno = setDefaultValue(vl, pheno, defaultValue, defaultRelatedID, thisdata[,"userID", drop=FALSE]) ## all categories coded as <0 we assume are `missing' values - pheno = replaceMissingCodes(pheno) + pheno = .replaceMissingCodes(pheno) ## remove categories if < 10 examples pheno = testNumExamples(pheno) @@ -190,3 +190,9 @@ setDefaultValue <- function(vl, pheno, defaultValue, defaultRelatedID, userID) { } +# Replace negative values with NA as these are assumed to be missing +.replaceMissingCodes <- function(pheno) { + phenoReplaced <- pheno + phenoReplaced[phenoReplaced <0 ] <- NA + return(phenoReplaced) +} diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 0f0b7aa..0f5171d 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -240,11 +240,11 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) } } -irnt <- function(pheno) { - set.seed(1234) - numPhenos = length(which(!is.na(pheno))) - quantilePheno = (rank(pheno, na.last="keep", ties.method="random")-0.5)/numPhenos - phenoIRNT = qnorm(quantilePheno) - return(phenoIRNT); +irnt <- function(pheno, seed = 1234) { + set.seed(seed) + numPhenos <- length(which(!is.na(pheno))) + quantilePheno <- (rank(pheno, na.last="keep", ties.method="random")-0.5)/numPhenos + phenoIRNT <- qnorm(quantilePheno) + return(phenoIRNT) } diff --git a/PHESANT/man/irnt.Rd b/PHESANT/man/irnt.Rd new file mode 100644 index 0000000..ed0323d --- /dev/null +++ b/PHESANT/man/irnt.Rd @@ -0,0 +1,24 @@ +\name{irnt} +\alias{irnt} +\title{ + Inverse Rank Normal Transformation +} +\description{ + Perform an inverse rank normal transformation for input phenotypes +} +\usage{ +irnt(pheno,seed) +} +\arguments{ + \item{pheno}{ + A vector of numeric vlues with NA allowed. +} +\item{seed}{ + Random seed for tie breakers, default to 1234. +} +} + +\value{ + A vector of the transformed values. +} + From 5c2725b5d70b38a4971ba9d447534948f3306b2f Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 09:38:56 -0400 Subject: [PATCH 16/27] Code Refactor: reorgnize files/functions based on objects --- PHESANT/R/binaryLogisticRegression.r | 2 +- PHESANT/R/cleanData.R | 66 +++++++++- PHESANT/R/confounders.r | 121 ++++++++++++++++++ PHESANT/R/equalSizedBins.r | 118 ----------------- PHESANT/R/{getIsExposure.r => exposure.r} | 0 ...oadIndicatorFields.r => indicatorFields.r} | 4 +- PHESANT/R/{initFunctions.r => init.r} | 37 ++++++ PHESANT/R/loadConfounders.r | 4 +- PHESANT/R/loadData.r | 41 +----- PHESANT/R/makeTestDataFrame.r | 30 ----- PHESANT/R/{loadPhenotypes.r => phenotypes.r} | 56 ++++++++ PHESANT/R/reassignValue.r | 73 ----------- PHESANT/R/run.r | 68 ++++++++++ PHESANT/R/storeNewVar.r | 14 -- PHESANT/R/testCatMultiple.r | 2 +- PHESANT/R/testCatSingle.r | 2 +- PHESANT/R/testCategoricalOrdered.r | 2 +- PHESANT/R/testCategoricalUnordered.r | 2 +- PHESANT/R/testContinuous.r | 92 ++++++++++++- PHESANT/R/testInteger.r | 2 +- ...oadTraitOfInterest.r => traitOfInterest.r} | 38 ++++++ PHESANT/R/{processArgs.r => utils.r} | 41 +++--- PHESANT/R/validatePhenotypeInput.r | 85 ------------ PHESANT/R/validateTraitInput.r | 69 ---------- PHESANT/phenomeScan.r | 109 +--------------- .../R/unittests => unittests}/run-tests.sh | 0 .../test_equalSizedBins.r | 0 .../test_reassignValue.r | 0 .../test_testCatMultiple.r | 0 .../test_testCatSingle.r | 0 30 files changed, 514 insertions(+), 564 deletions(-) create mode 100644 PHESANT/R/confounders.r delete mode 100644 PHESANT/R/equalSizedBins.r rename PHESANT/R/{getIsExposure.r => exposure.r} (100%) rename PHESANT/R/{loadIndicatorFields.r => indicatorFields.r} (97%) rename PHESANT/R/{initFunctions.r => init.r} (65%) delete mode 100644 PHESANT/R/makeTestDataFrame.r rename PHESANT/R/{loadPhenotypes.r => phenotypes.r} (69%) delete mode 100644 PHESANT/R/reassignValue.r create mode 100644 PHESANT/R/run.r delete mode 100644 PHESANT/R/storeNewVar.r rename PHESANT/R/{loadTraitOfInterest.r => traitOfInterest.r} (58%) rename PHESANT/R/{processArgs.r => utils.r} (77%) delete mode 100644 PHESANT/R/validatePhenotypeInput.r delete mode 100644 PHESANT/R/validateTraitInput.r rename {PHESANT/R/unittests => unittests}/run-tests.sh (100%) rename {PHESANT/R/unittests => unittests}/test_equalSizedBins.r (100%) rename {PHESANT/R/unittests => unittests}/test_reassignValue.r (100%) rename {PHESANT/R/unittests => unittests}/test_testCatMultiple.r (100%) rename {PHESANT/R/unittests => unittests}/test_testCatSingle.r (100%) diff --git a/PHESANT/R/binaryLogisticRegression.r b/PHESANT/R/binaryLogisticRegression.r index 666b8b5..1112377 100644 --- a/PHESANT/R/binaryLogisticRegression.r +++ b/PHESANT/R/binaryLogisticRegression.r @@ -52,7 +52,7 @@ binaryLogisticRegression <- function(opt, varName, varType, thisdata, isExposure if (opt$save == TRUE) { # add pheno to dataframe - storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'bin') + .storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'bin') cat("SUCCESS results-logistic-binary "); incrementCounter("success.binary") } diff --git a/PHESANT/R/cleanData.R b/PHESANT/R/cleanData.R index 5848f2d..7f27c4d 100644 --- a/PHESANT/R/cleanData.R +++ b/PHESANT/R/cleanData.R @@ -16,9 +16,73 @@ # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. +# Reassigns values as specified in data coding info file +reassignValue <- function(vl, pheno, varName) { + # get data code info - whether this data code is ordinal or not and any reordering and resassignments + dataPheno <- vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),] + dataCode <- dataPheno$DATA_CODING + + # not all variables will have a data code info row + dataCodeRow <- which(vl$dataCodeInfo$dataCode==dataCode) + if (length(dataCodeRow)==0) { #do nothing + } else if (length(dataCodeRow)==1) { + dataDataCode <- vl$dataCodeInfo[dataCodeRow,] + reassignments <- as.character(dataDataCode$reassignments) + + # Reassigns values in pheno, as specified in resassignments argument + # can be NA if row not included in data coding info file + if (!is.na(reassignments) && nchar(reassignments)>0) { + reassignParts <- unlist(strsplit(reassignments,"\\|")) + cat(paste("reassignments: ", reassignments, " || ", sep="")) + + # do each reassignment + for(i in reassignParts) { + reassignParts <- unlist(strsplit(i,"=")) + + # matrix version + idx <- which(pheno==reassignParts[1],arr.ind=TRUE) + pheno[idx] <- strtoi(reassignParts[2]) + } + + ## see if type has changed (this happens for field 216 (X changed to -1)) + ## as.numeric will set non numeric to NA so we know if it's ok to do this by seeing if there are extra NA's after the conversion + pNum = as.numeric(unlist(pheno)) + isNum = length(which(is.na(pheno), arr.ind=TRUE))==length(which(is.na(pNum), arr.ind=TRUE)) + if (isNum) { + pheno = pNum + } + } + } else { + cat("WARNING: >1 ROWS IN DATA CODE INFO FILE || ") + + } + return(pheno) +} + +fixOddFieldsToCatMul <- function(vl, data) { + # examples are variables: 40006, 40011, 40012, 40013 + # get all variables that need their instances changing to arrays + dataPheno = vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),]; + for (i in 1:nrow(dataPheno)) { + varID = dataPheno[i,]$FieldID; + varidString = paste("x",varID,"_", sep=""); + + # get all columns in data dataframe for this variable + colIdxs = which(grepl(varidString,names(data))); + + # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 + count = 0; + for (j in colIdxs) { + colnames(data)[j] <- paste(varidString, "0_", count, sep="") + count = count + 1; + } + } + return(data) +} + # Replace NaN and empty values with NA in pheno -replaceNaN <- function(pheno) { +.replaceNaN <- function(pheno) { if (is.factor(pheno)) { phenoReplaced <- pheno nanStr <- which(phenoReplaced=="NaN") diff --git a/PHESANT/R/confounders.r b/PHESANT/R/confounders.r new file mode 100644 index 0000000..9fc1c10 --- /dev/null +++ b/PHESANT/R/confounders.r @@ -0,0 +1,121 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +## +## loads confounder variables from phenotype file + +loadConfounders <- function(opt, phenotypes) { + if (opt$save==TRUE) { + # saving not running tests so we add a fake confounder + numRows = nrow(phenotypes) + data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) + colnames(data)[1] <- "userID" + colnames(data)[2] <- "conf1" + return(data) + } else { + if (!is.null(opt$confounderfile)) { + print("Loading confounders from confounder file ...") + confs = fread(opt$confounderfile, sep=',', header=TRUE, data.table=FALSE) + confs = lapply(confs,function(x) type.convert(as.character(x))) + confs = as.data.frame(confs) + + ## find userID column and change name to userID + idx = which(colnames(confs) == opt$userId) + confs = confs[,c(opt$userID,setdiff(colnames(confs),opt$userID))] + colnames(confs)[1] <- "userID" + } else { + print("Loading confounders from phenotypes file ...") + confNames = .getConfounderNames(opt) + ##### + ##### extract confounders from data file + confs = fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) + confs = lapply(confs,function(x) type.convert(as.character(x))) + confs = as.data.frame(confs) + + ##### + ##### process genetic batch to create genetic chip variable + if (opt$genetic == TRUE) { + genoBatch = confs[,"x22000_0_0"] + # chip comes from batch field 22000 + genoChip = rep.int(NA,nrow(confs)); + idxForVar = which(genoBatch<0); + genoChip[idxForVar] = 0; + idxForVar = which(genoBatch>=0 & genoBatch<2000); + genoChip[idxForVar] = 1; + + # remove geno batch from and add geno chip to confounders + confs = confs[,-which(names(confs) == "x22000_0_0")] + confs = cbind.data.frame(confs, genoChip) + } + + ##### + ##### Convert assessment centre to an indicator variable + if (opt$sensitivity==TRUE) { + confs$x54_0_0 = as.factor(confs$x54_0_0) + assCentre = model.matrix(~confs$x54_0_0) + assCentre = assCentre[,2:ncol(assCentre)] + confs = cbind(confs, assCentre) + confs$x54_0_0 = NULL + } + colnames(confs)[1] <- "userID" + + } + + # remove any rows with no values + print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) + confsComp = complete.cases(confs) + print(paste("Number of INCOMPLETE rows removed from confounder data: ", length(which(confsComp==FALSE)),sep="")) + confs = confs[confsComp==TRUE,] + print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) + + print("Confounder columns:") + print(names(confs)) + + return(confs) + } +} + +.getConfounderNames <- function(opt) { + ##### + ##### first get vector of confounder names + # age and sex + confNames = c(opt$userId, "x21022_0_0", "x31_0_0") + + # if genetic trait of interest then adjust for genotype chip + # and also let user choose sensitivity analysis that also adjusts for top 10 genetic principal components and assessment centre + if (opt$genetic == TRUE) { + confNames = append(confNames, "x22000_0_0") + if (opt$sensitivity==TRUE) { + confNames = append(confNames, c("x22009_0_1", "x22009_0_2", "x22009_0_3", "x22009_0_4", "x22009_0_5", "x22009_0_6", "x22009_0_7", "x22009_0_8", "x22009_0_9", "x22009_0_10", "x54_0_0")) + print("Adjusting for age, sex, genotype chip, top 10 genetic principal components and assessment centre") + } else { + print("Adjusting for age, sex and genotype chip") + } + } else { + # non genetic trait of interest, then sensitivity adjusts for assessment center + if (opt$sensitivity==TRUE) { + confNames = append(confNames, "x54_0_0") + print("Adjusting for age, sex and assessment centre") + } else { + print("Adjusting for age and sex") + } + } + return(confNames) +} + + diff --git a/PHESANT/R/equalSizedBins.r b/PHESANT/R/equalSizedBins.r deleted file mode 100644 index 15570cd..0000000 --- a/PHESANT/R/equalSizedBins.r +++ /dev/null @@ -1,118 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -# splits the pheno into 3 bins with the cut points between values rather at the exact value for the quantile -equalSizedBins <- function(phenoAvg) { - - ## equal sized bins - q = quantile(phenoAvg, probs=c(1/3,2/3), na.rm=TRUE) - - minX = min(phenoAvg, na.rm=TRUE) - maxX = max(phenoAvg, na.rm=TRUE) - - phenoBinned = phenoAvg; - if (q[1]==minX) { - # edge case - quantile value is lowest value - - # assign min value as cat1 - idx1 = which(phenoAvg==q[1]); - phenoBinned[idx1] = 0; - - # divide remaining values into cat2 and cat3 - phenoAvgRemaining = phenoAvg[which(phenoAvg!=q[1])]; - qx = quantile(phenoAvgRemaining, probs=c(0.5), na.rm=TRUE) - minXX = min(phenoAvgRemaining, na.rm=TRUE) - maxXX = max(phenoAvgRemaining, na.rm=TRUE) - - if (qx[1]==minXX) { - # edge case again - quantile value is lowest value - idx2 = which(phenoAvg==qx[1]); - idx3 = which(phenoAvg>qx[1]); - } - else if (qx[1]==maxXX) { - # edge case again - quantile value is max value - idx2 = which(phenoAvgq[1]); - idx3 = which(phenoAvg==qx[1]); - } - else { - idx2 = which(phenoAvgq[1]); - idx3 = which(phenoAvg>=qx[1]); - } - phenoBinned[idx2] = 1; - phenoBinned[idx3] = 2; - } - else if (q[2]==maxX) { - # edge case - quantile value is highest value - - # assign max value as cat3 - idx3 = which(phenoAvg==q[2]); - phenoBinned[idx3] = 2; - - # divide remaining values into cat1 and cat2 - phenoAvgRemaining = phenoAvg[which(phenoAvg!=q[2])]; - qx = quantile(phenoAvgRemaining, probs=c(0.5), na.rm=TRUE) - minXX = min(phenoAvgRemaining, na.rm=TRUE) - maxXX = max(phenoAvgRemaining, na.rm=TRUE) - - if (qx[1]==minXX) { - # edge case again - quantile value is lowest value - idx1 = which(phenoAvg==qx[1]); - idx2 = which(phenoAvg>qx[1] & phenoAvg=qx[1] & phenoAvg this value - phenoBinned = phenoAvg; - idx1 = which(phenoAvgq[2]); - phenoBinned[idx1] = 0; - phenoBinned[idx2] = 1; - phenoBinned[idx3] = 2; - } - else { - # standard case - split the data into three roughly equal parts where - # cat1=q2 - phenoBinned = phenoAvg; - idx1 = which(phenoAvg=q[1] & phenoAvg=q[2]); - phenoBinned[idx1] = 0; - phenoBinned[idx2] = 1; - phenoBinned[idx3] = 2; - } - - cat("cat N: ", length(idx1),", ",length(idx2),", ",length(idx3), " || ", sep=""); - - return(phenoBinned); -} - diff --git a/PHESANT/R/getIsExposure.r b/PHESANT/R/exposure.r similarity index 100% rename from PHESANT/R/getIsExposure.r rename to PHESANT/R/exposure.r diff --git a/PHESANT/R/loadIndicatorFields.r b/PHESANT/R/indicatorFields.r similarity index 97% rename from PHESANT/R/loadIndicatorFields.r rename to PHESANT/R/indicatorFields.r index 1a815cd..04c860a 100644 --- a/PHESANT/R/loadIndicatorFields.r +++ b/PHESANT/R/indicatorFields.r @@ -27,7 +27,7 @@ loadIndicatorFields <- function(opt, vl, phenosToTest) { indVars = c(opt$userId) ## add indicator variables to pheno data - indVars = addIndicatorVariables(vl, indVars, phenosToTest, phenoVarsAll) + indVars = .addIndicatorVariables(vl, indVars, phenosToTest, phenoVarsAll) if (length(indVars)>1) { # not just user id column print("Loading required related variable(s):") @@ -44,7 +44,7 @@ loadIndicatorFields <- function(opt, vl, phenosToTest) { } -addIndicatorVariables <- function(vl, indVars, phenosToTest, phenoVarsAll) { +.addIndicatorVariables <- function(vl, indVars, phenosToTest, phenoVarsAll) { ##### default value related fields for data codes # get list of all indicator variables from outcome info file diff --git a/PHESANT/R/initFunctions.r b/PHESANT/R/init.r similarity index 65% rename from PHESANT/R/initFunctions.r rename to PHESANT/R/init.r index 7e9f996..abcce78 100644 --- a/PHESANT/R/initFunctions.r +++ b/PHESANT/R/init.r @@ -17,6 +17,43 @@ # DEALINGS IN THE SOFTWARE. +initData <-function(opt) { + ## load the files we write to and use + initCounters() + if (opt$save==FALSE) { + initResultsFiles(opt) + } + vl <- initVariableLists(opt) + + ## load data + d <- loadData(opt, vl) + data <- d$datax + confounders <- d$confounders + vl$indicatorFields <- d$inds + + numPreceedingCols <- ncol(confounders)-1+2 + phenoStartIdx <- numPreceedingCols + 1; # confounders,minus id column, plus trait of interest and user ID + phenoVars <- colnames(data)[-c(1,2)] # first and second columns are the id and snpScore, respectively + + return(list(data = data, vl = vl, confounders = confounders, phenoStartIdx = phenoStartIdx, phenoVars = phenoVars)) +} + +initEnv <- function(opt, data) { + if (opt$save == TRUE) { + pkg.env$derivedBinary <- data.frame(userID=data$userID) + pkg.env$derivedCont <- data.frame(userID=data$userID) + pkg.env$derivedCatOrd <- data.frame(userID=data$userID) + pkg.env$derivedCatUnord <- data.frame(userID=data$userID) + pkg.env$resLogFile = paste(opt$resDir,"data-log-",opt$varTypeArg,".txt",sep="") + sink(pkg.env$resLogFile) + } else { + pkg.env$modelFitLogFile = paste(opt$resDir,"modelfit-log-",opt$varTypeArg,".txt",sep="") + sink(pkg.env$modelFitLogFile) + sink() + pkg.env$resLogFile = paste(opt$resDir,"results-log-",opt$varTypeArg,".txt",sep="") + sink(pkg.env$resLogFile) + } +} # create new results files and headers initResultsFiles <- function(opt) { diff --git a/PHESANT/R/loadConfounders.r b/PHESANT/R/loadConfounders.r index b2cdda2..9fc1c10 100644 --- a/PHESANT/R/loadConfounders.r +++ b/PHESANT/R/loadConfounders.r @@ -40,7 +40,7 @@ loadConfounders <- function(opt, phenotypes) { colnames(confs)[1] <- "userID" } else { print("Loading confounders from phenotypes file ...") - confNames = getConfounderNames(opt) + confNames = .getConfounderNames(opt) ##### ##### extract confounders from data file confs = fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) @@ -90,7 +90,7 @@ loadConfounders <- function(opt, phenotypes) { } } -getConfounderNames <- function(opt) { +.getConfounderNames <- function(opt) { ##### ##### first get vector of confounder names # age and sex diff --git a/PHESANT/R/loadData.r b/PHESANT/R/loadData.r index e73ca27..405af81 100644 --- a/PHESANT/R/loadData.r +++ b/PHESANT/R/loadData.r @@ -23,22 +23,12 @@ # returns an object holding these two data frames loadData <- function(opt, vl) { - ##### validating data - ## check phenotype file headers - validatePhenotypeInput(opt) + validatePhenotypeInputHeader(opt) + validateTraitInputHeader(opt) - ## check trait of interest file headers - validateTraitInput(opt) - - ##### load data - ## load phenotype print("Loading phenotypes ...") phenotype = loadPhenotypes(opt) - - ## load trait of interest toi <- loadTraitOfInterest(opt, phenotype) - - ## load confounders conf <- loadConfounders(opt, phenotype) ## add trait of interest to phenotype data frame and remove rows with no trait of interest @@ -47,13 +37,11 @@ loadData <- function(opt, vl) { ## remove any rows with no trait of interest idxNotEmpty = which(!is.na(phenotype[,"geno"])) - if (opt$save == TRUE) { print(paste("Phenotype file has ", nrow(phenotype), " rows.", sep="")) } else { print(paste("Phenotype file has ", nrow(phenotype), " rows with ", length(idxNotEmpty), " not NA for trait of interest (",opt$traitofinterest,").", sep="")) } - phenotype = phenotype[idxNotEmpty,] # match ids from not empty phenotypes list @@ -61,9 +49,9 @@ loadData <- function(opt, vl) { conf = conf[confsIdx,] if (nrow(phenotype)==0) { - stop("No examples with row in both trait of interest and phenotype files", call.=FALSE) + stop("No examples with row in both trait of interest and phenotype files", call.=FALSE) } else { - print(paste("Phenotype and trait of interest data files merged, with", nrow(phenotype),"examples")) + print(paste("Phenotype and trait of interest data files merged, with", nrow(phenotype),"examples")) } # some fields are fixed that have a field type as cat single but we want to treat them like cat mult @@ -73,24 +61,3 @@ loadData <- function(opt, vl) { return(d) } -fixOddFieldsToCatMul <- function(vl, data) { - # examples are variables: 40006, 40011, 40012, 40013 - # get all variables that need their instances changing to arrays - dataPheno = vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),]; - for (i in 1:nrow(dataPheno)) { - varID = dataPheno[i,]$FieldID; - varidString = paste("x",varID,"_", sep=""); - - # get all columns in data dataframe for this variable - colIdxs = which(grepl(varidString,names(data))); - - # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 - count = 0; - for (j in colIdxs) { - colnames(data)[j] <- paste(varidString, "0_", count, sep="") - count = count + 1; - } - } - return(data) -} - diff --git a/PHESANT/R/makeTestDataFrame.r b/PHESANT/R/makeTestDataFrame.r deleted file mode 100644 index 39963b2..0000000 --- a/PHESANT/R/makeTestDataFrame.r +++ /dev/null @@ -1,30 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -## makes a smaller data frame containing the data for a particular test -makeTestDataFrame <- function(datax, confounders, currentVarValues) { - - thisdata = datax[,c("geno", "userID")] - thisdata = merge(thisdata, confounders, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) - currentVarValues = cbind.data.frame(datax$userID, currentVarValues) - colnames(currentVarValues)[1] = "userID" - thisdata = merge(thisdata, currentVarValues, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) - return(thisdata) - -} diff --git a/PHESANT/R/loadPhenotypes.r b/PHESANT/R/phenotypes.r similarity index 69% rename from PHESANT/R/loadPhenotypes.r rename to PHESANT/R/phenotypes.r index 0a55079..16b28e8 100644 --- a/PHESANT/R/loadPhenotypes.r +++ b/PHESANT/R/phenotypes.r @@ -108,4 +108,60 @@ loadPhenotypes <- function(opt) { } +# Validate the contents of the phenotype file +validatePhenotypeInputHeader <- function(opt) { + print("Validating phenotype data ...") + + ## get just first row so we can check the column names + phenoIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') + + ### pheno file validation + print(paste("Number of columns in phenotype file: ", ncol(phenoIn),sep="")) + ## check user id exists in pheno file + idx1 = which(names(phenoIn) == opt$userId); + if (length(idx1)==0) { + stop(paste("phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) + } + + # we only need the confounders if we are actually running the tests + if (opt$save==FALSE & is.null(opt$confounderfile)) { + ## confounder variables exist in pheno file + idx = which(names(phenoIn) == "x21022_0_0"); + if (length(idx)==0) { + stop("phenotype file doesn't contain required age colunn: x21022_0_0", call.=FALSE) + } + + idx = which(names(phenoIn) == "x31_0_0"); + if (length(idx)==0) { + stop("phenotype file doesn't contain required sex colunn: x31_0_0", call.=FALSE) + } + + if (opt$genetic ==TRUE) { + idx = which(names(phenoIn) == "x22000_0_0"); + if (length(idx)==0) { + stop("phenotype file doesn't contain required genetic batch colunn: x22000_0_0", call.=FALSE) + } + } + + ## if running with sensitivity option then check extra columns exist in pheno file (genetic PCs and assessment centre) + if (opt$sensitivity==TRUE) { + if (opt$genetic ==TRUE) { + ## check first 10 genetic PCs exist + for (i in 1:10) { + idx = which(names(phenoIn) == paste("x22009_0_", i, sep="")); + if (length(idx)==0) { + stop(paste("phenotype file doesn't contain required genetic principal component colunn: x22009_0_", i, sep=""), call.=FALSE) + } + } + } + ## assessment centre field + idx = which(names(phenoIn) == "x54_0_0"); + if (length(idx)==0) { + stop("phenotype file doesn't contain required assessment centre colunn: x54_0_0", call.=FALSE) + } + } + + } + print("Phenotype file validated") +} diff --git a/PHESANT/R/reassignValue.r b/PHESANT/R/reassignValue.r deleted file mode 100644 index c9bc289..0000000 --- a/PHESANT/R/reassignValue.r +++ /dev/null @@ -1,73 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# Reassigns values as specified in data coding info file -reassignValue <- function(vl, pheno, varName) { - - # get data code info - whether this data code is ordinal or not and any reordering and resassignments - dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; - dataCode = dataPheno$DATA_CODING; - - # not all variables will have a data code info row - dataCodeRow = which(vl$dataCodeInfo$dataCode==dataCode); - - if (length(dataCodeRow)==0) { - return(pheno); - } - else if (length(dataCodeRow)>1) { - cat("WARNING: >1 ROWS IN DATA CODE INFO FILE || ") - return(pheno); - } - - dataDataCode = vl$dataCodeInfo[dataCodeRow,]; - reassignments = as.character(dataDataCode$reassignments); - - return(reassignValue2(vl, pheno, reassignments)) - -} - -# Reassigns values in pheno, as specified in resassignments argument -reassignValue2 <- function(vl, pheno, reassignments) { - - # can be NA if row not included in data coding info file - if (!is.na(reassignments) && nchar(reassignments)>0) { - - reassignParts = unlist(strsplit(reassignments,"\\|")); - cat(paste("reassignments: ", reassignments, " || ", sep="")); - - # do each reassignment - for(i in reassignParts) { - reassignParts = unlist(strsplit(i,"=")); - - # matrix version - idx = which(pheno==reassignParts[1],arr.ind=TRUE) - pheno[idx]=strtoi(reassignParts[2]); - } - - ## see if type has changed (this happens for field 216 (X changed to -1)) - ## as.numeric will set non numeric to NA so we know if it's ok to do this by seeing if there are extra NA's after the conversion - pNum = as.numeric(unlist(pheno)) - isNum = length(which(is.na(pheno), arr.ind=TRUE))==length(which(is.na(pNum), arr.ind=TRUE)) - if (isNum) { - pheno = pNum - } - } - - return(pheno) -} diff --git a/PHESANT/R/run.r b/PHESANT/R/run.r new file mode 100644 index 0000000..70ed305 --- /dev/null +++ b/PHESANT/R/run.r @@ -0,0 +1,68 @@ +run <- function(opt) { + input <- initData(opt) + data <- input$data + vl <- input$vl + confounders <- input$confounders + phenoStartIdx <- input$phenoStartIdx + phenoVars <- input$phenoVars + print("LOADING DONE") + + initEnv(opt, data) + + currentVar <- "" + currentVarShort <- "" + first <- TRUE + + phenoIdx=0; # zero because then the idx is the position of the previous variable, i.e. the var in currentVar + for (var in phenoVars) { + sink() + sink(pkg.env$resLogFile, append=TRUE) + + varx <- gsub("^x", "", var) + varx <- gsub("_[0-9]+$", "", varx) + + varxShort <- gsub("^x", "", var) + varxShort <- gsub("_[0-9]+_[0-9]+$", "", varxShort) + + ## test this variable + if (currentVar == varx) { + thisCol <- data[,eval(var)] + thisCol <- .replaceNaN(thisCol) + currentVarValues <- cbind.data.frame(currentVarValues, thisCol) + } else if (currentVarShort == varxShort) { + ## different time point of this var so skip + } else { + ## new variable so run test for previous (we have collected all the columns now) + if (first==FALSE) { + thisdata <- makeTestDataFrame(data, confounders, currentVarValues) + testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) + } + first <- FALSE + + ## new variable so set values + currentVar <- varx + currentVarShort <- varxShort + + currentVarValues <- data[,eval(var)] + currentVarValues <- .replaceNaN(currentVarValues) + } + phenoIdx <- phenoIdx + 1 + } + + if (phenoIdx>0){ + # last variable so test association + thisdata = makeTestDataFrame(data, confounders, currentVarValues) + testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) + } + sink() + + # save counters of each path in variable flow + saveCounts(opt) + if (opt$save == TRUE) { + write.table(pkg.env$derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(pkg.env$derivedCont, file=paste(opt$resDir,"data-cont-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(pkg.env$derivedCatOrd, file=paste(opt$resDir,"data-catord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(pkg.env$derivedCatUnord, file=paste(opt$resDir,"data-catunord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + } + +} \ No newline at end of file diff --git a/PHESANT/R/storeNewVar.r b/PHESANT/R/storeNewVar.r deleted file mode 100644 index e2f640a..0000000 --- a/PHESANT/R/storeNewVar.r +++ /dev/null @@ -1,14 +0,0 @@ -storeNewVar <- function(userIDData, phenoData, varName, type) { - # add pheno to dataframe - newdata = data.frame(userID=userIDData, newvar=phenoData) - names(newdata)[names(newdata)=="newvar"] = varName - if (type == "bin") { - pkg.env$derivedBinary <- merge(pkg.env$derivedBinary, newdata, by="userID", all=TRUE); - } else if (type == "cont") { - pkg.env$derivedCont <- merge(pkg.env$derivedCont, newdata, by="userID", all=TRUE); - } else if (type == "catOrd") { - pkg.env$derivedCatOrd <- merge(pkg.env$derivedCatOrd, newdata, by="userID", all=TRUE); - } else if (type == "catUnord") { - pkg.env$derivedCatUnord <- merge(pkg.env$derivedCatUnord, newdata, by="userID", all=TRUE); - } -} diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index 8184ee3..508c4e0 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -126,7 +126,7 @@ restrictSample2 <- function(vl, varName,pheno, varIndicator,variableVal, userID) indicatorVar = indvarx[,indName] # remove participants with NA value in this related field - indicatorVar = replaceNaN(indicatorVar) + indicatorVar = .replaceNaN(indicatorVar) naIdxs = which(is.na(indicatorVar)) cat("Remove indicator var NAs: ", length(naIdxs), " || ", sep=""); diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index bd32e4d..d840a22 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -169,7 +169,7 @@ setDefaultValue <- function(vl, pheno, defaultValue, defaultRelatedID, userID) { indicatorVar = indvarx[,indName] # remove participants with NA value in this related field - indicatorVar = replaceNaN(indicatorVar) + indicatorVar = .replaceNaN(indicatorVar) # check if there are already examples with default value and if so display warning numWithDefault = length(which(pheno==defaultValue)) diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index 1f208db..78c8ea2 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -50,7 +50,7 @@ testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoSta if (opt$save == TRUE) { # add pheno to dataframe - storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catOrd') + .storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catOrd') cat("SUCCESS results-ordered-logistic"); incrementCounter("success.ordCat") } diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index dce7cb0..320f522 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -46,7 +46,7 @@ testCategoricalUnordered <- function(opt, vl, varName, varType, thisdata, phenoS if (opt$save == TRUE) { # add pheno to dataframe - storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catUnord') + .storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catUnord') cat("SUCCESS results-notordered-logistic "); incrementCounter("success.unordCat") } diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 0f5171d..883ea75 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -100,7 +100,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) incrementCounter("cont.ordcattry") ## equal sized bins - phenoBinned = equalSizedBins(phenoAvg); + phenoBinned = .equalSizedBins(phenoAvg) # check number of people in each bin bin0Num = length(which(phenoBinned==0)) @@ -174,7 +174,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) if (opt$save == TRUE) { # add pheno to dataframe - storeNewVar(thisdata[,"userID"], phenoIRNT, varName, 'cont') + .storeNewVar(thisdata[,"userID"], phenoIRNT, varName, 'cont') cat("SUCCESS results-linear"); incrementCounter("success.continuous") } @@ -248,3 +248,91 @@ irnt <- function(pheno, seed = 1234) { return(phenoIRNT) } +# splits the pheno into 3 bins with the cut points between values rather at the exact value for the quantile +.equalSizedBins <- function(phenoAvg) { + ## equal sized bins + q <- quantile(phenoAvg, probs=c(1/3,2/3), na.rm=TRUE) + + minX <- min(phenoAvg, na.rm=TRUE) + maxX <- max(phenoAvg, na.rm=TRUE) + + phenoBinned <- phenoAvg + if (q[1]==minX) { + # edge case - quantile value is lowest value + # assign min value as cat1 + idx1 <- which(phenoAvg==q[1]) + phenoBinned[idx1] <- 0 + + # divide remaining values into cat2 and cat3 + phenoAvgRemaining <- phenoAvg[which(phenoAvg!=q[1])] + qx <- quantile(phenoAvgRemaining, probs=c(0.5), na.rm=TRUE) + minXX <- min(phenoAvgRemaining, na.rm=TRUE) + maxXX <- max(phenoAvgRemaining, na.rm=TRUE) + + if (qx[1]==minXX) { + # edge case again - quantile value is lowest value + idx2 <- which(phenoAvg==qx[1]) + idx3 <- which(phenoAvg>qx[1]) + } else if (qx[1]==maxXX) { + # edge case again - quantile value is max value + idx2 <- which(phenoAvgq[1]) + idx3 <- which(phenoAvg==qx[1]) + } else { + idx2 <- which(phenoAvgq[1]) + idx3 <- which(phenoAvg>=qx[1]) + } + phenoBinned[idx2] <- 1 + phenoBinned[idx3] <- 2 + } else if (q[2]==maxX) { + # edge case - quantile value is highest value + # assign max value as cat3 + idx3 <- which(phenoAvg==q[2]) + phenoBinned[idx3] <- 2 + + # divide remaining values into cat1 and cat2 + phenoAvgRemaining <- phenoAvg[which(phenoAvg!=q[2])] + qx <- quantile(phenoAvgRemaining, probs=c(0.5), na.rm=TRUE) + minXX <- min(phenoAvgRemaining, na.rm=TRUE) + maxXX <- max(phenoAvgRemaining, na.rm=TRUE) + + if (qx[1]==minXX) { + # edge case again - quantile value is lowest value + idx1 <- which(phenoAvg==qx[1]) + idx2 <- which(phenoAvg>qx[1] & phenoAvg=qx[1] & phenoAvg this value + phenoBinned <- phenoAvg + idx1 <- which(phenoAvgq[2]) + phenoBinned[idx1] <- 0 + phenoBinned[idx2] <- 1 + phenoBinned[idx3] <- 2 + } else { + # standard case - split the data into three roughly equal parts where + # cat1=q2 + phenoBinned <- phenoAvg + idx1 <- which(phenoAvg=q[1] & phenoAvg=q[2]) + phenoBinned[idx1] <- 0 + phenoBinned[idx2] <- 1 + phenoBinned[idx3] <- 2 + } + + cat("cat N: ", length(idx1),", ",length(idx2),", ",length(idx3), " || ", sep="") + return(phenoBinned) +} diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 5ffac76..8591435 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -40,7 +40,7 @@ testInteger <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { phenoAvg = rowMeans(pheno, na.rm=TRUE) # if participant only has NA values then NaN is generated so we convert back to NA - phenoAvg = replaceNaN(phenoAvg) + phenoAvg = .replaceNaN(phenoAvg) } else { phenoAvg = pheno diff --git a/PHESANT/R/loadTraitOfInterest.r b/PHESANT/R/traitOfInterest.r similarity index 58% rename from PHESANT/R/loadTraitOfInterest.r rename to PHESANT/R/traitOfInterest.r index 7f637f2..537686e 100644 --- a/PHESANT/R/loadTraitOfInterest.r +++ b/PHESANT/R/traitOfInterest.r @@ -43,3 +43,41 @@ loadTraitOfInterest <- function(opt, phenotypes) { return(data) } + +# Validate the contents of the trait of interest file +validateTraitInputHeader <- function(opt) { + if (opt$save!=TRUE) { + + ### get header of trait of interest file or pheno file (if no trait of interest file is specified + print("Validating trait of interest data ...") + if (is.null(opt$traitofinterestfile)) { + snpIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') + } else { + snpIn = read.table(opt$traitofinterestfile, header=1, nrows=1, sep=',') + } + + ### trait of interest file validation + print(paste("Number of columns in trait of interest file:", ncol(snpIn),sep="")) + ## check user id exists in snp file + idx1 = which(names(snpIn) == opt$userId); + if (length(idx1)==0) { + if (is.null(opt$traitofinterestfile)) { + stop(paste("Phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) + } else { + stop(paste("Trait of interest file doesn't contain userID colunn:", opt$userId), call.=FALSE) + } + } + + ## check trait of interest exists in trait of interest file + idx2 = which(names(snpIn) == opt$traitofinterest); + if (length(idx2)==0) { + if (is.null(opt$traitofinterestfile)) { + stop(paste("No trait of interest file specified, and phenotypes file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) + } else { + stop(paste("Trait of interest file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) + } + } + print("Trait of interest file validated") + } +} + diff --git a/PHESANT/R/processArgs.r b/PHESANT/R/utils.r similarity index 77% rename from PHESANT/R/processArgs.r rename to PHESANT/R/utils.r index bca4872..2e560fb 100644 --- a/PHESANT/R/processArgs.r +++ b/PHESANT/R/utils.r @@ -1,21 +1,28 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. +.storeNewVar <- function(userIDData, phenoData, varName, type) { + # add pheno to dataframe + newdata = data.frame(userID=userIDData, newvar=phenoData) + names(newdata)[names(newdata)=="newvar"] = varName + if (type == "bin") { + pkg.env$derivedBinary <- merge(pkg.env$derivedBinary, newdata, by="userID", all=TRUE); + } else if (type == "cont") { + pkg.env$derivedCont <- merge(pkg.env$derivedCont, newdata, by="userID", all=TRUE); + } else if (type == "catOrd") { + pkg.env$derivedCatOrd <- merge(pkg.env$derivedCatOrd, newdata, by="userID", all=TRUE); + } else if (type == "catUnord") { + pkg.env$derivedCatUnord <- merge(pkg.env$derivedCatUnord, newdata, by="userID", all=TRUE); + } +} + +## makes a smaller data frame containing the data for a particular test +makeTestDataFrame <- function(datax, confounders, currentVarValues) { + thisdata <- datax[,c("geno", "userID")] + thisdata <- merge(thisdata, confounders, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + currentVarValues <- cbind.data.frame(datax$userID, currentVarValues) + colnames(currentVarValues)[1] <- "userID" + thisdata <- merge(thisdata, currentVarValues, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + return(thisdata) +} # Parse the arguments input by the user # if argument 'test' is used then run test phenome scan diff --git a/PHESANT/R/validatePhenotypeInput.r b/PHESANT/R/validatePhenotypeInput.r deleted file mode 100644 index cc3ca87..0000000 --- a/PHESANT/R/validatePhenotypeInput.r +++ /dev/null @@ -1,85 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# Validate the contents of the phenotype file -validatePhenotypeInput <- function(opt) { - - print("Validating phenotype data ...") - - ## get just first row so we can check the column names - phenoIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') - - ### - ### pheno file validation - - print(paste("Number of columns in phenotype file: ", ncol(phenoIn),sep="")) - - ## check user id exists in pheno file - idx1 = which(names(phenoIn) == opt$userId); - if (length(idx1)==0) { - stop(paste("phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) - } - - # we only need the confounders if we are actually running the tests - if (opt$save==FALSE & is.null(opt$confounderfile)) { - - ## confounder variables exist in pheno file - idx = which(names(phenoIn) == "x21022_0_0"); - if (length(idx)==0) { - stop("phenotype file doesn't contain required age colunn: x21022_0_0", call.=FALSE) - } - - idx = which(names(phenoIn) == "x31_0_0"); - if (length(idx)==0) { - stop("phenotype file doesn't contain required sex colunn: x31_0_0", call.=FALSE) - } - - - if (opt$genetic ==TRUE) { - idx = which(names(phenoIn) == "x22000_0_0"); - if (length(idx)==0) { - stop("phenotype file doesn't contain required genetic batch colunn: x22000_0_0", call.=FALSE) - } - } - - ## if running with sensitivity option then check extra columns exist in pheno file (genetic PCs and assessment centre) - if (opt$sensitivity==TRUE) { - - if (opt$genetic ==TRUE) { - ## check first 10 genetic PCs exist - for (i in 1:10) { - idx = which(names(phenoIn) == paste("x22009_0_", i, sep="")); - if (length(idx)==0) { - stop(paste("phenotype file doesn't contain required genetic principal component colunn: x22009_0_", i, sep=""), call.=FALSE) - } - } - } - - ## assessment centre field - idx = which(names(phenoIn) == "x54_0_0"); - if (length(idx)==0) { - stop("phenotype file doesn't contain required assessment centre colunn: x54_0_0", call.=FALSE) - } - } - - } - - print("Phenotype file validated") - -} diff --git a/PHESANT/R/validateTraitInput.r b/PHESANT/R/validateTraitInput.r deleted file mode 100644 index a42c28b..0000000 --- a/PHESANT/R/validateTraitInput.r +++ /dev/null @@ -1,69 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# Validate the contents of the trait of interest file -validateTraitInput <- function(opt) { - -if (opt$save!=TRUE) { - - print("Validating trait of interest data ...") - - ### - ### get header of trait of interest file or pheno file (if no trait of interest file is specified - - if (is.null(opt$traitofinterestfile)) { - snpIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') - } - else { - snpIn = read.table(opt$traitofinterestfile, header=1, nrows=1, sep=',') - } - - ### - ### trait of interest file validation - - print(paste("Number of columns in trait of interest file:", ncol(snpIn),sep="")) - - ## check user id exists in snp file - idx1 = which(names(snpIn) == opt$userId); - if (length(idx1)==0) { - if (is.null(opt$traitofinterestfile)) { - stop(paste("Phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) - } else { - stop(paste("Trait of interest file doesn't contain userID colunn:", opt$userId), call.=FALSE) - } - - } - - ## check trait of interest exists in trait of interest file - idx2 = which(names(snpIn) == opt$traitofinterest); - if (length(idx2)==0) { - - if (is.null(opt$traitofinterestfile)) { - stop(paste("No trait of interest file specified, and phenotypes file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) - } - else { - stop(paste("Trait of interest file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) - } - } - - print("Trait of interest file validated") - -} - -} diff --git a/PHESANT/phenomeScan.r b/PHESANT/phenomeScan.r index 3aa1ff5..0803a90 100644 --- a/PHESANT/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -18,7 +18,6 @@ ## ## main phenome scan file - ## Updated by Quanli Wang library(PHESANT) @@ -33,110 +32,4 @@ if (length(args) == 0) { opt <- processArgs(opt, opt_parser) } -## load the files we write to and use -initCounters() -if (opt$save==FALSE) { - initResultsFiles(opt) -} -vl <- initVariableLists(opt) - -## load data -d <- loadData(opt, vl) -data=d$datax -confounders=d$confounders -vl$indicatorFields=d$inds - -numPreceedingCols = ncol(confounders)-1+2; # confounders,minus id column, plus trait of interest and user ID -phenoStartIdx = numPreceedingCols+1; - -print("LOADING DONE") - -phenoVars=colnames(data); -# remove user id and age and sex columns -phenoVars = phenoVars[-c(1,2)]; # first and second columns are the id and snpScore, respectively, as determined in loadData.r - -currentVar=""; -currentVarShort=""; -first=TRUE; - -if (opt$save == TRUE) { - pkg.env$derivedBinary <- data.frame(userID=data$userID) - pkg.env$derivedCont <- data.frame(userID=data$userID) - pkg.env$derivedCatOrd <- data.frame(userID=data$userID) - pkg.env$derivedCatUnord <- data.frame(userID=data$userID) - pkg.env$resLogFile = paste(opt$resDir,"data-log-",opt$varTypeArg,".txt",sep="") - sink(pkg.env$resLogFile) -} else { - pkg.env$modelFitLogFile = paste(opt$resDir,"modelfit-log-",opt$varTypeArg,".txt",sep="") - sink(pkg.env$modelFitLogFile) - sink() - pkg.env$resLogFile = paste(opt$resDir,"results-log-",opt$varTypeArg,".txt",sep="") - sink(pkg.env$resLogFile) -} - - -phenoIdx=0; # zero because then the idx is the position of the previous variable, i.e. the var in currentVar -for (var in phenoVars) { - - - sink() -# print(var) - sink(pkg.env$resLogFile, append=TRUE) - - varx = gsub("^x", "", var); - varx = gsub("_[0-9]+$", "", varx); - varxShort = gsub("^x", "", var); - varxShort = gsub("_[0-9]+_[0-9]+$", "", varxShort); - - ## test this variable - if (currentVar == varx) { - thisCol = data[,eval(var)] - thisCol = replaceNaN(thisCol) - currentVarValues = cbind.data.frame(currentVarValues, thisCol); - } - else if (currentVarShort == varxShort) { - ## different time point of this var so skip - } - else { - ## new variable so run test for previous (we have collected all the columns now) - if (first==FALSE) { - - thisdata = makeTestDataFrame(data, confounders, currentVarValues) - testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) - } - - first=FALSE; - - ## new variable so set values - currentVar = varx; - currentVarShort = varxShort; - - currentVarValues = data[,eval(var)] - currentVarValues = replaceNaN(currentVarValues) - } - - phenoIdx = phenoIdx + 1; -} - -if (phenoIdx>0){ - # last variable so test association - thisdata = makeTestDataFrame(data, confounders, currentVarValues) - testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) -} - -sink() - -# save counters of each path in variable flow -saveCounts(opt) - -if (opt$save == TRUE) { - write.table(pkg.env$derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); - write.table(pkg.env$derivedCont, file=paste(opt$resDir,"data-cont-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); - write.table(pkg.env$derivedCatOrd, file=paste(opt$resDir,"data-catord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); - write.table(pkg.env$derivedCatUnord, file=paste(opt$resDir,"data-catunord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); -} - -warnings() - - - +run(opt) diff --git a/PHESANT/R/unittests/run-tests.sh b/unittests/run-tests.sh similarity index 100% rename from PHESANT/R/unittests/run-tests.sh rename to unittests/run-tests.sh diff --git a/PHESANT/R/unittests/test_equalSizedBins.r b/unittests/test_equalSizedBins.r similarity index 100% rename from PHESANT/R/unittests/test_equalSizedBins.r rename to unittests/test_equalSizedBins.r diff --git a/PHESANT/R/unittests/test_reassignValue.r b/unittests/test_reassignValue.r similarity index 100% rename from PHESANT/R/unittests/test_reassignValue.r rename to unittests/test_reassignValue.r diff --git a/PHESANT/R/unittests/test_testCatMultiple.r b/unittests/test_testCatMultiple.r similarity index 100% rename from PHESANT/R/unittests/test_testCatMultiple.r rename to unittests/test_testCatMultiple.r diff --git a/PHESANT/R/unittests/test_testCatSingle.r b/unittests/test_testCatSingle.r similarity index 100% rename from PHESANT/R/unittests/test_testCatSingle.r rename to unittests/test_testCatSingle.r From 2fadc11ded4e216e633c076c25f4bbb6183d3ce9 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 09:41:32 -0400 Subject: [PATCH 17/27] remove duplicate file --- PHESANT/R/loadConfounders.r | 121 ------------------------------------ 1 file changed, 121 deletions(-) delete mode 100644 PHESANT/R/loadConfounders.r diff --git a/PHESANT/R/loadConfounders.r b/PHESANT/R/loadConfounders.r deleted file mode 100644 index 9fc1c10..0000000 --- a/PHESANT/R/loadConfounders.r +++ /dev/null @@ -1,121 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -## -## loads confounder variables from phenotype file - -loadConfounders <- function(opt, phenotypes) { - if (opt$save==TRUE) { - # saving not running tests so we add a fake confounder - numRows = nrow(phenotypes) - data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) - colnames(data)[1] <- "userID" - colnames(data)[2] <- "conf1" - return(data) - } else { - if (!is.null(opt$confounderfile)) { - print("Loading confounders from confounder file ...") - confs = fread(opt$confounderfile, sep=',', header=TRUE, data.table=FALSE) - confs = lapply(confs,function(x) type.convert(as.character(x))) - confs = as.data.frame(confs) - - ## find userID column and change name to userID - idx = which(colnames(confs) == opt$userId) - confs = confs[,c(opt$userID,setdiff(colnames(confs),opt$userID))] - colnames(confs)[1] <- "userID" - } else { - print("Loading confounders from phenotypes file ...") - confNames = .getConfounderNames(opt) - ##### - ##### extract confounders from data file - confs = fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) - confs = lapply(confs,function(x) type.convert(as.character(x))) - confs = as.data.frame(confs) - - ##### - ##### process genetic batch to create genetic chip variable - if (opt$genetic == TRUE) { - genoBatch = confs[,"x22000_0_0"] - # chip comes from batch field 22000 - genoChip = rep.int(NA,nrow(confs)); - idxForVar = which(genoBatch<0); - genoChip[idxForVar] = 0; - idxForVar = which(genoBatch>=0 & genoBatch<2000); - genoChip[idxForVar] = 1; - - # remove geno batch from and add geno chip to confounders - confs = confs[,-which(names(confs) == "x22000_0_0")] - confs = cbind.data.frame(confs, genoChip) - } - - ##### - ##### Convert assessment centre to an indicator variable - if (opt$sensitivity==TRUE) { - confs$x54_0_0 = as.factor(confs$x54_0_0) - assCentre = model.matrix(~confs$x54_0_0) - assCentre = assCentre[,2:ncol(assCentre)] - confs = cbind(confs, assCentre) - confs$x54_0_0 = NULL - } - colnames(confs)[1] <- "userID" - - } - - # remove any rows with no values - print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) - confsComp = complete.cases(confs) - print(paste("Number of INCOMPLETE rows removed from confounder data: ", length(which(confsComp==FALSE)),sep="")) - confs = confs[confsComp==TRUE,] - print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) - - print("Confounder columns:") - print(names(confs)) - - return(confs) - } -} - -.getConfounderNames <- function(opt) { - ##### - ##### first get vector of confounder names - # age and sex - confNames = c(opt$userId, "x21022_0_0", "x31_0_0") - - # if genetic trait of interest then adjust for genotype chip - # and also let user choose sensitivity analysis that also adjusts for top 10 genetic principal components and assessment centre - if (opt$genetic == TRUE) { - confNames = append(confNames, "x22000_0_0") - if (opt$sensitivity==TRUE) { - confNames = append(confNames, c("x22009_0_1", "x22009_0_2", "x22009_0_3", "x22009_0_4", "x22009_0_5", "x22009_0_6", "x22009_0_7", "x22009_0_8", "x22009_0_9", "x22009_0_10", "x54_0_0")) - print("Adjusting for age, sex, genotype chip, top 10 genetic principal components and assessment centre") - } else { - print("Adjusting for age, sex and genotype chip") - } - } else { - # non genetic trait of interest, then sensitivity adjusts for assessment center - if (opt$sensitivity==TRUE) { - confNames = append(confNames, "x54_0_0") - print("Adjusting for age, sex and assessment centre") - } else { - print("Adjusting for age and sex") - } - } - return(confNames) -} - - From f9c36bacf7cb93322422053ff125ca3dc2daaca2 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 11:10:35 -0400 Subject: [PATCH 18/27] Code clean and Rd for input file heders --- PHESANT/R/counters.r | 2 +- PHESANT/R/init.r | 23 ++++---- PHESANT/R/phenotypes.r | 22 +++----- PHESANT/R/run.r | 22 +++----- PHESANT/R/traitOfInterest.r | 61 ++++++++++----------- PHESANT/man/initVariableLists.Rd | 31 +++++++++++ PHESANT/man/validatePhenotypeInputHeader.Rd | 36 ++++++++++++ PHESANT/man/validateTraitInputHeader.Rd | 28 ++++++++++ 8 files changed, 154 insertions(+), 71 deletions(-) create mode 100644 PHESANT/man/initVariableLists.Rd create mode 100644 PHESANT/man/validatePhenotypeInputHeader.Rd create mode 100644 PHESANT/man/validateTraitInputHeader.Rd diff --git a/PHESANT/R/counters.r b/PHESANT/R/counters.r index 31a586b..ec47a29 100644 --- a/PHESANT/R/counters.r +++ b/PHESANT/R/counters.r @@ -50,6 +50,6 @@ saveCounts <- function(opt) { } # init the counters used to determine how many variables took each path in the variable processing flow. -initCounters <- function() { +.initCounters <- function() { pkg.env$counters = data.frame(name=character(),countValue=integer(), stringsAsFactors=FALSE) } \ No newline at end of file diff --git a/PHESANT/R/init.r b/PHESANT/R/init.r index abcce78..05d66ba 100644 --- a/PHESANT/R/init.r +++ b/PHESANT/R/init.r @@ -19,11 +19,11 @@ initData <-function(opt) { ## load the files we write to and use - initCounters() + .initCounters() if (opt$save==FALSE) { - initResultsFiles(opt) + .initResultsFiles(opt) } - vl <- initVariableLists(opt) + vl <- initVariableLists(opt$variablelistfile, opt$datacodingfile) ## load data d <- loadData(opt, vl) @@ -38,7 +38,7 @@ initData <-function(opt) { return(list(data = data, vl = vl, confounders = confounders, phenoStartIdx = phenoStartIdx, phenoVars = phenoVars)) } -initEnv <- function(opt, data) { +.initEnv <- function(opt, data) { if (opt$save == TRUE) { pkg.env$derivedBinary <- data.frame(userID=data$userID) pkg.env$derivedCont <- data.frame(userID=data$userID) @@ -55,7 +55,7 @@ initEnv <- function(opt, data) { } } # create new results files and headers -initResultsFiles <- function(opt) { +.initResultsFiles <- function(opt) { ## only linear and continuous fields can create linear results file.create(paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt",sep="")); @@ -76,14 +76,11 @@ initResultsFiles <- function(opt) { } # load the variable information and data code information files -initVariableLists <- function(opt) { - - phenoInfo=read.table(opt$variablelistfile,sep="\t",header=1,comment.char="",quote=""); - - dataCodeInfo=read.table(opt$datacodingfile,sep=",", header=1); - - vars=list(phenoInfo=phenoInfo, dataCodeInfo=dataCodeInfo); - return(vars); +initVariableLists <- function(variablelistfile, datacodingfile) { + phenoInfo <- read.table(variablelistfile,sep="\t",header=1,comment.char="",quote="") + dataCodeInfo <- read.table(datacodingfile,sep=",", header=1) + vars <- list(phenoInfo=phenoInfo, dataCodeInfo=dataCodeInfo) + return(vars) } diff --git a/PHESANT/R/phenotypes.r b/PHESANT/R/phenotypes.r index 16b28e8..06a970c 100644 --- a/PHESANT/R/phenotypes.r +++ b/PHESANT/R/phenotypes.r @@ -16,9 +16,7 @@ # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -## ## load phenotypes from phenotype file - loadPhenotypes <- function(opt) { ## is not running 'all' then we determine the start and end idxs of phenotypes that we test, so that we can parallelise into multiple jobs if (opt$varTypeArg!="all") { @@ -111,14 +109,13 @@ loadPhenotypes <- function(opt) { # Validate the contents of the phenotype file validatePhenotypeInputHeader <- function(opt) { print("Validating phenotype data ...") - ## get just first row so we can check the column names - phenoIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') + phenoIn <- read.table(opt$phenofile, header=1, nrows=1, sep=',') ### pheno file validation print(paste("Number of columns in phenotype file: ", ncol(phenoIn),sep="")) ## check user id exists in pheno file - idx1 = which(names(phenoIn) == opt$userId); + idx1 <- which(names(phenoIn) == opt$userId) if (length(idx1)==0) { stop(paste("phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) } @@ -126,18 +123,18 @@ validatePhenotypeInputHeader <- function(opt) { # we only need the confounders if we are actually running the tests if (opt$save==FALSE & is.null(opt$confounderfile)) { ## confounder variables exist in pheno file - idx = which(names(phenoIn) == "x21022_0_0"); + idx <- which(names(phenoIn) == "x21022_0_0") if (length(idx)==0) { stop("phenotype file doesn't contain required age colunn: x21022_0_0", call.=FALSE) - } - - idx = which(names(phenoIn) == "x31_0_0"); + } + + idx <- which(names(phenoIn) == "x31_0_0") if (length(idx)==0) { stop("phenotype file doesn't contain required sex colunn: x31_0_0", call.=FALSE) } if (opt$genetic ==TRUE) { - idx = which(names(phenoIn) == "x22000_0_0"); + idx <- which(names(phenoIn) == "x22000_0_0") if (length(idx)==0) { stop("phenotype file doesn't contain required genetic batch colunn: x22000_0_0", call.=FALSE) } @@ -148,19 +145,18 @@ validatePhenotypeInputHeader <- function(opt) { if (opt$genetic ==TRUE) { ## check first 10 genetic PCs exist for (i in 1:10) { - idx = which(names(phenoIn) == paste("x22009_0_", i, sep="")); + idx <- which(names(phenoIn) == paste("x22009_0_", i, sep="")) if (length(idx)==0) { stop(paste("phenotype file doesn't contain required genetic principal component colunn: x22009_0_", i, sep=""), call.=FALSE) } } } ## assessment centre field - idx = which(names(phenoIn) == "x54_0_0"); + idx <- which(names(phenoIn) == "x54_0_0") if (length(idx)==0) { stop("phenotype file doesn't contain required assessment centre colunn: x54_0_0", call.=FALSE) } } - } print("Phenotype file validated") } diff --git a/PHESANT/R/run.r b/PHESANT/R/run.r index 70ed305..cda7a06 100644 --- a/PHESANT/R/run.r +++ b/PHESANT/R/run.r @@ -1,20 +1,16 @@ run <- function(opt) { input <- initData(opt) - data <- input$data - vl <- input$vl - confounders <- input$confounders - phenoStartIdx <- input$phenoStartIdx - phenoVars <- input$phenoVars print("LOADING DONE") - initEnv(opt, data) + #initilize package level variables, primarily for passing variables between functions and logging + .initEnv(opt, input$data) currentVar <- "" currentVarShort <- "" first <- TRUE phenoIdx=0; # zero because then the idx is the position of the previous variable, i.e. the var in currentVar - for (var in phenoVars) { + for (var in input$phenoVars) { sink() sink(pkg.env$resLogFile, append=TRUE) @@ -26,7 +22,7 @@ run <- function(opt) { ## test this variable if (currentVar == varx) { - thisCol <- data[,eval(var)] + thisCol <- input$data[,eval(var)] thisCol <- .replaceNaN(thisCol) currentVarValues <- cbind.data.frame(currentVarValues, thisCol) } else if (currentVarShort == varxShort) { @@ -34,8 +30,8 @@ run <- function(opt) { } else { ## new variable so run test for previous (we have collected all the columns now) if (first==FALSE) { - thisdata <- makeTestDataFrame(data, confounders, currentVarValues) - testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) + thisdata <- makeTestDataFrame(input$data, input$confounders, currentVarValues) + testAssociations(opt, input$vl, currentVar, currentVarShort, thisdata, input$phenoStartIdx) } first <- FALSE @@ -43,7 +39,7 @@ run <- function(opt) { currentVar <- varx currentVarShort <- varxShort - currentVarValues <- data[,eval(var)] + currentVarValues <- input$data[,eval(var)] currentVarValues <- .replaceNaN(currentVarValues) } phenoIdx <- phenoIdx + 1 @@ -51,8 +47,8 @@ run <- function(opt) { if (phenoIdx>0){ # last variable so test association - thisdata = makeTestDataFrame(data, confounders, currentVarValues) - testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) + thisdata = makeTestDataFrame(input$data, input$confounders, currentVarValues) + testAssociations(opt, input$vl, currentVar, currentVarShort, thisdata, input$phenoStartIdx) } sink() diff --git a/PHESANT/R/traitOfInterest.r b/PHESANT/R/traitOfInterest.r index 537686e..8d6ea10 100644 --- a/PHESANT/R/traitOfInterest.r +++ b/PHESANT/R/traitOfInterest.r @@ -47,37 +47,36 @@ loadTraitOfInterest <- function(opt, phenotypes) { # Validate the contents of the trait of interest file validateTraitInputHeader <- function(opt) { if (opt$save!=TRUE) { + ### get header of trait of interest file or pheno file (if no trait of interest file is specified + print("Validating trait of interest data ...") + if (is.null(opt$traitofinterestfile)) { + snpIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') + } else { + snpIn = read.table(opt$traitofinterestfile, header=1, nrows=1, sep=',') + } - ### get header of trait of interest file or pheno file (if no trait of interest file is specified - print("Validating trait of interest data ...") - if (is.null(opt$traitofinterestfile)) { - snpIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') - } else { - snpIn = read.table(opt$traitofinterestfile, header=1, nrows=1, sep=',') - } - - ### trait of interest file validation - print(paste("Number of columns in trait of interest file:", ncol(snpIn),sep="")) - ## check user id exists in snp file - idx1 = which(names(snpIn) == opt$userId); - if (length(idx1)==0) { - if (is.null(opt$traitofinterestfile)) { - stop(paste("Phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) - } else { - stop(paste("Trait of interest file doesn't contain userID colunn:", opt$userId), call.=FALSE) - } - } - - ## check trait of interest exists in trait of interest file - idx2 = which(names(snpIn) == opt$traitofinterest); - if (length(idx2)==0) { - if (is.null(opt$traitofinterestfile)) { - stop(paste("No trait of interest file specified, and phenotypes file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) - } else { - stop(paste("Trait of interest file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) - } - } - print("Trait of interest file validated") - } + ### trait of interest file validation + print(paste("Number of columns in trait of interest file:", ncol(snpIn),sep="")) + ## check user id exists in snp file + idx1 = which(names(snpIn) == opt$userId); + if (length(idx1)==0) { + if (is.null(opt$traitofinterestfile)) { + stop(paste("Phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) + } else { + stop(paste("Trait of interest file doesn't contain userID colunn:", opt$userId), call.=FALSE) + } + } + + ## check trait of interest exists in trait of interest file + idx2 = which(names(snpIn) == opt$traitofinterest); + if (length(idx2)==0) { + if (is.null(opt$traitofinterestfile)) { + stop(paste("No trait of interest file specified, and phenotypes file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) + } else { + stop(paste("Trait of interest file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) + } + } + print("Trait of interest file validated") + } } diff --git a/PHESANT/man/initVariableLists.Rd b/PHESANT/man/initVariableLists.Rd new file mode 100644 index 0000000..26a7e5a --- /dev/null +++ b/PHESANT/man/initVariableLists.Rd @@ -0,0 +1,31 @@ +\name{initVariableLists} +\alias{initVariableLists} +\title{ +Load the variable information and data code information files +} +\description{ +Load the variable information and data code information files +} +\usage{ +initVariableLists(variablelistfile, datacodingfile) +} +\arguments{ + \item{variablelistfile}{ + the input phenotype variable list file +} + \item{datacodingfile}{ + the input data coding file for the DATA_CODING field for phenotypre +} +} +\details{ +Load the variable information and data code information files +} +\value{ + return a list contain with two variables: + \item{phenoInfo }{A data frame that holds information about phenotype varibles from UKB} + \item{dataCodeInfo }{A data frame that provides further information about the DATA_CODING field of a phenotype from phenoInfo } +} + +\examples{ + #vl <- initVariableLists(opt$variablelistfile, opt$datacodingfile) +} diff --git a/PHESANT/man/validatePhenotypeInputHeader.Rd b/PHESANT/man/validatePhenotypeInputHeader.Rd new file mode 100644 index 0000000..c4a8e4e --- /dev/null +++ b/PHESANT/man/validatePhenotypeInputHeader.Rd @@ -0,0 +1,36 @@ +\name{validatePhenotypeInputHeader} +\alias{validatePhenotypeInputHeader} +\title{ +Validate the contents of the phenotype file +} +\description{ +Validate the contents of the phenotype file +} +\usage{ +validatePhenotypeInputHeader(opt) +} +\arguments{ + \item{opt}{ + The list of input options provided by user. +} +} +\details{ +This function validates the input phenotype file based on user options. +\enumerate{ + \item check if user id field exists in pheno file + \item check if phenotype file contains the required age colunn: x21022_0_0 + \item check if phenotype file contains the required sex colunn: x31_0_0 + \item check if phenotype file contains the required genetic batch colunn: x22000_0_0, when genetic option is used + \item check if phenotype file contains the required genetic principal component colunns(1 to 10): x22009_0_, when sensitivity and genetic options are used + \item check if phenotype file contains the required assessment centre colunn: x54_0_0, when sensitivity option is used + + } +} +\value{ + No return values and reports error and stops the program if the validation fails. +} + + +\examples{ +#validatePhenotypeInputHeader(opt) +} diff --git a/PHESANT/man/validateTraitInputHeader.Rd b/PHESANT/man/validateTraitInputHeader.Rd new file mode 100644 index 0000000..74b640d --- /dev/null +++ b/PHESANT/man/validateTraitInputHeader.Rd @@ -0,0 +1,28 @@ +\name{validateTraitInputHeader} +\alias{validateTraitInputHeader} +\title{ +Validate the contents of the trait of interest file +} +\description{ +Validate the contents of the trait of interest file. +} +\usage{ +validateTraitInputHeader(opt) +} +\arguments{ + \item{opt}{The list of input options provided by user.} +} +\details{ +This function validates the contents of the trait of interest file based on user options. A trait of interest ca nbe either from phrenotype file or trait of interest file. +\enumerate{ + \item check if user id field exists in pheno file or traint of interest file if the trait of interest file is provided. + \item check if phenotype file or trait of interest file contains the required trait of interest variable column. + } +} +\value{ +No return values and reports error and stops the program if the validation fails. +} + +\examples{ +#validateTraitInputHeader(opt) +} From e7d6ecfe92b023217327b54da4addbd001cfbbf9 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 14:19:39 -0400 Subject: [PATCH 19/27] Code refactor and documentation --- PHESANT/R/binaryLogisticRegression.r | 14 ++--- PHESANT/R/cleanData.R | 39 ++++++------ PHESANT/R/confounders.r | 68 ++++++++++----------- PHESANT/R/counters.r | 6 +- PHESANT/R/indicatorFields.r | 86 +++++++++++++------------- PHESANT/R/init.r | 2 +- PHESANT/R/loadData.r | 9 +-- PHESANT/R/phenotypes.r | 90 ++++++++++++++++------------ PHESANT/R/run.r | 2 +- PHESANT/R/testAssociations.r | 30 +++++----- PHESANT/R/testCatMultiple.r | 6 +- PHESANT/R/testCatSingle.r | 12 ++-- PHESANT/R/testCategoricalOrdered.r | 12 ++-- PHESANT/R/testCategoricalUnordered.r | 12 ++-- PHESANT/R/testContinuous.r | 30 +++++----- PHESANT/R/testInteger.r | 10 ++-- PHESANT/R/testNumExamples.r | 37 ------------ PHESANT/R/traitOfInterest.r | 7 +-- PHESANT/man/initData.Rd | 29 +++++++++ PHESANT/man/loadConfounders.Rd | 32 ++++++++++ PHESANT/man/loadData.Rd | 37 ++++++++++++ PHESANT/man/loadPhenotypes.Rd | 25 ++++++++ PHESANT/man/loadTraitOfInterest.Rd | 25 ++++++++ 23 files changed, 365 insertions(+), 255 deletions(-) delete mode 100644 PHESANT/R/testNumExamples.r create mode 100644 PHESANT/man/initData.Rd create mode 100644 PHESANT/man/loadConfounders.Rd create mode 100644 PHESANT/man/loadData.Rd create mode 100644 PHESANT/man/loadPhenotypes.Rd create mode 100644 PHESANT/man/loadTraitOfInterest.Rd diff --git a/PHESANT/R/binaryLogisticRegression.r b/PHESANT/R/binaryLogisticRegression.r index 1112377..705b9b1 100644 --- a/PHESANT/R/binaryLogisticRegression.r +++ b/PHESANT/R/binaryLogisticRegression.r @@ -31,7 +31,7 @@ binaryLogisticRegression <- function(opt, varName, varType, thisdata, isExposure if (length(facLevels)!=2) { #stop(paste("Not 2 levels: ", length(facLevels), " || ", sep="")) cat("BINARY-NOT2LEVELS- (", length(facLevels), ") || ",sep=""); - incrementCounter("binary.nottwolevels") + .incrementCounter("binary.nottwolevels") } idxTrue = length(which(phenoFactor==facLevels[1])) @@ -40,11 +40,11 @@ binaryLogisticRegression <- function(opt, varName, varType, thisdata, isExposure if (idxTrue<10 || idxFalse<10) { cat("BINARY-LOGISTIC-SKIP-10 (", idxTrue, "/", idxFalse, ") || ", sep="") - incrementCounter("binary.10") + .incrementCounter("binary.10") } else if (numNotNA<500) { cat("BINARY-LOGISTIC-SKIP-500 (", numNotNA, ") || ",sep=""); - incrementCounter("binary.500") + .incrementCounter("binary.500") } else { @@ -54,7 +54,7 @@ binaryLogisticRegression <- function(opt, varName, varType, thisdata, isExposure # add pheno to dataframe .storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'bin') cat("SUCCESS results-logistic-binary "); - incrementCounter("success.binary") + .incrementCounter("success.binary") } else { @@ -102,10 +102,10 @@ binaryLogisticRegression <- function(opt, varName, varType, thisdata, isExposure write(paste(varName,varType,paste(idxTrue,"/",idxFalse,"(",numNotNA,")",sep=""), beta,lower,upper,pvalue, sep=","), file=paste(opt$resDir,"results-logistic-binary-",opt$varTypeArg,".txt",sep=""), append="TRUE"); cat("SUCCESS results-logistic-binary "); - incrementCounter("success.binary") + .incrementCounter("success.binary") if (isExposure==TRUE) { - incrementCounter("success.exposure.binary") + .incrementCounter("success.exposure.binary") } ## END TRYCATCH @@ -113,7 +113,7 @@ binaryLogisticRegression <- function(opt, varName, varType, thisdata, isExposure sink() sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - incrementCounter("binary.error") + .incrementCounter("binary.error") }) } } diff --git a/PHESANT/R/cleanData.R b/PHESANT/R/cleanData.R index 7f27c4d..adf98ae 100644 --- a/PHESANT/R/cleanData.R +++ b/PHESANT/R/cleanData.R @@ -59,28 +59,6 @@ reassignValue <- function(vl, pheno, varName) { return(pheno) } -fixOddFieldsToCatMul <- function(vl, data) { - # examples are variables: 40006, 40011, 40012, 40013 - # get all variables that need their instances changing to arrays - dataPheno = vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),]; - for (i in 1:nrow(dataPheno)) { - varID = dataPheno[i,]$FieldID; - varidString = paste("x",varID,"_", sep=""); - - # get all columns in data dataframe for this variable - colIdxs = which(grepl(varidString,names(data))); - - # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 - count = 0; - for (j in colIdxs) { - colnames(data)[j] <- paste(varidString, "0_", count, sep="") - count = count + 1; - } - } - return(data) -} - - # Replace NaN and empty values with NA in pheno .replaceNaN <- function(pheno) { if (is.factor(pheno)) { @@ -98,3 +76,20 @@ fixOddFieldsToCatMul <- function(vl, data) { } return(phenoReplaced) } + +.testNumExamples <- function(pheno) { + ## loop through values and remove if has < 10 examples + uniqVar <- unique(na.omit(pheno)) + for (u in uniqVar) { + withValIdx <- which(pheno==u) + numWithVal <- length(withValIdx); + if (numWithVal<10) { + pheno[withValIdx] <- NA + cat(paste("Removed ",u ,": ", numWithVal, "<10 examples || ", sep="")); + } else { + cat(paste("Inc(>=10): ", u, "(", numWithVal, ") || ", sep="")); + } + } + return(pheno) +} + diff --git a/PHESANT/R/confounders.r b/PHESANT/R/confounders.r index 9fc1c10..c2fa2ce 100644 --- a/PHESANT/R/confounders.r +++ b/PHESANT/R/confounders.r @@ -22,86 +22,80 @@ loadConfounders <- function(opt, phenotypes) { if (opt$save==TRUE) { # saving not running tests so we add a fake confounder - numRows = nrow(phenotypes) - data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) + numRows <- nrow(phenotypes) + data <- cbind.data.frame(phenotypes$userID, rep(-1, numRows)) colnames(data)[1] <- "userID" colnames(data)[2] <- "conf1" return(data) } else { if (!is.null(opt$confounderfile)) { print("Loading confounders from confounder file ...") - confs = fread(opt$confounderfile, sep=',', header=TRUE, data.table=FALSE) - confs = lapply(confs,function(x) type.convert(as.character(x))) - confs = as.data.frame(confs) + confs <- fread(opt$confounderfile, sep=',', header=TRUE, data.table=FALSE) + confs <- lapply(confs,function(x) type.convert(as.character(x))) + confs <- as.data.frame(confs) ## find userID column and change name to userID - idx = which(colnames(confs) == opt$userId) - confs = confs[,c(opt$userID,setdiff(colnames(confs),opt$userID))] + idx <- which(colnames(confs) == opt$userId) + confs <- confs[,c(opt$userID,setdiff(colnames(confs),opt$userID))] colnames(confs)[1] <- "userID" } else { print("Loading confounders from phenotypes file ...") - confNames = .getConfounderNames(opt) - ##### + confNames <- .getConfounderNames(opt) + ##### extract confounders from data file - confs = fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) - confs = lapply(confs,function(x) type.convert(as.character(x))) - confs = as.data.frame(confs) - - ##### + confs <- fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) + confs <- lapply(confs,function(x) type.convert(as.character(x))) + confs <- as.data.frame(confs) + ##### process genetic batch to create genetic chip variable if (opt$genetic == TRUE) { - genoBatch = confs[,"x22000_0_0"] + genoBatch <- confs[,"x22000_0_0"] # chip comes from batch field 22000 - genoChip = rep.int(NA,nrow(confs)); - idxForVar = which(genoBatch<0); - genoChip[idxForVar] = 0; - idxForVar = which(genoBatch>=0 & genoBatch<2000); - genoChip[idxForVar] = 1; + genoChip <- rep.int(NA,nrow(confs)) + idxForVar <- which(genoBatch<0) + genoChip[idxForVar] <- 0 + idxForVar <- which(genoBatch>=0 & genoBatch<2000) + genoChip[idxForVar] <- 1 # remove geno batch from and add geno chip to confounders - confs = confs[,-which(names(confs) == "x22000_0_0")] - confs = cbind.data.frame(confs, genoChip) + confs <- confs[,-which(names(confs) == "x22000_0_0")] + confs <- cbind.data.frame(confs, genoChip) } - ##### ##### Convert assessment centre to an indicator variable if (opt$sensitivity==TRUE) { - confs$x54_0_0 = as.factor(confs$x54_0_0) - assCentre = model.matrix(~confs$x54_0_0) - assCentre = assCentre[,2:ncol(assCentre)] - confs = cbind(confs, assCentre) + confs$x54_0_0 <- as.factor(confs$x54_0_0) + assCentre <- model.matrix(~confs$x54_0_0) + assCentre <- assCentre[,2:ncol(assCentre)] + confs <- cbind(confs, assCentre) confs$x54_0_0 = NULL } colnames(confs)[1] <- "userID" - } # remove any rows with no values print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) - confsComp = complete.cases(confs) + confsComp <- complete.cases(confs) print(paste("Number of INCOMPLETE rows removed from confounder data: ", length(which(confsComp==FALSE)),sep="")) - confs = confs[confsComp==TRUE,] + confs <- confs[confsComp==TRUE,] print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) print("Confounder columns:") print(names(confs)) - return(confs) } } .getConfounderNames <- function(opt) { - ##### ##### first get vector of confounder names # age and sex - confNames = c(opt$userId, "x21022_0_0", "x31_0_0") - + confNames <- c(opt$userId, "x21022_0_0", "x31_0_0") # if genetic trait of interest then adjust for genotype chip # and also let user choose sensitivity analysis that also adjusts for top 10 genetic principal components and assessment centre if (opt$genetic == TRUE) { - confNames = append(confNames, "x22000_0_0") + confNames <- append(confNames, "x22000_0_0") if (opt$sensitivity==TRUE) { - confNames = append(confNames, c("x22009_0_1", "x22009_0_2", "x22009_0_3", "x22009_0_4", "x22009_0_5", "x22009_0_6", "x22009_0_7", "x22009_0_8", "x22009_0_9", "x22009_0_10", "x54_0_0")) + confNames <- append(confNames, c("x22009_0_1", "x22009_0_2", "x22009_0_3", "x22009_0_4", "x22009_0_5", "x22009_0_6", "x22009_0_7", "x22009_0_8", "x22009_0_9", "x22009_0_10", "x54_0_0")) print("Adjusting for age, sex, genotype chip, top 10 genetic principal components and assessment centre") } else { print("Adjusting for age, sex and genotype chip") @@ -109,7 +103,7 @@ loadConfounders <- function(opt, phenotypes) { } else { # non genetic trait of interest, then sensitivity adjusts for assessment center if (opt$sensitivity==TRUE) { - confNames = append(confNames, "x54_0_0") + confNames <- append(confNames, "x54_0_0") print("Adjusting for age, sex and assessment centre") } else { print("Adjusting for age and sex") diff --git a/PHESANT/R/counters.r b/PHESANT/R/counters.r index ec47a29..d5a061d 100644 --- a/PHESANT/R/counters.r +++ b/PHESANT/R/counters.r @@ -17,7 +17,7 @@ # DEALINGS IN THE SOFTWARE. # adds given value to counter, that are used to count how many variables go down each route in the data flow -addToCounts <- function(countName, num) { +.addToCounts <- function(countName, num) { idx = which(pkg.env$counters$name==countName) if (length(idx)==0) { # counter does not exist so add with countValue 1 @@ -29,7 +29,7 @@ addToCounts <- function(countName, num) { } # increments counters used to count how many variables go down each route in the data flow -incrementCounter <- function(countName) { +.incrementCounter <- function(countName) { idx = which(pkg.env$counters$name==countName) if (length(idx)==0) { # counter does not exist so add with countValue 1 @@ -41,7 +41,7 @@ incrementCounter <- function(countName) { } # Saves the counters stored in count variables, to a file in results directory -saveCounts <- function(opt) { +.saveCounts <- function(opt) { countFile = paste(opt$resDir,"variable-flow-counts-",opt$varTypeArg,".txt",sep="") # sort on counter name sortIdx = order(as.character(pkg.env$counters[,"name"])) diff --git a/PHESANT/R/indicatorFields.r b/PHESANT/R/indicatorFields.r index 04c860a..2a0178b 100644 --- a/PHESANT/R/indicatorFields.r +++ b/PHESANT/R/indicatorFields.r @@ -18,13 +18,12 @@ ## ## load data used for data code default value related field, and categorical multiple indicator field -loadIndicatorFields <- function(opt, vl, phenosToTest) { +.loadIndicatorFields <- function(opt, vl, phenosToTest) { print("Loading indicator fields from phenotypes file ...") - # read pheno file column names - phenoVarsAll = colnames(read.table(opt$phenofile, header=1, nrows=1, sep=',')) - phenoVarsAll = phenoVarsAll[which(phenoVarsAll!=opt$userId)] - indVars = c(opt$userId) + phenoVarsAll <- colnames(read.table(opt$phenofile, header=1, nrows=1, sep=',')) + phenoVarsAll <- phenoVarsAll[which(phenoVarsAll!=opt$userId)] + indVars <- c(opt$userId) ## add indicator variables to pheno data indVars = .addIndicatorVariables(vl, indVars, phenosToTest, phenoVarsAll) @@ -37,8 +36,8 @@ loadIndicatorFields <- function(opt, vl, phenosToTest) { } ## read in the right table columns - data = fread(opt$phenofile, select=indVars, sep=',', header=TRUE, data.table=FALSE) - data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) + data <- fread(opt$phenofile, select=indVars, sep=',', header=TRUE, data.table=FALSE) + data <- data.frame(lapply(data,function(x) type.convert(as.character(x)))) colnames(data)[1] <- "userID" return(data) } @@ -48,30 +47,29 @@ loadIndicatorFields <- function(opt, vl, phenosToTest) { ##### default value related fields for data codes # get list of all indicator variables from outcome info file - # get datacodes with an indicator variable - dataCodeIdx = which(!is.na(vl$dataCodeInfo$default_related_field) & vl$dataCodeInfo$default_related_field!="") + dataCodeIdx <- which(!is.na(vl$dataCodeInfo$default_related_field) & vl$dataCodeInfo$default_related_field!="") # whether there are any related fields that are not in the phenotype data file when they should be - hasIssue=FALSE + hasIssue <- FALSE if (length(dataCodeIdx)>0) { - dataCodeWithRF = vl$dataCodeInfo[dataCodeIdx,] - defaultFields = c() + dataCodeWithRF <- vl$dataCodeInfo[dataCodeIdx,] + defaultFields <- c() # check there is a field in the phenotypes data for each data code and if so then add this data codes related field to the phenosToTest list if (nrow(dataCodeWithRF)>0) { for (i in 1:nrow(dataCodeWithRF)) { - dc = dataCodeWithRF$dataCode[i] + dc <- dataCodeWithRF$dataCode[i] # get all fields with this datacode - fieldsIdx = which(vl$phenoInfo$DATA_CODING == dc) - fieldIDs = vl$phenoInfo$FieldID[fieldsIdx] - fieldIDs = paste("x",fieldIDs,"_0_0", sep="") + fieldsIdx <- which(vl$phenoInfo$DATA_CODING == dc) + fieldIDs <- vl$phenoInfo$FieldID[fieldsIdx] + fieldIDs <- paste("x",fieldIDs,"_0_0", sep="") # datacode related field - rf = dataCodeWithRF$default_related_field[i] - rf = paste("x",rf,"_0_0", sep="") + rf <- dataCodeWithRF$default_related_field[i] + rf <- paste("x",rf,"_0_0", sep="") # if one of these field IDs are in phenotypeColumns then data code related field is needed if (length(intersect(fieldIDs, phenosToTest))>0) { @@ -80,15 +78,15 @@ loadIndicatorFields <- function(opt, vl, phenosToTest) { } } - defaultFields = unique(defaultFields) - indVars = append(indVars, defaultFields) + defaultFields <- unique(defaultFields) + indVars <- append(indVars, defaultFields) ##### check these required variables exist in phenotype file if(length(defaultFields)>0) { for (i in 1:length(defaultFields)) { if (!(defaultFields[i] %in% phenoVarsAll)) { print(paste("Required variable: Field ",defaultFields[i],"is a data code related field (default_related_field column in data code information file) but was not found in phenotype data")) - hasIssue=TRUE + hasIssue <- TRUE } } } @@ -96,45 +94,43 @@ loadIndicatorFields <- function(opt, vl, phenosToTest) { ##### categorical multiple indicator fields # get field info, for fields with cat mult indicator fields - fieldsIdx = which(!is.na(vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS)) + fieldsIdx <- which(!is.na(vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS)) if (length(fieldsIdx)>0) { - fieldsWithCMIF = vl$phenoInfo[fieldsIdx,] - fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "NO_NAN"),] - fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "ALL"),] - fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == ""),] + fieldsWithCMIF <- vl$phenoInfo[fieldsIdx,] + fieldsWithCMIF <- fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "NO_NAN"),] + fieldsWithCMIF <- fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "ALL"),] + fieldsWithCMIF <- fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == ""),] if (nrow(fieldsWithCMIF)>0) { # turn into variable format not field ID - fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS = paste("x",fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS,"_0_0", sep="") - fieldsWithCMIF$FieldID = paste("x",fieldsWithCMIF$FieldID,"_", sep="") - phenosToTestIds = sub("_.*", "_", phenosToTest) + fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS <- paste("x",fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS,"_0_0", sep="") + fieldsWithCMIF$FieldID <- paste("x",fieldsWithCMIF$FieldID,"_", sep="") + phenosToTestIds <- sub("_.*", "_", phenosToTest) # remove rows where the field isn't in the phenotypes list - idxIn = which(fieldsWithCMIF$FieldID %in% phenosToTestIds) - fieldsWithCMIF = fieldsWithCMIF[idxIn,] + idxIn <- which(fieldsWithCMIF$FieldID %in% phenosToTestIds) + fieldsWithCMIF <- fieldsWithCMIF[idxIn,] - defaultFields = fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS - defaultFields = unique(defaultFields) + defaultFields <- fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS + defaultFields <- unique(defaultFields) - ##### ##### remove those that already exist in phenotypes 'part' - idxExists = which(defaultFields %in% indVars) + idxExists <- which(defaultFields %in% indVars) if (length(idxExists>0)) { - defaultFields = defaultFields[-idxExists] + defaultFields <- defaultFields[-idxExists] } - indVars = unique(append(indVars, defaultFields)) + indVars <- unique(append(indVars, defaultFields)) - ##### ##### check these required variables exist in phenotype file if(length(defaultFields)>0) { - for (i in 1:length(defaultFields)) { - if (!(defaultFields[i] %in% phenoVarsAll)) { - print(paste("Required variable: Field ",defaultFields[i],"is a categorical multiple indicator field (CAT_MULT_INDICATOR_FIELDS column in variable information file) but was not found in phenotype data")) - hasIssue=TRUE - } - } + for (i in 1:length(defaultFields)) { + if (!(defaultFields[i] %in% phenoVarsAll)) { + print(paste("Required variable: Field ",defaultFields[i],"is a categorical multiple indicator field (CAT_MULT_INDICATOR_FIELDS column in variable information file) but was not found in phenotype data")) + hasIssue <- TRUE + } + } } } @@ -145,8 +141,6 @@ loadIndicatorFields <- function(opt, vl, phenosToTest) { print("!!! PHESANT has stopped - add required variables to phenotype file or remove relevant phenotypes (so that required variables are not needed).") quit() } - return(indVars) - } diff --git a/PHESANT/R/init.r b/PHESANT/R/init.r index 05d66ba..830a907 100644 --- a/PHESANT/R/init.r +++ b/PHESANT/R/init.r @@ -27,7 +27,7 @@ initData <-function(opt) { ## load data d <- loadData(opt, vl) - data <- d$datax + data <- d$phenotype confounders <- d$confounders vl$indicatorFields <- d$inds diff --git a/PHESANT/R/loadData.r b/PHESANT/R/loadData.r index 405af81..2a09f76 100644 --- a/PHESANT/R/loadData.r +++ b/PHESANT/R/loadData.r @@ -20,7 +20,8 @@ # loads phenotype and trait of interest data files # creates phenotype / trait of interest data frame # creates confounder data frame -# returns an object holding these two data frames +# create indicator fields data frame +# returns an object holding these three data frames loadData <- function(opt, vl) { validatePhenotypeInputHeader(opt) @@ -55,9 +56,9 @@ loadData <- function(opt, vl) { } # some fields are fixed that have a field type as cat single but we want to treat them like cat mult - phenotype = fixOddFieldsToCatMul(vl, phenotype) - indFields = loadIndicatorFields(opt, vl, colnames(phenotype)) - d = list(datax=phenotype, confounders=conf, inds=indFields) + phenotype = .fixOddFieldsToCatMul(vl, phenotype) + indFields = .loadIndicatorFields(opt, vl, colnames(phenotype)) + d = list(phenotype=phenotype, confounders=conf, inds=indFields) return(d) } diff --git a/PHESANT/R/phenotypes.r b/PHESANT/R/phenotypes.r index 06a970c..d87f16d 100644 --- a/PHESANT/R/phenotypes.r +++ b/PHESANT/R/phenotypes.r @@ -19,88 +19,83 @@ ## load phenotypes from phenotype file loadPhenotypes <- function(opt) { ## is not running 'all' then we determine the start and end idxs of phenotypes that we test, so that we can parallelise into multiple jobs - if (opt$varTypeArg!="all") { + if (opt$varTypeArg != "all") { # read pheno file column names - phenoVars = read.table(opt$phenofile, header=0, nrows=1, sep=',') - phenoVars = phenoVars[which(phenoVars!=opt$userId)] + phenoVars <- read.table(opt$phenofile, header=0, nrows=1, sep=',') + phenoVars <- phenoVars[which(phenoVars!=opt$userId)] - ##### ##### calculate part start and end - partSize = ceiling(length(phenoVars)/opt$numParts); - partStart = (opt$partIdx-1)*partSize + 1; + partSize <- ceiling(length(phenoVars)/opt$numParts) + partStart <- (opt$partIdx-1)*partSize + 1 if (opt$partIdx == opt$numParts) { - partEnd = length(phenoVars); + partEnd <- length(phenoVars) } else { - partEnd = partStart + partSize - 1; + partEnd <- partStart + partSize - 1 } - print(paste(partStart, '-', partEnd)); + print(paste(partStart, '-', partEnd)) - - ##### ##### find range of columns to read in - ## This is more complicated than just reading in a column range, because we need to determine cut points ## such that all columns of a particular field are loaded. ## A field is included in a 'part' if its last column is within the part range. ## e.g. for part 2 of 5 parts and for 100 columns, then fields having their last column at position 21 - 40 (i.e. its column index) are included in this part. ## user ID always included - phenosToTest = c(opt$userId) + phenosToTest <- c(opt$userId) - currentVar="" - currentVarLong="" - currentVarShort="" - first=TRUE - phenoIdx=0 + currentVar <- "" + currentVarLong <- "" + currentVarShort <- "" + first <- TRUE + phenoIdx <- 0 # all columns for a particular field - thisPhenoToTest = c() + thisPhenoToTest <- c() for (var in phenoVars) { - varx = gsub("^x", "", var); - varx = gsub("_[0-9]+$", "", varx); - varxShort = gsub("^x", "", var); - varxShort = gsub("_[0-9]+_[0-9]+$", "", varxShort); - currentVarLong = var + varx <- gsub("^x", "", var) + varx <- gsub("_[0-9]+$", "", varx) + varxShort <- gsub("^x", "", var) + varxShort <- gsub("_[0-9]+_[0-9]+$", "", varxShort) + currentVarLong <- var if (currentVar == varx) { # same variable same timepoint # add current var to pheno list - thisPhenoToTest = append(thisPhenoToTest, as.character(currentVarLong)) + thisPhenoToTest <- append(thisPhenoToTest, as.character(currentVarLong)) } else if (currentVarShort == varxShort) { # save var, diff timepoint ## different time point of this var so skip in testing but add here because some are fixed to cat mult - thisPhenoToTest = append(thisPhenoToTest, as.character(currentVarLong)) + thisPhenoToTest <- append(thisPhenoToTest, as.character(currentVarLong)) } else { ## new variable so run test for previous (we have collected all the columns now) if (first==FALSE) { if (phenoIdx>=partStart && phenoIdx<=partEnd) { # only start new variable processing if last column of it is within the idx range for this part - phenosToTest = append(phenosToTest, thisPhenoToTest) + phenosToTest <- append(phenosToTest, thisPhenoToTest) } } - first=FALSE; + first <- FALSE ## new variable so set values - currentVar = varx - currentVarShort = varxShort - thisPhenoToTest = c(as.character(currentVarLong)) + currentVar <- varx + currentVarShort <- varxShort + thisPhenoToTest <- c(as.character(currentVarLong)) } - phenoIdx = phenoIdx + 1 + phenoIdx <- phenoIdx + 1 } # last variable so test association if (phenoIdx>=partStart && phenoIdx<=partEnd) { - #phenosToTest = append(phenosToTest, as.character(currentVarLong)) - phenosToTest = append(phenosToTest, as.character(thisPhenoToTest)) + phenosToTest <- append(phenosToTest, as.character(thisPhenoToTest)) } ## read in the right table columns - a subset of the data file - data = fread(opt$phenofile, select=phenosToTest, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) + data <- fread(opt$phenofile, select=phenosToTest, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) } else { # reading all data at once - data = fread(opt$phenofile, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) + data <- fread(opt$phenofile, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) } ## this is type conversion as used in the read.table function (that we used to use ((this was changed because read.table cannot read column subsets)) - data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) + data <- data.frame(lapply(data,function(x) type.convert(as.character(x)))) colnames(data)[1] <- "userID" return(data) @@ -161,3 +156,24 @@ validatePhenotypeInputHeader <- function(opt) { print("Phenotype file validated") } +.fixOddFieldsToCatMul <- function(vl, data) { + # examples are variables: 40006, 40011, 40012, 40013 + # get all variables that need their instances changing to arrays + dataPheno <- vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),] + for (i in 1:nrow(dataPheno)) { + varID <- dataPheno[i,]$FieldID + varidString <- paste("x",varID,"_", sep="") + + # get all columns in data dataframe for this variable + colIdxs <- which(grepl(varidString,names(data))) + + # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 + count <- 0 + for (j in colIdxs) { + colnames(data)[j] <- paste(varidString, "0_", count, sep="") + count <- count + 1 + } + } + return(data) +} + diff --git a/PHESANT/R/run.r b/PHESANT/R/run.r index cda7a06..b4cf1c0 100644 --- a/PHESANT/R/run.r +++ b/PHESANT/R/run.r @@ -53,7 +53,7 @@ run <- function(opt) { sink() # save counters of each path in variable flow - saveCounts(opt) + .saveCounts(opt) if (opt$save == TRUE) { write.table(pkg.env$derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); write.table(pkg.env$derivedCont, file=paste(opt$resDir,"data-cont-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); diff --git a/PHESANT/R/testAssociations.r b/PHESANT/R/testAssociations.r index 8309b13..951e704 100644 --- a/PHESANT/R/testAssociations.r +++ b/PHESANT/R/testAssociations.r @@ -30,7 +30,7 @@ testAssociations <- function(opt, vl, currentVar, currentVarShort, thisdata, phe # check if variable info is found for this field if (length(idx)==0) { cat(paste(currentVar, " || Variable could not be found in pheno info file. \n", sep="")) - incrementCounter("notinphenofile") + .incrementCounter("notinphenofile") } else { @@ -47,12 +47,12 @@ testAssociations <- function(opt, vl, currentVar, currentVarShort, thisdata, phe if (excluded!="") { cat(paste("Excluded integer: ", excluded, " || ", sep="")) - incrementCounter("excluded.int") + .incrementCounter("excluded.int") } else { - incrementCounter("start.int") + .incrementCounter("start.int") if (isExposure==TRUE) { - incrementCounter("start.exposure.int") + .incrementCounter("start.exposure.int") } testInteger(opt, vl, currentVarShort, "INTEGER", thisdata, phenoStartIdx); @@ -66,12 +66,12 @@ testAssociations <- function(opt, vl, currentVar, currentVarShort, thisdata, phe if (excluded!="") { cat(paste("Excluded continuous: ", excluded, " || ", sep="")) - incrementCounter("excluded.cont") + .incrementCounter("excluded.cont") } else { - incrementCounter("start.cont") + .incrementCounter("start.cont") if (isExposure==TRUE) { - incrementCounter("start.exposure.cont") + .incrementCounter("start.exposure.cont") } testContinuous(opt, vl, currentVarShort, "CONTINUOUS", thisdata, phenoStartIdx); } @@ -84,12 +84,12 @@ testAssociations <- function(opt, vl, currentVar, currentVarShort, thisdata, phe if (excluded!="") { cat(paste("Excluded cat-single: ", excluded, " || ", sep="")) - incrementCounter("excluded.catSin") + .incrementCounter("excluded.catSin") } else { - incrementCounter("start.catSin") + .incrementCounter("start.catSin") if (isExposure==TRUE) { - incrementCounter("start.exposure.catSin") + .incrementCounter("start.exposure.catSin") } testCategoricalSingle(opt, vl, currentVarShort, "CAT-SIN", thisdata, phenoStartIdx); } @@ -102,24 +102,24 @@ testAssociations <- function(opt, vl, currentVar, currentVarShort, thisdata, phe if (excluded!="") { cat(paste("Excluded cat-multiple: ", excluded, " || ", sep="")) - incrementCounter("excluded.catMul") + .incrementCounter("excluded.catMul") } else { if (catSinToMult!="") { cat("cat-single to cat-multiple || ", sep="") - incrementCounter("catSinToCatMul") + .incrementCounter("catSinToCatMul") } - incrementCounter("start.catMul") + .incrementCounter("start.catMul") if (isExposure==TRUE) { - incrementCounter("start.exposure.catMul") + .incrementCounter("start.exposure.catMul") } else { # get number of cat mult values denoting trait of interest numVals = getNumValuesCatMultExposure(vl, currentVarShort) if (numVals>0) { - addToCounts("start.exposure.catMulvalues", numVals) + .addToCounts("start.exposure.catMulvalues", numVals) } } testCategoricalMultiple(opt, vl, currentVarShort, "CAT-MUL", thisdata, phenoStartIdx); diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index 508c4e0..c2b5ffe 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -54,7 +54,7 @@ testCategoricalMultiple <- function(opt, vl, varName, varType, thisdata, phenoSt idxsTrue = idxForVar[,"row"] cat(" CAT-MUL-BINARY-VAR ", variableVal, " || ", sep=""); - incrementCounter("catMul.binary") + .incrementCounter("catMul.binary") # make zero vector and set 1s for those with this variable value varBinary = rep.int(0,numRows); @@ -77,12 +77,12 @@ testCategoricalMultiple <- function(opt, vl, varName, varType, thisdata, phenoSt if (idxTrue<10 || idxFalse<10) { cat("CAT-MULT-SKIP-10 (", idxTrue, " vs ", idxFalse, ") || ", sep=""); - incrementCounter("catMul.10") + .incrementCounter("catMul.10") } else { isExposure = getIsCatMultExposure(vl, varName, variableVal) - incrementCounter("catMul.over10") + .incrementCounter("catMul.over10") # binary - so logistic regression binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), varType, newthisdata, isExposure, phenoStartIdx) } diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index d840a22..7cfb006 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -61,18 +61,18 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar pheno = .replaceMissingCodes(pheno) ## remove categories if < 10 examples - pheno = testNumExamples(pheno) + pheno = .testNumExamples(pheno) uniqVar = unique(na.omit(pheno)) uniqVar = sort(uniqVar) if (length(uniqVar)<=1) { cat("SKIP (only one value) || "); - incrementCounter("catSin.onevalue") + .incrementCounter("catSin.onevalue") } else if (length(uniqVar)==2) { cat("CAT-SINGLE-BINARY || "); - incrementCounter("catSin.case3") + .incrementCounter("catSin.case3") # binary so logistic regression phenoFactor = factor(pheno) @@ -91,7 +91,7 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar if (ordered == 0) { cat("CAT-SINGLE-UNORDERED || ") - incrementCounter("catSin.case2") + .incrementCounter("catSin.case2") thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); testCategoricalUnordered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); @@ -101,7 +101,7 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar ## ordered cat("ordered || "); - incrementCounter("catSin.case1") + .incrementCounter("catSin.case1") ## reorder variable values into increasing order thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); @@ -110,7 +110,7 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar } else if (ordered == -2) { cat(" EXCLUDED or BINARY variable: Should not get here in code. ") - incrementCounter( "catSin.binaryorexcluded") + .incrementCounter( "catSin.binaryorexcluded") } else { print(paste("ERROR", varName, varType, dataCode)); diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index 78c8ea2..efbce8b 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -25,7 +25,7 @@ testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoSta geno = thisdata[,"geno"] cat("CAT-ORD || "); - incrementCounter("ordCat") + .incrementCounter("ordCat") doCatOrdAssertions(pheno) @@ -39,7 +39,7 @@ testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoSta numNotNA = length(which(!is.na(pheno))) if (numNotNA<500) { cat("CATORD-SKIP-500 (", numNotNA, ") || ",sep=""); - incrementCounter("ordCat.500") + .incrementCounter("ordCat.500") } else { # test this cat ordered variable with ordered logistic regression @@ -52,7 +52,7 @@ testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoSta # add pheno to dataframe .storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catOrd') cat("SUCCESS results-ordered-logistic"); - incrementCounter("success.ordCat") + .incrementCounter("success.ordCat") } else { @@ -93,11 +93,11 @@ testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoSta write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-ordered-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE"); cat("SUCCESS results-ordered-logistic"); - incrementCounter("success.ordCat") + .incrementCounter("success.ordCat") isExposure = getIsExposure(vl, varName) if (isExposure == TRUE) { - incrementCounter("success.exposure.ordCat") + .incrementCounter("success.exposure.ordCat") } ### END TRYCATCH @@ -105,7 +105,7 @@ testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoSta sink() sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - incrementCounter("ordCat.error") + .incrementCounter("ordCat.error") }) } diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index 320f522..177020c 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -27,7 +27,7 @@ testCategoricalUnordered <- function(opt, vl, varName, varType, thisdata, phenoS numNotNA = length(which(!is.na(pheno))) if (numNotNA<500) { cat("CATUNORD-SKIP-500 (", numNotNA, ") || ",sep=""); - incrementCounter("unordCat.500") + .incrementCounter("unordCat.500") } else { @@ -38,7 +38,7 @@ testCategoricalUnordered <- function(opt, vl, varName, varType, thisdata, phenoS numWeights=(numUnique-1)*(((phenoStartIdx -1)-2)+1+1) if (numWeights>1000) { cat("Too many weights in model: ", numWeights, " > 1000, (num outcomes values: ", numUnique, ") || SKIP ", sep="") - incrementCounter("unordCat.cats") + .incrementCounter("unordCat.cats") return(NULL) } @@ -48,7 +48,7 @@ testCategoricalUnordered <- function(opt, vl, varName, varType, thisdata, phenoS # add pheno to dataframe .storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catUnord') cat("SUCCESS results-notordered-logistic "); - incrementCounter("success.unordCat") + .incrementCounter("success.unordCat") } else { @@ -127,11 +127,11 @@ testCategoricalUnordered <- function(opt, vl, varName, varType, thisdata, phenoS } cat("SUCCESS results-notordered-logistic "); - incrementCounter("success.unordCat") + .incrementCounter("success.unordCat") isExposure = getIsExposure(vl, varName) if (isExposure == TRUE) { - incrementCounter("success.exposure.unordCat") + .incrementCounter("success.exposure.unordCat") } ## END TRYCATCH @@ -139,7 +139,7 @@ testCategoricalUnordered <- function(opt, vl, varName, varType, thisdata, phenoS sink() sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - incrementCounter("unordCat.error") + .incrementCounter("unordCat.error") }) } } diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 883ea75..6d5e687 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -78,7 +78,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) ## remove categories if < 10 examples to see if this should be binary or not, but if ordered categorical ## then we include all values when generating this - phenoAvgMoreThan10 = testNumExamples(phenoAvg) + phenoAvgMoreThan10 = .testNumExamples(phenoAvg) ## binary if 2 distinct values, else ordered categorical phenoFactor = factor(phenoAvgMoreThan10) @@ -86,11 +86,11 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) if (numLevels<=1) { cat("SKIP (number of levels: ",numLevels,")",sep="") - incrementCounter("cont.onevalue") + .incrementCounter("cont.onevalue") } else if (numLevels==2) { # binary - incrementCounter("cont.binary") + .incrementCounter("cont.binary") thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx); } @@ -98,7 +98,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) else { ## try to treat as ordered categorical - incrementCounter("cont.ordcattry") + .incrementCounter("cont.ordcattry") ## equal sized bins phenoBinned = .equalSizedBins(phenoAvg) @@ -111,7 +111,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) # successful binning. >=10 examples in each of the 3 bins - incrementCounter("cont.ordcattry.ordcat") + .incrementCounter("cont.ordcattry.ordcat") thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned); testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); } @@ -122,12 +122,12 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) ## skip - not possible to create binary variable because first and third bins are too small ## ie. could merge bin1 with bin 2 but then bin3 still too small etc cat("SKIP 2 bins are too small || ") - incrementCounter("cont.ordcattry.smallbins") + .incrementCounter("cont.ordcattry.smallbins") } else if ((bin0Num<10 | bin1Num<10) & (bin0Num+bin1Num)>=10) { # combine first and second bin to create binary variable - incrementCounter("cont.ordcattry.binsbinary") + .incrementCounter("cont.ordcattry.binsbinary") cat("Combine first two bins and treat as binary || ") phenoBinned[which(phenoBinned==0)] = 1 @@ -138,7 +138,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) else if ((bin2Num<10 | bin1Num<10) & (bin2Num+bin1Num)>=10) { # combine second and last bin to create binary variable - incrementCounter("cont.ordcattry.binsbinary") + .incrementCounter("cont.ordcattry.binsbinary") cat("Combine last two bins and treat as binary || ") phenoBinned[which(phenoBinned==2)] = 1 @@ -151,7 +151,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) else { ## skip - not possible to create binary variable because combining bins would still be too small cat("SKIP 2 bins are too small(2) || ") - incrementCounter("cont.ordcattry.smallbins2") + .incrementCounter("cont.ordcattry.smallbins2") } } @@ -160,13 +160,13 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) } else { cat("IRNT || "); - incrementCounter("cont.main") + .incrementCounter("cont.main") # check there are at least 500 examples numNotNA = length(which(!is.na(phenoAvg))) if (numNotNA<500) { cat("CONTINUOUS-SKIP-500 (", numNotNA, ") || ",sep=""); - incrementCounter("cont.main.500") + .incrementCounter("cont.main.500") } else { ## inverse rank normal transformation @@ -176,7 +176,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) # add pheno to dataframe .storeNewVar(thisdata[,"userID"], phenoIRNT, varName, 'cont') cat("SUCCESS results-linear"); - incrementCounter("success.continuous") + .incrementCounter("success.continuous") } else { @@ -223,9 +223,9 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt", sep=""), append="TRUE"); cat("SUCCESS results-linear"); - incrementCounter("success.continuous") + .incrementCounter("success.continuous") if (isExposure == TRUE) { - incrementCounter("success.exposure.continuous") + .incrementCounter("success.exposure.continuous") } ## END TRYCATCH @@ -233,7 +233,7 @@ testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) sink() sink(pkg.env$resLogFile, append=TRUE) cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - incrementCounter("continuous.error") + .incrementCounter("continuous.error") }) } } diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 8591435..0759155 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -53,29 +53,29 @@ testInteger <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoAvg); testContinuous2(opt, vl, varName, varType, thisdatanew, phenoStartIdx) - incrementCounter( "int.continuous") + .incrementCounter( "int.continuous") } else { ## remove categories if < 10 examples - phenoAvg = testNumExamples(phenoAvg) + phenoAvg = .testNumExamples(phenoAvg) ## binary if 2 distinct values, else ordered categorical phenoFactor = factor(phenoAvg) numLevels = length(levels(phenoFactor)) if (numLevels<=1) { cat("SKIP (number of levels: ",numLevels,")",sep=""); - incrementCounter("int.onevalue") + .incrementCounter("int.onevalue") } else if (numLevels==2) { - incrementCounter("int.binary") + .incrementCounter("int.binary") # binary thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure,phenoStartIdx); } else { - incrementCounter("int.catord") + .incrementCounter("int.catord") cat("3-20 values || ") # we don't use equal sized bins just the original integers (that have >=10 examples) as categories diff --git a/PHESANT/R/testNumExamples.r b/PHESANT/R/testNumExamples.r deleted file mode 100644 index 37cc992..0000000 --- a/PHESANT/R/testNumExamples.r +++ /dev/null @@ -1,37 +0,0 @@ -# The MIT License (MIT) -# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without -# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions -# of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# Remove variable values if less than 10 examples have this value -testNumExamples <- function(pheno) { - - ## loop through values and remove if has < 10 examples - uniqVar = unique(na.omit(pheno)); - for (u in uniqVar) { - withValIdx = which(pheno==u) - numWithVal = length(withValIdx); - if (numWithVal<10) { - pheno[withValIdx]=NA - cat(paste("Removed ",u ,": ", numWithVal, "<10 examples || ", sep="")); - } - else { - cat(paste("Inc(>=10): ", u, "(", numWithVal, ") || ", sep="")); - } - } - return(pheno); -} diff --git a/PHESANT/R/traitOfInterest.r b/PHESANT/R/traitOfInterest.r index 8d6ea10..ec700e0 100644 --- a/PHESANT/R/traitOfInterest.r +++ b/PHESANT/R/traitOfInterest.r @@ -24,8 +24,8 @@ loadTraitOfInterest <- function(opt, phenotypes) { if (opt$save==TRUE) { # saving not running tests so we don't have a trait of interest # add pretend trait of interest so other code doesn't break - numRows = nrow(phenotypes) - data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) + numRows <- nrow(phenotypes) + data <- cbind.data.frame(phenotypes$userID, rep(-1, numRows)) } else { # load the trait of interest specified by the user if (is.null(opt$traitofinterestfile)) { @@ -35,12 +35,11 @@ loadTraitOfInterest <- function(opt, phenotypes) { print("Loading trait of interest file ...") data = fread(opt$traitofinterestfile, select=c(opt$userId, opt$traitofinterest), sep=',', header=TRUE, data.table=FALSE) } - data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) + data <- data.frame(lapply(data,function(x) type.convert(as.character(x)))) } colnames(data)[1] <- "userID" colnames(data)[2] <- "geno" - return(data) } diff --git a/PHESANT/man/initData.Rd b/PHESANT/man/initData.Rd new file mode 100644 index 0000000..359918d --- /dev/null +++ b/PHESANT/man/initData.Rd @@ -0,0 +1,29 @@ +\name{initData} +\alias{initData} +\title{ +Validates, loads input data and inilizes working environment to run PHESANT +} +\description{ +Validates, loads input data and inilizes working environment to run PHESANT. +} +\usage{ +initData(opt) +} +\arguments{ + \item{opt}{The list of input options provided by user.} +} + +\value{ +Return a list objerct containing all the data used to run PHESANT. + \enumerate{ + \item data: data frame for all phenotypes + \item vl: variable list for all phenotypes + \item confounders: data frame for all confounders + \item phenoStartIdx: the column index for first phenotype column in data data frame + \item phenoVars: a list of all phenotypes, excluding user ID + } +} + +\examples{ +# input <- initData(opt) +} diff --git a/PHESANT/man/loadConfounders.Rd b/PHESANT/man/loadConfounders.Rd new file mode 100644 index 0000000..1e51a5d --- /dev/null +++ b/PHESANT/man/loadConfounders.Rd @@ -0,0 +1,32 @@ +\name{loadConfounders} +\alias{loadConfounders} +\title{ +Loads confounder variables +} +\description{ +Loads confounder variables from phenotype file or confounder file based on user options. If user provides a confunder file, all confounder variables will be extracted from it. Otherwise, confounder variables will be read from phenotype file and a set of predefined confounders will be extracted. +} +\usage{ +loadConfounders(opt, phenotypes) +} +\arguments{ + \item{opt}{The list of input options provided by user.} + \item{phenotypes}{The phenotype data from loadPhenotypes function.} +} +\details{ +If user provides a confunder file, all confounder variables will be extracted from it. Otherwise, confounder variables will be read from phenotype file and a set of predefined confounders will be extracted. When reading from phenotype file for confounders, the following variables will be extracted: + \enumerate{ + \item age: x21022_0_0 + \item sex: x31_0_0 + \item genetic: x22000_0_0, with genetic batch used to create genetic chip variable, optional, present only if opt$genetic is set. + \item genetic principal components and assessment centre, optional, present only if opt$genetic and opt$sensitivity are set: x22009_0_1 through x22009_0_10, and x54_0_0. + \item assessment centre only if opt$genetic is false and opt$sensitivity is true: x54_0_0. + } +} + +\value{ +Returns a dataframe for all confounder variables, with first column being UserID.} + +\examples{ +#conf <- loadConfounders(opt, phenotype) +} diff --git a/PHESANT/man/loadData.Rd b/PHESANT/man/loadData.Rd new file mode 100644 index 0000000..ca14664 --- /dev/null +++ b/PHESANT/man/loadData.Rd @@ -0,0 +1,37 @@ +\name{loadData} +\alias{loadData} +\title{ +Loads phenotype, trait of interest data files +} +\description{ +Validates and loads phenotype, trait of interest, and relted fields from data files. +} +\usage{ +loadData(opt, vl) +} +\arguments{ + \item{opt}{The list of input options provided by user.} + \item{vl}{The list that holds input phenotype variable list and input data coding list} +} +\details{ +Taking user options as input, this function validates and loads phenotype, trait of interest, and relted fields from data files. + \enumerate{ + \item loads phenotype and trait of interest data files + \item creates phenotype / trait of interest data frame + \item creates confounder data frame + \item parses and creates indicator data frame + \item returns an object holding these three data frames + } +} +\value{ + Returns a list of dataframes: + \enumerate{ + \item phenotype: data frame for all phenotypes + \item confounders: data frame for all confounders + \item inds: data frame for all related indicators + } +} + +\examples{ +#d <- loadData(opt, vl) +} diff --git a/PHESANT/man/loadPhenotypes.Rd b/PHESANT/man/loadPhenotypes.Rd new file mode 100644 index 0000000..1fecd9f --- /dev/null +++ b/PHESANT/man/loadPhenotypes.Rd @@ -0,0 +1,25 @@ +\name{loadPhenotypes} +\alias{loadPhenotypes} +\title{ +Load phenotypes from phenotype file +} +\description{ +Load all or part of phenotypes from phenotype file based on user option. +} +\usage{ +loadPhenotypes(opt) +} +\arguments{ + \item{opt}{The list of input options provided by user.} +} +\details{ +Load all or part of phenotypes from phenotype file based on user option. +} +\value{ +Return a data frame object that conbtains all the required phenotypes. +} + +\examples{ +#phenotype = loadPhenotypes(opt) +} + diff --git a/PHESANT/man/loadTraitOfInterest.Rd b/PHESANT/man/loadTraitOfInterest.Rd new file mode 100644 index 0000000..ef60560 --- /dev/null +++ b/PHESANT/man/loadTraitOfInterest.Rd @@ -0,0 +1,25 @@ +\name{loadTraitOfInterest} +\alias{loadTraitOfInterest} +\title{ +Load trait of interest +} +\description{ +Load trait of interest, either from separate trait of interest file, or from phenotype file. +} +\usage{ +loadTraitOfInterest(opt, phenotypes) +} +\arguments{ + \item{opt}{The list of input options provided by user.} + \item{phenotypes}{The phenotype data from loadPhenotypes function.} +} +\details{ +Load trait of interest, either from separate trait of interest file, or from phenotype file. +} +\value{ +Returns a data frame of two columns for User ID and trait of interest. +} + +\examples{ +#toi <- loadTraitOfInterest(opt, phenotype) +} From 36d191bd18db99f0b5d63cb393dbe043bdcb4826 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 14:29:17 -0400 Subject: [PATCH 20/27] Code refactor --- PHESANT/R/testCatMultiple.r | 248 +++++++++++++++++------------------- 1 file changed, 117 insertions(+), 131 deletions(-) diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index c2b5ffe..ab90ef2 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -24,143 +24,129 @@ # 3) Checking derived variable has at least 10 cases in each group # 4) Calling binaryLogisticRegression function for this derived binary variable testCategoricalMultiple <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { - cat("CAT-MULTIPLE || "); - - pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] - pheno = reassignValue(vl, pheno, varName) - - ## get unique values from all columns of this variable - uniqueValues = unique(na.omit(pheno[,1])); - numCols = ncol(pheno); - numRows = nrow(pheno); - if (numCols>1) { - for (num in 2:numCols) { - u = unique(na.omit(pheno[,num])) - uniqueValues = union(uniqueValues,u); - } - } - - ## for each value create a binary variable and test this - for (variableVal in uniqueValues) { - - ## numeric negative values we assume are missing - check this - if(is.numeric(variableVal) & variableVal<0) { - cat("SKIP_val:", variableVal," < 0", sep=""); - next; - } - - # make variable for this value - idxForVar = which(pheno == variableVal, arr.ind=TRUE) - idxsTrue = idxForVar[,"row"] - - cat(" CAT-MUL-BINARY-VAR ", variableVal, " || ", sep=""); - .incrementCounter("catMul.binary") - - # make zero vector and set 1s for those with this variable value - varBinary = rep.int(0,numRows); - varBinary[idxsTrue] = 1; - varBinaryFactor = factor(varBinary) - - ## data for this new binary variable - newthisdata = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], varBinaryFactor) - - ## one of 3 ways to decide which examples are negative - idxsToRemove = restrictSample(vl, varName, pheno, variableVal, thisdata[,"userID", drop=FALSE]) - - if (!is.null(idxsToRemove) & length(idxsToRemove) > 0) { - newthisdata = newthisdata[-idxsToRemove,] - } - - facLevels = levels(newthisdata[,phenoStartIdx]) - idxTrue = length(which(newthisdata[,phenoStartIdx]==facLevels[1])) - idxFalse = length(which(newthisdata[,phenoStartIdx]==facLevels[2])) - - if (idxTrue<10 || idxFalse<10) { - cat("CAT-MULT-SKIP-10 (", idxTrue, " vs ", idxFalse, ") || ", sep=""); - .incrementCounter("catMul.10") - } - else { - isExposure = getIsCatMultExposure(vl, varName, variableVal) - - .incrementCounter("catMul.over10") - # binary - so logistic regression - binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), varType, newthisdata, isExposure, phenoStartIdx) - } - } + cat("CAT-MULTIPLE || ") + + pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] + pheno = reassignValue(vl, pheno, varName) + + ## get unique values from all columns of this variable + uniqueValues = unique(na.omit(pheno[,1])); + numCols = ncol(pheno); + numRows = nrow(pheno); + if (numCols>1) { + for (num in 2:numCols) { + u = unique(na.omit(pheno[,num])) + uniqueValues = union(uniqueValues,u); + } + } + + ## for each value create a binary variable and test this + for (variableVal in uniqueValues) { + + ## numeric negative values we assume are missing - check this + if(is.numeric(variableVal) & variableVal<0) { + cat("SKIP_val:", variableVal," < 0", sep=""); + next; + } + + # make variable for this value + idxForVar = which(pheno == variableVal, arr.ind=TRUE) + idxsTrue = idxForVar[,"row"] + + cat(" CAT-MUL-BINARY-VAR ", variableVal, " || ", sep=""); + .incrementCounter("catMul.binary") + + # make zero vector and set 1s for those with this variable value + varBinary = rep.int(0,numRows); + varBinary[idxsTrue] = 1; + varBinaryFactor = factor(varBinary) + + ## data for this new binary variable + newthisdata = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], varBinaryFactor) + + ## one of 3 ways to decide which examples are negative + idxsToRemove = .restrictSample(vl, varName, pheno, variableVal, thisdata[,"userID", drop=FALSE]) + + if (!is.null(idxsToRemove) & length(idxsToRemove) > 0) { + newthisdata = newthisdata[-idxsToRemove,] + } + + facLevels = levels(newthisdata[,phenoStartIdx]) + idxTrue = length(which(newthisdata[,phenoStartIdx]==facLevels[1])) + idxFalse = length(which(newthisdata[,phenoStartIdx]==facLevels[2])) + + if (idxTrue<10 || idxFalse<10) { + cat("CAT-MULT-SKIP-10 (", idxTrue, " vs ", idxFalse, ") || ", sep=""); + .incrementCounter("catMul.10") + } + else { + isExposure = getIsCatMultExposure(vl, varName, variableVal) + + .incrementCounter("catMul.over10") + # binary - so logistic regression + binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), varType, newthisdata, isExposure, phenoStartIdx) + } + } } # restricts sample based on value in CAT_MULT_INDICATOR_FIELDS column of variable info file, # either NO_NAN, ALL or a field ID # returns idx's that should be removed from the sample -restrictSample <- function(vl, varName,pheno,variableVal, userID) { - - # get definition for sample for this variable either NO_NAN, ALL or a variable ID - varIndicator = vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS[which(vl$phenoInfo$FieldID==varName)] - - return(restrictSample2(vl, varName,pheno,varIndicator,variableVal, userID)) +.restrictSample <- function(vl, varName,pheno,variableVal, userID) { + # get definition for sample for this variable either NO_NAN, ALL or a variable ID + varIndicator = vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS[which(vl$phenoInfo$FieldID==varName)] + return(.restrictSample2(vl, varName,pheno,varIndicator,variableVal, userID)) } - -restrictSample2 <- function(vl, varName,pheno, varIndicator,variableVal, userID) { - - if (varIndicator=="NO_NAN") { # remove NAs - ## remove all people with no value for this variable - - # row indexes with NA in all columns of this cat mult field - ind <- apply(pheno, 1, function(x) all(is.na(x))) - naIdxs = which(ind==TRUE) - cat("NO_NAN Remove NA participants ", length(naIdxs), " || ", sep=""); - } - else if (varIndicator=="ALL") { - - # use all people (no missing assumed) so return empty vector - # e.g. hospital data and death registry - naIdxs = cbind() - cat("ALL || ") - } - else if (varIndicator!="") { - # remove people who have no value for indicator variable - indName = paste("x",varIndicator,"_0_0",sep=""); - cat("Indicator name ", indName, " || ", sep=""); - indvarx = merge(userID, vl$indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) - indicatorVar = indvarx[,indName] - - # remove participants with NA value in this related field - indicatorVar = .replaceNaN(indicatorVar) - naIdxs = which(is.na(indicatorVar)) - - cat("Remove indicator var NAs: ", length(naIdxs), " || ", sep=""); - - if (is.numeric(as.matrix(indicatorVar))) { - # remove participants with value <0 in this related field - assumed missing indicators - lessZero = which(indicatorVar<0) - naIdxs = union(naIdxs, lessZero) - cat("Remove indicator var <0: ", length(lessZero), " || ", sep="") - } - } - else { - stop("Categorical multiples variables need a value for CAT_MULT_INDICATOR_FIELDS", call.=FALSE) - } - - ## remove people with pheno<0 if they aren't a positive example for this variable indicator - ## because we can't know if they are a negative example or not - if (is.numeric(as.matrix(pheno))) { - idxForVar = which(pheno == variableVal, arr.ind=TRUE) - idxMissing = which(pheno < 0, arr.ind=TRUE) - - # all people with <0 value and not variableVal - naMissing = setdiff(idxMissing,idxForVar) - - # add these people with unknowns to set to remove from sample - naIdxs = union(naIdxs, naMissing) - - cat(paste("Removed ", length(naMissing) ," examples != ", variableVal, " but with missing value (<0) || ", sep="")); - } - else { - cat("Not numeric || ") - } - - return(naIdxs); - +.restrictSample2 <- function(vl, varName,pheno, varIndicator,variableVal, userID) { + if (varIndicator=="NO_NAN") { # remove NAs + ## remove all people with no value for this variable + # row indexes with NA in all columns of this cat mult field + ind <- apply(pheno, 1, function(x) all(is.na(x))) + naIdxs <- which(ind==TRUE) + cat("NO_NAN Remove NA participants ", length(naIdxs), " || ", sep="") + } else if (varIndicator=="ALL") { + # use all people (no missing assumed) so return empty vector + # e.g. hospital data and death registry + naIdxs = cbind() + cat("ALL || ") + } else if (varIndicator!="") { + # remove people who have no value for indicator variable + indName <- paste("x",varIndicator,"_0_0",sep=""); + cat("Indicator name ", indName, " || ", sep=""); + indvarx <- merge(userID, vl$indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + indicatorVar <- indvarx[,indName] + + # remove participants with NA value in this related field + indicatorVar <- .replaceNaN(indicatorVar) + naIdxs <- which(is.na(indicatorVar)) + cat("Remove indicator var NAs: ", length(naIdxs), " || ", sep=""); + + if (is.numeric(as.matrix(indicatorVar))) { + # remove participants with value <0 in this related field - assumed missing indicators + lessZero <- which(indicatorVar<0) + naIdxs <- union(naIdxs, lessZero) + cat("Remove indicator var <0: ", length(lessZero), " || ", sep="") + } + } else { + stop("Categorical multiples variables need a value for CAT_MULT_INDICATOR_FIELDS", call.=FALSE) + } + + ## remove people with pheno<0 if they aren't a positive example for this variable indicator + ## because we can't know if they are a negative example or not + if (is.numeric(as.matrix(pheno))) { + idxForVar <- which(pheno == variableVal, arr.ind=TRUE) + idxMissing <- which(pheno < 0, arr.ind=TRUE) + + # all people with <0 value and not variableVal + naMissing <- setdiff(idxMissing,idxForVar) + + # add these people with unknowns to set to remove from sample + naIdxs <- union(naIdxs, naMissing) + cat(paste("Removed ", length(naMissing) ," examples != ", variableVal, " but with missing value (<0) || ", sep="")) + } else { + cat("Not numeric || ") + } + return(naIdxs) } From e2b121336675a8d74bd0a93a700862d96396cb35 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 14:38:14 -0400 Subject: [PATCH 21/27] Hide testContinuous2 --- PHESANT/R/testContinuous.r | 22 +++++++--------------- PHESANT/R/testInteger.r | 2 +- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index 6d5e687..c19214c 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -16,28 +16,20 @@ # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. - # Main function called for continuous fields testContinuous <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { - - cat("CONTINUOUS MAIN || "); - - pheno = thisdata[,phenoStartIdx:ncol(thisdata)] - - # reassign values - pheno = reassignValue(vl, pheno, varName) - - thisdata[,phenoStartIdx:ncol(thisdata)] = pheno - - testContinuous2(opt, vl, varName, varType, thisdata, phenoStartIdx) - + cat("CONTINUOUS MAIN || ") + pheno <- thisdata[,phenoStartIdx:ncol(thisdata)] + # reassign values + pheno <- reassignValue(vl, pheno, varName) + thisdata[,phenoStartIdx:ncol(thisdata)] <- pheno + .testContinuous2(opt, vl, varName, varType, thisdata, phenoStartIdx) } # Main code used to process continuous fields, or integer fields that have been reassigned as continuous because they have >20 distinct values. # This is needed because we have already reassigned values for integer fields, so do this in the function above for continuous fields. -testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { +.testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { cat("CONTINUOUS || "); - pheno = thisdata[,phenoStartIdx:ncol(thisdata)] isExposure = getIsExposure(vl, varName) diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 0759155..863b5bf 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -52,7 +52,7 @@ testInteger <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { if (length(uniqVar)>=20) { thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoAvg); - testContinuous2(opt, vl, varName, varType, thisdatanew, phenoStartIdx) + .testContinuous2(opt, vl, varName, varType, thisdatanew, phenoStartIdx) .incrementCounter( "int.continuous") } else { From 08327b89d1cef7740ecaf60fea32ff085e18fdb6 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 16:42:09 -0400 Subject: [PATCH 22/27] Add more documents --- PHESANT/R/testCatSingle.r | 297 ++++++++++++------------- PHESANT/R/testCategoricalUnordered.r | 31 +-- PHESANT/man/chooseReferenceCategory.Rd | 18 ++ PHESANT/man/option_list.Rd | 9 + PHESANT/man/pkg.env.Rd | 11 + 5 files changed, 187 insertions(+), 179 deletions(-) create mode 100644 PHESANT/man/chooseReferenceCategory.Rd create mode 100644 PHESANT/man/option_list.Rd create mode 100644 PHESANT/man/pkg.env.Rd diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index 7cfb006..7b0b509 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -24,175 +24,158 @@ # 4) Remove values with <10 cases # 5) Deterimine correct test to perform, either binary, ordered or unordered. testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { - cat("CAT-SINGLE || "); - - pheno = thisdata[,phenoStartIdx:ncol(thisdata)] - isExposure = getIsExposure(vl, varName) - - # assert variable has only one column - if (!is.null(dim(pheno))) stop("More than one column for categorical single") - - pheno = reassignValue(vl, pheno, varName) - - # get data code info - whether this data code is ordinal or not and any reordering - dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; - dataCode = dataPheno$DATA_CODING; - - # get data coding information - dataCodeRow = which(vl$dataCodeInfo$dataCode==dataCode); - if (length(dataCodeRow)==0) { - cat("ERROR: No row in data coding info file || "); - return(NULL); - } - dataDataCode = vl$dataCodeInfo[dataCodeRow,]; - ordered = dataDataCode$ordinal; - order = as.character(dataDataCode$ordering); - - ## reorder variable values into increasing order (we do this now as this may convert variable to binary rather than ordered) - pheno = reorderOrderedCategory(pheno,order); - - ## if data code has a default_value then recode NA's to this value for participants with value in default_related_field - ## this is used where there is no zero option e.g. field 100200 - defaultValue = dataDataCode$default_value - defaultRelatedID = dataDataCode$default_related_field - pheno = setDefaultValue(vl, pheno, defaultValue, defaultRelatedID, thisdata[,"userID", drop=FALSE]) - - ## all categories coded as <0 we assume are `missing' values - pheno = .replaceMissingCodes(pheno) - - ## remove categories if < 10 examples - pheno = .testNumExamples(pheno) - - uniqVar = unique(na.omit(pheno)) - uniqVar = sort(uniqVar) - - if (length(uniqVar)<=1) { - cat("SKIP (only one value) || "); - .incrementCounter("catSin.onevalue") - } - else if (length(uniqVar)==2) { - cat("CAT-SINGLE-BINARY || "); - .incrementCounter("catSin.case3") - # binary so logistic regression - - phenoFactor = factor(pheno) - # binary - so logistic regression - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); - binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx) - } - else { - # > 2 categories - if (is.na(ordered)) { - cat(" ERROR: 'ordered' not found in data code info file") - } - else { - - ## unordered - if (ordered == 0) { - - cat("CAT-SINGLE-UNORDERED || ") - .incrementCounter("catSin.case2") - - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); - testCategoricalUnordered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); - - } - else if (ordered == 1) { - - ## ordered - cat("ordered || "); - .incrementCounter("catSin.case1") - - ## reorder variable values into increasing order - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); - testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx, order) - - } - else if (ordered == -2) { - cat(" EXCLUDED or BINARY variable: Should not get here in code. ") - .incrementCounter( "catSin.binaryorexcluded") - } - else { - print(paste("ERROR", varName, varType, dataCode)); - } - } - } + cat("CAT-SINGLE || ") + pheno <- thisdata[,phenoStartIdx:ncol(thisdata)] + isExposure <- getIsExposure(vl, varName) + + # assert variable has only one column + if (!is.null(dim(pheno))) stop("More than one column for categorical single") + + pheno = reassignValue(vl, pheno, varName) + + # get data code info - whether this data code is ordinal or not and any reordering + dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; + dataCode = dataPheno$DATA_CODING; + + # get data coding information + dataCodeRow = which(vl$dataCodeInfo$dataCode==dataCode); + if (length(dataCodeRow)==0) { + cat("ERROR: No row in data coding info file || "); + return(NULL); + } + dataDataCode = vl$dataCodeInfo[dataCodeRow,]; + ordered = dataDataCode$ordinal; + order = as.character(dataDataCode$ordering); + + ## reorder variable values into increasing order (we do this now as this may convert variable to binary rather than ordered) + pheno = .reorderOrderedCategory(pheno,order); + + ## if data code has a default_value then recode NA's to this value for participants with value in default_related_field + ## this is used where there is no zero option e.g. field 100200 + defaultValue = dataDataCode$default_value + defaultRelatedID = dataDataCode$default_related_field + pheno = .setDefaultValue(vl, pheno, defaultValue, defaultRelatedID, thisdata[,"userID", drop=FALSE]) + + ## all categories coded as <0 we assume are `missing' values + pheno = .replaceMissingCodes(pheno) + + ## remove categories if < 10 examples + pheno = .testNumExamples(pheno) + + uniqVar = unique(na.omit(pheno)) + uniqVar = sort(uniqVar) + + if (length(uniqVar)<=1) { + cat("SKIP (only one value) || "); + .incrementCounter("catSin.onevalue") + } + else if (length(uniqVar)==2) { + cat("CAT-SINGLE-BINARY || "); + .incrementCounter("catSin.case3") + # binary so logistic regression + + phenoFactor = factor(pheno) + # binary - so logistic regression + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx) + } + else { + # > 2 categories + if (is.na(ordered)) { + cat(" ERROR: 'ordered' not found in data code info file") + } + else { + + ## unordered + if (ordered == 0) { + + cat("CAT-SINGLE-UNORDERED || ") + .incrementCounter("catSin.case2") + + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); + testCategoricalUnordered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); + + } + else if (ordered == 1) { + + ## ordered + cat("ordered || "); + .incrementCounter("catSin.case1") + + ## reorder variable values into increasing order + thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); + testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx, order) + + } + else if (ordered == -2) { + cat(" EXCLUDED or BINARY variable: Should not get here in code. ") + .incrementCounter( "catSin.binaryorexcluded") + } + else { + print(paste("ERROR", varName, varType, dataCode)); + } + } + } } ## values are reordered and assigned values 1:N for N categories -reorderOrderedCategory <- function(pheno,order) { - - ## new pheno of NAs (all values not in order are assumed to be NA) - - if (!is.na(order) && nchar(order)>0) { - - # make empty pheno - pheno2 = rep(NA,length(pheno)); - - ## get ordering - orderParts = unlist(strsplit(order,"\\|")); - - # go through values in correct order and set value - # from 1 to the number of values - count=1; - for(i in orderParts) { - idx = which(pheno==i); - pheno2[idx] = count; - - count=count+1; - } - - cat("reorder ",order," || ",sep=""); - - return(pheno2) - } - else { - return(pheno); - } - +.reorderOrderedCategory <- function(pheno,order) { + ## new pheno of NAs (all values not in order are assumed to be NA) + if (!is.na(order) && nchar(order)>0) { + # make empty pheno + pheno2 <- rep(NA,length(pheno)) + ## get ordering + orderParts <- unlist(strsplit(order,"\\|")) + + # go through values in correct order and set value + # from 1 to the number of values + count <- 1 + for (i in orderParts) { + idx <- which(pheno==i) + pheno2[idx] <- count + count <- count + 1 + } + cat("reorder ",order," || ",sep=""); + return(pheno2) + } else { + return(pheno) + } } ## sets default value for people with no value in pheno, but with a value in the ## field specified in the default_value_related_field column in the data coding info file. ## the default value is specified in the default_value column in the data coding info file. -setDefaultValue <- function(vl, pheno, defaultValue, defaultRelatedID, userID) { - - - if (!is.na(defaultValue) && nchar(defaultValue)>0) { - - # remove people who have no value for indicator variable - indName = paste("x",defaultRelatedID,"_0_0",sep=""); - - cat("Default related field: ", indName, " || ", sep=""); - indicatorVar = vl$indicatorFields[,indName] - indvarx = merge(userID, vl$indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) - indicatorVar = indvarx[,indName] - +.setDefaultValue <- function(vl, pheno, defaultValue, defaultRelatedID, userID) { + if (!is.na(defaultValue) && nchar(defaultValue)>0) { + # remove people who have no value for indicator variable + indName <- paste("x",defaultRelatedID,"_0_0",sep="") + + cat("Default related field: ", indName, " || ", sep="") + indicatorVar <- vl$indicatorFields[,indName] + indvarx <- merge(userID, vl$indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + indicatorVar <- indvarx[,indName] + # remove participants with NA value in this related field - indicatorVar = .replaceNaN(indicatorVar) - - # check if there are already examples with default value and if so display warning - numWithDefault = length(which(pheno==defaultValue)) - if (numWithDefault>0) { - cat("(WARNING: already ", numWithDefault, " values with default value) ", sep="") - } - - # set default value in people who have no value in the pheno but do have a value in the default_value_related_field - defaultIdxs = which(!is.na(indicatorVar) & is.na(pheno)) - pheno[defaultIdxs] = defaultValue - - cat("default value ", defaultValue, " set, N= ", length(defaultIdxs), " || ", sep=""); - - } - - return(pheno) - + indicatorVar <- .replaceNaN(indicatorVar) + + # check if there are already examples with default value and if so display warning + numWithDefault <- length(which(pheno==defaultValue)) + if (numWithDefault>0) { + cat("(WARNING: already ", numWithDefault, " values with default value) ", sep="") + } + + # set default value in people who have no value in the pheno but do have a value in the default_value_related_field + defaultIdxs <- which(!is.na(indicatorVar) & is.na(pheno)) + pheno[defaultIdxs] <- defaultValue + cat("default value ", defaultValue, " set, N= ", length(defaultIdxs), " || ", sep=""); + } + return(pheno) } # Replace negative values with NA as these are assumed to be missing .replaceMissingCodes <- function(pheno) { - phenoReplaced <- pheno - phenoReplaced[phenoReplaced <0 ] <- NA - return(phenoReplaced) + phenoReplaced <- pheno + phenoReplaced[phenoReplaced <0 ] <- NA + return(phenoReplaced) } diff --git a/PHESANT/R/testCategoricalUnordered.r b/PHESANT/R/testCategoricalUnordered.r index 177020c..639ecd0 100644 --- a/PHESANT/R/testCategoricalUnordered.r +++ b/PHESANT/R/testCategoricalUnordered.r @@ -147,29 +147,16 @@ testCategoricalUnordered <- function(opt, vl, varName, varType, thisdata, phenoS # find reference category - category with most number of examples chooseReferenceCategory <- function(pheno) { - - uniqVar = unique(na.omit(pheno)); - phenoFactor = factor(pheno) - - maxFreq=0; - maxFreqVar = ""; - for (u in uniqVar) { - withValIdx = which(pheno==u) - numWithVal = length(withValIdx); - if (numWithVal>maxFreq) { - maxFreq = numWithVal; - maxFreqVar = u; - } - } - - cat("reference: ", maxFreqVar,"=",maxFreq, " || ", sep=""); - - ## choose reference (category with largest frequency) - phenoFactor <- relevel(phenoFactor, ref = paste("",maxFreqVar,sep="")) - - return(phenoFactor); + freq <- summary(factor(na.omit(pheno))) + maxFreq <- max(freq) + maxFreqVar <- names(which(freq == max(freq)))[1] + cat("reference: ", maxFreqVar,"=", maxFreq, " || ", sep="") + + ## choose reference (category with largest frequency) + phenoFactor <- factor(pheno) + phenoFactor <- relevel(phenoFactor, ref = maxFreqVar) + return(phenoFactor) } - diff --git a/PHESANT/man/chooseReferenceCategory.Rd b/PHESANT/man/chooseReferenceCategory.Rd new file mode 100644 index 0000000..210767a --- /dev/null +++ b/PHESANT/man/chooseReferenceCategory.Rd @@ -0,0 +1,18 @@ +\name{chooseReferenceCategory} +\alias{chooseReferenceCategory} +\title{ +Choose Reference Category +} +\description{ +Convert an integer vector with possible NA's to a factor and use the category/factor with most number of examples as reference. +} +\usage{ +chooseReferenceCategory(pheno) +} +\arguments{ + \item{pheno}{ An integer vector with possible NA's. } +} + +\value{ + A factor with the most number of examples as reference. +} diff --git a/PHESANT/man/option_list.Rd b/PHESANT/man/option_list.Rd new file mode 100644 index 0000000..21a696c --- /dev/null +++ b/PHESANT/man/option_list.Rd @@ -0,0 +1,9 @@ +\name{option_list} +\alias{option_list} +\docType{data} +\title{ +User Input Option List +} + + + diff --git a/PHESANT/man/pkg.env.Rd b/PHESANT/man/pkg.env.Rd new file mode 100644 index 0000000..0a02f01 --- /dev/null +++ b/PHESANT/man/pkg.env.Rd @@ -0,0 +1,11 @@ +\name{pkg.env} +\alias{pkg.env} +\docType{data} +\title{ +package level envrionment +} +\description{ +A package level envrionment used to hold package level variables. +} + +\keyword{datasets} From 390b2bfbaad079db64e8e95ee8288924dfce66c6 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 17:11:12 -0400 Subject: [PATCH 23/27] Code refactor and fix the buggy "end" word in function setOrderString --- PHESANT/R/cleanData.R | 2 +- PHESANT/R/testCatMultiple.r | 2 +- PHESANT/R/testCatSingle.r | 2 +- PHESANT/R/testCategoricalOrdered.r | 60 +++++++++++++----------------- PHESANT/R/testContinuous.r | 2 +- PHESANT/R/testInteger.r | 2 +- 6 files changed, 31 insertions(+), 39 deletions(-) diff --git a/PHESANT/R/cleanData.R b/PHESANT/R/cleanData.R index adf98ae..fa78a52 100644 --- a/PHESANT/R/cleanData.R +++ b/PHESANT/R/cleanData.R @@ -17,7 +17,7 @@ # DEALINGS IN THE SOFTWARE. # Reassigns values as specified in data coding info file -reassignValue <- function(vl, pheno, varName) { +.reassignValue <- function(vl, pheno, varName) { # get data code info - whether this data code is ordinal or not and any reordering and resassignments dataPheno <- vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),] dataCode <- dataPheno$DATA_CODING diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index ab90ef2..b9f7573 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -27,7 +27,7 @@ testCategoricalMultiple <- function(opt, vl, varName, varType, thisdata, phenoSt cat("CAT-MULTIPLE || ") pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] - pheno = reassignValue(vl, pheno, varName) + pheno = .reassignValue(vl, pheno, varName) ## get unique values from all columns of this variable uniqueValues = unique(na.omit(pheno[,1])); diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index 7b0b509..85d988b 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -31,7 +31,7 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar # assert variable has only one column if (!is.null(dim(pheno))) stop("More than one column for categorical single") - pheno = reassignValue(vl, pheno, varName) + pheno = .reassignValue(vl, pheno, varName) # get data code info - whether this data code is ordinal or not and any reordering dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index efbce8b..7089ff7 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -26,13 +26,12 @@ testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoSta cat("CAT-ORD || "); .incrementCounter("ordCat") - - doCatOrdAssertions(pheno) + .doCatOrdAssertions(pheno) uniqVar = unique(na.omit(pheno)); # log the ordering of categories used - orderStr = setOrderString(orderStr, uniqVar); + orderStr = .setOrderString(orderStr, uniqVar); cat("order: ", orderStr, " || ", sep=""); # check sample size @@ -115,46 +114,39 @@ testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoSta # check that the phenotype is valid - that there are more than two categories # and that these all have at least 10 cases # something has gone wrong if this is the case -doCatOrdAssertions <- function(pheno) { - - # assert variable has only one column - if (!is.null(dim(pheno))) stop("More than one column for categorical ordered") - - uniqVar = unique(na.omit(pheno)); - +.doCatOrdAssertions <- function(pheno) { + # assert variable has only one column + if (!is.null(dim(pheno))) stop("More than one column for categorical ordered") + uniqVar <- unique(na.omit(pheno)) # assert more than 2 categories if (length(uniqVar)<=1) stop("1 or zero values") if (length(uniqVar)==2) stop("this variable is binary") # assert each value has >= 10 examples for (u in uniqVar) { - withValIdx = which(pheno==u) - numWithVal = length(withValIdx); - - if (numWithVal<10) stop("value with <10 examples") - } + withValIdx <- which(pheno==u) + numWithVal <- length(withValIdx) + if (numWithVal<10) stop("value with <10 examples") + } } # If data coding file does not specify an order then we use the default order as in coding defined by Biobank # and this function just generates a string with this order for logging purposes -setOrderString <- function(orderStr, uniqVar) { - - if (is.na(orderStr) || nchar(orderStr)==0) { - - orderStr=""; - - # create order str by appending each value - uniqVarSorted = sort(uniqVar); - first=1; - for (i in uniqVarSorted) { - if (first==0) { - orderStr = paste(orderStr, "|", sep=""); - } - if (i>=0) # ignore missing values - orderStr = paste(orderStr, i, sep=""); - first=0; - end - } +.setOrderString <- function(orderStr, uniqVar) { + if (is.na(orderStr) || nchar(orderStr)==0) { + orderStr <- "" + # create order str by appending each value + uniqVarSorted <- sort(uniqVar) + first <- TRUE + for (i in uniqVarSorted) { + if (!first) { + orderStr <- paste(orderStr, "|", sep="") + } + if (i >= 0) {# ignore missing values + orderStr <- paste(orderStr, i, sep="") + } + first <- FALSE } - return(orderStr); + } + return(orderStr) } diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index c19214c..aeef1a6 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -21,7 +21,7 @@ testContinuous <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { cat("CONTINUOUS MAIN || ") pheno <- thisdata[,phenoStartIdx:ncol(thisdata)] # reassign values - pheno <- reassignValue(vl, pheno, varName) + pheno <- .reassignValue(vl, pheno, varName) thisdata[,phenoStartIdx:ncol(thisdata)] <- pheno .testContinuous2(opt, vl, varName, varType, thisdata, phenoStartIdx) } diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 863b5bf..98735c9 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -33,7 +33,7 @@ testInteger <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { return(NULL) } - pheno = reassignValue(vl, pheno, varName) + pheno = .reassignValue(vl, pheno, varName) ## average if multiple columns if (!is.null(dim(pheno))) { From c55bbea85e6d9a8df61ab45e0e4367705f9f5fe0 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 17:25:05 -0400 Subject: [PATCH 24/27] Code clean for exposure related functions --- PHESANT/R/exposure.r | 113 +++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 68 deletions(-) diff --git a/PHESANT/R/exposure.r b/PHESANT/R/exposure.r index a05d561..7a05247 100644 --- a/PHESANT/R/exposure.r +++ b/PHESANT/R/exposure.r @@ -21,13 +21,9 @@ # in the variable information file # to determine if values of cat mult fields (not the whole field) are exposure values, use getIsCatMultExposure function instead. getIsExposure <- function(vl, varName) { - - idx=which(vl$phenoInfo$FieldID==varName) - isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] - if (!is.na(isExposure) & isExposure=="YES") { - return(TRUE) - } - return(FALSE) + idx <- which(vl$phenoInfo$FieldID==varName) + isExposure <- vl$phenoInfo$TRAIT_OF_INTEREST[idx] + return(!is.na(isExposure) && isExposure=="YES") } # looks up categorical multiple field in the variable info file, return @@ -36,74 +32,55 @@ getIsExposure <- function(vl, varName) { # as a trait of interest in the TRAIT_OF_INTEREST column (multiple values are # separated by "|" in this field getIsCatMultExposure <- function(vl, varName, varValue) { - - # get row index of field in variable information file - idx=which(vl$phenoInfo$FieldID==varName) - - # may be empty of may contain VALUE1|VALUE2 etc .. to denote those - # cat mult values denoting exposure variable - isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] - - if (!is.na(isExposure) & isExposure!="") { - - isExposure = as.character(isExposure) - - ## first check if value is YES, then all values are exposure traits - if (isExposure == "YES") { - cat("IS_CM_ALL_EXPOSURE || ") - return(TRUE) + # get row index of field in variable information file + idx <- which(vl$phenoInfo$FieldID==varName) + # may be empty of may contain VALUE1|VALUE2 etc .. to denote those + # cat mult values denoting exposure variable + isExposure <- vl$phenoInfo$TRAIT_OF_INTEREST[idx] + if (!is.na(isExposure) && isExposure!="") { + isExposure <- as.character(isExposure) + ## first check if value is YES, then all values are exposure traits + if (isExposure == "YES") { + cat("IS_CM_ALL_EXPOSURE || ") + return(TRUE) + } + + ## try to split by |, to set particular values as exposure + # split into variable Values + exposureValues = unlist(strsplit(isExposure,"\\|")) + # for each value stated, check whether it is varValue + for (thisVal in exposureValues) { + if (thisVal == varValue) { + cat("IS_CM_EXPOSURE || ") + return(TRUE) + } + } } - ## try to split by |, to set particular values as exposure - - # split into variable Values - exposureValues = unlist(strsplit(isExposure,"\\|")) - - # for each value stated, check whether it is varValue - for (thisVal in exposureValues) { - if (thisVal == varValue) { - cat("IS_CM_EXPOSURE || ") - return(TRUE) - } - } - } - - # varValue is not in list of exposure values - return(FALSE) - + # varValue is not in list of exposure values + return(FALSE) } # looks up categorical multiple field in the variable info file, return # number of values denoted as trait of interest. # returns zero if whole field is denoted trait of interest, not particular values. getNumValuesCatMultExposure <- function(vl, varName) { - - # get row index of field in variable information file - idx=which(vl$phenoInfo$FieldID==varName) - - # may be empty of may contain VALUE1|VALUE2 etc .. to denote those - # cat mult values denoting exposure variable - isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] - - if (!is.na(isExposure) & isExposure!="") { - - isExposure = as.character(isExposure) - - ## first check if value is YES, then no partic values are traits of interest - if (isExposure == "YES") { - return(0) + # get row index of field in variable information file + idx <- which(vl$phenoInfo$FieldID==varName) + # may be empty of may contain VALUE1|VALUE2 etc .. to denote those + # cat mult values denoting exposure variable + isExposure <- vl$phenoInfo$TRAIT_OF_INTEREST[idx] + if (!is.na(isExposure) & isExposure!="") { + isExposure <- as.character(isExposure) + ## first check if value is YES, then no partic values are traits of interest + if (isExposure == "YES") { + return(0) + } + ## try to split by |, to set particular values as exposure + # split into variable Values + exposureValues <- unlist(strsplit(isExposure,"\\|")) + return(length(exposureValues)) } - - ## try to split by |, to set particular values as exposure - - # split into variable Values - exposureValues = unlist(strsplit(isExposure,"\\|")) - - return(length(exposureValues)) - - } - - # varValue is not in list of exposure values - return(0) - + # varValue is not in list of exposure values + return(0) } From 0871bb48c3dcde9f894027189cc2adebc9ef42a0 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Fri, 15 Mar 2019 17:40:31 -0400 Subject: [PATCH 25/27] Code refactor --- PHESANT/R/run.r | 4 ++-- PHESANT/R/utils.r | 47 +++++++++++++++++++++-------------------------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/PHESANT/R/run.r b/PHESANT/R/run.r index b4cf1c0..8d43b88 100644 --- a/PHESANT/R/run.r +++ b/PHESANT/R/run.r @@ -30,7 +30,7 @@ run <- function(opt) { } else { ## new variable so run test for previous (we have collected all the columns now) if (first==FALSE) { - thisdata <- makeTestDataFrame(input$data, input$confounders, currentVarValues) + thisdata <- .makeTestDataFrame(input$data, input$confounders, currentVarValues) testAssociations(opt, input$vl, currentVar, currentVarShort, thisdata, input$phenoStartIdx) } first <- FALSE @@ -47,7 +47,7 @@ run <- function(opt) { if (phenoIdx>0){ # last variable so test association - thisdata = makeTestDataFrame(input$data, input$confounders, currentVarValues) + thisdata = .makeTestDataFrame(input$data, input$confounders, currentVarValues) testAssociations(opt, input$vl, currentVar, currentVarShort, thisdata, input$phenoStartIdx) } sink() diff --git a/PHESANT/R/utils.r b/PHESANT/R/utils.r index 2e560fb..563d949 100644 --- a/PHESANT/R/utils.r +++ b/PHESANT/R/utils.r @@ -15,7 +15,7 @@ } ## makes a smaller data frame containing the data for a particular test -makeTestDataFrame <- function(datax, confounders, currentVarValues) { +.makeTestDataFrame <- function(datax, confounders, currentVarValues) { thisdata <- datax[,c("geno", "userID")] thisdata <- merge(thisdata, confounders, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) currentVarValues <- cbind.data.frame(datax$userID, currentVarValues) @@ -43,7 +43,7 @@ processArgs <- function(opt, opt_parser) { opt$sensitivity <- FALSE; opt$genetic <- TRUE; } - opt <- processParts(opt, opt_parser,opt$partIdx, opt$numParts) + opt <- .processParts(opt, opt_parser,opt$partIdx, opt$numParts) } else { ## check arguments are supplied correctly @@ -88,7 +88,7 @@ processArgs <- function(opt, opt_parser) { else if (!file.exists(opt$resDir)) { stop(paste("results directory resDir=", opt$resDir, " does not exist", sep=""), call.=FALSE) } - opt <-processParts(opt, opt_parser,opt$partIdx, opt$numParts); + opt <-.processParts(opt, opt_parser,opt$partIdx, opt$numParts); } if (opt$save==TRUE) { print("Saving phenotypes to file. Tests of association will not run!") @@ -97,27 +97,22 @@ processArgs <- function(opt, opt_parser) { } # Parse the 'part' arguments and check they are valid -processParts <- function(opt, opt_parser, pIdx, nParts) { - - if (is.null(pIdx) && is.null(nParts)) { - opt$varTypeArg <- "all"; - print(paste("Running with all traits in phenotype file:", opt$phenofile)); - } - else if (is.null(pIdx)) { - print_help(opt_parser) - stop("pIdx argument must be supplied when nParts argument is supplied", call.=FALSE) - } - else if (is.null(nParts)) { - print_help(opt_parser) - stop("nParts argument must be supplied when pIdx argument is supplied", call.=FALSE) - } - else if (pIdx<1 || pIdx>nParts) { - print_help(opt_parser) - stop("pIdx arguments must be between 1 and nParts inclusive", call.=FALSE) - } - else { - opt$varTypeArg <- paste(pIdx, "-", nParts, sep=""); - print(paste("Running with part",pIdx,"of",nParts," in phenotype file:", opt$phenofile)); - } - return(opt) +.processParts <- function(opt, opt_parser, pIdx, nParts) { + if (is.null(pIdx) && is.null(nParts)) { + opt$varTypeArg <- "all"; + print(paste("Running with all traits in phenotype file:", opt$phenofile)); + } else if (is.null(pIdx)) { + print_help(opt_parser) + stop("pIdx argument must be supplied when nParts argument is supplied", call.=FALSE) + } else if (is.null(nParts)) { + print_help(opt_parser) + stop("nParts argument must be supplied when pIdx argument is supplied", call.=FALSE) + } else if (pIdx<1 || pIdx>nParts) { + print_help(opt_parser) + stop("pIdx arguments must be between 1 and nParts inclusive", call.=FALSE) + } else { + opt$varTypeArg <- paste(pIdx, "-", nParts, sep=""); + print(paste("Running with part",pIdx,"of",nParts," in phenotype file:", opt$phenofile)); + } + return(opt) } From 0bc16c247cdd085cd1a095673275a8e4987c9ca1 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Sat, 16 Mar 2019 13:12:05 -0400 Subject: [PATCH 26/27] Code refactor and documentation: continued --- PHESANT/R/exposure.r | 4 +- PHESANT/R/testAssociations.r | 201 ++++++-------- PHESANT/R/testCatMultiple.r | 88 +++--- PHESANT/R/testCatSingle.r | 133 ++++----- PHESANT/R/testContinuous.r | 361 +++++++++++-------------- PHESANT/R/testInteger.r | 109 ++++---- PHESANT/R/utils.r | 10 +- PHESANT/man/getIsExposure.Rd | 21 ++ PHESANT/man/option_list.Rd | 4 +- PHESANT/man/parseOptions.Rd | 15 + PHESANT/man/run.Rd | 23 ++ PHESANT/man/testAssociations.Rd | 27 ++ PHESANT/man/testCategoricalMultiple.Rd | 34 +++ PHESANT/man/testCategoricalSingle.Rd | 32 +++ PHESANT/man/testContinuous.Rd | 34 +++ PHESANT/man/testInteger.Rd | 32 +++ PHESANT/phenomeScan.r | 4 +- 17 files changed, 630 insertions(+), 502 deletions(-) create mode 100644 PHESANT/man/getIsExposure.Rd create mode 100644 PHESANT/man/parseOptions.Rd create mode 100644 PHESANT/man/run.Rd create mode 100644 PHESANT/man/testAssociations.Rd create mode 100644 PHESANT/man/testCategoricalMultiple.Rd create mode 100644 PHESANT/man/testCategoricalSingle.Rd create mode 100644 PHESANT/man/testContinuous.Rd create mode 100644 PHESANT/man/testInteger.Rd diff --git a/PHESANT/R/exposure.r b/PHESANT/R/exposure.r index 7a05247..fcbfcdf 100644 --- a/PHESANT/R/exposure.r +++ b/PHESANT/R/exposure.r @@ -31,7 +31,7 @@ getIsExposure <- function(vl, varName) { # this field denote the exposure), or whether varName has varValue stated # as a trait of interest in the TRAIT_OF_INTEREST column (multiple values are # separated by "|" in this field -getIsCatMultExposure <- function(vl, varName, varValue) { +.getIsCatMultExposure <- function(vl, varName, varValue) { # get row index of field in variable information file idx <- which(vl$phenoInfo$FieldID==varName) # may be empty of may contain VALUE1|VALUE2 etc .. to denote those @@ -64,7 +64,7 @@ getIsCatMultExposure <- function(vl, varName, varValue) { # looks up categorical multiple field in the variable info file, return # number of values denoted as trait of interest. # returns zero if whole field is denoted trait of interest, not particular values. -getNumValuesCatMultExposure <- function(vl, varName) { +.getNumValuesCatMultExposure <- function(vl, varName) { # get row index of field in variable information file idx <- which(vl$phenoInfo$FieldID==varName) # may be empty of may contain VALUE1|VALUE2 etc .. to denote those diff --git a/PHESANT/R/testAssociations.r b/PHESANT/R/testAssociations.r index 951e704..016e1fc 100644 --- a/PHESANT/R/testAssociations.r +++ b/PHESANT/R/testAssociations.r @@ -16,124 +16,93 @@ # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. - # Tests the association of a field, determined by its field type testAssociations <- function(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) { - - ## call file for variable type - - tryCatch({ - - # retrieve whether phenotype is excluded etc - idx=which(vl$phenoInfo$FieldID==currentVarShort); - - # check if variable info is found for this field - if (length(idx)==0) { - cat(paste(currentVar, " || Variable could not be found in pheno info file. \n", sep="")) - .incrementCounter("notinphenofile") - } - else { - - # get info from variable info file - excluded = vl$phenoInfo$EXCLUDED[idx] - catSinToMult = vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT[idx] - fieldType = vl$phenoInfo$ValueType[idx] - isExposure = getIsExposure(vl, currentVarShort) #vl$phenoInfo$EXPOSURE_PHENOTYPE[idx] - - if (fieldType=="Integer") { - - #### INTEGER - cat(currentVar, "|| ", sep="") - - if (excluded!="") { - cat(paste("Excluded integer: ", excluded, " || ", sep="")) - .incrementCounter("excluded.int") - } - else { - .incrementCounter("start.int") - if (isExposure==TRUE) { - .incrementCounter("start.exposure.int") - } - - testInteger(opt, vl, currentVarShort, "INTEGER", thisdata, phenoStartIdx); - } - cat("\n"); - } - else if (fieldType=="Continuous") { - - #### CONTINUOUS - cat(currentVar, "|| ", sep="") - - if (excluded!="") { - cat(paste("Excluded continuous: ", excluded, " || ", sep="")) - .incrementCounter("excluded.cont") - } - else { - .incrementCounter("start.cont") - if (isExposure==TRUE) { - .incrementCounter("start.exposure.cont") - } - testContinuous(opt, vl, currentVarShort, "CONTINUOUS", thisdata, phenoStartIdx); - } - cat("\n"); - } - else if (fieldType=="Categorical single" && catSinToMult=="") { - - #### CAT SINGLE - cat(currentVar, "|| ", sep="") - - if (excluded!="") { - cat(paste("Excluded cat-single: ", excluded, " || ", sep="")) - .incrementCounter("excluded.catSin") - } - else { - .incrementCounter("start.catSin") - if (isExposure==TRUE) { - .incrementCounter("start.exposure.catSin") - } - testCategoricalSingle(opt, vl, currentVarShort, "CAT-SIN", thisdata, phenoStartIdx); - } - cat("\n"); - } - else if (fieldType=="Categorical multiple" || catSinToMult!="") { - - #### CAT MULTIPLE - cat(currentVar, "|| ", sep="") - - if (excluded!="") { - cat(paste("Excluded cat-multiple: ", excluded, " || ", sep="")) - .incrementCounter("excluded.catMul") - } - else { - - if (catSinToMult!="") { - cat("cat-single to cat-multiple || ", sep="") - .incrementCounter("catSinToCatMul") - } - - .incrementCounter("start.catMul") - if (isExposure==TRUE) { - .incrementCounter("start.exposure.catMul") - } - else { - # get number of cat mult values denoting trait of interest - numVals = getNumValuesCatMultExposure(vl, currentVarShort) - if (numVals>0) { - .addToCounts("start.exposure.catMulvalues", numVals) - } - } - testCategoricalMultiple(opt, vl, currentVarShort, "CAT-MUL", thisdata, phenoStartIdx); - } - cat("\n"); - } - else { - #cat("VAR MISSING ", currentVarShort, "\n", sep=""); - } - } - - }, error = function(e) { - print(paste("ERROR:", currentVar,e)) - }) + ## call coresponding test function for variable type + tryCatch({ + # retrieve whether phenotype is excluded etc + idx=which(vl$phenoInfo$FieldID==currentVarShort); + # check if variable info is found for this field + if (length(idx)==0) { + cat(paste(currentVar, " || Variable could not be found in pheno info file. \n", sep="")) + .incrementCounter("notinphenofile") + } else { + # get info from variable info file + excluded = vl$phenoInfo$EXCLUDED[idx] + catSinToMult = vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT[idx] + fieldType = vl$phenoInfo$ValueType[idx] + isExposure = getIsExposure(vl, currentVarShort) #vl$phenoInfo$EXPOSURE_PHENOTYPE[idx] + + if (fieldType=="Integer") { + #### INTEGER + cat(currentVar, "|| ", sep="") + if (excluded!="") { + cat(paste("Excluded integer: ", excluded, " || ", sep="")) + .incrementCounter("excluded.int") + } else { + .incrementCounter("start.int") + if (isExposure==TRUE) { + .incrementCounter("start.exposure.int") + } + testInteger(opt, vl, currentVarShort, "INTEGER", thisdata, phenoStartIdx) + } + cat("\n") + } else if (fieldType=="Continuous") { + cat(currentVar, "|| ", sep="") + if (excluded!="") { + cat(paste("Excluded continuous: ", excluded, " || ", sep="")) + .incrementCounter("excluded.cont") + } else { + .incrementCounter("start.cont") + if (isExposure==TRUE) { + .incrementCounter("start.exposure.cont") + } + testContinuous(opt, vl, currentVarShort, "CONTINUOUS", thisdata, phenoStartIdx) + } + cat("\n") + } else if (fieldType=="Categorical single" && catSinToMult=="") { + cat(currentVar, "|| ", sep="") + if (excluded!="") { + cat(paste("Excluded cat-single: ", excluded, " || ", sep="")) + .incrementCounter("excluded.catSin") + } else { + .incrementCounter("start.catSin") + if (isExposure==TRUE) { + .incrementCounter("start.exposure.catSin") + } + testCategoricalSingle(opt, vl, currentVarShort, "CAT-SIN", thisdata, phenoStartIdx) + } + cat("\n") + } else if (fieldType=="Categorical multiple" || catSinToMult!="") { + cat(currentVar, "|| ", sep="") + if (excluded!="") { + cat(paste("Excluded cat-multiple: ", excluded, " || ", sep="")) + .incrementCounter("excluded.catMul") + } else { + if (catSinToMult!="") { + cat("cat-single to cat-multiple || ", sep="") + .incrementCounter("catSinToCatMul") + } + .incrementCounter("start.catMul") + if (isExposure==TRUE) { + .incrementCounter("start.exposure.catMul") + } else { + # get number of cat mult values denoting trait of interest + numVals = .getNumValuesCatMultExposure(vl, currentVarShort) + if (numVals>0) { + .addToCounts("start.exposure.catMulvalues", numVals) + } + } + testCategoricalMultiple(opt, vl, currentVarShort, "CAT-MUL", thisdata, phenoStartIdx) + } + cat("\n") + } else { + #cat("VAR MISSING ", currentVarShort, "\n", sep="") + } + } + }, error = function(e) { + print(paste("ERROR:", currentVar,e)) + }) } diff --git a/PHESANT/R/testCatMultiple.r b/PHESANT/R/testCatMultiple.r index b9f7573..b9613af 100644 --- a/PHESANT/R/testCatMultiple.r +++ b/PHESANT/R/testCatMultiple.r @@ -25,67 +25,63 @@ # 4) Calling binaryLogisticRegression function for this derived binary variable testCategoricalMultiple <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { cat("CAT-MULTIPLE || ") - - pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] - pheno = .reassignValue(vl, pheno, varName) + pheno <- thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] + pheno <- .reassignValue(vl, pheno, varName) ## get unique values from all columns of this variable - uniqueValues = unique(na.omit(pheno[,1])); - numCols = ncol(pheno); - numRows = nrow(pheno); + uniqueValues <- unique(na.omit(pheno[,1])) + numCols <- ncol(pheno) + numRows <- nrow(pheno) if (numCols>1) { - for (num in 2:numCols) { - u = unique(na.omit(pheno[,num])) - uniqueValues = union(uniqueValues,u); - } + for (num in 2:numCols) { + u <- unique(na.omit(pheno[,num])) + uniqueValues <- union(uniqueValues,u) + } } ## for each value create a binary variable and test this for (variableVal in uniqueValues) { - - ## numeric negative values we assume are missing - check this - if(is.numeric(variableVal) & variableVal<0) { - cat("SKIP_val:", variableVal," < 0", sep=""); - next; - } + ## numeric negative values we assume are missing - check this + if(is.numeric(variableVal) & variableVal<0) { + cat("SKIP_val:", variableVal," < 0", sep="") + next + } - # make variable for this value - idxForVar = which(pheno == variableVal, arr.ind=TRUE) - idxsTrue = idxForVar[,"row"] + # make variable for this value + idxForVar <- which(pheno == variableVal, arr.ind=TRUE) + idxsTrue <- idxForVar[,"row"] - cat(" CAT-MUL-BINARY-VAR ", variableVal, " || ", sep=""); - .incrementCounter("catMul.binary") + cat(" CAT-MUL-BINARY-VAR ", variableVal, " || ", sep="") + .incrementCounter("catMul.binary") - # make zero vector and set 1s for those with this variable value - varBinary = rep.int(0,numRows); - varBinary[idxsTrue] = 1; - varBinaryFactor = factor(varBinary) + # make zero vector and set 1s for those with this variable value + varBinary <- rep.int(0,numRows) + varBinary[idxsTrue] <- 1 + varBinaryFactor <- factor(varBinary) - ## data for this new binary variable - newthisdata = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], varBinaryFactor) + ## data for this new binary variable + newthisdata <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], varBinaryFactor) - ## one of 3 ways to decide which examples are negative - idxsToRemove = .restrictSample(vl, varName, pheno, variableVal, thisdata[,"userID", drop=FALSE]) + ## one of 3 ways to decide which examples are negative + idxsToRemove <- .restrictSample(vl, varName, pheno, variableVal, thisdata[,"userID", drop=FALSE]) - if (!is.null(idxsToRemove) & length(idxsToRemove) > 0) { - newthisdata = newthisdata[-idxsToRemove,] - } + if (!is.null(idxsToRemove) & length(idxsToRemove) > 0) { + newthisdata <- newthisdata[-idxsToRemove,] + } - facLevels = levels(newthisdata[,phenoStartIdx]) - idxTrue = length(which(newthisdata[,phenoStartIdx]==facLevels[1])) - idxFalse = length(which(newthisdata[,phenoStartIdx]==facLevels[2])) + facLevels <- levels(newthisdata[,phenoStartIdx]) + idxTrue <- length(which(newthisdata[,phenoStartIdx]==facLevels[1])) + idxFalse <- length(which(newthisdata[,phenoStartIdx]==facLevels[2])) - if (idxTrue<10 || idxFalse<10) { - cat("CAT-MULT-SKIP-10 (", idxTrue, " vs ", idxFalse, ") || ", sep=""); - .incrementCounter("catMul.10") - } - else { - isExposure = getIsCatMultExposure(vl, varName, variableVal) - - .incrementCounter("catMul.over10") + if (idxTrue<10 || idxFalse<10) { + cat("CAT-MULT-SKIP-10 (", idxTrue, " vs ", idxFalse, ") || ", sep="") + .incrementCounter("catMul.10") + } else { + isExposure <- .getIsCatMultExposure(vl, varName, variableVal) + .incrementCounter("catMul.over10") # binary - so logistic regression - binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), varType, newthisdata, isExposure, phenoStartIdx) - } + binaryLogisticRegression(opt, paste(varName, variableVal,sep="#"), varType, newthisdata, isExposure, phenoStartIdx) + } } } @@ -94,7 +90,7 @@ testCategoricalMultiple <- function(opt, vl, varName, varType, thisdata, phenoSt # returns idx's that should be removed from the sample .restrictSample <- function(vl, varName,pheno,variableVal, userID) { # get definition for sample for this variable either NO_NAN, ALL or a variable ID - varIndicator = vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS[which(vl$phenoInfo$FieldID==varName)] + varIndicator <- vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS[which(vl$phenoInfo$FieldID==varName)] return(.restrictSample2(vl, varName,pheno,varIndicator,variableVal, userID)) } diff --git a/PHESANT/R/testCatSingle.r b/PHESANT/R/testCatSingle.r index 85d988b..bc812e1 100644 --- a/PHESANT/R/testCatSingle.r +++ b/PHESANT/R/testCatSingle.r @@ -30,91 +30,78 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar # assert variable has only one column if (!is.null(dim(pheno))) stop("More than one column for categorical single") - - pheno = .reassignValue(vl, pheno, varName) - + pheno <- .reassignValue(vl, pheno, varName) + # get data code info - whether this data code is ordinal or not and any reordering - dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; - dataCode = dataPheno$DATA_CODING; + dataPheno <- vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),] + dataCode <- dataPheno$DATA_CODING # get data coding information - dataCodeRow = which(vl$dataCodeInfo$dataCode==dataCode); + dataCodeRow <- which(vl$dataCodeInfo$dataCode==dataCode) if (length(dataCodeRow)==0) { - cat("ERROR: No row in data coding info file || "); - return(NULL); - } - dataDataCode = vl$dataCodeInfo[dataCodeRow,]; - ordered = dataDataCode$ordinal; - order = as.character(dataDataCode$ordering); + cat("ERROR: No row in data coding info file || ") + return + } + dataDataCode <- vl$dataCodeInfo[dataCodeRow,] + ordered <- dataDataCode$ordinal + order <- as.character(dataDataCode$ordering) ## reorder variable values into increasing order (we do this now as this may convert variable to binary rather than ordered) - pheno = .reorderOrderedCategory(pheno,order); + pheno <- .reorderOrderedCategory(pheno,order) ## if data code has a default_value then recode NA's to this value for participants with value in default_related_field ## this is used where there is no zero option e.g. field 100200 - defaultValue = dataDataCode$default_value - defaultRelatedID = dataDataCode$default_related_field - pheno = .setDefaultValue(vl, pheno, defaultValue, defaultRelatedID, thisdata[,"userID", drop=FALSE]) + defaultValue <- dataDataCode$default_value + defaultRelatedID <- dataDataCode$default_related_field + pheno <- .setDefaultValue(vl, pheno, defaultValue, defaultRelatedID, thisdata[,"userID", drop=FALSE]) - ## all categories coded as <0 we assume are `missing' values - pheno = .replaceMissingCodes(pheno) + ## all categories coded as <0 we assume are `missing' values + pheno <- .replaceMissingCodes(pheno) ## remove categories if < 10 examples - pheno = .testNumExamples(pheno) - - uniqVar = unique(na.omit(pheno)) - uniqVar = sort(uniqVar) - - if (length(uniqVar)<=1) { - cat("SKIP (only one value) || "); - .incrementCounter("catSin.onevalue") - } - else if (length(uniqVar)==2) { - cat("CAT-SINGLE-BINARY || "); - .incrementCounter("catSin.case3") - # binary so logistic regression + pheno <- .testNumExamples(pheno) - phenoFactor = factor(pheno) - # binary - so logistic regression - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); - binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx) - } - else { - # > 2 categories - if (is.na(ordered)) { - cat(" ERROR: 'ordered' not found in data code info file") - } - else { - - ## unordered - if (ordered == 0) { - - cat("CAT-SINGLE-UNORDERED || ") - .incrementCounter("catSin.case2") - - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); - testCategoricalUnordered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); - - } - else if (ordered == 1) { - - ## ordered - cat("ordered || "); - .incrementCounter("catSin.case1") + uniqVar <- unique(na.omit(pheno)) + uniqVar <- sort(uniqVar) - ## reorder variable values into increasing order - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno); - testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx, order) - - } - else if (ordered == -2) { - cat(" EXCLUDED or BINARY variable: Should not get here in code. ") - .incrementCounter( "catSin.binaryorexcluded") - } - else { - print(paste("ERROR", varName, varType, dataCode)); - } - } + if (length(uniqVar)<=1) { + cat("SKIP (only one value) || ") + .incrementCounter("catSin.onevalue") + } else if (length(uniqVar)==2) { + cat("CAT-SINGLE-BINARY || ") + .incrementCounter("catSin.case3") + # binary so logistic regression + phenoFactor <- factor(pheno) + # binary - so logistic regression + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor) + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx) + } else { + # > 2 categories + if (is.na(ordered)) { + cat(" ERROR: 'ordered' not found in data code info file") + } else { + ## unordered + if (ordered == 0) { + cat("CAT-SINGLE-UNORDERED || ") + .incrementCounter("catSin.case2") + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno) + testCategoricalUnordered(opt, vl, varName, varType, thisdatanew, phenoStartIdx) + } else if (ordered == 1) { + ## ordered + cat("ordered || ") + .incrementCounter("catSin.case1") + + ## reorder variable values into increasing order + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], pheno) + testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx, order) + + } else if (ordered == -2) { + cat(" EXCLUDED or BINARY variable: Should not get here in code. ") + .incrementCounter( "catSin.binaryorexcluded") + } else { + print(paste("ERROR", varName, varType, dataCode)); + } + } } } @@ -135,7 +122,7 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar pheno2[idx] <- count count <- count + 1 } - cat("reorder ",order," || ",sep=""); + cat("reorder ",order," || ",sep="") return(pheno2) } else { return(pheno) @@ -167,7 +154,7 @@ testCategoricalSingle <- function(opt, vl, varName, varType, thisdata, phenoStar # set default value in people who have no value in the pheno but do have a value in the default_value_related_field defaultIdxs <- which(!is.na(indicatorVar) & is.na(pheno)) pheno[defaultIdxs] <- defaultValue - cat("default value ", defaultValue, " set, N= ", length(defaultIdxs), " || ", sep=""); + cat("default value ", defaultValue, " set, N= ", length(defaultIdxs), " || ", sep="") } return(pheno) } diff --git a/PHESANT/R/testContinuous.r b/PHESANT/R/testContinuous.r index aeef1a6..b60c0c6 100644 --- a/PHESANT/R/testContinuous.r +++ b/PHESANT/R/testContinuous.r @@ -29,207 +29,172 @@ testContinuous <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { # Main code used to process continuous fields, or integer fields that have been reassigned as continuous because they have >20 distinct values. # This is needed because we have already reassigned values for integer fields, so do this in the function above for continuous fields. .testContinuous2 <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { - cat("CONTINUOUS || "); - pheno = thisdata[,phenoStartIdx:ncol(thisdata)] - isExposure = getIsExposure(vl, varName) - - if (!is.null(dim(pheno))) { - phenoAvg = rowMeans(pheno, na.rm=TRUE) - } - else { - phenoAvg = pheno - } - - ## recode NaN to NA, which is generated if all cols of pheno are NA for a given person - idxNan = which(is.nan(phenoAvg)) - phenoAvg[idxNan] = NA; - numNotNA=length(na.omit(phenoAvg)); - - ## check whether >20% examples with same value - uniqVar = unique(na.omit(phenoAvg)) - valid = TRUE - for (uniq in uniqVar) { - numWithValue = length(which(phenoAvg==uniq)) - if (numWithValue/numNotNA >=0.2) { - valid = FALSE; - break - } - } - - if (valid == FALSE) { - - ## treat as ordinal categorical - cat(">20% IN ONE CATEGORY || "); - - # if >2 unique values then treat as ordered categorical - numUniqueValues = length(uniqVar) - - # straight forward case that there are two (or one) values - if (numUniqueValues<=2) { - ## treat as binary or skip (binary requires>=10 per category) - - ## remove categories if < 10 examples to see if this should be binary or not, but if ordered categorical - ## then we include all values when generating this - phenoAvgMoreThan10 = .testNumExamples(phenoAvg) - - ## binary if 2 distinct values, else ordered categorical - phenoFactor = factor(phenoAvgMoreThan10) - numLevels = length(unique(na.omit(phenoAvgMoreThan10))) #length(levels(phenoFactor)) + cat("CONTINUOUS || ") + pheno <- thisdata[,phenoStartIdx:ncol(thisdata)] + isExposure <- getIsExposure(vl, varName) + + if (!is.null(dim(pheno))) { + phenoAvg <- rowMeans(pheno, na.rm=TRUE) + } else { + phenoAvg <- pheno + } + + ## recode NaN to NA, which is generated if all cols of pheno are NA for a given person + idxNan <- which(is.nan(phenoAvg)) + phenoAvg[idxNan] <- NA + numNotNA <- length(na.omit(phenoAvg)) + + ## check whether >20% examples with same value + uniqVar <- unique(na.omit(phenoAvg)) + valid <- TRUE + for (uniq in uniqVar) { + numWithValue <- length(which(phenoAvg==uniq)) + if (numWithValue/numNotNA >=0.2) { + valid <- FALSE + break + } + } + + if (!valid) { + ## treat as ordinal categorical + cat(">20% IN ONE CATEGORY || ") + # if >2 unique values then treat as ordered categorical + numUniqueValues <- length(uniqVar) + # straight forward case that there are two (or one) values + if (numUniqueValues<=2) { + ## treat as binary or skip (binary requires>=10 per category) + ## remove categories if < 10 examples to see if this should be binary or not, but if ordered categorical + ## then we include all values when generating this + phenoAvgMoreThan10 <- .testNumExamples(phenoAvg) + + ## binary if 2 distinct values, else ordered categorical + phenoFactor <- factor(phenoAvgMoreThan10) + numLevels <- length(unique(na.omit(phenoAvgMoreThan10))) #length(levels(phenoFactor)) if (numLevels<=1) { - cat("SKIP (number of levels: ",numLevels,")",sep="") - .incrementCounter("cont.onevalue") - } - else if (numLevels==2) { - # binary - .incrementCounter("cont.binary") - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); - binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx); + cat("SKIP (number of levels: ",numLevels,")",sep="") + .incrementCounter("cont.onevalue") + } else if (numLevels==2) { + # binary + .incrementCounter("cont.binary") + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor) + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx) } - } - else { - ## try to treat as ordered categorical - - .incrementCounter("cont.ordcattry") - ## equal sized bins - phenoBinned = .equalSizedBins(phenoAvg) - - # check number of people in each bin - bin0Num = length(which(phenoBinned==0)) - bin1Num = length(which(phenoBinned==1)) - bin2Num = length(which(phenoBinned==2)) - - if (bin0Num>=10 & bin1Num>=10 & bin2Num>=10) { - - # successful binning. >=10 examples in each of the 3 bins - - .incrementCounter("cont.ordcattry.ordcat") - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned); - testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); - } - else { - # try to treat as binary because not enough examples in each bin - - if (bin0Num<10 & bin2Num<10) { - ## skip - not possible to create binary variable because first and third bins are too small - ## ie. could merge bin1 with bin 2 but then bin3 still too small etc - cat("SKIP 2 bins are too small || ") - .incrementCounter("cont.ordcattry.smallbins") - } - else if ((bin0Num<10 | bin1Num<10) & (bin0Num+bin1Num)>=10) { - - # combine first and second bin to create binary variable - .incrementCounter("cont.ordcattry.binsbinary") - cat("Combine first two bins and treat as binary || ") - phenoBinned[which(phenoBinned==0)] = 1 - - # test binary - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) - binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx); - } - else if ((bin2Num<10 | bin1Num<10) & (bin2Num+bin1Num)>=10) { - - # combine second and last bin to create binary variable - .incrementCounter("cont.ordcattry.binsbinary") - cat("Combine last two bins and treat as binary || ") - phenoBinned[which(phenoBinned==2)] = 1 - - # test binary - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) - binaryLogisticRegression(opt, varName,varType, thisdatanew, isExposure, phenoStartIdx) - } - - - else { - ## skip - not possible to create binary variable because combining bins would still be too small - cat("SKIP 2 bins are too small(2) || ") - .incrementCounter("cont.ordcattry.smallbins2") - } - - } - - } - } - else { - cat("IRNT || "); - .incrementCounter("cont.main") - - # check there are at least 500 examples - numNotNA = length(which(!is.na(phenoAvg))) - if (numNotNA<500) { - cat("CONTINUOUS-SKIP-500 (", numNotNA, ") || ",sep=""); - .incrementCounter("cont.main.500") - } - else { - ## inverse rank normal transformation - phenoIRNT = irnt(phenoAvg) - - if (opt$save == TRUE) { - # add pheno to dataframe - .storeNewVar(thisdata[,"userID"], phenoIRNT, varName, 'cont') - cat("SUCCESS results-linear"); - .incrementCounter("success.continuous") - } - else { - - ## do regression (use standardised geno values) - if (opt$standardise==TRUE) { - geno = scale(thisdata[,"geno"]) - } - else { - geno = thisdata[,"geno"] - } - confounders=thisdata[,3:(phenoStartIdx -1), drop = FALSE] - - sink() - sink(pkg.env$modelFitLogFile, append=TRUE) - print("--------------") - print(varName) - - ###### BEGIN TRYCATCH - tryCatch({ - - fit <- lm(phenoIRNT ~ geno + ., data=confounders) - - sink() - sink(pkg.env$resLogFile, append=TRUE) - - sumx = summary(fit) - - pvalue = sumx$coefficients['geno','Pr(>|t|)'] - beta = sumx$coefficients["geno","Estimate"] - - if (opt$confidenceintervals == TRUE) { - cis = confint(fit, level=0.95) - lower = cis["geno", "2.5 %"] - upper = cis["geno", "97.5 %"] - } - else { - lower = NA - upper = NA - } - - numNotNA = length(which(!is.na(phenoIRNT))) - - ## save result to file - write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt", sep=""), append="TRUE"); - cat("SUCCESS results-linear"); - - .incrementCounter("success.continuous") - if (isExposure == TRUE) { - .incrementCounter("success.exposure.continuous") - } - - ## END TRYCATCH - }, error = function(e) { - sink() - sink(pkg.env$resLogFile, append=TRUE) - cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - .incrementCounter("continuous.error") - }) - } - } - } + } else { + ## try to treat as ordered categorical + .incrementCounter("cont.ordcattry") + ## equal sized bins + phenoBinned <- .equalSizedBins(phenoAvg) + + # check number of people in each bin + bin0Num <- length(which(phenoBinned==0)) + bin1Num <- length(which(phenoBinned==1)) + bin2Num <- length(which(phenoBinned==2)) + + if (bin0Num>=10 & bin1Num>=10 & bin2Num>=10) { + # successful binning. >=10 examples in each of the 3 bins + .incrementCounter("cont.ordcattry.ordcat") + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) + testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx) + } else { + # try to treat as binary because not enough examples in each bin + if (bin0Num<10 & bin2Num<10) { + ## skip - not possible to create binary variable because first and third bins are too small + ## ie. could merge bin1 with bin 2 but then bin3 still too small etc + cat("SKIP 2 bins are too small || ") + .incrementCounter("cont.ordcattry.smallbins") + } else if ((bin0Num<10 | bin1Num<10) & (bin0Num+bin1Num)>=10) { + # combine first and second bin to create binary variable + .incrementCounter("cont.ordcattry.binsbinary") + cat("Combine first two bins and treat as binary || ") + phenoBinned[which(phenoBinned==0)] <-1 + + # test binary + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure, phenoStartIdx) + } else if ((bin2Num<10 | bin1Num<10) & (bin2Num+bin1Num)>=10) { + # combine second and last bin to create binary variable + .incrementCounter("cont.ordcattry.binsbinary") + cat("Combine last two bins and treat as binary || ") + phenoBinned[which(phenoBinned==2)] <- 1 + + # test binary + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoBinned) + binaryLogisticRegression(opt, varName,varType, thisdatanew, isExposure, phenoStartIdx) + } else { + ## skip - not possible to create binary variable because combining bins would still be too small + cat("SKIP 2 bins are too small(2) || ") + .incrementCounter("cont.ordcattry.smallbins2") + } + } + } + } else { + cat("IRNT || ") + .incrementCounter("cont.main") + # check there are at least 500 examples + numNotNA <- length(which(!is.na(phenoAvg))) + if (numNotNA<500) { + cat("CONTINUOUS-SKIP-500 (", numNotNA, ") || ",sep="") + .incrementCounter("cont.main.500") + } else { + ## inverse rank normal transformation + phenoIRNT <- irnt(phenoAvg) + if (opt$save == TRUE) { + # add pheno to dataframe + .storeNewVar(thisdata[,"userID"], phenoIRNT, varName, 'cont') + cat("SUCCESS results-linear") + .incrementCounter("success.continuous") + } else { + ## do regression (use standardised geno values) + if (opt$standardise==TRUE) { + geno <- scale(thisdata[,"geno"]) + } else { + geno <- thisdata[,"geno"] + } + confounders <- thisdata[,3:(phenoStartIdx -1), drop = FALSE] + + sink() + sink(pkg.env$modelFitLogFile, append=TRUE) + print("--------------") + print(varName) + + ###### BEGIN TRYCATCH + tryCatch({ + fit <- lm(phenoIRNT ~ geno + ., data=confounders) + sink() + sink(pkg.env$resLogFile, append=TRUE) + sumx <- summary(fit) + pvalue <- sumx$coefficients['geno','Pr(>|t|)'] + beta <- sumx$coefficients["geno","Estimate"] + + if (opt$confidenceintervals == TRUE) { + cis <- confint(fit, level=0.95) + lower <- cis["geno", "2.5 %"] + upper <- cis["geno", "97.5 %"] + } else { + lower <- NA + upper <- NA + } + + numNotNA <- length(which(!is.na(phenoIRNT))) + ## save result to file + write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt", sep=""), append="TRUE") + cat("SUCCESS results-linear") + .incrementCounter("success.continuous") + if (isExposure == TRUE) { + .incrementCounter("success.exposure.continuous") + } + + ## END TRYCATCH + }, error = function(e) { + sink() + sink(pkg.env$resLogFile, append=TRUE) + cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) + .incrementCounter("continuous.error") + }) + } + } + } } irnt <- function(pheno, seed = 1234) { diff --git a/PHESANT/R/testInteger.r b/PHESANT/R/testInteger.r index 98735c9..de76887 100644 --- a/PHESANT/R/testInteger.r +++ b/PHESANT/R/testInteger.r @@ -16,73 +16,58 @@ # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. - # Processing integer fields, namely: # 1) Reassigning values as specified in the data code information file # 2) Generate a single value if there are several values (arrays) by taking the mean # 3) Treating this field as continuous if at least 20 distinct values. # Otherwise treat as binary or ordered categorical if 2 or more than two values. testInteger <- function(opt, vl, varName, varType, thisdata, phenoStartIdx) { - cat("INTEGER || "); - - pheno = thisdata[,phenoStartIdx:ncol(thisdata)] - isExposure = getIsExposure(vl, varName) - - if (!is.numeric(as.matrix(pheno))) { - cat("SKIP Integer type but not numeric",sep=""); - return(NULL) - } - - pheno = .reassignValue(vl, pheno, varName) - - ## average if multiple columns - if (!is.null(dim(pheno))) { - phenoAvg = rowMeans(pheno, na.rm=TRUE) - - # if participant only has NA values then NaN is generated so we convert back to NA - phenoAvg = .replaceNaN(phenoAvg) - } - else { - phenoAvg = pheno + cat("INTEGER || ") + pheno <- thisdata[,phenoStartIdx:ncol(thisdata)] + isExposure <- getIsExposure(vl, varName) + if (is.numeric(as.matrix(pheno))) { + pheno <- .reassignValue(vl, pheno, varName) + + ## average if multiple columns + if (!is.null(dim(pheno))) { + phenoAvg <- rowMeans(pheno, na.rm=TRUE) + # if participant only has NA values then NaN is generated so we convert back to NA + phenoAvg <- .replaceNaN(phenoAvg) + } else { + phenoAvg <- pheno } - - uniqVar = unique(na.omit(phenoAvg)) - - # if >=20 separate values then treat as continuous - if (length(uniqVar)>=20) { - - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoAvg); - .testContinuous2(opt, vl, varName, varType, thisdatanew, phenoStartIdx) - .incrementCounter( "int.continuous") - } - else { - - ## remove categories if < 10 examples - phenoAvg = .testNumExamples(phenoAvg) - - ## binary if 2 distinct values, else ordered categorical - phenoFactor = factor(phenoAvg) - numLevels = length(levels(phenoFactor)) - if (numLevels<=1) { - cat("SKIP (number of levels: ",numLevels,")",sep=""); - .incrementCounter("int.onevalue") - } - else if (numLevels==2) { - .incrementCounter("int.binary") - - # binary - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); - binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure,phenoStartIdx); - } - else { - .incrementCounter("int.catord") - cat("3-20 values || ") - - # we don't use equal sized bins just the original integers (that have >=10 examples) as categories - thisdatanew = cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); - - # treat as ordinal categorical - testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx); - } - } + + uniqVar <- unique(na.omit(phenoAvg)) + # if >=20 separate values then treat as continuous + if (length(uniqVar)>=20) { + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoAvg) + .testContinuous2(opt, vl, varName, varType, thisdatanew, phenoStartIdx) + .incrementCounter( "int.continuous") + } else { + ## remove categories if < 10 examples + phenoAvg <- .testNumExamples(phenoAvg) + ## binary if 2 distinct values, else ordered categorical + phenoFactor <- factor(phenoAvg) + numLevels <- length(levels(phenoFactor)) + if (numLevels<=1) { + cat("SKIP (number of levels: ",numLevels,")",sep="") + .incrementCounter("int.onevalue") + } else if (numLevels==2) { + .incrementCounter("int.binary") + # binary + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor); + binaryLogisticRegression(opt, varName, varType, thisdatanew, isExposure,phenoStartIdx) + } else { + .incrementCounter("int.catord") + cat("3-20 values || ") + # we don't use equal sized bins just the original integers (that have >=10 examples) as categories + thisdatanew <- cbind.data.frame(thisdata[,1:(phenoStartIdx -1)], phenoFactor) + + # treat as ordinal categorical + testCategoricalOrdered(opt, vl, varName, varType, thisdatanew, phenoStartIdx) + } + } + } else { + cat("SKIP Integer type but not numeric",sep="") + } } diff --git a/PHESANT/R/utils.r b/PHESANT/R/utils.r index 563d949..b64c99d 100644 --- a/PHESANT/R/utils.r +++ b/PHESANT/R/utils.r @@ -24,9 +24,17 @@ return(thisdata) } +parseOptions <- function() { + opt_parser <- OptionParser(option_list=option_list) + opt <- parse_args(opt_parser) + opt <- .processArgs(opt, opt_parser) + return(opt) +} + + # Parse the arguments input by the user # if argument 'test' is used then run test phenome scan -processArgs <- function(opt, opt_parser) { +.processArgs <- function(opt, opt_parser) { if (opt$test==TRUE) { # set up the test phenome scan settings datadir='../testWAS/data/'; diff --git a/PHESANT/man/getIsExposure.Rd b/PHESANT/man/getIsExposure.Rd new file mode 100644 index 0000000..b21ca2a --- /dev/null +++ b/PHESANT/man/getIsExposure.Rd @@ -0,0 +1,21 @@ +\name{getIsExposure} +\alias{getIsExposure} +\title{ +Check field exposure +} +\description{ +Check whether a field denotes the trait of interest, as specified. +} +\usage{ +getIsExposure(vl, varName) +} +\arguments{ + \item{vl}{variable list object +} + \item{varName}{ the field ID to be checked +} +} + +\value{ +Return true if the field has a "YES" value in TRAIT_OF_INTEST column. +} diff --git a/PHESANT/man/option_list.Rd b/PHESANT/man/option_list.Rd index 21a696c..83712e6 100644 --- a/PHESANT/man/option_list.Rd +++ b/PHESANT/man/option_list.Rd @@ -4,6 +4,8 @@ \title{ User Input Option List } - +\description{ +This data object defines the options that the program accepts. +} diff --git a/PHESANT/man/parseOptions.Rd b/PHESANT/man/parseOptions.Rd new file mode 100644 index 0000000..9b59ae2 --- /dev/null +++ b/PHESANT/man/parseOptions.Rd @@ -0,0 +1,15 @@ +\name{parseOptions} +\alias{parseOptions} +\title{ +Validate and parse user options +} +\description{ +Validate and parse user options +} +\usage{ +parseOptions() +} + +\examples{ +# opt <- parseOptions() +} diff --git a/PHESANT/man/run.Rd b/PHESANT/man/run.Rd new file mode 100644 index 0000000..ebd1497 --- /dev/null +++ b/PHESANT/man/run.Rd @@ -0,0 +1,23 @@ +\name{run} +\alias{run} +\title{ +Run an PHESANT analysis with user specificed options +} +\description{ +Run an PHESANT analysis with user specificed options +} +\usage{ +run(opt) +} +\arguments{ + \item{opt}{The list of input options provided by user.} +} + +\value{ +No return value. All output will be writen to the user specificed oiutput folder. +} + +\examples{ +# run(opt) +} + diff --git a/PHESANT/man/testAssociations.Rd b/PHESANT/man/testAssociations.Rd new file mode 100644 index 0000000..b16c7fa --- /dev/null +++ b/PHESANT/man/testAssociations.Rd @@ -0,0 +1,27 @@ +\name{testAssociations} +\alias{testAssociations} + +\title{ +Association Test Dispacher +} +\description{ +The main function to identify the right kind of association test to perfrom based on phenotype variable information. +} +\usage{ +testAssociations(opt, vl, currentVar, currentVarShort, thisdata, phenoStartIdx) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{opt}{The list of input options provided by user.} + \item{vl}{The list that holds input phenotype variable list and input data coding list.} + \item{currentVar}{The phenotype/variable to be tested, in the form of FIELDID_INSTANCE. For example 21022_0.} + \item{currentVarShort}{The phenotype/variable to be tested, in the form of FIELDID. For example 21022.} + \item{thisdata}{The data frame object holds all variables including phenotypes, confounders, depeinding variables and trait of interest.} + \item{phenoStartIdx}{The column index of the first phenotype.} +} +\details{ +For a given variable identified by its FIELDID, it reads from variable list to decide if to run association test and chooses the approriate assocition test based on variable value type. Currently, recognized variable types are "Integer", "Continuous", "Categorical single", and "Categorical multiple" and all other types are ignored. +} +\value{ +Return nothing and all output and lof information are written to output directory specified by user. +} diff --git a/PHESANT/man/testCategoricalMultiple.Rd b/PHESANT/man/testCategoricalMultiple.Rd new file mode 100644 index 0000000..acb10f0 --- /dev/null +++ b/PHESANT/man/testCategoricalMultiple.Rd @@ -0,0 +1,34 @@ +\name{testCategoricalMultiple} +\alias{testCategoricalMultiple} +\title{ +Association Test dispacther for "Categorical multiple" value type +} +\description{ +The function takes the declared "Categorical multiple" value type of a field and validates/decides on the actual type of association test and perform the test accordingly. +} +\usage{ +testCategoricalMultiple(opt, vl, varName, varType, thisdata, phenoStartIdx) +} +\arguments{ + \item{opt}{The list of input options provided by user.} + \item{vl}{The list that holds input phenotype variable list and input data coding list.} + \item{varName}{The phenotype/variable to be tested, in the form of FIELDID. For example 21022.} + \item{varType}{The value type of the variable. Should always be "Categorical multiple" here and used for logging purpose only.} + \item{thisdata}{The data frame object holds all variables including phenotypes, confounders, depeinding variables and trait of interest.} + \item{phenoStartIdx}{The column index of the first phenotype.} +} +\details{ +Performs variable processing for categorical (multiple) fields, namely + \enumerate{ + \item Reassign values as specified in data coding file + \item Generate binary variable for each category in field, restricting to correct set of participants as specified + \item Replace missing codes - we assume values < 0 are missing for categorical (single) variables + \item Check derived variable has at least 10 cases in each group + \item Call binaryLogisticRegression function for this derived binary variable + } +} + +\value{ +Return nothing and all output and lof information are written to output directory specified by user. +} + diff --git a/PHESANT/man/testCategoricalSingle.Rd b/PHESANT/man/testCategoricalSingle.Rd new file mode 100644 index 0000000..1148619 --- /dev/null +++ b/PHESANT/man/testCategoricalSingle.Rd @@ -0,0 +1,32 @@ +\name{testCategoricalSingle} +\alias{testCategoricalSingle} +\title{ +Association Test dispacther for "Categorical single" value type +} +\description{ +The function takes the declared "Categorical single" value type of a field and validates/decides on the actual type of association test and perform the test accordingly. +} +\usage{ +testCategoricalSingle(opt, vl, varName, varType, thisdata, phenoStartIdx) +} +\arguments{ + \item{opt}{The list of input options provided by user.} + \item{vl}{The list that holds input phenotype variable list and input data coding list.} + \item{varName}{The phenotype/variable to be tested, in the form of FIELDID. For example 21022.} + \item{varType}{The value type of the variable. Should always be "Categorical single" here and used for logging purpose only.} + \item{thisdata}{The data frame object holds all variables including phenotypes, confounders, depeinding variables and trait of interest.} + \item{phenoStartIdx}{The column index of the first phenotype.} +} +\details{ +Performs variable processing for categorical (single) fields, namely + \enumerate{ + \item Reassign values as specified in data coding information file + \item Reorder categories for ordered fields + \item Replace missing codes - we assume values < 0 are missing for categorical (single) variables + \item Remove values with <10 cases + \item Deterimine correct test to perform, either binary, ordered or unordered + } +} +\value{ +Return nothing and all output and lof information are written to output directory specified by user. +} \ No newline at end of file diff --git a/PHESANT/man/testContinuous.Rd b/PHESANT/man/testContinuous.Rd new file mode 100644 index 0000000..679d7ba --- /dev/null +++ b/PHESANT/man/testContinuous.Rd @@ -0,0 +1,34 @@ +\name{testContinuous} +\alias{testContinuous} +\title{ +Association Test dispacther for Continuous value type +} +\description{ +The function takes the declared "Continuous" value type of a field and validates/decides on the actual type of association test and perform the test accordingly. +} +\usage{ +testContinuous(opt, vl, varName, varType, thisdata, phenoStartIdx) +} +\arguments{ + \item{opt}{The list of input options provided by user.} + \item{vl}{The list that holds input phenotype variable list and input data coding list.} + \item{varName}{The phenotype/variable to be tested, in the form of FIELDID. For example 21022.} + \item{varType}{The value type of the variable. Should always be "Continuous" here and used for logging purpose only.} + \item{thisdata}{The data frame object holds all variables including phenotypes, confounders, depeinding variables and trait of interest.} + \item{phenoStartIdx}{The column index of the first phenotype.} +} +\details{ + Processing integer fields, namely: + \enumerate{ + \item Reassigning values as specified in the data code information file + \item Generate a single value if there are several values (arrays) by taking the mean + \item Treating this field as ordinal categorical if >20\% examples with same value + \item Otherwise Treat as continuous (with Rank-based Inverse Normal Transformation) if 500 or more samples have values + \item In the case of 3), treat it as binary if only two types of values, or ordinal categorical if its values can be binned approximately into three equl bins. + \item In the case of 5), if more than 3 type of values and can not be binned reasonably into 3 euqal bins, merge them into two bins ans treat it as binary, or give up. + } +} +\value{ +Return nothing and all output and lof information are written to output directory specified by user. +} + diff --git a/PHESANT/man/testInteger.Rd b/PHESANT/man/testInteger.Rd new file mode 100644 index 0000000..c2c85ee --- /dev/null +++ b/PHESANT/man/testInteger.Rd @@ -0,0 +1,32 @@ +\name{testInteger} +\alias{testInteger} +\title{ +Association Test dispacther for Integer value type +} +\description{ +The function takes the declared "Integer" value type of a field and validates/decides on the actual type of association test and perform the test accordingly. +} +\usage{ +testInteger(opt, vl, varName, varType, thisdata, phenoStartIdx) +} +\arguments{ + \item{opt}{The list of input options provided by user.} + \item{vl}{The list that holds input phenotype variable list and input data coding list.} + \item{varName}{The phenotype/variable to be tested, in the form of FIELDID. For example 21022.} + \item{varType}{The value type of the variable. Should always be "INTEGER" here and used for logging purpose only.} + \item{thisdata}{The data frame object holds all variables including phenotypes, confounders, depeinding variables and trait of interest.} + \item{phenoStartIdx}{The column index of the first phenotype.} +} +\details{ +Processing integer fields, namely: + \enumerate{ + \item Reassigning values as specified in the data code information file + \item Generate a single value if there are several values (arrays) by taking the mean + \item Treating this field as continuous if at least 20 distinct values + \item Otherwise treat as binary or ordered categorical if 2 or more than two values + } +} +\value{ +Return nothing and all output and lof information are written to output directory specified by user. +} + diff --git a/PHESANT/phenomeScan.r b/PHESANT/phenomeScan.r index 0803a90..be5f7c6 100644 --- a/PHESANT/phenomeScan.r +++ b/PHESANT/phenomeScan.r @@ -27,9 +27,7 @@ args <- commandArgs(T) if (length(args) == 0) { load(file = "opt_only.RData") } else { - opt_parser <- OptionParser(option_list=option_list) - opt <- parse_args(opt_parser) - opt <- processArgs(opt, opt_parser) + opt <- parseOptions() } run(opt) From 66b719af092d1e4d18e5fa6a6ca92d89645e86c0 Mon Sep 17 00:00:00 2001 From: Quanli Wang Date: Sat, 16 Mar 2019 13:35:30 -0400 Subject: [PATCH 27/27] Restore original code structure --- PHESANT/R/testCategoricalOrdered.r | 156 +++++------ WAS/addToCounts.r | 33 +++ WAS/binaryLogisticRegression.r | 121 +++++++++ WAS/equalSizedBins.r | 118 +++++++++ WAS/fixOddFieldsToCatMul.r | 54 ++++ WAS/getIsCatMultExposure.r | 61 +++++ WAS/getIsExposure.r | 31 +++ WAS/getNumValuesCatMultExposure.r | 53 ++++ WAS/incrementCounter.r | 34 +++ WAS/initFunctions.r | 99 +++++++ WAS/loadConfounders.r | 136 ++++++++++ WAS/loadData.r | 85 ++++++ WAS/loadIndicatorFields.r | 162 ++++++++++++ WAS/loadPhenotypes.r | 127 +++++++++ WAS/loadTraitOfInterest.r | 54 ++++ WAS/makeTestDataFrame.r | 30 +++ WAS/opt.RData | Bin 0 -> 4853 bytes WAS/phenomeScan.r | 161 +++++++++++ WAS/processArgs.r | 132 +++++++++ WAS/reassignValue.r | 73 +++++ WAS/replaceMissingCodes.r | 37 +++ WAS/replaceNaN.r | 47 ++++ WAS/saveCounts.r | 31 +++ WAS/storeNewVar.r | 21 ++ WAS/testAssociations.r | 141 ++++++++++ WAS/testCatMultiple.r | 166 ++++++++++++ WAS/testCatSingle.r | 193 ++++++++++++++ WAS/testCategoricalOrdered.r | 163 ++++++++++++ WAS/testCategoricalUnordered.r | 177 +++++++++++++ WAS/testContinuous.r | 250 ++++++++++++++++++ WAS/testInteger.r | 88 ++++++ WAS/testNumExamples.r | 37 +++ {unittests => WAS/unittests}/run-tests.sh | 0 .../unittests}/test_equalSizedBins.r | 0 .../unittests}/test_reassignValue.r | 0 .../unittests}/test_testCatMultiple.r | 0 .../unittests}/test_testCatSingle.r | 0 WAS/validatePhenotypeInput.r | 85 ++++++ WAS/validateTraitInput.r | 69 +++++ 39 files changed, 3139 insertions(+), 86 deletions(-) create mode 100644 WAS/addToCounts.r create mode 100644 WAS/binaryLogisticRegression.r create mode 100644 WAS/equalSizedBins.r create mode 100644 WAS/fixOddFieldsToCatMul.r create mode 100644 WAS/getIsCatMultExposure.r create mode 100644 WAS/getIsExposure.r create mode 100644 WAS/getNumValuesCatMultExposure.r create mode 100644 WAS/incrementCounter.r create mode 100644 WAS/initFunctions.r create mode 100644 WAS/loadConfounders.r create mode 100644 WAS/loadData.r create mode 100644 WAS/loadIndicatorFields.r create mode 100644 WAS/loadPhenotypes.r create mode 100644 WAS/loadTraitOfInterest.r create mode 100644 WAS/makeTestDataFrame.r create mode 100644 WAS/opt.RData create mode 100644 WAS/phenomeScan.r create mode 100644 WAS/processArgs.r create mode 100644 WAS/reassignValue.r create mode 100644 WAS/replaceMissingCodes.r create mode 100644 WAS/replaceNaN.r create mode 100644 WAS/saveCounts.r create mode 100644 WAS/storeNewVar.r create mode 100644 WAS/testAssociations.r create mode 100644 WAS/testCatMultiple.r create mode 100644 WAS/testCatSingle.r create mode 100644 WAS/testCategoricalOrdered.r create mode 100644 WAS/testCategoricalUnordered.r create mode 100644 WAS/testContinuous.r create mode 100644 WAS/testInteger.r create mode 100644 WAS/testNumExamples.r rename {unittests => WAS/unittests}/run-tests.sh (100%) rename {unittests => WAS/unittests}/test_equalSizedBins.r (100%) rename {unittests => WAS/unittests}/test_reassignValue.r (100%) rename {unittests => WAS/unittests}/test_testCatMultiple.r (100%) rename {unittests => WAS/unittests}/test_testCatSingle.r (100%) create mode 100644 WAS/validatePhenotypeInput.r create mode 100644 WAS/validateTraitInput.r diff --git a/PHESANT/R/testCategoricalOrdered.r b/PHESANT/R/testCategoricalOrdered.r index 7089ff7..5cffe59 100644 --- a/PHESANT/R/testCategoricalOrdered.r +++ b/PHESANT/R/testCategoricalOrdered.r @@ -19,96 +19,80 @@ # Performs ordered logistic regression test and saves results in ordered logistic results file testCategoricalOrdered <- function(opt, vl, varName, varType, thisdata, phenoStartIdx, orderStr="") { - - - pheno = thisdata[,phenoStartIdx:ncol(thisdata)] - geno = thisdata[,"geno"] - - cat("CAT-ORD || "); - .incrementCounter("ordCat") - .doCatOrdAssertions(pheno) - - uniqVar = unique(na.omit(pheno)); - - # log the ordering of categories used - orderStr = .setOrderString(orderStr, uniqVar); - cat("order: ", orderStr, " || ", sep=""); - - # check sample size - numNotNA = length(which(!is.na(pheno))) - if (numNotNA<500) { - cat("CATORD-SKIP-500 (", numNotNA, ") || ",sep=""); - .incrementCounter("ordCat.500") - } - else { - # test this cat ordered variable with ordered logistic regression - - phenoFactor = factor(pheno) - - cat("num categories: ", length(unique(na.omit(phenoFactor))), " || ", sep=""); - - if (opt$save == TRUE) { - # add pheno to dataframe - .storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catOrd') - cat("SUCCESS results-ordered-logistic"); - .incrementCounter("success.ordCat") - } - else { - - # ordinal logistic regression - sink() - sink(pkg.env$modelFitLogFile, append=TRUE) - print("--------------") - print(varName) - - ### BEGIN TRYCATCH - tryCatch({ - confounders=thisdata[,3:(phenoStartIdx -1), drop = FALSE] - - - if (opt$standardise==TRUE) { - geno = scale(geno) - } - - fit <- polr(phenoFactor ~ geno + ., data=confounders, Hess=TRUE) - - ctable <- coef(summary(fit)) - sink() - sink(pkg.env$resLogFile, append=TRUE) - - ct = coeftest(fit) - pvalue = ct["geno","Pr(>|t|)"] - beta = ctable["geno", "Value"]; - - if (opt$confidenceintervals == TRUE) { - se = ctable["geno", "Std. Error"] - lower = beta - 1.96*se; - upper = beta + 1.96*se; + pheno <- thisdata[,phenoStartIdx:ncol(thisdata)] + geno <- thisdata[,"geno"] + + cat("CAT-ORD || ") + .incrementCounter("ordCat") + .doCatOrdAssertions(pheno) + uniqVar <- unique(na.omit(pheno)) + + # log the ordering of categories used + orderStr <-.setOrderString(orderStr, uniqVar) + cat("order: ", orderStr, " || ", sep="") + + # check sample size + numNotNA <- length(which(!is.na(pheno))) + if (numNotNA<500) { + cat("CATORD-SKIP-500 (", numNotNA, ") || ",sep="") + .incrementCounter("ordCat.500") + } else { + # test this cat ordered variable with ordered logistic regression + phenoFactor <- factor(pheno) + cat("num categories: ", length(unique(na.omit(phenoFactor))), " || ", sep=""); + if (opt$save == TRUE) { + # add pheno to dataframe + .storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catOrd') + cat("SUCCESS results-ordered-logistic") + .incrementCounter("success.ordCat") + } else { + # ordinal logistic regression + sink() + sink(pkg.env$modelFitLogFile, append=TRUE) + print("--------------") + print(varName) + + ### BEGIN TRYCATCH + tryCatch({ + confounders=thisdata[,3:(phenoStartIdx -1), drop = FALSE] + if (opt$standardise==TRUE) { + geno = scale(geno) } - else { - lower = NA - upper = NA + fit <- polr(phenoFactor ~ geno + ., data=confounders, Hess=TRUE) + ctable <- coef(summary(fit)) + sink() + sink(pkg.env$resLogFile, append=TRUE) + + ct <- coeftest(fit) + pvalue <- ct["geno","Pr(>|t|)"] + beta <- ctable["geno", "Value"] + + if (opt$confidenceintervals == TRUE) { + se <- ctable["geno", "Std. Error"] + lower <- beta - 1.96*se + upper <- beta + 1.96*se + } else { + lower <- NA + upper <- NA } - - write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-ordered-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE"); - cat("SUCCESS results-ordered-logistic"); - .incrementCounter("success.ordCat") - - isExposure = getIsExposure(vl, varName) + + write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-ordered-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE") + cat("SUCCESS results-ordered-logistic") + .incrementCounter("success.ordCat") + + isExposure <- getIsExposure(vl, varName) if (isExposure == TRUE) { - .incrementCounter("success.exposure.ordCat") + .incrementCounter("success.exposure.ordCat") } - - ### END TRYCATCH - }, error = function(e) { - sink() - sink(pkg.env$resLogFile, append=TRUE) - cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) - .incrementCounter("ordCat.error") - }) - } - - } + ### END TRYCATCH + }, error = function(e) { + sink() + sink(pkg.env$resLogFile, append=TRUE) + cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) + .incrementCounter("ordCat.error") + }) + } + } } # check that the phenotype is valid - that there are more than two categories diff --git a/WAS/addToCounts.r b/WAS/addToCounts.r new file mode 100644 index 0000000..e41420c --- /dev/null +++ b/WAS/addToCounts.r @@ -0,0 +1,33 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# adds given value to counter, that are used to count how many variables go down each route in the data flow +addToCounts <- function(countName, num) { + + idx = which(counters$name==countName) + + if (length(idx)==0) { + # counter does not exist so add with countValue 1 + counters <<- rbind(counters, data.frame(name=countName, countValue=num)) + } + else { + # add to counter that already exists + counters$countValue[idx] <<- counters$countValue[idx]+num + } + +} diff --git a/WAS/binaryLogisticRegression.r b/WAS/binaryLogisticRegression.r new file mode 100644 index 0000000..6fbcefd --- /dev/null +++ b/WAS/binaryLogisticRegression.r @@ -0,0 +1,121 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Perform binary logistic regression +# +# Performs binary logistic regression on the phenotype stored in thisdata +# and stores result in 'results-logistic-binary' results file. +binaryLogisticRegression <- function(varName, varType, thisdata, isExposure) { + + phenoFactor = factor(thisdata[,phenoStartIdx]) + + facLevels = levels(phenoFactor) + + # assert variable has exactly two distinct values + if (length(facLevels)!=2) { + #stop(paste("Not 2 levels: ", length(facLevels), " || ", sep="")) + cat("BINARY-NOT2LEVELS- (", length(facLevels), ") || ",sep=""); + incrementCounter("binary.nottwolevels") + } + + idxTrue = length(which(phenoFactor==facLevels[1])) + idxFalse = length(which(phenoFactor==facLevels[2])) + numNotNA = length(which(!is.na(phenoFactor))) + + if (idxTrue<10 || idxFalse<10) { + cat("BINARY-LOGISTIC-SKIP-10 (", idxTrue, "/", idxFalse, ") || ", sep="") + incrementCounter("binary.10") + } + else if (numNotNA<500) { + cat("BINARY-LOGISTIC-SKIP-500 (", numNotNA, ") || ",sep=""); + incrementCounter("binary.500") + } + else { + + cat("sample ", idxTrue, "/", idxFalse, "(", numNotNA, ") || ", sep=""); + + if (opt$save == TRUE) { + # add pheno to dataframe + storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'bin') + cat("SUCCESS results-logistic-binary "); + incrementCounter("success.binary") + } + else { + + # use standardised geno values + if (opt$standardise==TRUE) { + geno = scale(thisdata[,"geno"]) + } + else { + geno = thisdata[,"geno"] + } + confounders=thisdata[,3:numPreceedingCols, drop = FALSE] + + sink() + sink(modelFitLogFile, append=TRUE) + print("--------------") + print(varName) + + ###### BEGIN TRYCATCH + tryCatch({ + + mylogit <- glm(phenoFactor ~ geno + ., data=confounders, family="binomial") + + sink() + sink(resLogFile, append=TRUE) + + sumx = summary(mylogit) + + pvalue = sumx$coefficients['geno','Pr(>|z|)'] + beta = sumx$coefficients["geno","Estimate"] + + + if (opt$confidenceintervals == TRUE) { + cis = confint(mylogit, "geno", level=0.95) + lower = cis["2.5 %"] + upper = cis["97.5 %"] + } + else { + lower = NA + upper = NA + } + + numNotNA = length(na.omit(phenoFactor)) + + ## save result to file + write(paste(varName,varType,paste(idxTrue,"/",idxFalse,"(",numNotNA,")",sep=""), beta,lower,upper,pvalue, sep=","), file=paste(opt$resDir,"results-logistic-binary-",opt$varTypeArg,".txt",sep=""), append="TRUE"); + cat("SUCCESS results-logistic-binary "); + + incrementCounter("success.binary") + + if (isExposure==TRUE) { + incrementCounter("success.exposure.binary") + } + + ## END TRYCATCH + }, error = function(e) { + sink() + sink(resLogFile, append=TRUE) + cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) + incrementCounter("binary.error") + }) + } + } +} + diff --git a/WAS/equalSizedBins.r b/WAS/equalSizedBins.r new file mode 100644 index 0000000..15570cd --- /dev/null +++ b/WAS/equalSizedBins.r @@ -0,0 +1,118 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# splits the pheno into 3 bins with the cut points between values rather at the exact value for the quantile +equalSizedBins <- function(phenoAvg) { + + ## equal sized bins + q = quantile(phenoAvg, probs=c(1/3,2/3), na.rm=TRUE) + + minX = min(phenoAvg, na.rm=TRUE) + maxX = max(phenoAvg, na.rm=TRUE) + + phenoBinned = phenoAvg; + if (q[1]==minX) { + # edge case - quantile value is lowest value + + # assign min value as cat1 + idx1 = which(phenoAvg==q[1]); + phenoBinned[idx1] = 0; + + # divide remaining values into cat2 and cat3 + phenoAvgRemaining = phenoAvg[which(phenoAvg!=q[1])]; + qx = quantile(phenoAvgRemaining, probs=c(0.5), na.rm=TRUE) + minXX = min(phenoAvgRemaining, na.rm=TRUE) + maxXX = max(phenoAvgRemaining, na.rm=TRUE) + + if (qx[1]==minXX) { + # edge case again - quantile value is lowest value + idx2 = which(phenoAvg==qx[1]); + idx3 = which(phenoAvg>qx[1]); + } + else if (qx[1]==maxXX) { + # edge case again - quantile value is max value + idx2 = which(phenoAvgq[1]); + idx3 = which(phenoAvg==qx[1]); + } + else { + idx2 = which(phenoAvgq[1]); + idx3 = which(phenoAvg>=qx[1]); + } + phenoBinned[idx2] = 1; + phenoBinned[idx3] = 2; + } + else if (q[2]==maxX) { + # edge case - quantile value is highest value + + # assign max value as cat3 + idx3 = which(phenoAvg==q[2]); + phenoBinned[idx3] = 2; + + # divide remaining values into cat1 and cat2 + phenoAvgRemaining = phenoAvg[which(phenoAvg!=q[2])]; + qx = quantile(phenoAvgRemaining, probs=c(0.5), na.rm=TRUE) + minXX = min(phenoAvgRemaining, na.rm=TRUE) + maxXX = max(phenoAvgRemaining, na.rm=TRUE) + + if (qx[1]==minXX) { + # edge case again - quantile value is lowest value + idx1 = which(phenoAvg==qx[1]); + idx2 = which(phenoAvg>qx[1] & phenoAvg=qx[1] & phenoAvg this value + phenoBinned = phenoAvg; + idx1 = which(phenoAvgq[2]); + phenoBinned[idx1] = 0; + phenoBinned[idx2] = 1; + phenoBinned[idx3] = 2; + } + else { + # standard case - split the data into three roughly equal parts where + # cat1=q2 + phenoBinned = phenoAvg; + idx1 = which(phenoAvg=q[1] & phenoAvg=q[2]); + phenoBinned[idx1] = 0; + phenoBinned[idx2] = 1; + phenoBinned[idx3] = 2; + } + + cat("cat N: ", length(idx1),", ",length(idx2),", ",length(idx3), " || ", sep=""); + + return(phenoBinned); +} + diff --git a/WAS/fixOddFieldsToCatMul.r b/WAS/fixOddFieldsToCatMul.r new file mode 100644 index 0000000..84eff53 --- /dev/null +++ b/WAS/fixOddFieldsToCatMul.r @@ -0,0 +1,54 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Changes variable name from instances to arrays +# +# This function changes the format of variable names from instance format to array format (i.e. we treat +# the instances as arrays), for a small subset of Biobank variables. +# +# Some variables are stored in Biobank as categorical (single) fields with the data stored as set of instances, +# but we want to treat these instead as categorical (multiple) with a set of arrays. +# These are indicated by the value "YES-INSTANCES" in the CAT_SINGLE_TO_CAT_MULT column of the variable info file. +# This function changes the format of these variable names from VARID_0_0, VARID_1_0, VARID_2_0 etc (which +# is instance format), to VARID_0_0, VARID_0_1, VARID_0_2 etc. (array format) so they can be treated as categorical (multiple) fields. +fixOddFieldsToCatMul <- function(data) { + + # examples are variables: 40006, 40011, 40012, 40013 + + # get all variables that need their instances changing to arrays + dataPheno = vl$phenoInfo[which(vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT=="YES-INSTANCES"),]; + + for (i in 1:nrow(dataPheno)) { + varID = dataPheno[i,]$FieldID; + varidString = paste("x",varID,"_", sep=""); + + # get all columns in data dataframe for this variable + colIdxs = which(grepl(varidString,names(data))); + + # change format from xvarid_0_0, xvarid_1_0, xvarid_2_0, to xvarid_0_0, xvarid_0_1, xvarid_0_2 + count = 0; + for (j in colIdxs) { + colnames(data)[j] <- paste(varidString, "0_", count, sep="") + count = count + 1; + } + } + + return(data) + +} diff --git a/WAS/getIsCatMultExposure.r b/WAS/getIsCatMultExposure.r new file mode 100644 index 0000000..560263f --- /dev/null +++ b/WAS/getIsCatMultExposure.r @@ -0,0 +1,61 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# looks up categorical multiple field in the variable info file, return +# whether field has YES in TRAIT_OF_INTEREST column (i.e. all values in +# this field denote the exposure), or whether varName has varValue stated +# as a trait of interest in the TRAIT_OF_INTEREST column (multiple values are +# separated by "|" in this field +getIsCatMultExposure <- function(varName, varValue) { + + # get row index of field in variable information file + idx=which(vl$phenoInfo$FieldID==varName) + + # may be empty of may contain VALUE1|VALUE2 etc .. to denote those + # cat mult values denoting exposure variable + isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] + + if (!is.na(isExposure) & isExposure!="") { + + isExposure = as.character(isExposure) + + ## first check if value is YES, then all values are exposure traits + if (isExposure == "YES") { + cat("IS_CM_ALL_EXPOSURE || ") + return(TRUE) + } + + ## try to split by |, to set particular values as exposure + + # split into variable Values + exposureValues = unlist(strsplit(isExposure,"\\|")) + + # for each value stated, check whether it is varValue + for (thisVal in exposureValues) { + if (thisVal == varValue) { + cat("IS_CM_EXPOSURE || ") + return(TRUE) + } + } + } + + # varValue is not in list of exposure values + return(FALSE) + +} diff --git a/WAS/getIsExposure.r b/WAS/getIsExposure.r new file mode 100644 index 0000000..c8df69e --- /dev/null +++ b/WAS/getIsExposure.r @@ -0,0 +1,31 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# returns boolean var - whether this field denotes the trait of interest, as specified +# in the variable information file +# to determine if values of cat mult fields (not the whole field) are exposure values, use getIsCatMultExposure function instead. +getIsExposure <- function(varName) { + + idx=which(vl$phenoInfo$FieldID==varName) + isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] + if (!is.na(isExposure) & isExposure=="YES") { + return(TRUE) + } + return(FALSE) +} diff --git a/WAS/getNumValuesCatMultExposure.r b/WAS/getNumValuesCatMultExposure.r new file mode 100644 index 0000000..4cae9a9 --- /dev/null +++ b/WAS/getNumValuesCatMultExposure.r @@ -0,0 +1,53 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# looks up categorical multiple field in the variable info file, return +# number of values denoted as trait of interest. +# returns zero if whole field is denoted trait of interest, not particular values. +getNumValuesCatMultExposure <- function(varName) { + + # get row index of field in variable information file + idx=which(vl$phenoInfo$FieldID==varName) + + # may be empty of may contain VALUE1|VALUE2 etc .. to denote those + # cat mult values denoting exposure variable + isExposure = vl$phenoInfo$TRAIT_OF_INTEREST[idx] + + if (!is.na(isExposure) & isExposure!="") { + + isExposure = as.character(isExposure) + + ## first check if value is YES, then no partic values are traits of interest + if (isExposure == "YES") { + return(0) + } + + ## try to split by |, to set particular values as exposure + + # split into variable Values + exposureValues = unlist(strsplit(isExposure,"\\|")) + + return(length(exposureValues)) + + } + + # varValue is not in list of exposure values + return(0) + +} diff --git a/WAS/incrementCounter.r b/WAS/incrementCounter.r new file mode 100644 index 0000000..c430a24 --- /dev/null +++ b/WAS/incrementCounter.r @@ -0,0 +1,34 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# increments counters used to count how many variables go down each route in the data flow +incrementCounter <- function(countName) { + + idx = which(counters$name==countName) + + if (length(idx)==0) { + # counter does not exist so add with countValue 1 + counters <<- rbind(counters, data.frame(name=countName, countValue=1)) + } + else { + # increment counter that already exists + counters$countValue[idx] <<- counters$countValue[idx]+1 + } + +} diff --git a/WAS/initFunctions.r b/WAS/initFunctions.r new file mode 100644 index 0000000..e412c6e --- /dev/null +++ b/WAS/initFunctions.r @@ -0,0 +1,99 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# load the required r files +loadSource <- function() { + source("loadData.r") + source("reassignValue.r") + source("validatePhenotypeInput.r") + source("validateTraitInput.r") + source("testNumExamples.r") + source("binaryLogisticRegression.r") + source("equalSizedBins.r") + source("fixOddFieldsToCatMul.r") + source("replaceMissingCodes.r") + source("replaceNaN.r") + source("testAssociations.r") + source("testCatMultiple.r") + source("testCatSingle.r") + source("testContinuous.r") + source("testInteger.r") + source("testCategoricalOrdered.r") + source("testCategoricalUnordered.r") + source("saveCounts.r") + source("incrementCounter.r") + source("getIsCatMultExposure.r") + source("getIsExposure.r") + source("addToCounts.r") + source("getNumValuesCatMultExposure.r") + source("storeNewVar.r") + source("loadPhenotypes.r") + source("loadTraitOfInterest.r") + source("loadConfounders.r") + source("makeTestDataFrame.r") + source("loadIndicatorFields.r") +} + +# init the counters used to determine how many variables took each path in the variable processing flow. +initCounters <- function() { + counters = data.frame(name=character(),countValue=integer(), stringsAsFactors=FALSE) + return(counters); +} + +# create new results files and headers +initResultsFiles <- function() { + + ## only linear and continuous fields can create linear results + file.create(paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt",sep="")); + write("varName,varType,n,beta,lower,upper,pvalue", file=paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt",sep=""), append="TRUE"); + + ## all field types can create binary results + file.create(paste(opt$resDir,"results-logistic-binary-",opt$varTypeArg,".txt",sep="")); + write("varName,varType,n,beta,lower,upper,pvalue", file=paste(opt$resDir,"results-logistic-binary-",opt$varTypeArg,".txt",sep=""), append="TRUE"); + + ## only categorical multiple cannot generate order categorical results + file.create(paste(opt$resDir,"results-ordered-logistic-",opt$varTypeArg,".txt",sep="")); + write("varName,varType,n,beta,lower,upper,pvalue", file=paste(opt$resDir,"results-ordered-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE"); + + ## only categorical single fields can generate unordered categorical results + file.create(paste(opt$resDir,"results-multinomial-logistic-",opt$varTypeArg,".txt",sep="")); + write("varName,varType,n,beta,lower,upper,pvalue", file=paste(opt$resDir,"results-multinomial-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE"); + +} + +# load the variable information and data code information files +initVariableLists <- function() { + + phenoInfo=read.table(opt$variablelistfile,sep="\t",header=1,comment.char="",quote=""); + + dataCodeInfo=read.table(opt$datacodingfile,sep=",", header=1); + + vars=list(phenoInfo=phenoInfo, dataCodeInfo=dataCodeInfo); + return(vars); +} + + + + + + + + + + + diff --git a/WAS/loadConfounders.r b/WAS/loadConfounders.r new file mode 100644 index 0000000..357dbab --- /dev/null +++ b/WAS/loadConfounders.r @@ -0,0 +1,136 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +## +## loads confounder variables from phenotype file + +loadConfounders <- function(phenotypes) { + +if (opt$save==TRUE) { + + # saving not running tests so we add a fake confounder + numRows = nrow(phenotypes) + data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) + colnames(data)[1] <- "userID" + colnames(data)[2] <- "conf1" + return(data) + +} else { + + if (!is.null(opt$confounderfile)) { + print("Loading confounders from confounder file ...") + + confs = fread(opt$confounderfile, sep=',', header=TRUE, data.table=FALSE) + confs = lapply(confs,function(x) type.convert(as.character(x))) + confs = as.data.frame(confs) + + ## find userID column and change name to userID + idx = which(colnames(confs) == opt$userId) + confs = confs[,c(opt$userID,setdiff(colnames(confs),opt$userID))] + colnames(confs)[1] <- "userID" + + } else { + print("Loading confounders from phenotypes file ...") + confNames = getConfounderNames() + + ##### + ##### extract confounders from data file + confs = fread(opt$phenofile, select=confNames, sep=',', header=TRUE, data.table=FALSE) + confs = lapply(confs,function(x) type.convert(as.character(x))) + confs = as.data.frame(confs) + + ##### + ##### process genetic batch to create genetic chip variable + if (opt$genetic == TRUE) { + genoBatch = confs[,"x22000_0_0"] + + # chip comes from batch field 22000 + genoChip = rep.int(NA,nrow(confs)); + idxForVar = which(genoBatch<0); + genoChip[idxForVar] = 0; + idxForVar = which(genoBatch>=0 & genoBatch<2000); + genoChip[idxForVar] = 1; + + # remove geno batch from and add geno chip to confounders + confs = confs[,-which(names(confs) == "x22000_0_0")] + confs = cbind.data.frame(confs, genoChip) + } + + + ##### + ##### Convert assessment centre to an indicator variable + if (opt$sensitivity==TRUE) { + confs$x54_0_0 = as.factor(confs$x54_0_0) + assCentre = model.matrix(~confs$x54_0_0) + assCentre = assCentre[,2:ncol(assCentre)] + confs = cbind(confs, assCentre) + confs$x54_0_0 = NULL + } + + colnames(confs)[1] <- "userID" + + } + + # remove any rows with no values + print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) + confsComp = complete.cases(confs) + print(paste("Number of INCOMPLETE rows removed from confounder data: ", length(which(confsComp==FALSE)),sep="")) + confs = confs[confsComp==TRUE,] + print(paste("Number of rows in confounder data: ", nrow(confs),sep="")) + + print("Confounder columns:") + print(names(confs)) + + return(confs) +} +} + +getConfounderNames <- function() { + + ##### + ##### first get vector of confounder names + + # age and sex + confNames = c(opt$userId, "x21022_0_0", "x31_0_0") + + # if genetic trait of interest then adjust for genotype chip + # and also let user choose sensitivity analysis that also adjusts for top 10 genetic principal components and assessment centre + if (opt$genetic == TRUE) { + + confNames = append(confNames, "x22000_0_0") + + if (opt$sensitivity==TRUE) { + confNames = append(confNames, c("x22009_0_1", "x22009_0_2", "x22009_0_3", "x22009_0_4", "x22009_0_5", "x22009_0_6", "x22009_0_7", "x22009_0_8", "x22009_0_9", "x22009_0_10", "x54_0_0")) + print("Adjusting for age, sex, genotype chip, top 10 genetic principal components and assessment centre") + } else { + print("Adjusting for age, sex and genotype chip") + } + } else { + # non genetic trait of interest, then sensitivity adjusts for assessment center + if (opt$sensitivity==TRUE) { + confNames = append(confNames, "x54_0_0") + print("Adjusting for age, sex and assessment centre") + } else { + print("Adjusting for age and sex") + } + } + + return(confNames) +} + + diff --git a/WAS/loadData.r b/WAS/loadData.r new file mode 100644 index 0000000..127ea39 --- /dev/null +++ b/WAS/loadData.r @@ -0,0 +1,85 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# loads phenotype and trait of interest data files +# creates phenotype / trait of interest data frame +# creates confounder data frame +# returns an object holding these two data frames +loadData <- function() { + + library(data.table) + + ##### + ##### validating data + + ## check phenotype file headers + validatePhenotypeInput() + + ## check trait of interest file headers + validateTraitInput() + + ##### + ##### load data + + ## load phenotype + print("Loading phenotypes ...") + phenotype = loadPhenotypes() + + ## load trait of interest + toi <- loadTraitOfInterest(phenotype) + + ## load confounders + conf <- loadConfounders(phenotype) + + ## add trait of interest to phenotype data frame and remove rows with no trait of interest + ## merge in toi with phenotype - keep id list from phenotypes file + phenotype = merge(toi, phenotype, by="userID", all.y=TRUE, all.x=FALSE) + + ## remove any rows with no trait of interest + idxNotEmpty = which(!is.na(phenotype[,"geno"])) + + if (opt$save == TRUE) { + print(paste("Phenotype file has ", nrow(phenotype), " rows.", sep="")) + } else { + print(paste("Phenotype file has ", nrow(phenotype), " rows with ", length(idxNotEmpty), " not NA for trait of interest (",opt$traitofinterest,").", sep="")) + } + + phenotype = phenotype[idxNotEmpty,] + + # match ids from not empty phenotypes list + confsIdx = which(conf$userID %in% phenotype$userID) + conf = conf[confsIdx,] + + if (nrow(phenotype)==0) { + stop("No examples with row in both trait of interest and phenotype files", call.=FALSE) + } else { + print(paste("Phenotype and trait of interest data files merged, with", nrow(phenotype),"examples")) + } + + # some fields are fixed that have a field type as cat single but we want to treat them like cat mult + phenotype = fixOddFieldsToCatMul(phenotype) + + indFields = loadIndicatorFields(colnames(phenotype)) + + d = list(datax=phenotype, confounders=conf, inds=indFields) + return(d) + +} + + diff --git a/WAS/loadIndicatorFields.r b/WAS/loadIndicatorFields.r new file mode 100644 index 0000000..31c94e0 --- /dev/null +++ b/WAS/loadIndicatorFields.r @@ -0,0 +1,162 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +## +## load data used for data code default value related field, and categorical multiple indicator field +loadIndicatorFields <- function(phenosToTest) { + + print("Loading indicator fields from phenotypes file ...") + + # read pheno file column names + phenoVarsAll = colnames(read.table(opt$phenofile, header=1, nrows=1, sep=',')) + phenoVarsAll = phenoVarsAll[which(phenoVarsAll!=opt$userId)] + + indVars = c(opt$userId) + + ## add indicator variables to pheno data + indVars = addIndicatorVariables(indVars, phenosToTest, phenoVarsAll) + + if (length(indVars)>1) { + # not just user id column + print("Loading required related variable(s):") + print(indVars[2:length(indVars)]) + } + else { + print("No required related variables.") + } + + ## read in the right table columns + data = fread(opt$phenofile, select=indVars, sep=',', header=TRUE, data.table=FALSE) + data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) + colnames(data)[1] <- "userID" + return(data) +} + + +addIndicatorVariables <- function(indVars, phenosToTest, phenoVarsAll) { + + ##### + ##### default value related fields for data codes + # get list of all indicator variables from outcome info file + + # get datacodes with an indicator variable + dataCodeIdx = which(!is.na(vl$dataCodeInfo$default_related_field) & vl$dataCodeInfo$default_related_field!="") + + # whether there are any related fields that are not in the phenotype data file when they should be + hasIssue=FALSE + + if (length(dataCodeIdx)>0) { + dataCodeWithRF = vl$dataCodeInfo[dataCodeIdx,] + + defaultFields = c() + + # check there is a field in the phenotypes data for each data code and if so then add this data codes related field to the phenosToTest list + if (nrow(dataCodeWithRF)>0) { + for (i in 1:nrow(dataCodeWithRF)) { + dc = dataCodeWithRF$dataCode[i] + + # get all fields with this datacode + fieldsIdx = which(vl$phenoInfo$DATA_CODING == dc) + fieldIDs = vl$phenoInfo$FieldID[fieldsIdx] + fieldIDs = paste("x",fieldIDs,"_0_0", sep="") + + # datacode related field + rf = dataCodeWithRF$default_related_field[i] + rf = paste("x",rf,"_0_0", sep="") + + # if one of these field IDs are in phenotypeColumns then data code related field is needed + if (length(intersect(fieldIDs, phenosToTest))>0) { + defaultFields = append(defaultFields, rf) + } + } + } + + defaultFields = unique(defaultFields) + indVars = append(indVars, defaultFields) + + ##### + ##### check these required variables exist in phenotype file + if(length(defaultFields)>0) { + for (i in 1:length(defaultFields)) { + if (!(defaultFields[i] %in% phenoVarsAll)) { + print(paste("Required variable: Field ",defaultFields[i],"is a data code related field (default_related_field column in data code information file) but was not found in phenotype data")) + hasIssue=TRUE + } + } + } + } + + ##### + ##### categorical multiple indicator fields + + # get field info, for fields with cat mult indicator fields + fieldsIdx = which(!is.na(vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS)) + + if (length(fieldsIdx)>0) { + fieldsWithCMIF = vl$phenoInfo[fieldsIdx,] + fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "NO_NAN"),] + fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == "ALL"),] + fieldsWithCMIF = fieldsWithCMIF[-which(fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS == ""),] + + if (nrow(fieldsWithCMIF)>0) { + + # turn into variable format not field ID + fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS = paste("x",fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS,"_0_0", sep="") + fieldsWithCMIF$FieldID = paste("x",fieldsWithCMIF$FieldID,"_", sep="") + phenosToTestIds = sub("_.*", "_", phenosToTest) + + # remove rows where the field isn't in the phenotypes list + idxIn = which(fieldsWithCMIF$FieldID %in% phenosToTestIds) + fieldsWithCMIF = fieldsWithCMIF[idxIn,] + + defaultFields = fieldsWithCMIF$CAT_MULT_INDICATOR_FIELDS + defaultFields = unique(defaultFields) + + ##### + ##### remove those that already exist in phenotypes 'part' + idxExists = which(defaultFields %in% indVars) + if (length(idxExists>0)) { + defaultFields = defaultFields[-idxExists] + } + + indVars = unique(append(indVars, defaultFields)) + + ##### + ##### check these required variables exist in phenotype file + if(length(defaultFields)>0) { + for (i in 1:length(defaultFields)) { + if (!(defaultFields[i] %in% phenoVarsAll)) { + print(paste("Required variable: Field ",defaultFields[i],"is a categorical multiple indicator field (CAT_MULT_INDICATOR_FIELDS column in variable information file) but was not found in phenotype data")) + hasIssue=TRUE + } + } + } + + } + } + + # stop script if there are missing variables + if (hasIssue==TRUE) { + print("!!! PHESANT has stopped - add required variables to phenotype file or remove relevant phenotypes (so that required variables are not needed).") + quit() + } + + return(indVars) + +} + diff --git a/WAS/loadPhenotypes.r b/WAS/loadPhenotypes.r new file mode 100644 index 0000000..c689ebe --- /dev/null +++ b/WAS/loadPhenotypes.r @@ -0,0 +1,127 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +## +## load phenotypes from phenotype file + +loadPhenotypes <- function() { + + + ## is not running 'all' then we determine the start and end idxs of phenotypes that we test, so that we can parallelise into multiple jobs + if (opt$varTypeArg!="all") { + + # read pheno file column names + phenoVars = read.table(opt$phenofile, header=0, nrows=1, sep=',') + phenoVars = phenoVars[which(phenoVars!=opt$userId)] + + + ##### + ##### calculate part start and end + + partSize = ceiling(length(phenoVars)/opt$numParts); + partStart = (opt$partIdx-1)*partSize + 1; + + if (opt$partIdx == opt$numParts) { + partEnd = length(phenoVars); + } else { + partEnd = partStart + partSize - 1; + } + + print(paste(partStart, '-', partEnd)); + + + ##### + ##### find range of columns to read in + + ## This is more complicated than just reading in a column range, because we need to determine cut points + ## such that all columns of a particular field are loaded. + ## A field is included in a 'part' if its last column is within the part range. + ## e.g. for part 2 of 5 parts and for 100 columns, then fields having their last column at position 21 - 40 (i.e. its column index) are included in this part. + + ## user ID always included + phenosToTest = c(opt$userId) + + currentVar="" + currentVarLong="" + currentVarShort="" + first=TRUE + phenoIdx=0 + + # all columns for a particular field + thisPhenoToTest = c() + + for (var in phenoVars) { + + varx = gsub("^x", "", var); + varx = gsub("_[0-9]+$", "", varx); + varxShort = gsub("^x", "", var); + varxShort = gsub("_[0-9]+_[0-9]+$", "", varxShort); + currentVarLong = var + + if (currentVar == varx) { # same variable same timepoint + # add current var to pheno list + thisPhenoToTest = append(thisPhenoToTest, as.character(currentVarLong)) + } + else if (currentVarShort == varxShort) { # save var, diff timepoint + ## different time point of this var so skip in testing but add here because some are fixed to cat mult + thisPhenoToTest = append(thisPhenoToTest, as.character(currentVarLong)) + } else { + ## new variable so run test for previous (we have collected all the columns now) + + if (first==FALSE) { + if (phenoIdx>=partStart && phenoIdx<=partEnd) { # only start new variable processing if last column of it is within the idx range for this part + phenosToTest = append(phenosToTest, thisPhenoToTest) + } + } + + first=FALSE; + + ## new variable so set values + currentVar = varx + currentVarShort = varxShort + thisPhenoToTest = c(as.character(currentVarLong)) + } + + + phenoIdx = phenoIdx + 1 + } + + # last variable so test association + if (phenoIdx>=partStart && phenoIdx<=partEnd) { + #phenosToTest = append(phenosToTest, as.character(currentVarLong)) + phenosToTest = append(phenosToTest, as.character(thisPhenoToTest)) + } + + ## read in the right table columns - a subset of the data file + data = fread(opt$phenofile, select=phenosToTest, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) + + } else { + # reading all data at once + data = fread(opt$phenofile, sep=',', header=TRUE, data.table=FALSE, na.strings=c("", "NA")) + } + + ## this is type conversion as used in the read.table function (that we used to use ((this was changed because read.table cannot read column subsets)) + data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) + + colnames(data)[1] <- "userID" + + return(data) + +} + + diff --git a/WAS/loadTraitOfInterest.r b/WAS/loadTraitOfInterest.r new file mode 100644 index 0000000..712362c --- /dev/null +++ b/WAS/loadTraitOfInterest.r @@ -0,0 +1,54 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +## +## load trait of interest, either from separate trait of interest file, or from phenotype file + +loadTraitOfInterest <- function(phenotypes) { + + +if (opt$save==TRUE) { + # saving not running tests so we don't have a trait of interest + + # add pretend trait of interest so other code doesn't break + numRows = nrow(phenotypes) + data = cbind.data.frame(phenotypes$userID, rep(-1, numRows)) + +} else { + + # load the trait of interest specified by the user + + if (is.null(opt$traitofinterestfile)) { + print("Extracting trait of interest from pheno file ...") + data = fread(opt$phenofile, select=c(opt$userId, opt$traitofinterest), sep=',', header=TRUE, data.table=FALSE) + + } else { + print("Loading trait of interest file ...") + data = fread(opt$traitofinterestfile, select=c(opt$userId, opt$traitofinterest), sep=',', header=TRUE, data.table=FALSE) + } + + data = data.frame(lapply(data,function(x) type.convert(as.character(x)))) +} + +colnames(data)[1] <- "userID" +colnames(data)[2] <- "geno" + +return(data) + +} diff --git a/WAS/makeTestDataFrame.r b/WAS/makeTestDataFrame.r new file mode 100644 index 0000000..39963b2 --- /dev/null +++ b/WAS/makeTestDataFrame.r @@ -0,0 +1,30 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +## makes a smaller data frame containing the data for a particular test +makeTestDataFrame <- function(datax, confounders, currentVarValues) { + + thisdata = datax[,c("geno", "userID")] + thisdata = merge(thisdata, confounders, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + currentVarValues = cbind.data.frame(datax$userID, currentVarValues) + colnames(currentVarValues)[1] = "userID" + thisdata = merge(thisdata, currentVarValues, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + return(thisdata) + +} diff --git a/WAS/opt.RData b/WAS/opt.RData new file mode 100644 index 0000000000000000000000000000000000000000..0f7822f72517a5db2b74bad89bb1f6abd3ac33e9 GIT binary patch literal 4853 zcmVPSGCkaifBa~Wq0Mtb3Z4usek&Jk8k$ytKCDVxvD$xyqJNmroROx7Y}+RT&?fedK1oqUm^ zwm44;RNqahd9%n#a0?OSv}HRKStUe!+L|>>SqICNn7n=cdV*4&x3UF7b?*&9Rmf(f zO~e_J zHw7MPqsTZA+G!`XTL(qhp=7jaIWTD#9X4=F3^VexhCfi{+0E5N+Jg9OT@HJxdpW9O zdMKNCo>)|PVtX>lgE3>yWUXwbj&A6YbJ6->oQ?@hZw&F8RLZ7}8Ow0Y8N;@?DNM&o zAK6GwNh_SvlT3b^qWLrhOs*6XurA)Lg zxkQE--gIt&KkR~)%FI47oQi7Z=hlX+WJQIKI+97Q*u_k-F0N_Aw6z|&=3y!vkblqII&s=5mQ z&*tYce7&mhV~Z22cdXOxE303bWj9OQA;Jf6nq%1fJjckxK`~MBNy)OjuXbr>HlIyf z#Ul~i!`6{EY=t+rB$Kw4volU+G2=X`ms7ebctrBenD7l68g|jfa)7CDz$Sh=YZjd= z>7`oATDzzqbIPLD9wM!sZH{t{9bS;q){>}9K`|QQ^($>Sd4tokvU~+?sa9fPZtKG8 zGYYGBCX>0+f>7D6gV{U6w6%6mPL>vC_yiHMxiF+1!}5gTx-k8R>VTiKa+Z@x)ns{n zp=l414i%Y%!T~ocPtY^A0d$TT*2vt5VH#7DH;jDIKWr2;_9I?}xSTPj>9BC?!bu;g zCu!SURGluae3A}`N16!kPH}wK`0lA-kSVrE57Rq;CU|6W}ncnqgi3b!ttHLsp@yjvo5nsfr< zktUtqsrme4KAmvKh>_e8wzwbWdcs=DIL~>- zqY`p67P40H3dMYi+wX#Vd#Od-o%|c1?&Odbsmj?N1-Cp0uDK%Og@eG~&f%oUR`}k= z;h#Z_b`HCQ7(xd)Cg^+sbgl|DJRZ!1V?JIt!P^GBFy3yEX+QAkhhw6BUW5t<9`9@r z^uhG@a@Z%~dw}2>9Qq`90|f67@WFW72@XF`V8a9l+w>TRB=cfZ*mM+ZI>zBkqVEaN z^Q1!GuVfdM>ry0ehz;0eRm0TxeV=_Ap090 zev|Oo$6%X(A-1{-{<{V?679bO-RnSa66n2!c)ktxdIx+aY<3gyZh)Ns3Aj^0=Qi}; z0^7U`JnwM$4@6hs-jmS1i|G3LSM$3VbDzWifejvT_+K*aA^7wf$n`$(zYpWzPQ~VA12R zB;oG>f1Bv{82T4Mza`-LgyR;8ULQ;7euU^^y*}Z`J|%j63S)e|KI2_}?TMn-QwjfX z0{?#_dVL%EzXf`I2Y7y$-!SA-$Qic#NZD&${)c7pYzXuAme@r@(3H;^YC4{ zzThbT6~h+)Ad&e;AoG6{U;LOK`yV35Pnb71`o8!n@A`8L{{JZ9|1*z&(`Mp}pF{tb zp!+X?=Px;K7xBffC3Js<=tiyaDu4NFir;_?zvXm?fZp#ww|@dUzlZUHXE%Hn>kqLG z5$hPng5^%)V^2u8<4@2jLu|^-63m1Ob7F#EHmR5&M2NVfnJ>~5U2xaL{rj5WpmUju ze#HN|Wg=fQQ^`6uALnsVV?b5e%w zOzybWf@wQenCf221?#mnxJEdVvKIlf90#^lkPPzwzD6yUu49$WA=%7S!I#Q7jH1dP zo)qKl(R|5Ck5molh(-1;r?B;e@l%jzy)@#%uLd+tLIx++sx7pl~OZyhglee zq_z-jww4NcyHvD-GSIgy!NcZ|$R0a3s$fe+?C@1lh`4Q)MGW7zmx`wgCLz-H zRTe3@ho%(==Ob;gcsrPUy4eiZtKg<;Xv}l9ieA1wE)`{Gjbzci->NF>(t8)vm#WWKY$+%+gQHa%9DDupjhCTL(Q({@dK~Q=@L8--d@`OexD+kJJ z8pABkl@_d=V=Ra|fLetY`%lpR;X2BvOTnW|pMu9&^b{Pc-0>QuqGTCJ zlQOvi8$9Er#A=K%eKL+Rb&84pF-`xoOu^$!AL1vNJ_S$4lQTH+q?=UA(y3a^b-aqy zr8Gc0;y9tpqU!?c6PkY)SD^AbT%lfK`UrW0>GSY%JaY`JRUW>{qUYfd z`90)nXW^A9HonUAS$K`8vMFucq3M!&;i z8GZj+)#AG`jBUWrZdbbo->YKV_n5v0-)H(7{D7_8_Ec_N4SuMxspx;7MX$ko{9i8z zziRfpK#TA63Y1)D#TThDTx0P(DpN1U_+xQSNrm}8VEPy~+1gABakrw(g!|F10&$ao zsbgbe%3Kudg76QH-0S4VEnbOjaZL;T$jEq{$!6V9DwcA65v-d4?Y{yc`1d`_pefe& zE7bZzVnJmaV-j#@RE%ksj8W`Q5$MYOGY3QaXF|3db`}c5tTO+aFw&x#E%_H}BFKWw zdt5Nx+C0s>bJF_;Dfj+lV`l zVxxg@y1)CLd3bKz;5cgcaGPEHrvV=Nv5WWtK{z(8r2;pyEq;V|Mf|{6Vdcs#I}`)B z_4(4Y#q&fjwv(?7vPy4EyKCuD+2ifo5!A?TqiWjq>!P5xT|)_=8p9@akb+vW_T59m zs$}<5HLDKj^jY~nrUY<^Vc|MUBy2=>R8_OlK`N1Ko=yN>3{4}^74mNJjvb;iCxiaY z&~`!F&Ei$PZdSjlNB1WQep}<&e~{Lp?y%hlW7=IykIIiXC4I#z_DNNf{je^YWgDIn zFeZjRv3UqR$Y!c)dJL&z)&B$8hhoYeiLQ`$dpvoMM9kSEG3AX)ci4|Du?5S|#dcQt z`#;9YD`PiTzY6+Y-U&Dl^7FI(sW%4mM1QezHLhGc!*U&~mt2C^ZfKu}c9%zY#JB4n z7OBQ1F#c-QxCC4e5OiN)l{F*cmZ=1sVkh8ohak7V2DcmtyQR0DZrKOzUTAxfygeZA zZs64qvhD|W^n=WzZDqkOs6phm4xnqWU?ES=23Tvg<75vVnR=|5a$OK>HS} zPxMbRb@WHsX%g3lZ?BOJsV*BR?B9X*CbUz~-hlR&MD|!aE(DQxp}hm`ZD{XGh)3CB zAjXP2{&C~s0Sm>${(9!rd(hs4_93fJPQB06Ii=k>5Z8FqYh*mC#0E+x6WR|vc~1GK z05cMqSA6pv6m2^Cv}^zunh*%zwX)!*tPkw~n-KW72D$*YBIrWU4d5%Z13?dhoiX7B zfc*$wKyUz{7oZFyR}lENx{5#?zlPu~4cv$TKAl?% zco*Ofg1Zp_<6mRd{eB<80|frKhX};+_W}Gd?;)60fC=E^%^*lcfEfbSbecslhrk~< zk3by%5WpW}A$X*K1ppr}haevTas+XbA z6al^<@M8qN96v!Ij{g*aj{nahfKTU31^fcwmk53p0WiLxAEn02{3Yg+-vIm;!S6MI z@Zv101g=58*g5kx-fYwyxC8>amuGM9+e8& zh=6Il7J)|f4Lz9RWPFNMHn3po_@+KSel!ACv>y-;4ffIDJMgm4i3rCj4(8NsX0zn^ zxvk=1>NenY!$I^gEzJ{kAp9zxGzEL^$g`Ui&!zzrz@yK?27#w!I00VhxiRFqE&ek# z#Pf+jYT4tKd_w2})-ck2q6a+VgDZ85NBxE3T#!v=n{|1wW17x-XH?$eUH4q-S;cdy zE8jS}$`9M8?-JEMXTSO@LN~_a_8YJCTrZwjRCuECO3%hCJsYp|^c`X;9MX8D=lbzc zQQ@P;D?OjBUrK7fG+ybsaop%A+}L=f=dBcKPpRH?B3fDAV>DhRtr~C}Q@k-CeD?O_Sx3({w9A0KE z{_7jX_n`h4yhq#rl^k!&`46Z6D^O}f4`^va50Pm@PZw(o9xnCrGK?d~&n*X%`F)t~ zKJ?jt;7s|)s^w{q-a&GEu^qgdahS;|M7aD^BRpi7pX21jRP{LH%o}{XcXhxP*9pQ` z1kS2O?8WL4HFriZ6E=naoU8=F8I?(YUN};{H&tAl${C(h11u5g<;|)U8iGksNhj3g|mYbc!8^iSz%@7$RLU=#}uGY5^b1M5jW^+hq&Q zSk7aM+vd1ioxqNnTq;|#GyeLnnqi`5;RF%Xz@bb9P9y^>2qF4t!IB74HB+~RGF60){ + # last variable so test association + thisdata = makeTestDataFrame(data, confounders, currentVarValues) + testAssociations(currentVar, currentVarShort, thisdata) +} + +sink() + +# save counters of each path in variable flow +saveCounts() + +if (opt$save == TRUE) { + write.table(derivedBinary, file=paste(opt$resDir,"data-binary-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(derivedCont, file=paste(opt$resDir,"data-cont-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(derivedCatOrd, file=paste(opt$resDir,"data-catord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); + write.table(derivedCatUnord, file=paste(opt$resDir,"data-catunord-",opt$varTypeArg,".txt", sep=""), append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=TRUE); +} + +warnings() + + + diff --git a/WAS/processArgs.r b/WAS/processArgs.r new file mode 100644 index 0000000..2b9a438 --- /dev/null +++ b/WAS/processArgs.r @@ -0,0 +1,132 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Parse the arguments input by the user +# if argument 'test' is used then run test phenome scan +processArgs <- function() { + +if (opt$test==TRUE) { + + # set up the test phenome scan settings + datadir='../testWAS/data/'; + opt$resDir <<- '../testWAS/results/'; + opt$userId <<- 'userId'; + opt$phenofile <<- paste(datadir,'phenotypes.csv', sep=""); + opt$variablelistfile <<- '../testWAS/variable-lists/outcome-info.tsv'; + opt$datacodingfile <<- '../testWAS/variable-lists/data-coding-ordinal-info.txt'; + opt$confidenceintervals <<- TRUE + + if(opt$save == FALSE) { + opt$traitofinterestfile <<- paste(datadir,'exposure.csv', sep=""); + opt$traitofinterest <<- 'exposure'; + opt$sensitivity <<- FALSE; + opt$genetic <<- TRUE; + } + + processParts(opt$partIdx, opt$numParts); +} +else { + + ## check arguments are supplied correctly + + if (is.null(opt$phenofile)){ + print_help(opt_parser) + stop("phenofile argument must be supplied", call.=FALSE) + } + else if (!file.exists(opt$phenofile)) { + stop(paste("phenotype data file phenofile=", opt$phenofile, " does not exist", sep=""), call.=FALSE) + } + +# if (is.null(opt$traitofinterestfile)){ +# print_help(opt_parser) +# stop("traitofinterestfile argument must be supplied", call.=FALSE) +# } + if (opt$save==FALSE && !is.null(opt$traitofinterestfile) && !file.exists(opt$traitofinterestfile)) { + stop(paste("trait of interest data file traitofinterestfile=", opt$traitofinterestfile, " does not exist", sep=""), call.=FALSE) + } + + if (opt$save==FALSE && !is.null(opt$confounderfile) && !file.exists(opt$confounderfile)) { + stop(paste("confounder data file confounderfile=", opt$confounderfile, " does not exist", sep=""), call.=FALSE) + } + + if (is.null(opt$variablelistfile)){ + print_help(opt_parser) + stop("variablelistfile argument must be supplied", call.=FALSE) + } + else if (!file.exists(opt$variablelistfile)) { + stop(paste("variable listing file variablelistfile=", opt$variablelistfile, " does not exist", sep=""), call.=FALSE) + } + + if (is.null(opt$datacodingfile)){ + print_help(opt_parser) + stop("datacodingfile argument must be supplied", call.=FALSE) + } + else if (!file.exists(opt$datacodingfile)) { + stop(paste("data coding file datacodingfile=", opt$datacodingfile, " does not exist", sep=""), call.=FALSE) + } + + if (opt$save==FALSE && is.null(opt$traitofinterest)){ + print_help(opt_parser) + stop("traitofinterest argument must be supplied", call.=FALSE) + } + + if (is.null(opt$resDir)){ + print_help(opt_parser) + stop("resDir argument must be supplied", call.=FALSE) + } + else if (!file.exists(opt$resDir)) { + stop(paste("results directory resDir=", opt$resDir, " does not exist", sep=""), call.=FALSE) + } + + + processParts(opt$partIdx, opt$numParts); +} + + +if (opt$save==TRUE) { + print("Saving phenotypes to file. Tests of association will not run!") +} + +} + +# Parse the 'part' arguments and check they are valid +processParts <- function(pIdx, nParts) { + + if (is.null(pIdx) && is.null(nParts)) { + opt$varTypeArg <<- "all"; + print(paste("Running with all traits in phenotype file:", opt$phenofile)); + } + else if (is.null(pIdx)) { + print_help(opt_parser) + stop("pIdx argument must be supplied when nParts argument is supplied", call.=FALSE) + } + else if (is.null(nParts)) { + print_help(opt_parser) + stop("nParts argument must be supplied when pIdx argument is supplied", call.=FALSE) + } + else if (pIdx<1 || pIdx>nParts) { + print_help(opt_parser) + stop("pIdx arguments must be between 1 and nParts inclusive", call.=FALSE) + } + else { + opt$varTypeArg <<- paste(pIdx, "-", nParts, sep=""); + print(paste("Running with part",pIdx,"of",nParts," in phenotype file:", opt$phenofile)); + } + +} diff --git a/WAS/reassignValue.r b/WAS/reassignValue.r new file mode 100644 index 0000000..ed9701c --- /dev/null +++ b/WAS/reassignValue.r @@ -0,0 +1,73 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Reassigns values as specified in data coding info file +reassignValue <- function(pheno, varName) { + + # get data code info - whether this data code is ordinal or not and any reordering and resassignments + dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; + dataCode = dataPheno$DATA_CODING; + + # not all variables will have a data code info row + dataCodeRow = which(vl$dataCodeInfo$dataCode==dataCode); + + if (length(dataCodeRow)==0) { + return(pheno); + } + else if (length(dataCodeRow)>1) { + cat("WARNING: >1 ROWS IN DATA CODE INFO FILE || ") + return(pheno); + } + + dataDataCode = vl$dataCodeInfo[dataCodeRow,]; + reassignments = as.character(dataDataCode$reassignments); + + return(reassignValue2(pheno, reassignments)) + +} + +# Reassigns values in pheno, as specified in resassignments argument +reassignValue2 <- function(pheno, reassignments) { + + # can be NA if row not included in data coding info file + if (!is.na(reassignments) && nchar(reassignments)>0) { + + reassignParts = unlist(strsplit(reassignments,"\\|")); + cat(paste("reassignments: ", reassignments, " || ", sep="")); + + # do each reassignment + for(i in reassignParts) { + reassignParts = unlist(strsplit(i,"=")); + + # matrix version + idx = which(pheno==reassignParts[1],arr.ind=TRUE) + pheno[idx]=strtoi(reassignParts[2]); + } + + ## see if type has changed (this happens for field 216 (X changed to -1)) + ## as.numeric will set non numeric to NA so we know if it's ok to do this by seeing if there are extra NA's after the conversion + pNum = as.numeric(unlist(pheno)) + isNum = length(which(is.na(pheno), arr.ind=TRUE))==length(which(is.na(pNum), arr.ind=TRUE)) + if (isNum) { + pheno = pNum + } + } + + return(pheno) +} diff --git a/WAS/replaceMissingCodes.r b/WAS/replaceMissingCodes.r new file mode 100644 index 0000000..f792374 --- /dev/null +++ b/WAS/replaceMissingCodes.r @@ -0,0 +1,37 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Replace negative values with NA as these are assumed to be missing +replaceMissingCodes <- function(pheno) { + + phenoReplaced = pheno; + + uniqVar = unique(na.omit(phenoReplaced)) + + # variable values <0 are `missing' codes + for (u in uniqVar) { + if (u<0) { + idxU = which(phenoReplaced==u) + phenoReplaced[idxU]=NA + } + } + + return(phenoReplaced) + +} diff --git a/WAS/replaceNaN.r b/WAS/replaceNaN.r new file mode 100644 index 0000000..dc3dfd4 --- /dev/null +++ b/WAS/replaceNaN.r @@ -0,0 +1,47 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Replace NaN and empty values with NA in pheno +replaceNaN <- function(pheno) { + + if (is.factor(pheno)) { + + phenoReplaced = pheno + + nanStr = which(phenoReplaced=="NaN") + phenoReplaced[nanStr]=NA + + emptyx = which(phenoReplaced=="") + phenoReplaced[emptyx]=NA + + } + else { + + phenoReplaced = pheno + nanx = which(is.nan(phenoReplaced)) + phenoReplaced[nanx] = NA; + + emptyStr = which(phenoReplaced=="") + phenoReplaced[emptyStr] = NA; + + } + + return(phenoReplaced) + +} diff --git a/WAS/saveCounts.r b/WAS/saveCounts.r new file mode 100644 index 0000000..5a72916 --- /dev/null +++ b/WAS/saveCounts.r @@ -0,0 +1,31 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Saves the counters stored in count variables, to a file in results directory +saveCounts <- function() { + + countFile = paste(opt$resDir,"variable-flow-counts-",opt$varTypeArg,".txt",sep="") + + # sort on counter name + sortIdx = order(as.character(counters[,"name"])) + counters <<- counters[sortIdx,] + + write.table(counters, file=countFile, sep=",", quote=FALSE, row.names=FALSE) +} + diff --git a/WAS/storeNewVar.r b/WAS/storeNewVar.r new file mode 100644 index 0000000..20c4d5a --- /dev/null +++ b/WAS/storeNewVar.r @@ -0,0 +1,21 @@ + + +storeNewVar <- function(userIDData, phenoData, varName, type) { + + # add pheno to dataframe + newdata = data.frame(userID=userIDData, newvar=phenoData) + names(newdata)[names(newdata)=="newvar"] = varName + + if (type == "bin") { + derivedBinary <<- merge(derivedBinary, newdata, by="userID", all=TRUE); + } else if (type == "cont") { + derivedCont <<- merge(derivedCont, newdata, by="userID", all=TRUE); + } else if (type == "catOrd") { + derivedCatOrd <<- merge(derivedCatOrd, newdata, by="userID", all=TRUE); + } else if (type == "catUnord") { + derivedCatUnord <<- merge(derivedCatUnord, newdata, by="userID", all=TRUE); + } + + #write.table(phenoFactor, file=paste(opt$resDir, "data-binary-", varName, ".csv", sep=""), row.names=FALSE, col.names=FALSE, na="", quote=FALSE); + +} diff --git a/WAS/testAssociations.r b/WAS/testAssociations.r new file mode 100644 index 0000000..71592ab --- /dev/null +++ b/WAS/testAssociations.r @@ -0,0 +1,141 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Tests the association of a field, determined by its field type +testAssociations <- function(currentVar, currentVarShort, thisdata) { + + ## call file for variable type + + tryCatch({ + + # retrieve whether phenotype is excluded etc + idx=which(vl$phenoInfo$FieldID==currentVarShort); + + # check if variable info is found for this field + if (length(idx)==0) { + cat(paste(currentVar, " || Variable could not be found in pheno info file. \n", sep="")) + incrementCounter("notinphenofile") + } + else { + + # get info from variable info file + excluded = vl$phenoInfo$EXCLUDED[idx] + catSinToMult = vl$phenoInfo$CAT_SINGLE_TO_CAT_MULT[idx] + fieldType = vl$phenoInfo$ValueType[idx] + isExposure = getIsExposure(currentVarShort) #vl$phenoInfo$EXPOSURE_PHENOTYPE[idx] + + if (fieldType=="Integer") { + + #### INTEGER + cat(currentVar, "|| ", sep="") + + if (excluded!="") { + cat(paste("Excluded integer: ", excluded, " || ", sep="")) + incrementCounter("excluded.int") + } + else { + incrementCounter("start.int") + if (isExposure==TRUE) { + incrementCounter("start.exposure.int") + } + + testInteger(currentVarShort, "INTEGER", thisdata); + } + cat("\n"); + } + else if (fieldType=="Continuous") { + + #### CONTINUOUS + cat(currentVar, "|| ", sep="") + + if (excluded!="") { + cat(paste("Excluded continuous: ", excluded, " || ", sep="")) + incrementCounter("excluded.cont") + } + else { + incrementCounter("start.cont") + if (isExposure==TRUE) { + incrementCounter("start.exposure.cont") + } + testContinuous(currentVarShort, "CONTINUOUS", thisdata); + } + cat("\n"); + } + else if (fieldType=="Categorical single" && catSinToMult=="") { + + #### CAT SINGLE + cat(currentVar, "|| ", sep="") + + if (excluded!="") { + cat(paste("Excluded cat-single: ", excluded, " || ", sep="")) + incrementCounter("excluded.catSin") + } + else { + incrementCounter("start.catSin") + if (isExposure==TRUE) { + incrementCounter("start.exposure.catSin") + } + testCategoricalSingle(currentVarShort, "CAT-SIN", thisdata); + } + cat("\n"); + } + else if (fieldType=="Categorical multiple" || catSinToMult!="") { + + #### CAT MULTIPLE + cat(currentVar, "|| ", sep="") + + if (excluded!="") { + cat(paste("Excluded cat-multiple: ", excluded, " || ", sep="")) + incrementCounter("excluded.catMul") + } + else { + + if (catSinToMult!="") { + cat("cat-single to cat-multiple || ", sep="") + incrementCounter("catSinToCatMul") + } + + incrementCounter("start.catMul") + if (isExposure==TRUE) { + incrementCounter("start.exposure.catMul") + } + else { + # get number of cat mult values denoting trait of interest + numVals = getNumValuesCatMultExposure(currentVarShort) + if (numVals>0) { + addToCounts("start.exposure.catMulvalues", numVals) + } + } + testCategoricalMultiple(currentVarShort, "CAT-MUL", thisdata); + } + cat("\n"); + } + else { + #cat("VAR MISSING ", currentVarShort, "\n", sep=""); + } + } + + }, error = function(e) { + print(paste("ERROR:", currentVar,e)) + }) + +} + + + diff --git a/WAS/testCatMultiple.r b/WAS/testCatMultiple.r new file mode 100644 index 0000000..8aaf974 --- /dev/null +++ b/WAS/testCatMultiple.r @@ -0,0 +1,166 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Performs preprocessing of categorical (multiple) fields, namely: +# 1) Reassigning values as specified in data coding file +# 2) Generating binary variable for each category in field, restricting to correct set of participants as specified +# in CAT_MULT_INDICATOR_FIELDS field of variable info file (either NO_NAN, ALL or a field ID) +# 3) Checking derived variable has at least 10 cases in each group +# 4) Calling binaryLogisticRegression function for this derived binary variable +testCategoricalMultiple <- function(varName, varType, thisdata) { + cat("CAT-MULTIPLE || "); + + pheno = thisdata[,phenoStartIdx:ncol(thisdata), drop=FALSE] + pheno = reassignValue(pheno, varName) + + ## get unique values from all columns of this variable + uniqueValues = unique(na.omit(pheno[,1])); + numCols = ncol(pheno); + numRows = nrow(pheno); + if (numCols>1) { + for (num in 2:numCols) { + u = unique(na.omit(pheno[,num])) + uniqueValues = union(uniqueValues,u); + } + } + + ## for each value create a binary variable and test this + for (variableVal in uniqueValues) { + + ## numeric negative values we assume are missing - check this + if(is.numeric(variableVal) & variableVal<0) { + cat("SKIP_val:", variableVal," < 0", sep=""); + next; + } + + # make variable for this value + idxForVar = which(pheno == variableVal, arr.ind=TRUE) + idxsTrue = idxForVar[,"row"] + + cat(" CAT-MUL-BINARY-VAR ", variableVal, " || ", sep=""); + incrementCounter("catMul.binary") + + # make zero vector and set 1s for those with this variable value + varBinary = rep.int(0,numRows); + varBinary[idxsTrue] = 1; + varBinaryFactor = factor(varBinary) + + ## data for this new binary variable + newthisdata = cbind.data.frame(thisdata[,1:numPreceedingCols], varBinaryFactor) + + ## one of 3 ways to decide which examples are negative + idxsToRemove = restrictSample(varName, pheno, variableVal, thisdata[,"userID", drop=FALSE]) + + if (!is.null(idxsToRemove) & length(idxsToRemove) > 0) { + newthisdata = newthisdata[-idxsToRemove,] + } + + facLevels = levels(newthisdata[,phenoStartIdx]) + idxTrue = length(which(newthisdata[,phenoStartIdx]==facLevels[1])) + idxFalse = length(which(newthisdata[,phenoStartIdx]==facLevels[2])) + + if (idxTrue<10 || idxFalse<10) { + cat("CAT-MULT-SKIP-10 (", idxTrue, " vs ", idxFalse, ") || ", sep=""); + incrementCounter("catMul.10") + } + else { + isExposure = getIsCatMultExposure(varName, variableVal) + + incrementCounter("catMul.over10") + # binary - so logistic regression + binaryLogisticRegression(paste(varName, variableVal,sep="#"), varType, newthisdata, isExposure) + } + } +} + +# restricts sample based on value in CAT_MULT_INDICATOR_FIELDS column of variable info file, +# either NO_NAN, ALL or a field ID +# returns idx's that should be removed from the sample +restrictSample <- function(varName,pheno,variableVal, userID) { + + # get definition for sample for this variable either NO_NAN, ALL or a variable ID + varIndicator = vl$phenoInfo$CAT_MULT_INDICATOR_FIELDS[which(vl$phenoInfo$FieldID==varName)] + + return(restrictSample2(varName,pheno,varIndicator,variableVal, userID)) +} + + +restrictSample2 <- function(varName,pheno, varIndicator,variableVal, userID) { + + if (varIndicator=="NO_NAN") { # remove NAs + ## remove all people with no value for this variable + + # row indexes with NA in all columns of this cat mult field + ind <- apply(pheno, 1, function(x) all(is.na(x))) + naIdxs = which(ind==TRUE) + cat("NO_NAN Remove NA participants ", length(naIdxs), " || ", sep=""); + } + else if (varIndicator=="ALL") { + + # use all people (no missing assumed) so return empty vector + # e.g. hospital data and death registry + naIdxs = cbind() + cat("ALL || ") + } + else if (varIndicator!="") { + # remove people who have no value for indicator variable + indName = paste("x",varIndicator,"_0_0",sep=""); + cat("Indicator name ", indName, " || ", sep=""); + indvarx = merge(userID, indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + indicatorVar = indvarx[,indName] + + # remove participants with NA value in this related field + indicatorVar = replaceNaN(indicatorVar) + naIdxs = which(is.na(indicatorVar)) + + cat("Remove indicator var NAs: ", length(naIdxs), " || ", sep=""); + + if (is.numeric(as.matrix(indicatorVar))) { + # remove participants with value <0 in this related field - assumed missing indicators + lessZero = which(indicatorVar<0) + naIdxs = union(naIdxs, lessZero) + cat("Remove indicator var <0: ", length(lessZero), " || ", sep="") + } + } + else { + stop("Categorical multiples variables need a value for CAT_MULT_INDICATOR_FIELDS", call.=FALSE) + } + + ## remove people with pheno<0 if they aren't a positive example for this variable indicator + ## because we can't know if they are a negative example or not + if (is.numeric(as.matrix(pheno))) { + idxForVar = which(pheno == variableVal, arr.ind=TRUE) + idxMissing = which(pheno < 0, arr.ind=TRUE) + + # all people with <0 value and not variableVal + naMissing = setdiff(idxMissing,idxForVar) + + # add these people with unknowns to set to remove from sample + naIdxs = union(naIdxs, naMissing) + + cat(paste("Removed ", length(naMissing) ," examples != ", variableVal, " but with missing value (<0) || ", sep="")); + } + else { + cat("Not numeric || ") + } + + return(naIdxs); + +} + diff --git a/WAS/testCatSingle.r b/WAS/testCatSingle.r new file mode 100644 index 0000000..109f88b --- /dev/null +++ b/WAS/testCatSingle.r @@ -0,0 +1,193 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Performs variable processing for categorical (single) fields, namely: +# 1) Reassigning values as specified in data coding information file +# 2) Reordering categories for ordered fields +# 3) Replacing missing codes - we assume values < 0 are missing for categorical (single) variables +# 4) Remove values with <10 cases +# 5) Deterimine correct test to perform, either binary, ordered or unordered. +testCategoricalSingle <- function(varName, varType, thisdata) { + cat("CAT-SINGLE || "); + + pheno = thisdata[,phenoStartIdx:ncol(thisdata)] + isExposure = getIsExposure(varName) + + # assert variable has only one column + if (!is.null(dim(pheno))) stop("More than one column for categorical single") + + pheno = reassignValue(pheno, varName) + + # get data code info - whether this data code is ordinal or not and any reordering + dataPheno = vl$phenoInfo[which(vl$phenoInfo$FieldID==varName),]; + dataCode = dataPheno$DATA_CODING; + + # get data coding information + dataCodeRow = which(vl$dataCodeInfo$dataCode==dataCode); + if (length(dataCodeRow)==0) { + cat("ERROR: No row in data coding info file || "); + return(NULL); + } + dataDataCode = vl$dataCodeInfo[dataCodeRow,]; + ordered = dataDataCode$ordinal; + order = as.character(dataDataCode$ordering); + + ## reorder variable values into increasing order (we do this now as this may convert variable to binary rather than ordered) + pheno = reorderOrderedCategory(pheno,order); + + ## if data code has a default_value then recode NA's to this value for participants with value in default_related_field + ## this is used where there is no zero option e.g. field 100200 + defaultValue = dataDataCode$default_value + defaultRelatedID = dataDataCode$default_related_field + pheno = setDefaultValue(pheno, defaultValue, defaultRelatedID, thisdata[,"userID", drop=FALSE]) + + ## all categories coded as <0 we assume are `missing' values + pheno = replaceMissingCodes(pheno) + + ## remove categories if < 10 examples + pheno = testNumExamples(pheno) + + uniqVar = unique(na.omit(pheno)) + uniqVar = sort(uniqVar) + + if (length(uniqVar)<=1) { + cat("SKIP (only one value) || "); + incrementCounter("catSin.onevalue") + } + else if (length(uniqVar)==2) { + cat("CAT-SINGLE-BINARY || "); + incrementCounter("catSin.case3") + # binary so logistic regression + + phenoFactor = factor(pheno) + # binary - so logistic regression + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); + binaryLogisticRegression(varName, varType, thisdatanew, isExposure) + } + else { + # > 2 categories + if (is.na(ordered)) { + cat(" ERROR: 'ordered' not found in data code info file") + } + else { + + ## unordered + if (ordered == 0) { + + cat("CAT-SINGLE-UNORDERED || ") + incrementCounter("catSin.case2") + + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); + testCategoricalUnordered(varName, varType, thisdatanew); + + } + else if (ordered == 1) { + + ## ordered + cat("ordered || "); + incrementCounter("catSin.case1") + + ## reorder variable values into increasing order + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], pheno); + testCategoricalOrdered(varName, varType, thisdatanew, order) + + } + else if (ordered == -2) { + cat(" EXCLUDED or BINARY variable: Should not get here in code. ") + incrementCounter("catSin.binaryorexcluded") + } + else { + print(paste("ERROR", varName, varType, dataCode)); + } + } + } + +} + +## values are reordered and assigned values 1:N for N categories +reorderOrderedCategory <- function(pheno,order) { + + ## new pheno of NAs (all values not in order are assumed to be NA) + + if (!is.na(order) && nchar(order)>0) { + + # make empty pheno + pheno2 = rep(NA,length(pheno)); + + ## get ordering + orderParts = unlist(strsplit(order,"\\|")); + + # go through values in correct order and set value + # from 1 to the number of values + count=1; + for(i in orderParts) { + idx = which(pheno==i); + pheno2[idx] = count; + + count=count+1; + } + + cat("reorder ",order," || ",sep=""); + + return(pheno2) + } + else { + return(pheno); + } + +} + +## sets default value for people with no value in pheno, but with a value in the +## field specified in the default_value_related_field column in the data coding info file. +## the default value is specified in the default_value column in the data coding info file. +setDefaultValue <- function(pheno, defaultValue, defaultRelatedID, userID) { + + + if (!is.na(defaultValue) && nchar(defaultValue)>0) { + + # remove people who have no value for indicator variable + indName = paste("x",defaultRelatedID,"_0_0",sep=""); + + cat("Default related field: ", indName, " || ", sep=""); + indicatorVar = indicatorFields[,indName] + indvarx = merge(userID, indicatorFields, by="userID", all.x=TRUE, all.y=FALSE, sort=FALSE) + indicatorVar = indvarx[,indName] + + # remove participants with NA value in this related field + indicatorVar = replaceNaN(indicatorVar) + + # check if there are already examples with default value and if so display warning + numWithDefault = length(which(pheno==defaultValue)) + if (numWithDefault>0) { + cat("(WARNING: already ", numWithDefault, " values with default value) ", sep="") + } + + # set default value in people who have no value in the pheno but do have a value in the default_value_related_field + defaultIdxs = which(!is.na(indicatorVar) & is.na(pheno)) + pheno[defaultIdxs] = defaultValue + + cat("default value ", defaultValue, " set, N= ", length(defaultIdxs), " || ", sep=""); + + } + + return(pheno) + +} + + diff --git a/WAS/testCategoricalOrdered.r b/WAS/testCategoricalOrdered.r new file mode 100644 index 0000000..78debf6 --- /dev/null +++ b/WAS/testCategoricalOrdered.r @@ -0,0 +1,163 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Performs ordered logistic regression test and saves results in ordered logistic results file +testCategoricalOrdered <- function(varName, varType, thisdata, orderStr="") { + + + pheno = thisdata[,phenoStartIdx:ncol(thisdata)] + geno = thisdata[,"geno"] + + cat("CAT-ORD || "); + incrementCounter("ordCat") + + doCatOrdAssertions(pheno) + + uniqVar = unique(na.omit(pheno)); + + # log the ordering of categories used + orderStr = setOrderString(orderStr, uniqVar); + cat("order: ", orderStr, " || ", sep=""); + + # check sample size + numNotNA = length(which(!is.na(pheno))) + if (numNotNA<500) { + cat("CATORD-SKIP-500 (", numNotNA, ") || ",sep=""); + incrementCounter("ordCat.500") + } + else { + # test this cat ordered variable with ordered logistic regression + + phenoFactor = factor(pheno) + + cat("num categories: ", length(unique(na.omit(phenoFactor))), " || ", sep=""); + + if (opt$save == TRUE) { + # add pheno to dataframe + storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catOrd') + cat("SUCCESS results-ordered-logistic"); + incrementCounter("success.ordCat") + } + else { + + # ordinal logistic regression + sink() + sink(modelFitLogFile, append=TRUE) + print("--------------") + print(varName) + + require(MASS) + require(lmtest) + + ### BEGIN TRYCATCH + tryCatch({ + confounders=thisdata[,3:numPreceedingCols, drop = FALSE] + + + if (opt$standardise==TRUE) { + geno = scale(geno) + } + + fit <- polr(phenoFactor ~ geno + ., data=confounders, Hess=TRUE) + + ctable <- coef(summary(fit)) + sink() + sink(resLogFile, append=TRUE) + + ct = coeftest(fit) + pvalue = ct["geno","Pr(>|t|)"] + beta = ctable["geno", "Value"]; + + if (opt$confidenceintervals == TRUE) { + se = ctable["geno", "Std. Error"] + lower = beta - 1.96*se; + upper = beta + 1.96*se; + } + else { + lower = NA + upper = NA + } + + write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-ordered-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE"); + cat("SUCCESS results-ordered-logistic"); + incrementCounter("success.ordCat") + + isExposure = getIsExposure(varName) + if (isExposure == TRUE) { + incrementCounter("success.exposure.ordCat") + } + + ### END TRYCATCH + }, error = function(e) { + sink() + sink(resLogFile, append=TRUE) + cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) + incrementCounter("ordCat.error") + }) + } + + } +} + +# check that the phenotype is valid - that there are more than two categories +# and that these all have at least 10 cases +# something has gone wrong if this is the case +doCatOrdAssertions <- function(pheno) { + + # assert variable has only one column + if (!is.null(dim(pheno))) stop("More than one column for categorical ordered") + + uniqVar = unique(na.omit(pheno)); + + # assert more than 2 categories + if (length(uniqVar)<=1) stop("1 or zero values") + if (length(uniqVar)==2) stop("this variable is binary") + + # assert each value has >= 10 examples + for (u in uniqVar) { + withValIdx = which(pheno==u) + numWithVal = length(withValIdx); + + if (numWithVal<10) stop("value with <10 examples") + } +} + +# If data coding file does not specify an order then we use the default order as in coding defined by Biobank +# and this function just generates a string with this order for logging purposes +setOrderString <- function(orderStr, uniqVar) { + + if (is.na(orderStr) || nchar(orderStr)==0) { + + orderStr=""; + + # create order str by appending each value + uniqVarSorted = sort(uniqVar); + first=1; + for (i in uniqVarSorted) { + if (first==0) { + orderStr = paste(orderStr, "|", sep=""); + } + if (i>=0) # ignore missing values + orderStr = paste(orderStr, i, sep=""); + first=0; + end + } + } + return(orderStr); +} diff --git a/WAS/testCategoricalUnordered.r b/WAS/testCategoricalUnordered.r new file mode 100644 index 0000000..eae9c73 --- /dev/null +++ b/WAS/testCategoricalUnordered.r @@ -0,0 +1,177 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Tests an unordered categorical phenotype with multinomial regression +# and saves this result in the multinomial logistic results file +testCategoricalUnordered <- function(varName, varType, thisdata) { + + pheno = thisdata[,phenoStartIdx:ncol(thisdata)] + #geno = thisdata[,"geno"] + + numNotNA = length(which(!is.na(pheno))) + if (numNotNA<500) { + cat("CATUNORD-SKIP-500 (", numNotNA, ") || ",sep=""); + incrementCounter("unordCat.500") + } + else { + + # check there are not too many levels and skip if there are + numUnique = length(unique(na.omit(pheno))) + + # num outcome values * (num confounders and trait of interest and bias term) + numWeights=(numUnique-1)*((numPreceedingCols-2)+1+1) + if (numWeights>1000) { + cat("Too many weights in model: ", numWeights, " > 1000, (num outcomes values: ", numUnique, ") || SKIP ", sep="") + incrementCounter("unordCat.cats") + return(NULL) + } + + phenoFactor = chooseReferenceCategory(pheno); + + if (opt$save == TRUE) { + # add pheno to dataframe + storeNewVar(thisdata[,"userID"], phenoFactor, varName, 'catUnord') + cat("SUCCESS results-notordered-logistic "); + incrementCounter("success.unordCat") + } + else { + + + reference = levels(phenoFactor)[1]; + + sink() + sink(modelFitLogFile, append=TRUE) # hide output of model fitting + print("--------------") + print(varName) + + require(nnet) + if (opt$standardise==TRUE) { + geno = scale(thisdata[,"geno"]) + } + else { + geno = thisdata[,"geno"] + } + #cat("genoMean=", mean(geno), " genoSD=", sd(geno), " || ", sep="") + + confounders=thisdata[,3:numPreceedingCols, drop = FALSE] + + ###### BEGIN TRYCATCH + tryCatch({ + + fit <- multinom(phenoFactor ~ geno + ., data=confounders, maxit=1000) + + ## baseline model with only confounders, to which we compare the model above + fitB <- multinom(phenoFactor ~ ., data=confounders, maxit=1000) + + ## compare model to baseline model + require(lmtest) + lres = lrtest(fit, fitB) + modelP = lres[2,"Pr(>Chisq)"]; + + ## save result to file + maxFreq = length(which(phenoFactor==reference)); + numNotNA = length(which(!is.na(pheno))) + write(paste(paste(varName,"-",reference,sep=""), varType, paste(maxFreq,"/",numNotNA,sep=""), -999, -999, -999, modelP, sep=","), file=paste(opt$resDir,"results-multinomial-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE") + + sink() + sink(resLogFile, append=TRUE) + + sumx <- summary(fit) + + z <- sumx$coefficients/sumx$standard.errors + p = (1 - pnorm(abs(z), 0, 1))*2 + + ci <- confint(fit, "geno", level=0.95) + ci = data.frame(ci) + + ## get result for each variable category + uniqVar = unique(na.omit(pheno)) + for (u in uniqVar) { + + ## no coef for baseline value, and values <0 are assumed to be missing + if (u == reference || u<0) { + next + } + + pvalue = p[paste(eval(u),sep=""),"geno"] + beta = sumx$coefficients[paste(eval(u),sep=""),"geno"] + + if (opt$confidenceintervals == TRUE) { + lower = ci[1, paste("X2.5...", u, sep="")] + upper = ci[1, paste("X97.5...", u, sep="")] + } + else { + lower = NA + upper = NA + } + + numThisValue = length(which(phenoFactor==u)); + + ## save result to file + write(paste(paste(varName,"-",reference,"#",u,sep=""), varType, paste(maxFreq,"#",numThisValue,sep=""), beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-multinomial-logistic-",opt$varTypeArg,".txt",sep=""), append="TRUE") + + } + + cat("SUCCESS results-notordered-logistic "); + incrementCounter("success.unordCat") + + isExposure = getIsExposure(varName) + if (isExposure == TRUE) { + incrementCounter("success.exposure.unordCat") + } + + ## END TRYCATCH + }, error = function(e) { + sink() + sink(resLogFile, append=TRUE) + cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) + incrementCounter("unordCat.error") + }) + } + } +} + +# find reference category - category with most number of examples +chooseReferenceCategory <- function(pheno) { + + uniqVar = unique(na.omit(pheno)); + phenoFactor = factor(pheno) + + maxFreq=0; + maxFreqVar = ""; + for (u in uniqVar) { + withValIdx = which(pheno==u) + numWithVal = length(withValIdx); + if (numWithVal>maxFreq) { + maxFreq = numWithVal; + maxFreqVar = u; + } + } + + cat("reference: ", maxFreqVar,"=",maxFreq, " || ", sep=""); + + ## choose reference (category with largest frequency) + phenoFactor <- relevel(phenoFactor, ref = paste("",maxFreqVar,sep="")) + + return(phenoFactor); +} + + + + diff --git a/WAS/testContinuous.r b/WAS/testContinuous.r new file mode 100644 index 0000000..1532eba --- /dev/null +++ b/WAS/testContinuous.r @@ -0,0 +1,250 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Main function called for continuous fields +testContinuous <- function(varName, varType, thisdata) { + + cat("CONTINUOUS MAIN || "); + + pheno = thisdata[,phenoStartIdx:ncol(thisdata)] + + # reassign values + pheno = reassignValue(pheno, varName) + + thisdata[,phenoStartIdx:ncol(thisdata)] = pheno + + testContinuous2(varName, varType, thisdata) + +} + +# Main code used to process continuous fields, or integer fields that have been reassigned as continuous because they have >20 distinct values. +# This is needed because we have already reassigned values for integer fields, so do this in the function above for continuous fields. +testContinuous2 <- function(varName, varType, thisdata) { + cat("CONTINUOUS || "); + + pheno = thisdata[,phenoStartIdx:ncol(thisdata)] + isExposure = getIsExposure(varName) + + if (!is.null(dim(pheno))) { + phenoAvg = rowMeans(pheno, na.rm=TRUE) + } + else { + phenoAvg = pheno + } + + ## recode NaN to NA, which is generated if all cols of pheno are NA for a given person + idxNan = which(is.nan(phenoAvg)) + phenoAvg[idxNan] = NA; + numNotNA=length(na.omit(phenoAvg)); + + ## check whether >20% examples with same value + uniqVar = unique(na.omit(phenoAvg)) + valid = TRUE + for (uniq in uniqVar) { + numWithValue = length(which(phenoAvg==uniq)) + if (numWithValue/numNotNA >=0.2) { + valid = FALSE; + break + } + } + + if (valid == FALSE) { + + ## treat as ordinal categorical + cat(">20% IN ONE CATEGORY || "); + + # if >2 unique values then treat as ordered categorical + numUniqueValues = length(uniqVar) + + # straight forward case that there are two (or one) values + if (numUniqueValues<=2) { + ## treat as binary or skip (binary requires>=10 per category) + + ## remove categories if < 10 examples to see if this should be binary or not, but if ordered categorical + ## then we include all values when generating this + phenoAvgMoreThan10 = testNumExamples(phenoAvg) + + ## binary if 2 distinct values, else ordered categorical + phenoFactor = factor(phenoAvgMoreThan10) + numLevels = length(unique(na.omit(phenoAvgMoreThan10))) #length(levels(phenoFactor)) + + if (numLevels<=1) { + cat("SKIP (number of levels: ",numLevels,")",sep="") + incrementCounter("cont.onevalue") + } + else if (numLevels==2) { + # binary + incrementCounter("cont.binary") + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); + binaryLogisticRegression(varName, varType, thisdatanew, isExposure); + } + } + else { + ## try to treat as ordered categorical + + incrementCounter("cont.ordcattry") + ## equal sized bins + phenoBinned = equalSizedBins(phenoAvg); + + # check number of people in each bin + bin0Num = length(which(phenoBinned==0)) + bin1Num = length(which(phenoBinned==1)) + bin2Num = length(which(phenoBinned==2)) + + if (bin0Num>=10 & bin1Num>=10 & bin2Num>=10) { + + # successful binning. >=10 examples in each of the 3 bins + + incrementCounter("cont.ordcattry.ordcat") + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned); + testCategoricalOrdered(varName, varType, thisdatanew); + } + else { + # try to treat as binary because not enough examples in each bin + + if (bin0Num<10 & bin2Num<10) { + ## skip - not possible to create binary variable because first and third bins are too small + ## ie. could merge bin1 with bin 2 but then bin3 still too small etc + cat("SKIP 2 bins are too small || ") + incrementCounter("cont.ordcattry.smallbins") + } + else if ((bin0Num<10 | bin1Num<10) & (bin0Num+bin1Num)>=10) { + + # combine first and second bin to create binary variable + incrementCounter("cont.ordcattry.binsbinary") + cat("Combine first two bins and treat as binary || ") + phenoBinned[which(phenoBinned==0)] = 1 + + # test binary + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned) + binaryLogisticRegression(varName, varType, thisdatanew, isExposure); + } + else if ((bin2Num<10 | bin1Num<10) & (bin2Num+bin1Num)>=10) { + + # combine second and last bin to create binary variable + incrementCounter("cont.ordcattry.binsbinary") + cat("Combine last two bins and treat as binary || ") + phenoBinned[which(phenoBinned==2)] = 1 + + # test binary + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoBinned) + binaryLogisticRegression(varName, varType, thisdatanew, isExposure) + } + + + else { + ## skip - not possible to create binary variable because combining bins would still be too small + cat("SKIP 2 bins are too small(2) || ") + incrementCounter("cont.ordcattry.smallbins2") + } + + } + + } + } + else { + cat("IRNT || "); + incrementCounter("cont.main") + + # check there are at least 500 examples + numNotNA = length(which(!is.na(phenoAvg))) + if (numNotNA<500) { + cat("CONTINUOUS-SKIP-500 (", numNotNA, ") || ",sep=""); + incrementCounter("cont.main.500") + } + else { + ## inverse rank normal transformation + phenoIRNT = irnt(phenoAvg) + + if (opt$save == TRUE) { + # add pheno to dataframe + storeNewVar(thisdata[,"userID"], phenoIRNT, varName, 'cont') + cat("SUCCESS results-linear"); + incrementCounter("success.continuous") + } + else { + + ## do regression (use standardised geno values) + if (opt$standardise==TRUE) { + geno = scale(thisdata[,"geno"]) + } + else { + geno = thisdata[,"geno"] + } + confounders=thisdata[,3:numPreceedingCols, drop = FALSE] + + sink() + sink(modelFitLogFile, append=TRUE) + print("--------------") + print(varName) + + ###### BEGIN TRYCATCH + tryCatch({ + + fit <- lm(phenoIRNT ~ geno + ., data=confounders) + + sink() + sink(resLogFile, append=TRUE) + + sumx = summary(fit) + + pvalue = sumx$coefficients['geno','Pr(>|t|)'] + beta = sumx$coefficients["geno","Estimate"] + + if (opt$confidenceintervals == TRUE) { + cis = confint(fit, level=0.95) + lower = cis["geno", "2.5 %"] + upper = cis["geno", "97.5 %"] + } + else { + lower = NA + upper = NA + } + + numNotNA = length(which(!is.na(phenoIRNT))) + + ## save result to file + write(paste(varName, varType, numNotNA, beta, lower, upper, pvalue, sep=","), file=paste(opt$resDir,"results-linear-",opt$varTypeArg,".txt", sep=""), append="TRUE"); + cat("SUCCESS results-linear"); + + incrementCounter("success.continuous") + if (isExposure == TRUE) { + incrementCounter("success.exposure.continuous") + } + + ## END TRYCATCH + }, error = function(e) { + sink() + sink(resLogFile, append=TRUE) + cat(paste("ERROR:", varName,gsub("[\r\n]", "", e), sep=" ")) + incrementCounter("continuous.error") + }) + } + } + } +} + +irnt <- function(pheno) { + set.seed(1234) + numPhenos = length(which(!is.na(pheno))) + quantilePheno = (rank(pheno, na.last="keep", ties.method="random")-0.5)/numPhenos + phenoIRNT = qnorm(quantilePheno) + return(phenoIRNT); +} + diff --git a/WAS/testInteger.r b/WAS/testInteger.r new file mode 100644 index 0000000..9f87511 --- /dev/null +++ b/WAS/testInteger.r @@ -0,0 +1,88 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Processing integer fields, namely: +# 1) Reassigning values as specified in the data code information file +# 2) Generate a single value if there are several values (arrays) by taking the mean +# 3) Treating this field as continuous if at least 20 distinct values. +# Otherwise treat as binary or ordered categorical if 2 or more than two values. +testInteger <- function(varName, varType, thisdata) { + cat("INTEGER || "); + + pheno = thisdata[,phenoStartIdx:ncol(thisdata)] + isExposure = getIsExposure(varName) + + if (!is.numeric(as.matrix(pheno))) { + cat("SKIP Integer type but not numeric",sep=""); + return(NULL) + } + + pheno = reassignValue(pheno, varName) + + ## average if multiple columns + if (!is.null(dim(pheno))) { + phenoAvg = rowMeans(pheno, na.rm=TRUE) + + # if participant only has NA values then NaN is generated so we convert back to NA + phenoAvg = replaceNaN(phenoAvg) + } + else { + phenoAvg = pheno + } + + uniqVar = unique(na.omit(phenoAvg)) + + # if >=20 separate values then treat as continuous + if (length(uniqVar)>=20) { + + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoAvg); + testContinuous2(varName, varType, thisdatanew) + incrementCounter("int.continuous") + } + else { + + ## remove categories if < 10 examples + phenoAvg = testNumExamples(phenoAvg) + + ## binary if 2 distinct values, else ordered categorical + phenoFactor = factor(phenoAvg) + numLevels = length(levels(phenoFactor)) + if (numLevels<=1) { + cat("SKIP (number of levels: ",numLevels,")",sep=""); + incrementCounter("int.onevalue") + } + else if (numLevels==2) { + incrementCounter("int.binary") + + # binary + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); + binaryLogisticRegression(varName, varType, thisdatanew, isExposure); + } + else { + incrementCounter("int.catord") + cat("3-20 values || ") + + # we don't use equal sized bins just the original integers (that have >=10 examples) as categories + thisdatanew = cbind.data.frame(thisdata[,1:numPreceedingCols], phenoFactor); + + # treat as ordinal categorical + testCategoricalOrdered(varName, varType, thisdatanew); + } + } +} diff --git a/WAS/testNumExamples.r b/WAS/testNumExamples.r new file mode 100644 index 0000000..37cc992 --- /dev/null +++ b/WAS/testNumExamples.r @@ -0,0 +1,37 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Remove variable values if less than 10 examples have this value +testNumExamples <- function(pheno) { + + ## loop through values and remove if has < 10 examples + uniqVar = unique(na.omit(pheno)); + for (u in uniqVar) { + withValIdx = which(pheno==u) + numWithVal = length(withValIdx); + if (numWithVal<10) { + pheno[withValIdx]=NA + cat(paste("Removed ",u ,": ", numWithVal, "<10 examples || ", sep="")); + } + else { + cat(paste("Inc(>=10): ", u, "(", numWithVal, ") || ", sep="")); + } + } + return(pheno); +} diff --git a/unittests/run-tests.sh b/WAS/unittests/run-tests.sh similarity index 100% rename from unittests/run-tests.sh rename to WAS/unittests/run-tests.sh diff --git a/unittests/test_equalSizedBins.r b/WAS/unittests/test_equalSizedBins.r similarity index 100% rename from unittests/test_equalSizedBins.r rename to WAS/unittests/test_equalSizedBins.r diff --git a/unittests/test_reassignValue.r b/WAS/unittests/test_reassignValue.r similarity index 100% rename from unittests/test_reassignValue.r rename to WAS/unittests/test_reassignValue.r diff --git a/unittests/test_testCatMultiple.r b/WAS/unittests/test_testCatMultiple.r similarity index 100% rename from unittests/test_testCatMultiple.r rename to WAS/unittests/test_testCatMultiple.r diff --git a/unittests/test_testCatSingle.r b/WAS/unittests/test_testCatSingle.r similarity index 100% rename from unittests/test_testCatSingle.r rename to WAS/unittests/test_testCatSingle.r diff --git a/WAS/validatePhenotypeInput.r b/WAS/validatePhenotypeInput.r new file mode 100644 index 0000000..9b95f71 --- /dev/null +++ b/WAS/validatePhenotypeInput.r @@ -0,0 +1,85 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Validate the contents of the phenotype file +validatePhenotypeInput <- function() { + + print("Validating phenotype data ...") + + ## get just first row so we can check the column names + phenoIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') + + ### + ### pheno file validation + + print(paste("Number of columns in phenotype file: ", ncol(phenoIn),sep="")) + + ## check user id exists in pheno file + idx1 = which(names(phenoIn) == opt$userId); + if (length(idx1)==0) { + stop(paste("phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) + } + + # we only need the confounders if we are actually running the tests + if (opt$save==FALSE & is.null(opt$confounderfile)) { + + ## confounder variables exist in pheno file + idx = which(names(phenoIn) == "x21022_0_0"); + if (length(idx)==0) { + stop("phenotype file doesn't contain required age colunn: x21022_0_0", call.=FALSE) + } + + idx = which(names(phenoIn) == "x31_0_0"); + if (length(idx)==0) { + stop("phenotype file doesn't contain required sex colunn: x31_0_0", call.=FALSE) + } + + + if (opt$genetic ==TRUE) { + idx = which(names(phenoIn) == "x22000_0_0"); + if (length(idx)==0) { + stop("phenotype file doesn't contain required genetic batch colunn: x22000_0_0", call.=FALSE) + } + } + + ## if running with sensitivity option then check extra columns exist in pheno file (genetic PCs and assessment centre) + if (opt$sensitivity==TRUE) { + + if (opt$genetic ==TRUE) { + ## check first 10 genetic PCs exist + for (i in 1:10) { + idx = which(names(phenoIn) == paste("x22009_0_", i, sep="")); + if (length(idx)==0) { + stop(paste("phenotype file doesn't contain required genetic principal component colunn: x22009_0_", i, sep=""), call.=FALSE) + } + } + } + + ## assessment centre field + idx = which(names(phenoIn) == "x54_0_0"); + if (length(idx)==0) { + stop("phenotype file doesn't contain required assessment centre colunn: x54_0_0", call.=FALSE) + } + } + + } + + print("Phenotype file validated") + +} diff --git a/WAS/validateTraitInput.r b/WAS/validateTraitInput.r new file mode 100644 index 0000000..16a35f4 --- /dev/null +++ b/WAS/validateTraitInput.r @@ -0,0 +1,69 @@ +# The MIT License (MIT) +# Copyright (c) 2017 Louise AC Millard, MRC Integrative Epidemiology Unit, University of Bristol +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without +# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Validate the contents of the trait of interest file +validateTraitInput <- function(snpIn) { + +if (opt$save!=TRUE) { + + print("Validating trait of interest data ...") + + ### + ### get header of trait of interest file or pheno file (if no trait of interest file is specified + + if (is.null(opt$traitofinterestfile)) { + snpIn = read.table(opt$phenofile, header=1, nrows=1, sep=',') + } + else { + snpIn = read.table(opt$traitofinterestfile, header=1, nrows=1, sep=',') + } + + ### + ### trait of interest file validation + + print(paste("Number of columns in trait of interest file:", ncol(snpIn),sep="")) + + ## check user id exists in snp file + idx1 = which(names(snpIn) == opt$userId); + if (length(idx1)==0) { + if (is.null(opt$traitofinterestfile)) { + stop(paste("Phenotype file doesn't contain userID colunn:", opt$userId), call.=FALSE) + } else { + stop(paste("Trait of interest file doesn't contain userID colunn:", opt$userId), call.=FALSE) + } + + } + + ## check trait of interest exists in trait of interest file + idx2 = which(names(snpIn) == opt$traitofinterest); + if (length(idx2)==0) { + + if (is.null(opt$traitofinterestfile)) { + stop(paste("No trait of interest file specified, and phenotypes file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) + } + else { + stop(paste("Trait of interest file doesn't contain trait of interest variable column:", opt$traitofinterest), call.=FALSE) + } + } + + print("Trait of interest file validated") + +} + +}