IBPA · ameenetemady · Nov 13, 2019 · Nov 13, 2019 · Jan 27, 2020 · Jan 27, 2020
diff --git a/README.md b/README.md
@@ -0,0 +1,56 @@
+### What is OPEX?
+
+OPEX is an optimal experimental design framework to help biologists to select the most informative experiments to conduct given the experiments conducted up to now. This repo demonstrates the application of OPEX on collecting gene expression data of <em>E. coli<em> under the stress of various antibiotics and biocides.
+
+### Dependencies
+* [mlegp](https://cran.r-project.org/web/packages/mlegp/index.html)
+
+### Structure of this project
+
+### Input data
+The input data is a table, in which the first 14 columns define the culture conditions in each row and the other 1123 columns represents the gene expression profile for each condition. (Genes that did not have a sufficient sequecing depth were excluded). 
+
+A culutre condition is defined by a binary vecotr, representing the presence (with 0) or absence (with 1) of 10 biocides and 4 antibiotics: ```Chlorexidine, Phenol, H2O2, Isopropanol, Bezalkonium_chloride, Ethanol, Glutaraldehyde, Percetic_acid, Sodium_hypochlorite, Povidone_iodine, Kanamycin, Rifampicin, Norfloxacin, Ampicillin```
+
+### How to run
+* Step1: generate a file that include the configurations for running OPEX.
+  ```
+  cd generate_setting
+  Rscript generate_setting.R
+  ```
+  After running the above commands, a file named ```generate_setting.csv``` is generated in ```./out_data```. The ```generate_setting.csv``` specifies the value for hyper-parameters: random_seed, exploration frequency, adaptive , start size, add, dataset id, noise, iter_num, and method.
+
+* Step2: Run the simulation on one configuration.
+```
+cd code
+bash run_one_setting.sh
+```
+
+Upon completion, a folder named ```expert_sample``` will be created in ```./out_data```. The folder has the same name as the Rscript used for generating configurations in Step 1. Inside the folder, ```expert_sample```, there is a folder for each sampling approach.
+
+The result is a csv file named by the configuration and contains the order of each culture condition selected by expert sampling.
+
+### Code architecture
+In this dir, ```./code/workhorse```, there are seven R scripts as follows. In ```main.R```, an object of the ```Simulate``` class is defined according to a configuration determined by a command line argument. The defintion of the ```Simulate``` class is defined in ```OOPGP.R```. The major method in the ```Simulate``` class is the ```simulate``` method. All the other methods and functions in other files are for supporting the ```simulate``` method of the ```Simulate``` class.
+
+```
+├── generate_setting
+│   └── expert_sampling.R
+├── run_one_setting.sh
+├── setpath.R
+└── workhorse
+    ├── add_noise.R
+    ├── main.R
+    ├── max_dist.R
+    ├── OOPGP.R
+    ├── prepare_data.R
+    ├── screen_index_helper.R
+    └── update_train_pool.R
+
+```
+
+
+### Acknowledgement
+This work was supported by an NSF award (#1743101).
+
+
diff --git a/code/generate_setting/.Rhistory b/code/generate_setting/.Rhistory
diff --git a/code/generate_setting/generate_setting.R b/code/generate_setting/generate_setting.R
@@ -0,0 +1,20 @@
+args = commandArgs(trailingOnly=FALSE)
+print(args[4])
+setting_file = paste(gsub(".R$", "", gsub("--file=", "", args[4])), ".csv", sep="")
+print(setting_file)
+
+setting = list(
+               random_seed = c(51:100),
+               exploration=c(1),
+               anti_batch = c(0),
+               start_size = c(15),
+               add=c(1, 3),
+               data=14,
+               noise=0,
+               iter_num = 30, # this depends on the start_size
+               method = c("Tan", "Random")
+               )
+setting = expand.grid(setting)
+#setting["iter_num"] = 24/setting["add"]
+write.csv(setting, paste("../../out_data/",setting_file,sep=""), row.names = FALSE)
+print(nrow(setting))
diff --git a/code/run_one_setting.sh b/code/run_one_setting.sh
@@ -0,0 +1 @@
+Rscript ./workhorse/main.R expert_sampling.csv 100
diff --git a/code/setpath.R b/code/setpath.R
@@ -0,0 +1,2 @@
+data_path="/home/wxk/Git/OED2019/data"
+out_data_path="/home/wxk/Git/OED2019/out_data"
diff --git a/code/workhorse/.nfs0000000007b8271b00000630 b/code/workhorse/.nfs0000000007b8271b00000630
diff --git a/code/workhorse/OOPGP.R b/code/workhorse/OOPGP.R
@@ -0,0 +1,118 @@
+# Define a class named Simulator.
+# The main method used in main.R is the simulate method. Other methods of this class and modules in other files are helper functions 
+# for the simulate method.
+
+
+library(mlegp)
+Simulator <- function(file_path_,setting_){
+  setting = setting_
+  input_path = file_path_
+  dists = NA 
+  gene_id = 15 
+  res_path = NA
+  res_name = NA
+  res_path = NA
+  data_all = NA
+  train = NA
+  train_backup = NA
+  pool = NA
+  pool_all  = NA
+  benchmark = NA
+  model = NA
+  errors = NA
+  structure(class="Simulator", environment())
+}
+
+
+source("prepare_data.R")
+
+train_simulator <- function(simulator) {
+    train = simulator$train[, c(1:14, simulator$gene_id)]
+    print("training data gene id")
+    print(dim(train))
+    num = ncol(train)
+    simulator$model = mlegp(train[,2:num-1], train[,num])
+
+  }
+
+
+source("update_train_pool.R")
+
+simulate_index <- function(simulator, metric_fun){
+   select_index = as.numeric()
+   for (i in 1:simulator$setting[["add"]]) {
+        metric = metric_fun(simulator)
+        index=sort(metric, decreasing = TRUE,index.return=TRUE)$ix
+
+        select_index = c(select_index,index[1])
+        update_train_by_prediction(simulator, index[1])
+        train_simulator(simulator)
+   }
+   return (select_index)
+}
+
+
+source("screen_index_helper.R")
+source("max_dist.R")
+
+screen_index <- function(simulator) {
+  method = simulator$setting[["method"]]
+  if (method != "Random" && method != "Expert") {
+      metric_fun = switch(simulator$setting[["method"]],
+                          "EN" = EN_metric,
+                          "MI" = MI_metric,
+                          "COV"= COV_metric,
+                          "DIV"=diversity_metric,
+                          "Tan"=tanimoto_metric)
+
+      if (simulator$setting[["anti_batch"]]==1) {
+         return(simulate_index(simulator,metric_fun))
+      } else {
+         metric = metric_fun(simulator)
+         index=sort(metric, decreasing = TRUE,index.return=TRUE)$ix
+         return(index[1:simulator$setting[["add"]]])
+      }
+  } else {
+      index = seq(1,nrow(simulator$pool),1)
+      return(index[1:simulator$setting[["add"]]])
+  }
+}
+
+
+simulate <- function(simulator){
+  iter_num = simulator$setting[["iter_num"]]
+  simulator$errors = as.numeric()
+  prepare_data(simulator)
+  for (i in 1:iter_num) {
+    # when exploration==1, no exploration; otherwise the frequency of using OED is 1/["exploration"]
+    if ((i%%simulator$setting[["exploration"]]!=0 | i==iter_num) & simulator$setting[["exploration"]] != 1) {
+      cat("random sampling-----------------------")
+      common_index = 1
+    } else {
+     cat("OED---------------------")    
+     indexes = as.numeric()
+     set.seed(i + simulator$setting[["random_seed"]])
+     gene_ids = sample(15:1137, 800, replace=FALSE)
+     for (gene_id in gene_ids) {
+        simulator$gene_id = gene_id
+        try(train_simulator(simulator))
+        select_index = try(screen_index(simulator))
+        if (isTRUE(class(select_index)=="try-error")) next
+        indexes = c(indexes, select_index)
+        # all the genes are the same for the "Tani" metric
+        if (simulator$setting[["method"]] == "Tan" | simulator$setting[["method"]] == "Random") {
+          cat("only one gene needed-----------------")
+          break
+        }
+     }
+     common_index = names(sort(table(indexes), decreasing=TRUE)[1])
+     common_index = as.numeric(common_index)
+     }
+     cat("common index: ", common_index,"\n") 
+     update_train_pool(simulator,common_index)
+     print("------------dimension of the training set")
+     print(dim(simulator$train))
+  } 
+}
+
+
diff --git a/code/workhorse/add_noise.R b/code/workhorse/add_noise.R
@@ -0,0 +1,13 @@
+add_noise <- function(noise_level,adddata) {
+
+  noises = rep(0,nrow(adddata))
+  for (dup in 1:4) {
+    noise = rnorm(nrow(adddata),0,adddata[,3]*noise_level*0.01)
+    noises = noises + noise
+  }
+  noises = noises/4
+  adddata[,3] = adddata[,3] + noises
+
+  return(adddata)
+}
+
diff --git a/code/workhorse/main.R b/code/workhorse/main.R
@@ -0,0 +1,44 @@
+library(mlegp)
+
+setwd("./workhorse")
+source("../setpath.R")
+source("OOPGP.R")
+
+args = commandArgs(TRUE)
+setting_file = args[1]
+setting_id = as.integer(args[2])
+
+out_data_path = paste(out_data_path, sub("\\..*$",'',setting_file),sep="/")
+dir.create(out_data_path, showWarnings = FALSE)
+
+# read in the configuration file and slice one setting
+settings = read.csv(paste("../../out_data/",setting_file,sep=""), stringsAsFactors=FALSE)
+s = settings[setting_id,,drop=TRUE]
+
+# parse the setting into a list
+setting_ = list(
+                 "random_seed" = s[["random_seed"]],
+                 "exploration"=s[["exploration"]],      
+                 "anti_batch" = s[["anti_batch"]],
+                 "start_size" = s[["start_size"]],
+                 "add"= s[["add"]],
+                 "data"=s[["data"]],
+                  "noise"=s[["noise"]],
+                  "iter_num" = s[["iter_num"]],
+                  "method"= s[["method"]]
+                   )
+
+# create the path for the input data
+file_path =  file.path(data_path, paste(setting_[["data"]],".csv",sep=""))
+
+# create an object of the Simulator class
+simulator = Simulator(file_path,setting_)
+
+# load the support data for expert sampling
+dists = read.csv(file.path(data_path, "pairwise_similarity_bio.csv"), row.names=1)
+simulator$dists = dists
+
+# run the simulator
+simulate(simulator)
+
+
diff --git a/code/workhorse/max_dist.R b/code/workhorse/max_dist.R
@@ -0,0 +1,28 @@
+max_dist<- function (simulator,index) {
+
+  add = simulator$setting[["add"]]
+  pool = simulator$pool
+  select = as.numeric()
+  select[1] = index[1]
+  removed = as.numeric()
+  left = setdiff(index,select)
+
+  while(length(select)<add){
+    for (l in left){
+      d = 10 # a random large number
+      # compute the distance of each left datapoint to all the select 
+        # datapoints and store the minimum
+      for (s in select) {
+      d = min(d,sqrt(sum((pool[l,1:ncol(pool)]- pool[s,1:ncol(pool)])**2)))
+      }
+
+      left = setdiff(left,l) # no matter selected or not no need to consider it in the 
+                             # next iteration of while loop
+      if (d > 0.35) {
+        select = c(select,l)
+        break
+      }
+    }
+  }
+  return(select)
+}
diff --git a/code/workhorse/prepare_data.R b/code/workhorse/prepare_data.R
@@ -0,0 +1,85 @@
+set_res_path <- function(simulator) {
+  cols = names(simulator$setting)[c(1,2)]
+  values = unlist(simulator$setting)[c(1,2)]
+  folder = paste0(cols,values,sep="",collapse = "-")
+  dir.create(file.path(out_data_path,folder), showWarnings = FALSE)
+  simulator$res_path = file.path(out_data_path,folder,simulator$setting["method"])
+  cat("create folder")
+  cat(simulator$res_path)
+  dir.create(simulator$res_path,showWarnings = FALSE)
+}
+
+set_res_name <- function(simulator){
+
+  simulator$res_name = paste("every",simulator$setting["add"],"iter_num",simulator$setting["iter_num"]
+                             ,sep = "_")
+}
+
+save_benchmark <- function(simulator) {
+
+  benchmark_file = paste(simulator$res_name,"_bench.csv",sep="")
+  write.csv(simulator$benchmark,file.path(simulator$res_path,benchmark_file))
+}
+
+
+
+load_data <- function(simulator) {
+  # load and sort the data, the first 14 rows covers all the 
+
+  # gene_id = simulator$setting[["gene_id"]]
+  data = read.csv(simulator$input_path, stringsAsFactors=FALSE)
+  con_names = data[,1]
+  data = data[,-1]
+  once_rows = which(apply(data[,1:14],1,sum)==1)
+
+  #you can use a 2-d table to image this
+  set.seed(simulator$setting[["random_seed"]]) # shuffle before splitting
+  antiseptics = sample(colnames(data)[1:10], 10, replace = FALSE)
+  antibiotics = sample(colnames(data)[11:14], 4, replace = FALSE)
+
+  row_index = sample(1:10, 10, replace = FALSE)
+  col_index = c(1,2,3,4,3,2,1,2,4,1)
+  start_condition = c(paste(antiseptics[row_index], antibiotics[col_index],sep="_"),"Control")
+  selected = match(start_condition,con_names)
+  selected = c(selected, once_rows)  
+  data_all = as.matrix(data)
+
+  print(apply(data_all[selected,1:14],2,sum))
+  cat("number of selected points", length(selected))
+
+  left = setdiff(sample(1:nrow(data_all)), selected)
+  simulator$data_all = data_all[c(selected, left),]
+}
+
+split_data <- function(simulator){
+
+  start_size = simulator$setting[["start_size"]]
+  simulator$train_backup = simulator$data_all[1:start_size,]
+  print("number of columns")
+  print(ncol(simulator$train_backup))
+  print("starting data for training")
+  print(simulator$train_backup[, 1:15])
+  simulator$train = simulator$train_backup
+  print("data for pool")
+  simulator$pool  = simulator$data_all[(start_size+1):nrow(simulator$data_all),]
+
+  if (simulator$setting[["method"]] == "Expert") {
+     expert_path = file.path(dirname(simulator$input_path), paste(simulator$setting[["random_seed"]], "_expert.csv", sep=""))
+     expert_file = read.csv(expert_path)
+     expert_order = expert_file[ (simulator$setting[["start_size"]]+1):nrow(expert_file), 2]
+     print(simulator$pool[, 1:15])
+     simulator$pool = simulator$pool[expert_order, ]
+  }
+  print(simulator$pool[, 1:15])
+  simulator$benchmark  = simulator$pool
+
+}
+
+prepare_data <- function(simulator){
+  set_res_name(simulator)
+  set_res_path(simulator) 
+  load_data(simulator)
+  split_data(simulator)
+#  save_benchmark(simulator)
+}
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Rscript ./workhorse/main.R expert_sampling.csv 100
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		data_path="/home/wxk/Git/OED2019/data"
ameenetemady marked this conversation as resolved. Show resolved Hide resolved
		out_data_path="/home/wxk/Git/OED2019/out_data"