-
Notifications
You must be signed in to change notification settings - Fork 19
Now it works! #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Now it works! #11
Changes from all commits
76b6aba
9592757
749c0f5
57ae3ba
346253e
eb1b620
119c85b
484f940
9427115
201f8db
b869639
f701e68
761ef29
3c70a61
aaa8f89
303e1ca
89ca197
ef5ca78
92ffdad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| #Graphing------------------------------ | ||
| #Reading in data | ||
| allData<-read.csv(file = "allData.csv", header = TRUE) | ||
| library(ggplot2) | ||
| ggplot() | ||
|
|
||
| #The data had strangely high ages that were likely errors; this corrects them | ||
| removed_strange_ages<-allData[allData$age <110,] | ||
|
|
||
| #finding entries with a marker present; removes any entries that weren't infected | ||
| sick<-subset(removed_strange_ages, marker01!=0 | marker02!=0 |marker03!=0 |marker04!=0 |marker05!=0 |marker06!=0 |marker07!=0 |marker08!=0 | marker09!=0 |marker10!=0) | ||
|
|
||
| #ggplotting cases as a function of time | ||
| #sub-setting and adjusting data | ||
| Xdata<- sick[sick$country=="X",] | ||
| Ydata<-sick[sick$country=="Y",] | ||
|
|
||
| Yno<-seq_along(Ydata$dayofYear) | ||
| Xno<-seq_along(Xdata$country) | ||
| sick$Casenum<-append(Xno,Yno, after = length(Xno)) | ||
|
|
||
| #Plot Cumulative cases in each country against time | ||
| ggplot(data = sick, | ||
| aes(x=sick$dayofYear, y=sick$Casenum, colour = country))+ | ||
| geom_smooth()+ | ||
| xlab("Day of the Year") + | ||
| ylab("Running Case Total")+ | ||
| theme_classic() | ||
|
|
||
| #Plot new cases against time | ||
| ggplot(data = sick, | ||
| aes(x=dayofYear))+ | ||
| geom_bar(position="dodge", aes( fill = country), show.legend = FALSE)+ | ||
| xlab("Day of the Year") + | ||
| ylab("New Cases Per Day")+ | ||
| facet_grid(.~country)+ | ||
| theme_classic() | ||
|
|
||
| #Marker comparison by country | ||
| #subset data | ||
| XmarkerData<-data.frame( MarkerNum = colnames(Xdata)[3:12], | ||
| MarkerSum = colSums(Xdata[,3:12]), | ||
| Country = Xdata[1:10,13]) | ||
| rownames(XmarkerData)<-1:nrow(XmarkerData) | ||
| YmarkerData<-data.frame( MarkerNum = colnames(Ydata)[3:12], | ||
| MarkerSum = colSums(Ydata[,3:12]), | ||
| Country = Ydata[1:10,13]) | ||
| rownames(YmarkerData)<-1:nrow(YmarkerData) | ||
| MarkerforPlot<- rbind(XmarkerData,YmarkerData) | ||
| #plot | ||
| ggplot(data = MarkerforPlot, | ||
| aes(x=Country, y=MarkerSum, fill=MarkerNum))+ | ||
| geom_bar(stat="identity", position="dodge")+ | ||
| xlab("Country") + | ||
| ylab("Marker Abundance")+ | ||
| theme_minimal() | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| #Age distribution histogram of patients (this goes is supportingFunctions.R script) | ||
| #Reading in data | ||
| allData<-read.csv(file = "allData.csv", header = TRUE) | ||
|
|
||
| #The data had strangely high ages that were likely errors; this corrects them | ||
| removed_strange_ages<-allData[allData$age <120,] | ||
|
|
||
| #finding entries with a marker present | ||
| sick<-subset(removed_strange_ages, marker01!=0 | marker02!=0 |marker03!=0 |marker04!=0 |marker05!=0 |marker06!=0 |marker07!=0 |marker08!=0 | marker09!=0 |marker10!=0) | ||
|
|
||
| #visualize age distribution | ||
|
|
||
| # Load the ggplot2 package | ||
| library(ggplot2) | ||
| ggplot() | ||
|
|
||
| # Create the histogram using the ggplot function and the geom_histogram function | ||
| ggplot(sick, aes(x = sick$age)) + | ||
| geom_histogram(binwidth = 10, center = 5) + | ||
| xlab("Age Groups") + | ||
| ylab("Number of People Infected") + | ||
| xlim(0,120) + | ||
| scale_x_continuous(breaks =seq(0,120,10)) + | ||
| scale_y_continuous(breaks =seq(0,15000,1250)) | ||
|
|
||
| #analysis.R script | ||
|
|
||
| #create source path to supportingFunctions.R script | ||
| source("C:/Users/natal/Desktop/shell-lesson-data/shell-lesson-data/RProject/RProject2022_Submission.R") | ||
|
|
||
| #Load functions in supportingFunctions.R file | ||
| source("RProject2022_Submission.R") | ||
|
|
||
| #Compile all data into single CSV by calling function | ||
| compiledData() | ||
|
|
||
|
|
||
| #Process data |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,157 @@ | ||
| #github was very uncooperative, we all did pretty equal work though | ||
| #1)our figures provide good evidence based on the cases per day the the disease originated in country X. | ||
| #2)We also determined that a vaccine developed in country Y would have limited uses for country X. | ||
| #The disparity in markers present between the two countries means that the little overlap that doesoccur would not confer good immunity to country X people. | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which graph you are referring to? |
||
|
|
||
| #Analysis R script--------------------- | ||
|
|
||
| #create source path to RProject2022_Submission.R script | ||
| setwd(C:/Users/natal/Desktop/shell-lesson-data/shell-lesson-data/Rproject) | ||
| source("C:/Users/natal/Desktop/shell-lesson-data/shell-lesson-data/Rproject/RProject2022_Submission.R") | ||
|
|
||
| #Load functions in RProject2022_Submission.R file | ||
| source(RProject2022_Submission.R) | ||
| #------------------------- | ||
| #Compile all data into single CSV by calling function | ||
| # compile data from all .csv files in a directory into a single .csv file | ||
| #same columns, also add "country" and "dayofYear" columns. | ||
| #user should be able to remove NA rows, include NA rows but be warned, | ||
| #or include NAs without warning | ||
|
|
||
| #Path directory | ||
| setwd("/Users/avivalund/Desktop/Biocomputing/FinalProject/RProject/") | ||
|
|
||
| #Compile; couldnt get source to work so here is the code that would compile the csvs | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can use source only if the file is exist in the same directory |
||
| compileData <- function(directory,country,naOption){ | ||
| #setting directory variable while testing , delete when functions works | ||
| setwd(directory) | ||
| csvlist <- list.files(path=directory, pattern = ".csv") | ||
| input<-read.csv(csvlist[1]) | ||
| input$country <- country | ||
| input$dayofYear <- as.numeric(substr(csvlist[1], 8, 10)) | ||
|
|
||
| for (i in 2:length(csvlist)){ | ||
| inputloop<-read.csv(csvlist[i]) | ||
| inputloop$country <- country | ||
| inputloop$dayofYear <- as.numeric(substr(csvlist[i], 8, 10)) | ||
| input=rbind(input,inputloop) | ||
| } | ||
|
|
||
| if (naOption == "remove") { | ||
| # Remove rows with NA's in any columns | ||
| input <- na.omit(input) | ||
| } else if (naOption == "warn") { | ||
| # Check for NA's in the data and warn the user if they are present | ||
| if (any(is.na(input))) { | ||
| warning("Data contains NA values") | ||
| } | ||
| } else if (naOption == "include") { | ||
| # Do nothing - include NA's in the data without warning the user | ||
| } else { | ||
| # Invalid option - raise an error | ||
| stop("Invalid value for naOption parameter") | ||
| } | ||
| write.csv(input, paste("country",country,"_alldata.csv"), row.names = F) | ||
|
|
||
| } | ||
| #------------------------------------------------------------------------------------------------------------------ | ||
| #Script 2: analyzing script, source("supportingFunctions.R") | ||
| # Write a function to summarize the compiled data set in terms of | ||
| # number of screens run, percent of patients screened that were infected, | ||
| # male vs. female patients, and the age distribution of patients. | ||
|
|
||
| #percentage male, percentage female, percentage positive, percentage negative, age distribution | ||
| summarydata<-read.csv("./alldata.csv") | ||
| removed_strange_ages<-data[data$age <110,] | ||
| summarizedCompileData <- function(data){ | ||
| cat("Summarized Data") | ||
|
|
||
| #number of screens run | ||
| cat("\nNumber of Screens Run:",nrow(data)) | ||
|
|
||
| #number of infected or healthy patients; removed ages that were likely errors | ||
| removed_strange_ages<-data[data$age <110,] | ||
| sick<-subset(removed_strange_ages, marker01!=0 | marker02!=0 |marker03!=0 |marker04!=0 |marker05!=0 |marker06!=0 |marker07!=0 |marker08!=0 | marker09!=0 |marker10!=0) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can use for loop on this
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -0.25 for code inefficiency |
||
| healthy<-subset(removed_strange_ages, marker01!=1 & marker02!=1 & marker03!=1 & marker04!=1 & marker05!=1 & marker06!=1 & marker07!=1 & marker08!=1 & marker09!=1 & marker10!=1) | ||
| cat("\nNumber of Positive Screenings:",nrow(sick)) | ||
|
|
||
| #percentages | ||
| cat("\n\nPercentage of screenings that were positive:",(nrow(sick)/nrow(removed_strange_ages))*100,"%") | ||
| cat("\nPercentage of positive screens that were male:",((nrow(subset(sick, gender=="male")))/nrow(sick)*100),"%") | ||
| cat("\n\nPercentage of positive screens that were female:",((nrow(subset(sick, gender=="female")))/nrow(sick)*100),"%") | ||
| cat("\nPercentage of screenings that were negative:",(nrow(healthy)/nrow(removed_strange_ages))*100,"%") | ||
|
|
||
| #visualize age distribution | ||
| # Load the ggplot2 package | ||
| library(ggplot2) | ||
| # Create the histogram using the ggplot function and the geom_histogram function | ||
| ggplot(sick, aes(x =age)) + | ||
| geom_histogram(binwidth = 10, center = 5, color="white") + | ||
| xlab("Age Groups") + | ||
| ylab("Number of People Infected") + | ||
| xlim(0,120) + | ||
| ggtitle("Age Distribution by Sex of Patients")+ | ||
| facet_grid(.~gender)+ | ||
| scale_x_continuous(breaks =seq(0,120,10)) + | ||
| theme_bw()+ | ||
| scale_y_continuous(breaks =seq(0,15000,1250)) | ||
| } | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +2 |
||
|
|
||
|
|
||
| #Graphing------------------------------ | ||
| #Reading in data | ||
| allData<-read.csv(file = "allData.csv", header = TRUE) | ||
| library(ggplot2) | ||
| ggplot() | ||
|
|
||
| #The data had strangely high ages that were likely errors; this corrects them | ||
| removed_strange_ages<-allData[allData$age <110,] | ||
|
|
||
| #finding entries with a marker present; removes any entries that weren't infected | ||
| sick<-subset(removed_strange_ages, marker01!=0 | marker02!=0 |marker03!=0 |marker04!=0 |marker05!=0 |marker06!=0 |marker07!=0 |marker08!=0 | marker09!=0 |marker10!=0) | ||
|
|
||
| #ggplotting cases as a function of time | ||
| #sub-setting and adjusting data | ||
| Xdata<- sick[sick$country=="X",] | ||
| Ydata<-sick[sick$country=="Y",] | ||
|
|
||
| Yno<-seq_along(Ydata$dayofYear) | ||
| Xno<-seq_along(Xdata$country) | ||
| sick$Casenum<-append(Xno,Yno, after = length(Xno)) | ||
|
|
||
| #Plot Cumulative cases in each country against time | ||
| ggplot(data = sick, | ||
| aes(x=sick$dayofYear, y=sick$Casenum, colour = country))+ | ||
| geom_smooth()+ | ||
| xlab("Day of the Year") + | ||
| ylab("Running Case Total")+ | ||
| theme_classic() | ||
|
|
||
| #Plot new cases against time | ||
| ggplot(data = sick, | ||
| aes(x=dayofYear))+ | ||
| geom_bar(position="dodge", aes( fill = country), show.legend = FALSE)+ | ||
| xlab("Day of the Year") + | ||
| ylab("New Cases Per Day")+ | ||
| facet_grid(.~country)+ | ||
| theme_classic() | ||
|
|
||
| #Marker comparison by country | ||
| #subset data | ||
| XmarkerData<-data.frame( MarkerNum = colnames(Xdata)[3:12], | ||
| MarkerSum = colSums(Xdata[,3:12]), | ||
| Country = Xdata[1:10,13]) | ||
| rownames(XmarkerData)<-1:nrow(XmarkerData) | ||
| YmarkerData<-data.frame( MarkerNum = colnames(Ydata)[3:12], | ||
| MarkerSum = colSums(Ydata[,3:12]), | ||
| Country = Ydata[1:10,13]) | ||
| rownames(YmarkerData)<-1:nrow(YmarkerData) | ||
| MarkerforPlot<- rbind(XmarkerData,YmarkerData) | ||
| #plot | ||
| ggplot(data = MarkerforPlot, | ||
| aes(x=Country, y=MarkerSum, fill=MarkerNum))+ | ||
| geom_bar(stat="identity", position="dodge")+ | ||
| xlab("Country") + | ||
| ylab("Marker Abundance")+ | ||
| theme_minimal() | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You did not use any support functions in the script -2
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good commenting and efficient code that uses coding concepts that we covered in class – 3.75 points (max is 4) |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not specific enough. You need to point to the evidence from the graph and say why the cases matter.
-0.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also it's unclear which graph you are referring to