-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCleanVIA.R
More file actions
92 lines (67 loc) · 3.2 KB
/
CleanVIA.R
File metadata and controls
92 lines (67 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
library(dplyr)
library(plyr)
library(zoo)
library(reshape2)
#############################################################################################
#' Title: "Clean VIA Excel Datasets"
#' Mausam Duggal, Systems Analysis Group, WSP|PB", April 19, 2016"
#############################################################################################
# set working directory
wd <- setwd("c:/personal/R")
##############################################################################################
#' set the years the data is available
year.lst <- c(2015, 2014, 2013, 2012, 2011, 2010, 2009)
for (i in 1: length(year.lst)) {
# create filename handle
filename <- paste0(year.lst[i], sep="")
#' read in csv file of VIA (2015 per Weekday Origin Tab in this case). We can loop
#' this to read all the tabs.
via <- read.csv(file = paste0(filename[i], "VIA.csv"), stringsAsFactors = FALSE)
#' rename columns to make them more readable
via <- rename(via, c("X"="Month", "Origin"="Day", "X.1"="Stn", "X.2"="Name"))
via$Month[via$Month == ""] <- NA
via$Day[via$Day == ""] <- NA
# make copy to do transformation on
via1 <- via
#' copy in the first month because the excel file when exported to CSV does not populate the
#' first row due to formatting issues. I am trying to do the least amount of formatting in Excel
#' except deleting the first 17 rows
via1$Month[1] <- "Jan"
#' #' Now use the ZOO package to populate with the earlier non-NA value
#' via1$Origin <- na.locf(via1$Origin) # first do it for the weekdays
#' via1$X <- na.locf(via1$X) # next do it for the month
################################################################################################
#' using a function instead of Zoo to populate WEEKDAY columns
################################################################################################
fill.vals = function(x) {
ind = which(!is.na(x$Day)) # get positions of nonmissing values
#ind1 <- as.data.frame(ind)
if(is.na(x$Day[1])) # if it begins with a missing, add the
ind = c(1,ind) # first position to the indices
#ind2 <- as.data.frame(ind)
x$Day <- rep(x$Day[ind], times = diff( # repeat the values at these indices
c(ind, nrow(x) + 1)))
}
#' call for via1
via1$Day <- fill.vals(via1)
#' using a function instead of Zoo to populate MONTH columns
fill.vals1 = function(x) {
ind = which(!is.na(x$Month)) # get positions of nonmissing values
#ind1 <- as.data.frame(ind)
if(is.na(x$Month[1])) # if it begins with a missing, add the
ind = c(1,ind) # first position to the indices
#ind2 <- as.data.frame(ind)
x$Month <- rep(x$Month[ind], times = diff( # repeat the values at these indices
c(ind, nrow(x) + 1)))
}
#' call for via1 to fill in the Month
via1$Month <- fill.vals1(via1)
#' Now use Grep function to clean the dataset by
#' getting rid of the total rows
via1.clean <- via1[-grep("Total|Origin", via1$Day),]
#' Now let's melth this data and add year to the cleaned dataset
via1.clean.m <- melt(via1.clean, id.vars = c("Month", "Day", "Stn", "Name"))
via1.clean.m$year <- year.lst[i]
#' write csv file
write.csv(via1.clean.m, file = paste0(year.lst[i],"cleanedVIA.csv"))
}