model/gapfill_functions.R at master · amcarter/model · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
library(dplyr)
# as numeric shortcut
a.n <- function(x) as.numeric(x)

# gap fill
na_fill <- function(x, tol=Inf){
    # x is a vector of data
    # tol is max number of steps missing (if greater, it retains NA)
    ina <- is.na(x)
    csum <- cumsum(!ina)
    # which gaps are too long
    x[ina] <- approx(x, xout = which(ina))$y
    if(!is.infinite(tol)){ # if not infinite tol, remove threshold values
        wg <- a.n(names(which(table(csum) > tol)))
        x[which(csum%in%wg)[-1]] <- NA
    }
    x
}

# linearly fill dataframe - works but could be optimized to not "apply"
linear_fill <- function(df, tol=Inf){
    # df is dataframe
    apply(df, 2, na_fill, tol=tol)
}

# find similar k days
top_k <- function(df, k, minobs=3){
    # df is dataframe
    # k is number of similar days to find
    # minobs is the min. observations required for a day
    D <- matrix(NA, nrow(df), k) # empty dataframe for matches for each day
    # rows with missing data, exclude first and last
    cc <- complete.cases(df)
    narows <- which(!cc)
    narows <- narows[1:(length(narows)-1)]
    # for each row with missing data,
    #  find similar rows without missing data
    for (i in narows){
        # skip days with less than n observations... hard to say anything about similarity
        if(sum(!is.na(df[i,])) < minobs){ next } # alternative: skip if all na
        # difference between the observations and the rest of the dataframe
        xdelt <- t(a.n(df[i,]) - t(df))
        # initialize NA row indexes
        delt <- rep(NA, nrow(df))
        # candidate rows are only rows without nan
        for(j in which(cc)){
            tt <- xdelt[j,] # the candidate row
            # skip comparisons with all NA rows (or self) or where limited comparison data
            if(j == i || all(is.na(tt))){ next }
            wn <- which(is.na(tt)) # which are NA
            # euclidean distance, not normalized by the length of the vectors
            #  (b/c we don't care about absolute distance)
            delt[j] <- tt[-wn] %*% tt[-wn]
        }
        # get top k matches
        D[i,] <- order(delt)[1:k]
    }
    D
}


# data prep function for fill_gaps
# adds snap points, gets average data
prep_missing <- function(df, ns, xd, mm){
    # df is the data frame
    # xd is the data frame of days
    # mm is the missing days
    # ns is the matching neighbors
    #
    ### MISSING DATA
    missing <- filter(df, date%in%xd$date[mm])
    # if missing first and last obs, extend timeseries to include neighbor days
    #  unless first/last day
    if(any(!complete.cases(missing)[c(1,nrow(missing))])){
        if(mm[1]!=1) mm = c(mm[1]-1, mm) # if not first day, extend obs range
        if(tail(mm,1)!=nrow(xd)) mm = c(mm, tail(mm,1)+1) # if not last day, extend obs range
        missing <- filter(df, date%in%xd$date[mm]) # missing one step interpolation
        # if any data missing still (i.e., first and last day), add avg of first and last obs
        #  this should catch most NAs for filling missing
        if(any(!complete.cases(missing)[c(1,nrow(missing))])){
            missing[c(1,nrow(missing)),-c(1,2)] <- apply(missing[c(1,nrow(missing)),-c(1,2)], 2, mean, na.rm=T)
        }
    }
    ndays <- length(mm)
    ### SIMILAR DATA
    # grab similar days - tuples of missing day and similar day
    ss <- data.frame(date=xd$date[mm], match=xd$date[t(ns[mm,])])
    ss <- ss[complete.cases(ss),]
    similar <- left_join(ss, df, by=c("match"="date")) %>%
        select(-match) %>% group_by(date, time) %>% summarize_all(mean) %>% ungroup()
    # make sure that the dates in similar and missing line up
    missing <- right_join(missing, select(similar,date,time), by=c("date","time"))
    ### DAILY SNAP POINTS
    # add snap points at each new day to rescale
    newdaypoints <- which(missing$time %in% missing$time[c(1,nrow(missing))])
    daypoints <- missing[newdaypoints,]
    if(any(is.na(daypoints))){
        dayfill <- linear_fill(select(daypoints,-date,-time))
        missing[newdaypoints,] <- data.frame(date=daypoints$date, time=daypoints$time, dayfill)
    }
    list(missing=select(missing,-date,-time), similar=select(similar,-date,-time), index=select(similar,date,time))
}

fill_missing <- function(df, ns, xd, ids, maxspan, lim=0){
    # df is the data frame
    # ns are the similar days for each day
    # xd is the daily data
    # ids are the identifiers
    # maxspan is the maximum number of days to gap fill
    # lim is the minimum number of days to fill
    #     will not fill gaps that are less than this
    #     used for testing (b/c the test data have pre-existing gaps)
    #
    # the days that need filling in xd
    filld <- which(complete.cases(ns))
    # groups for blocks of missing data
    group <- cumsum(c(T,diff(filld)>1))
    for(g in unique(group)){
        # grab missing days
        mm <- filld[group==g]
        if(length(mm)>=lim && length(mm)<maxspan){
            pp <- prep_missing(df, ns, xd, mm)
            dy <- (pp$missing - pp$similar)
            dyhat <- linear_fill(dy)
            filled <- pp$similar + dyhat
            df[which(paste(df$date,df$time)%in%paste(pp$index$date,pp$index$time)),-c(1,2)] <- filled
        }
    }
    data.frame(ids,select(df,-date,-time), stringsAsFactors=FALSE)
}

gap_fill <- function(df, maxspan_days=5, knn=3){
    df <- select(df, -DateTime_UTC)
    # require 15 min interval

    # kind of goofy to do this by date and time, but that's because I translated the code from Python
    dds <- df %>% mutate(date=as.Date(solar.time), time=strftime(solar.time, format="%H:%M:%S")) %>%
        select(-c(region, site, solar.time)) %>% select(date, time, everything())
    # index data
    ids <- df %>% select(c(region, site, solar.time))

    # linearly interpolate df
    ddl <- dds %>% select(-date, -time) %>% linear_fill(tol=12)
    dds <- data.frame(select(dds,date,time),ddl)

    # get daily averages if full day observations, otherwise NA
    xd <- dds %>% select(-time) %>% group_by(date) %>%
        summarize_all(funs((n()==96)*mean(.)))

    ns <- top_k(select(xd,-date), k=knn, minobs=2)

    filled <- fill_missing(dds, ns, xd, ids, maxspan=maxspan_days)

    filled
}

# test
# df <- read.csv("CT_Farmington.csv")
# df_filled <- gap_fill(df)