Center for Renal Precision Medicine

#Requirements
requirements <- c("tableone", "reshape2", "ggplot2", 
                  "Boruta", "caret", "caretEnsemble", "kernlab")

#CRAN repository
repos <- "http://cran.us.r-project.org"

#install and load missing requirements
for (pack in requirements){
  if( !is.element(pack, .packages(all.available = TRUE)) ) {
    install.packages(pack, repos = repos)
  }
  library(pack, character.only = TRUE)
}

##uncomment this block of code to install RSeqR with git personal access token
##install RSeqR with git PAtoken
#if (! require("RSeqR")){
#  try(remotes::install_github("Bishop-Laboratory/RSeqR", dependencies = "Imports",
#                              force = TRUE,
#                              auth_token = getPass(msg = "Enter git personal access token: ")))
#  require("RSeqR") #use require to give warning if try failed and avoid error
#}
##assert RSeqR is loaded
#if (! "package:RSeqR" %in% search()){
#  #throw error and exit
#  stop("RSeqR package is not installed. Do you have a git personal access token?")
#}


#globals
set.seed(params$seed)
#define a function that returns an array of Zscores given a filename
getZscores <- function(fname){
  #init Zscores
  Zscores <- c()
  #get path the file
  fpath <- file.path(params$datadir, fname)
  #assert file exists
  if(file.exists(fpath)){
    #init rdata
    rdata <- NULL
    #try to load the rdata
    try(load(fpath))
    #try to get the Zscores
    try(Zscores <- res$`Z-scores`$`regioneR::numOverlaps`$shifted.z.scores)
  }
  #return Zscores
  return(Zscores)
}

#load manifest
metadata <- read.csv(file.path(params$datadir,'manifest.csv'))

#create dataframe that holds the Zscore data
Zdata <- data.frame(row.names = metadata$id)
Zdata$group <- as.factor(metadata$group) #make sure label is a factor
Zdata$Z <- lapply(basename(metadata$filename), getZscores)

# Remove nulls
keep <- sapply(Zdata$Z, is.numeric)
Zdata <- Zdata[keep,]
#compute Fourier transform of Z
Zdata$W <- lapply(Zdata$Z, fft)

#compute autocorrelation on Z
Zdata$Zacf <- lapply(Zdata$Z, function(x){
  #y <- acf(x, lag.max = length(x)/1, plot = FALSE, type = "correlation")
  y <- acf(x, lag.max = length(x)/1, plot = FALSE, type = "covariance")
  return(drop(y$acf))
  })

#compute Fourier transform of the autocorrelation 
Zdata$Wacf <- lapply(Zdata$Zacf, fft)
#compute first and second moments for Z, Zacf, Re(W), Im(W), Re(Wacf), Im(Wacf) 
Zdata$Z1 <- as.numeric(lapply(Zdata$Z, mean))
Zdata$Z2 <- as.numeric(lapply(Zdata$Z, function(x){sqrt(sum(x*x))}))
Zdata$Zacf1 <- as.numeric(lapply(Zdata$Zacf, mean))
Zdata$Zacf2 <- as.numeric(lapply(Zdata$Zacf, function(x){sqrt(sum(x*x))}))
Zdata$ReW1 <- as.numeric(lapply(Zdata$W, function(x){mean(Re(x))}))
Zdata$ReW2 <- as.numeric(lapply(Zdata$W, function(x){sqrt(sum(Re(x)*Re(x)))}))
Zdata$ImW1 <- as.numeric(lapply(Zdata$W, function(x){mean(Im(x))}))
Zdata$ImW2 <- as.numeric(lapply(Zdata$W, function(x){sqrt(sum(Im(x)*Im(x)))}))
Zdata$ReWacf1 <- as.numeric(lapply(Zdata$Wacf, function(x){mean(Re(x))}))
Zdata$ReWacf2 <- as.numeric(lapply(Zdata$Wacf, function(x){sqrt(sum(Re(x)*Re(x)))}))
Zdata$ImWacf1 <- as.numeric(lapply(Zdata$Wacf, function(x){mean(Im(x))}))
Zdata$ImWacf2 <- as.numeric(lapply(Zdata$Wacf, function(x){sqrt(sum(Im(x)*Im(x)))}))

#declare engineered features
feats <- c("Z1", "Z2", "Zacf1", "Zacf2", "ReW1", "ReW2", "ImW1", "ImW2", "ReWacf1", "ReWacf2", "ImWacf1", "ImWacf2")
#power transform and Zscore
prep <- preProcess(Zdata[, feats], method = c("center", "scale", "YeoJohnson"))

#Standardize features
Zdata[, feats] <- predict(prep, Zdata[, feats])
CreateTableOne(vars = feats, strata = "group" , data = Zdata, addOverall = TRUE)
##                      Stratified by group
##                       Overall     NEG          POS          p      test
##   n                    547          205          342                   
##   Z1 (mean (SD))      0.00 (1.00) -1.01 (0.60)  0.61 (0.63) <0.001     
##   Z2 (mean (SD))      0.00 (1.00) -1.06 (0.61)  0.64 (0.55) <0.001     
##   Zacf1 (mean (SD))   0.00 (1.00) -0.74 (0.48)  0.44 (0.97) <0.001     
##   Zacf2 (mean (SD))   0.00 (1.00) -0.90 (0.72)  0.54 (0.72) <0.001     
##   ReW1 (mean (SD))    0.00 (1.00) -0.84 (0.64)  0.50 (0.83) <0.001     
##   ReW2 (mean (SD))    0.00 (1.00) -1.06 (0.62)  0.64 (0.54) <0.001     
##   ImW1 (mean (SD))    0.00 (1.00) -0.35 (1.18)  0.21 (0.80) <0.001     
##   ImW2 (mean (SD))    0.00 (1.00)  0.04 (0.59) -0.02 (1.18)  0.471     
##   ReWacf1 (mean (SD)) 0.00 (1.00) -0.87 (0.72)  0.52 (0.75) <0.001     
##   ReWacf2 (mean (SD)) 0.00 (1.00) -0.90 (0.72)  0.54 (0.71) <0.001     
##   ImWacf1 (mean (SD)) 0.00 (1.00) -0.52 (0.15)  0.31 (1.15) <0.001     
##   ImWacf2 (mean (SD)) 0.00 (1.00) -0.89 (0.74)  0.53 (0.72) <0.001
#plot feature densities
df.m <- melt(Zdata[, feats], id.vars = NULL)
ggplot(df.m, aes(x = variable, y = value)) + geom_violin()

#partition off a 40% discovery set for feature selection
indexes = createDataPartition(Zdata$group, p = .50, list = F)
train = Zdata[indexes, ]
discoTest = Zdata[-indexes, ]
indexes2 = createDataPartition(discoTest$group, p = .50, list = F)
disco = discoTest[indexes2, ]
test = discoTest[-indexes2, ]

#formula
eqn = formula(paste("group", paste(feats, collapse = " + "), sep = " ~ "))

#feature selection with boruta
tentativeboruta <- Boruta(eqn, data = disco)
print(tentativeboruta)
## Boruta performed 15 iterations in 0.7821624 secs.
##  11 attributes confirmed important: ImW2, ImWacf1, ImWacf2, ReW1, ReW2
## and 6 more;
##  1 attributes confirmed unimportant: ImW1;
finalboruta <- TentativeRoughFix(tentativeboruta)
## Warning in TentativeRoughFix(tentativeboruta): There are no Tentative
## attributes! Returning original object.
print(finalboruta)
## Boruta performed 15 iterations in 0.7821624 secs.
##  11 attributes confirmed important: ImW2, ImWacf1, ImWacf2, ReW1, ReW2
## and 6 more;
##  1 attributes confirmed unimportant: ImW1;
#declare selected features
selfeats <- names(finalboruta$finalDecision[which(finalboruta$finalDecision=='Confirmed')])
#plot feature selection
plot(finalboruta, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(finalboruta$ImpHistory), function(i){
  finalboruta$ImpHistory[is.finite(finalboruta$ImpHistory[,i]),i]
  })
names(lz) <- colnames(finalboruta$ImpHistory)
Labels <- sort(sapply(lz, median))
axis(side = 1, las=2, labels = names(Labels),
     at = 1:ncol(finalboruta$ImpHistory), cex.axis = 0.7)

#regresion formula using selected features 
eqn = formula(paste("group", paste(selfeats, collapse = " + "), sep = " ~ "))

#Use 5-fold cross validation repeated 15 times for training control
kfolds= 10
nrep= 5
tctrl <- trainControl(method = "repeatedcv", number=kfolds, repeats = nrep,
                      #index = createFolds(disco$group, kfolds),
                      classProbs = TRUE, summaryFunction = twoClassSummary,
                      savePredictions='all', search="random")

# create submodels using data excluding the disco set
algorithmList <- c('lda', 'rpart', 'glm', 'knn', 'svmRadial')
models <- caretList(eqn, data=train, trControl=tctrl,
                    methodList=algorithmList, metric="ROC")
## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble will
## have the same resampling indexes.
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
results <- resamples(models)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, rpart, glm, knn, svmRadial 
## Number of resamples: 50 
## 
## ROC 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lda       0.9470588 0.9829768 0.9941176 0.9901283 1.0000000 1.0000000    0
## rpart     0.7444444 0.8653743 0.9000000 0.8975205 0.9314840 0.9973262    0
## glm       0.9304813 0.9834893 0.9920083 0.9868473 1.0000000 1.0000000    0
## knn       0.9235294 0.9654412 0.9818182 0.9787011 0.9941176 1.0000000    0
## svmRadial 0.9411765 0.9841800 0.9917112 0.9872620 1.0000000 1.0000000    0
## 
## Sens 
##           Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## lda        0.7 0.9000000 0.9000000 0.9030909 0.9772727    1    0
## rpart      0.6 0.8000000 0.8181818 0.8367273 0.9000000    1    0
## glm        0.8 0.9000000 0.9045455 0.9229091 1.0000000    1    0
## knn        0.7 0.8000000 0.8090909 0.8363636 0.9000000    1    0
## svmRadial  0.7 0.8181818 0.9000000 0.8954545 0.9090909    1    0
## 
## Spec 
##                Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## lda       0.8823529 0.9411765 1.0000000 0.9744444       1    1    0
## rpart     0.7058824 0.8888889 0.9411765 0.9370588       1    1    0
## glm       0.6470588 0.9411765 0.9428105 0.9566667       1    1    0
## knn       0.8823529 0.9411765 1.0000000 0.9663399       1    1    0
## svmRadial 0.8235294 0.9411765 1.0000000 0.9639869       1    1    0
dotplot(results)

# stack using rf using 10-fold cv repeated 5 times
stacked_model <- caretStack(models, method="rf", metric="ROC", 
                        trControl=tctrl)
print(stacked_model)
## A rf ensemble of 5 base models: lda, rpart, glm, knn, svmRadial
## 
## Ensemble results:
## Random Forest 
## 
## 1370 samples
##    5 predictor
##    2 classes: 'NEG', 'POS' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1233, 1233, 1233, 1233, 1233, 1232, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##   3     0.9978880  0.9716591  0.9780137
##   4     0.9977290  0.9724359  0.9763748
##   5     0.9974904  0.9720739  0.9763803
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
#predict on the disco set (would be best to have an external validation set)
yhat <- predict(stacked_model, test)
confusionMatrix(yhat, test$group)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NEG POS
##        NEG  47   7
##        POS   4  78
##                                           
##                Accuracy : 0.9191          
##                  95% CI : (0.8599, 0.9589)
##     No Information Rate : 0.625           
##     P-Value [Acc > NIR] : 3.58e-15        
##                                           
##                   Kappa : 0.8295          
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.9216          
##             Specificity : 0.9176          
##          Pos Pred Value : 0.8704          
##          Neg Pred Value : 0.9512          
##              Prevalence : 0.3750          
##          Detection Rate : 0.3456          
##    Detection Prevalence : 0.3971          
##       Balanced Accuracy : 0.9196          
##                                           
##        'Positive' Class : NEG             
## 
# library(MLeval)
# evalm(stacked_model)
fftModel <- stacked_model
prepFeatures <- prep

save(prepFeatures, file = "../misc-data/model/prepFeatures.rda", compress = "xz")
save(fftModel, file = "../misc-data/model/fftModel.rda", compress = "xz")