#Requirements
requirements <- c("tableone", "reshape2", "ggplot2",
"Boruta", "caret", "caretEnsemble", "kernlab")
#CRAN repository
repos <- "http://cran.us.r-project.org"
#install and load missing requirements
for (pack in requirements){
if( !is.element(pack, .packages(all.available = TRUE)) ) {
install.packages(pack, repos = repos)
}
library(pack, character.only = TRUE)
}
##uncomment this block of code to install RSeqR with git personal access token
##install RSeqR with git PAtoken
#if (! require("RSeqR")){
# try(remotes::install_github("Bishop-Laboratory/RSeqR", dependencies = "Imports",
# force = TRUE,
# auth_token = getPass(msg = "Enter git personal access token: ")))
# require("RSeqR") #use require to give warning if try failed and avoid error
#}
##assert RSeqR is loaded
#if (! "package:RSeqR" %in% search()){
# #throw error and exit
# stop("RSeqR package is not installed. Do you have a git personal access token?")
#}
#globals
set.seed(params$seed)
#define a function that returns an array of Zscores given a filename
getZscores <- function(fname){
#init Zscores
Zscores <- c()
#get path the file
fpath <- file.path(params$datadir, fname)
#assert file exists
if(file.exists(fpath)){
#init rdata
rdata <- NULL
#try to load the rdata
try(load(fpath))
#try to get the Zscores
try(Zscores <- res$`Z-scores`$`regioneR::numOverlaps`$shifted.z.scores)
}
#return Zscores
return(Zscores)
}
#load manifest
metadata <- read.csv(file.path(params$datadir,'manifest.csv'))
#create dataframe that holds the Zscore data
Zdata <- data.frame(row.names = metadata$id)
Zdata$group <- as.factor(metadata$group) #make sure label is a factor
Zdata$Z <- lapply(basename(metadata$filename), getZscores)
# Remove nulls
keep <- sapply(Zdata$Z, is.numeric)
Zdata <- Zdata[keep,]
#compute Fourier transform of Z
Zdata$W <- lapply(Zdata$Z, fft)
#compute autocorrelation on Z
Zdata$Zacf <- lapply(Zdata$Z, function(x){
#y <- acf(x, lag.max = length(x)/1, plot = FALSE, type = "correlation")
y <- acf(x, lag.max = length(x)/1, plot = FALSE, type = "covariance")
return(drop(y$acf))
})
#compute Fourier transform of the autocorrelation
Zdata$Wacf <- lapply(Zdata$Zacf, fft)
#compute first and second moments for Z, Zacf, Re(W), Im(W), Re(Wacf), Im(Wacf)
Zdata$Z1 <- as.numeric(lapply(Zdata$Z, mean))
Zdata$Z2 <- as.numeric(lapply(Zdata$Z, function(x){sqrt(sum(x*x))}))
Zdata$Zacf1 <- as.numeric(lapply(Zdata$Zacf, mean))
Zdata$Zacf2 <- as.numeric(lapply(Zdata$Zacf, function(x){sqrt(sum(x*x))}))
Zdata$ReW1 <- as.numeric(lapply(Zdata$W, function(x){mean(Re(x))}))
Zdata$ReW2 <- as.numeric(lapply(Zdata$W, function(x){sqrt(sum(Re(x)*Re(x)))}))
Zdata$ImW1 <- as.numeric(lapply(Zdata$W, function(x){mean(Im(x))}))
Zdata$ImW2 <- as.numeric(lapply(Zdata$W, function(x){sqrt(sum(Im(x)*Im(x)))}))
Zdata$ReWacf1 <- as.numeric(lapply(Zdata$Wacf, function(x){mean(Re(x))}))
Zdata$ReWacf2 <- as.numeric(lapply(Zdata$Wacf, function(x){sqrt(sum(Re(x)*Re(x)))}))
Zdata$ImWacf1 <- as.numeric(lapply(Zdata$Wacf, function(x){mean(Im(x))}))
Zdata$ImWacf2 <- as.numeric(lapply(Zdata$Wacf, function(x){sqrt(sum(Im(x)*Im(x)))}))
#declare engineered features
feats <- c("Z1", "Z2", "Zacf1", "Zacf2", "ReW1", "ReW2", "ImW1", "ImW2", "ReWacf1", "ReWacf2", "ImWacf1", "ImWacf2")
#power transform and Zscore
prep <- preProcess(Zdata[, feats], method = c("center", "scale", "YeoJohnson"))
#Standardize features
Zdata[, feats] <- predict(prep, Zdata[, feats])
CreateTableOne(vars = feats, strata = "group" , data = Zdata, addOverall = TRUE)
## Stratified by group
## Overall NEG POS p test
## n 547 205 342
## Z1 (mean (SD)) 0.00 (1.00) -1.01 (0.60) 0.61 (0.63) <0.001
## Z2 (mean (SD)) 0.00 (1.00) -1.06 (0.61) 0.64 (0.55) <0.001
## Zacf1 (mean (SD)) 0.00 (1.00) -0.74 (0.48) 0.44 (0.97) <0.001
## Zacf2 (mean (SD)) 0.00 (1.00) -0.90 (0.72) 0.54 (0.72) <0.001
## ReW1 (mean (SD)) 0.00 (1.00) -0.84 (0.64) 0.50 (0.83) <0.001
## ReW2 (mean (SD)) 0.00 (1.00) -1.06 (0.62) 0.64 (0.54) <0.001
## ImW1 (mean (SD)) 0.00 (1.00) -0.35 (1.18) 0.21 (0.80) <0.001
## ImW2 (mean (SD)) 0.00 (1.00) 0.04 (0.59) -0.02 (1.18) 0.471
## ReWacf1 (mean (SD)) 0.00 (1.00) -0.87 (0.72) 0.52 (0.75) <0.001
## ReWacf2 (mean (SD)) 0.00 (1.00) -0.90 (0.72) 0.54 (0.71) <0.001
## ImWacf1 (mean (SD)) 0.00 (1.00) -0.52 (0.15) 0.31 (1.15) <0.001
## ImWacf2 (mean (SD)) 0.00 (1.00) -0.89 (0.74) 0.53 (0.72) <0.001
#plot feature densities
df.m <- melt(Zdata[, feats], id.vars = NULL)
ggplot(df.m, aes(x = variable, y = value)) + geom_violin()
#partition off a 40% discovery set for feature selection
indexes = createDataPartition(Zdata$group, p = .50, list = F)
train = Zdata[indexes, ]
discoTest = Zdata[-indexes, ]
indexes2 = createDataPartition(discoTest$group, p = .50, list = F)
disco = discoTest[indexes2, ]
test = discoTest[-indexes2, ]
#formula
eqn = formula(paste("group", paste(feats, collapse = " + "), sep = " ~ "))
#feature selection with boruta
tentativeboruta <- Boruta(eqn, data = disco)
print(tentativeboruta)
## Boruta performed 15 iterations in 0.7821624 secs.
## 11 attributes confirmed important: ImW2, ImWacf1, ImWacf2, ReW1, ReW2
## and 6 more;
## 1 attributes confirmed unimportant: ImW1;
finalboruta <- TentativeRoughFix(tentativeboruta)
## Warning in TentativeRoughFix(tentativeboruta): There are no Tentative
## attributes! Returning original object.
print(finalboruta)
## Boruta performed 15 iterations in 0.7821624 secs.
## 11 attributes confirmed important: ImW2, ImWacf1, ImWacf2, ReW1, ReW2
## and 6 more;
## 1 attributes confirmed unimportant: ImW1;
#declare selected features
selfeats <- names(finalboruta$finalDecision[which(finalboruta$finalDecision=='Confirmed')])
#plot feature selection
plot(finalboruta, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(finalboruta$ImpHistory), function(i){
finalboruta$ImpHistory[is.finite(finalboruta$ImpHistory[,i]),i]
})
names(lz) <- colnames(finalboruta$ImpHistory)
Labels <- sort(sapply(lz, median))
axis(side = 1, las=2, labels = names(Labels),
at = 1:ncol(finalboruta$ImpHistory), cex.axis = 0.7)
#regresion formula using selected features
eqn = formula(paste("group", paste(selfeats, collapse = " + "), sep = " ~ "))
#Use 5-fold cross validation repeated 15 times for training control
kfolds= 10
nrep= 5
tctrl <- trainControl(method = "repeatedcv", number=kfolds, repeats = nrep,
#index = createFolds(disco$group, kfolds),
classProbs = TRUE, summaryFunction = twoClassSummary,
savePredictions='all', search="random")
# create submodels using data excluding the disco set
algorithmList <- c('lda', 'rpart', 'glm', 'knn', 'svmRadial')
models <- caretList(eqn, data=train, trControl=tctrl,
methodList=algorithmList, metric="ROC")
## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble will
## have the same resampling indexes.
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
results <- resamples(models)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, rpart, glm, knn, svmRadial
## Number of resamples: 50
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.9470588 0.9829768 0.9941176 0.9901283 1.0000000 1.0000000 0
## rpart 0.7444444 0.8653743 0.9000000 0.8975205 0.9314840 0.9973262 0
## glm 0.9304813 0.9834893 0.9920083 0.9868473 1.0000000 1.0000000 0
## knn 0.9235294 0.9654412 0.9818182 0.9787011 0.9941176 1.0000000 0
## svmRadial 0.9411765 0.9841800 0.9917112 0.9872620 1.0000000 1.0000000 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.7 0.9000000 0.9000000 0.9030909 0.9772727 1 0
## rpart 0.6 0.8000000 0.8181818 0.8367273 0.9000000 1 0
## glm 0.8 0.9000000 0.9045455 0.9229091 1.0000000 1 0
## knn 0.7 0.8000000 0.8090909 0.8363636 0.9000000 1 0
## svmRadial 0.7 0.8181818 0.9000000 0.8954545 0.9090909 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.8823529 0.9411765 1.0000000 0.9744444 1 1 0
## rpart 0.7058824 0.8888889 0.9411765 0.9370588 1 1 0
## glm 0.6470588 0.9411765 0.9428105 0.9566667 1 1 0
## knn 0.8823529 0.9411765 1.0000000 0.9663399 1 1 0
## svmRadial 0.8235294 0.9411765 1.0000000 0.9639869 1 1 0
dotplot(results)
# stack using rf using 10-fold cv repeated 5 times
stacked_model <- caretStack(models, method="rf", metric="ROC",
trControl=tctrl)
print(stacked_model)
## A rf ensemble of 5 base models: lda, rpart, glm, knn, svmRadial
##
## Ensemble results:
## Random Forest
##
## 1370 samples
## 5 predictor
## 2 classes: 'NEG', 'POS'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1233, 1233, 1233, 1233, 1233, 1232, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 3 0.9978880 0.9716591 0.9780137
## 4 0.9977290 0.9724359 0.9763748
## 5 0.9974904 0.9720739 0.9763803
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
#predict on the disco set (would be best to have an external validation set)
yhat <- predict(stacked_model, test)
confusionMatrix(yhat, test$group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NEG POS
## NEG 47 7
## POS 4 78
##
## Accuracy : 0.9191
## 95% CI : (0.8599, 0.9589)
## No Information Rate : 0.625
## P-Value [Acc > NIR] : 3.58e-15
##
## Kappa : 0.8295
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.9216
## Specificity : 0.9176
## Pos Pred Value : 0.8704
## Neg Pred Value : 0.9512
## Prevalence : 0.3750
## Detection Rate : 0.3456
## Detection Prevalence : 0.3971
## Balanced Accuracy : 0.9196
##
## 'Positive' Class : NEG
##
# library(MLeval)
# evalm(stacked_model)
fftModel <- stacked_model
prepFeatures <- prep
save(prepFeatures, file = "../misc-data/model/prepFeatures.rda", compress = "xz")
save(fftModel, file = "../misc-data/model/fftModel.rda", compress = "xz")