rm(list=ls())
library(tidyverse)
library(caret)
################################################################################
set.seed(2022)  #sets the random number generator (to ensures that we all get the same result)
################################################################################
#read data
dataF<-read.csv("https://dl.dropbox.com/s/5qe56ysmmigh398/ml_lecture_data.csv?dl=1", stringsAsFactors=T)
################################################################################
#generate training and testing datasets
train_rows <- createDataPartition(dataF$y, list=F) # this step involves randomly choosing rowns from dataF
train_set <- dataF[train_rows, ]
test_set <- dataF[-train_rows, ]
################################################################################
#explore the data
dataF %>%   ggplot(aes(x, fill = y)) + geom_density() 
dataF %>% ggplot(aes(y, x)) + geom_boxplot()
################################################################################
#random guessing
y_predict <- sample(c("cancer", "normal"), nrow(test_set), replace = TRUE) %>% factor(levels = levels(test_set$y))
#Overall Accuracy
cat("1) random guessing overall accuracy: ", mean(y_predict == test_set$y), "\n\n")
################################################################################
#explore our training dataset, calculate mean and standard deviation
train_set %>% group_by(y) %>% summarize(mean(x), sd(x))
################################################################################
#predict cancer if expression is within two standard deviations: 694-(2*39) from the average expression of cancer samples
y_predict <- ifelse(train_set$x > (694-(2*39)), "cancer", "normal") %>%  factor(levels = levels(train_set$y))
cat("2) mean/sd overall accuracy: ", mean(y_predict == train_set$y), "\n\n") #0.773
################################################################################
#predict using multiple cutoffs
cutoff <- seq(610, 655)
#####
mcutoffs_map_fun<-function(x){
  y_predict <- ifelse(train_set$x > x, "cancer", "normal") %>% factor(levels = levels(train_set$y))
  mean(y_predict == train_set$y)
}
#####
mcutoffs_accuracy <- sapply(cutoff, mcutoffs_map_fun)
plot(cutoff, mcutoffs_accuracy, type="b")
##### find cutoff that resulted in the highest accuracy
cat("3a) multiple cutoff highest accuracy (training dataset): ", max(mcutoffs_accuracy), "\n")
cat("3b) cutoff that resulted in the highest accuracy (training dataset): ", cutoff[which.max(mcutoffs_accuracy)], "\n")
##### add both cutoffs to the graph as vertical lines:
abline(v=694-(2*39), col="red")
abline(v=cutoff[which.max(mcutoffs_accuracy)], col="blue") 
################################################################################
#predict using best cutoff on test dataset
best_cutoff <- cutoff[which.max(mcutoffs_accuracy)]
y_predict <- ifelse(test_set$x > best_cutoff, "cancer", "normal") %>% factor(levels = levels(test_set$y))
cat("3c) best cutoff overall accuracy on test dataset: ", mean(y_predict == test_set$y), "\n\n") #0.836
################################################################################
#calculate the accuracy for each sample type alone
################################################################################
cat("4) accuracy for each sample type alone:\n")
test_set %>% mutate(y_predict = y_predict) %>% group_by(y) %>% summarize(accuracy = mean(y_predict == y)) %>% print()
cat("\n\n")
################################################################################
#Confusion matrix
################################################################################
#generate confusion matrix
cm <- confusionMatrix(data = y_predict, reference = test_set$y)
cat("5) best cutoff evaluation metrics:\n")
cat("confusion matrix:\n")
print(cm$table)
cat("\n\n")
print(cm$overall["Accuracy"])
cat("\n")
print(cm$byClass[c("Sensitivity","Specificity", "Prevalence")])
cat("\n\n")
################################################################################