rm(list=ls()) library(tidyverse) library(caret) ################################################################################ set.seed(2022) #sets the random number generator (to ensures that we all get the same result) ################################################################################ #read data dataF<-read.csv("https://dl.dropbox.com/s/5qe56ysmmigh398/ml_lecture_data.csv?dl=1", stringsAsFactors=T) ################################################################################ #generate training and testing datasets train_rows <- createDataPartition(dataF$y, list=F) # this step involves randomly choosing rowns from dataF train_set <- dataF[train_rows, ] test_set <- dataF[-train_rows, ] ################################################################################ #explore the data dataF %>% ggplot(aes(x, fill = y)) + geom_density() dataF %>% ggplot(aes(y, x)) + geom_boxplot() ################################################################################ #random guessing y_predict <- sample(c("cancer", "normal"), nrow(test_set), replace = TRUE) %>% factor(levels = levels(test_set$y)) #Overall Accuracy cat("1) random guessing overall accuracy: ", mean(y_predict == test_set$y), "\n\n") ################################################################################ #explore our training dataset, calculate mean and standard deviation train_set %>% group_by(y) %>% summarize(mean(x), sd(x)) ################################################################################ #predict cancer if expression is within two standard deviations: 694-(2*39) from the average expression of cancer samples y_predict <- ifelse(train_set$x > (694-(2*39)), "cancer", "normal") %>% factor(levels = levels(train_set$y)) cat("2) mean/sd overall accuracy: ", mean(y_predict == train_set$y), "\n\n") #0.773 ################################################################################ #predict using multiple cutoffs cutoff <- seq(610, 655) ##### mcutoffs_map_fun<-function(x){ y_predict <- ifelse(train_set$x > x, "cancer", "normal") %>% factor(levels = levels(train_set$y)) mean(y_predict == train_set$y) } ##### mcutoffs_accuracy <- sapply(cutoff, mcutoffs_map_fun) plot(cutoff, mcutoffs_accuracy, type="b") ##### find cutoff that resulted in the highest accuracy cat("3a) multiple cutoff highest accuracy (training dataset): ", max(mcutoffs_accuracy), "\n") cat("3b) cutoff that resulted in the highest accuracy (training dataset): ", cutoff[which.max(mcutoffs_accuracy)], "\n") ##### add both cutoffs to the graph as vertical lines: abline(v=694-(2*39), col="red") abline(v=cutoff[which.max(mcutoffs_accuracy)], col="blue") ################################################################################ #predict using best cutoff on test dataset best_cutoff <- cutoff[which.max(mcutoffs_accuracy)] y_predict <- ifelse(test_set$x > best_cutoff, "cancer", "normal") %>% factor(levels = levels(test_set$y)) cat("3c) best cutoff overall accuracy on test dataset: ", mean(y_predict == test_set$y), "\n\n") #0.836 ################################################################################ #calculate the accuracy for each sample type alone ################################################################################ cat("4) accuracy for each sample type alone:\n") test_set %>% mutate(y_predict = y_predict) %>% group_by(y) %>% summarize(accuracy = mean(y_predict == y)) %>% print() cat("\n\n") ################################################################################ #Confusion matrix ################################################################################ #generate confusion matrix cm <- confusionMatrix(data = y_predict, reference = test_set$y) cat("5) best cutoff evaluation metrics:\n") cat("confusion matrix:\n") print(cm$table) cat("\n\n") print(cm$overall["Accuracy"]) cat("\n") print(cm$byClass[c("Sensitivity","Specificity", "Prevalence")]) cat("\n\n") ################################################################################