Source Code used in the Project
Table of Contents
1 Source Code
1.1 Variable selection in R
Variable selection in based on http://www.uccor.edu.ar/paginas/seminarios/Software/SVM_RFE_R_implementation.pdf
#################################### # RFE parameters #################################### library(ipred) library(e1071) #Custom Functions svmFuncs <- caretFuncs #Default caret functions #SVM Ranking function svmFuncs$rank <- function (object, x, y) { w <- t(coef(object$finalModel)[[1]]) %*% object$finalModel@xmatrix[[1]] vimp <- data.frame(t(w)*t(w)) names(vimp)[1] <- 'vimp' vimp$var <- row.names(vimp) order <- 1/(vimp$vimp) vimp <- vimp[order(order),] vimp$'Overall' <- seq(nrow(vimp),1) vimp } RFEcontrol <- rfeControl( functions = svmFuncs, method = "repeatedCV", number = 10, repeats = 10, rerank = FALSE, returnResamp = "final", saveDetails = FALSE, verbose = TRUE) TrainControl=trainControl( method = "repeatedCV", number=10, repeats=1, returnResamp = "all", classProbs = TRUE, summaryFunction=twoClassSummary ) x <- trainset[,xnames] y <- trainset$Target RFE <- rfe(x,y,sizes = seq(130,160,by=10), method='svmLinear', tuneGrid = expand.grid(.C=1), metric='ROC', maximize=TRUE, rfeControl = RFEcontrol, trControl = TrainControl) NewVars <- RFE$optVariables RFE plot(RFE)
1.2 Logistic Regression Training
More info on this on: http://cran.r-project.org/web/packages/glmnet/index.html
library('caTools') library('caret') library('glmnet') library('ipred') library('e1071') Data <- read.csv("overfitting.csv", header=TRUE) Data$Target <- as.factor(ifelse(Data$Target_Practice ==1,'X1','X0')) Data$Target_Evaluate = NULL Data$Target_Leaderboard = NULL Data$Target_Practice = NULL xnames <- setdiff(names(Data),c('Target','case_id','train')) #Order Data <- Data[,c('Target','case_id','train',xnames)] #Split to train and test trainset = Data[Data$train == 1,] testset = Data[Data$train == 0,] #Remove unwanted columns trainset$case_id = NULL trainset$train = NULL MyTrainControl=trainControl( method = "repeatedCV", number=10, repeats=5, returnResamp = "all", classProbs = TRUE, summaryFunction=twoClassSummary ) theTarget <- 'Target' theFormula <- as.formula(paste(theTarget," ~ . ")) # train the model using the training control defined model <- train(theFormula,data=trainset,method='glmnet', metric = "ROC", tuneGrid = expand.grid(.alpha=c(0,1),.lambda=seq(0,.25,by=0.005)), trControl=MyTrainControl) # testing and get AUC test <- predict(model, newdata=testset, type = "prob") colAUC(test, testset$Target)
1.3 Linear and RBF with with cross validation
#Load Required Packages library('caTools') library('caret') library('glmnet') library('ipred') library('e1071') #data reading and setup Data <- read.csv("overfitting.csv", header=TRUE) Data$Target <- as.factor(ifelse(Data$Target_Practice==1,'X1','X0')) Data$Target_Evaluate = NULL Data$Target_Leaderboard = NULL Data$Target_Practice = NULL vars <- setdiff(names(Data),c('Target','case_id','train')) #Order Data <- Data[,c('Target','case_id','train',vars)] #Split to train and test trainset = Data[Data$train == 1,] testset = Data[Data$train == 0,] #Remove unwanted columns trainset$case_id = NULL trainset$train = NULL #10-Fold cross validation MyTrainControl=trainControl( method = "repeatedCV", number=10, repeats=5, returnResamp = "all", classProbs = TRUE, summaryFunction=twoClassSummary ) # use all variables. FL <- as.formula(paste("Target ~ ", paste(vars, collapse= "+"))) #use new variables selected # FL <- as.formula(paste("Target ~ ", paste(NewVars, collapse= "+"))) library(kernlab) model <- train(FL,data=trainset,method='svmLinear', metric = "ROC", probability=TRUE, tuneLength=7, trControl=MyTrainControl) # # SVM Radial #model <- train(FL,data=trainset,method='svmRadial', # metric = "ROC", # probability=TRUE, # tuneLength=7, # trControl=MyTrainControl) plot(model,metric = "ROC") test <- predict(model, newdata=testset, type = "prob") colAUC(test, testset$Target)
1.4 Pyhton code for generating some graphs
Python ROC curves look much more nice. beware the identation.
import numpy as np import pylab as pl from sklearn import svm, datasets from sklearn.utils import shuffle from sklearn.metrics import roc_curve, auc #Calculate auc for each value c parameter of TSVM ar = np.loadtxt("classification.test.c9") fpr9, tpr9, thresholds9 = roc_curve(test,ar) roc_auc9 = auc(fpr9,tpr9) ar = np.loadtxt("classification.test.c1") fpr1, tpr1, thresholds1 = roc_curve(test,ar) roc_auc1 = auc(fpr1,tpr1) ar = np.loadtxt("classification.test.c3") fpr3, tpr3, thresholds3 = roc_curve(test,ar) roc_auc3 = auc(fpr3,tpr3) ar = np.loadtxt("classification.test.cd25") fprd25, tprd25, thresholdsd25 = roc_curve(test,ar) roc_aucd25 = auc(fprd25,tprd25) ar = np.loadtxt("classification.test.cd50") fprd50, tprd50, thresholdsd50 = roc_curve(test,ar) roc_aucd50 = auc(fprd50,tprd50) ar = np.loadtxt("classification.test.c5") fpr5, tpr5, thresholds5 = roc_curve(test,ar) roc_auc5 = auc(fpr5,tpr5) ar = np.loadtxt("classification.test.c7") fpr7, tpr7, thresholds7 = roc_curve(test,ar) roc_auc7 = auc(fpr7,tpr7) #auto ar = np.loadtxt("classification.test.c0") fpr0, tpr0, thresholds0 = roc_curve(test,ar) roc_auc0 = auc(fpr0,tpr0) pl.clf() pl.plot(fpr0, tpr0, label='C=Auto ROC curve (area = %0.2f)' % roc_auc0) pl.plot(fpr1, tpr1, label='C=1 ROC curve (area = %0.2f)' % roc_auc1) pl.plot(fpr3, tpr3, label='C=3 ROC curve (area = %0.2f)' % roc_auc3) pl.plot(fpr5, tpr5, label='C=5 ROC curve (area = %0.2f)' % roc_auc5) pl.plot(fpr7, tpr7, label='C=7 ROC curve (area = %0.2f)' % roc_auc7) pl.plot(fpr9, tpr9, label='C=9 ROC curve (area = %0.2f)' % roc_auc9) pl.plot(fprd25, tprd25, label='C=0.25 ROC curve (area = %0.2f)' % roc_auc25) pl.plot(fprd50, tprd50, label='C=0.50 ROC curve (area = %0.2f)' % roc_auc50) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic Transductive SVM') pl.legend(loc="lower right") pl.show()
Date: 2012-03-16 09:13:22 CET
HTML generated by org-mode 6.34trans in emacs 23