Source Code used in the Project
Table of Contents
1 Source Code
1.1 Variable selection in R
Variable selection in based on http://www.uccor.edu.ar/paginas/seminarios/Software/SVM_RFE_R_implementation.pdf
####################################
# RFE parameters
####################################
library(ipred)
library(e1071)
#Custom Functions
svmFuncs <- caretFuncs #Default caret functions
#SVM Ranking function
svmFuncs$rank <- function (object, x, y) {
w <- t(coef(object$finalModel)[[1]]) %*% object$finalModel@xmatrix[[1]]
vimp <- data.frame(t(w)*t(w))
names(vimp)[1] <- 'vimp'
vimp$var <- row.names(vimp)
order <- 1/(vimp$vimp)
vimp <- vimp[order(order),]
vimp$'Overall' <- seq(nrow(vimp),1)
vimp
}
RFEcontrol <- rfeControl(
functions = svmFuncs,
method = "repeatedCV",
number = 10,
repeats = 10,
rerank = FALSE,
returnResamp = "final",
saveDetails = FALSE,
verbose = TRUE)
TrainControl=trainControl(
method = "repeatedCV",
number=10,
repeats=1,
returnResamp = "all",
classProbs = TRUE,
summaryFunction=twoClassSummary
)
x <- trainset[,xnames]
y <- trainset$Target
RFE <- rfe(x,y,sizes = seq(130,160,by=10),
method='svmLinear',
tuneGrid = expand.grid(.C=1),
metric='ROC',
maximize=TRUE,
rfeControl = RFEcontrol,
trControl = TrainControl)
NewVars <- RFE$optVariables
RFE
plot(RFE)
1.2 Logistic Regression Training
More info on this on: http://cran.r-project.org/web/packages/glmnet/index.html
library('caTools')
library('caret')
library('glmnet')
library('ipred')
library('e1071')
Data <- read.csv("overfitting.csv", header=TRUE)
Data$Target <- as.factor(ifelse(Data$Target_Practice ==1,'X1','X0'))
Data$Target_Evaluate = NULL
Data$Target_Leaderboard = NULL
Data$Target_Practice = NULL
xnames <- setdiff(names(Data),c('Target','case_id','train'))
#Order
Data <- Data[,c('Target','case_id','train',xnames)]
#Split to train and test
trainset = Data[Data$train == 1,]
testset = Data[Data$train == 0,]
#Remove unwanted columns
trainset$case_id = NULL
trainset$train = NULL
MyTrainControl=trainControl(
method = "repeatedCV",
number=10,
repeats=5,
returnResamp = "all",
classProbs = TRUE,
summaryFunction=twoClassSummary
)
theTarget <- 'Target'
theFormula <- as.formula(paste(theTarget," ~ . "))
# train the model using the training control defined
model <- train(theFormula,data=trainset,method='glmnet',
metric = "ROC",
tuneGrid = expand.grid(.alpha=c(0,1),.lambda=seq(0,.25,by=0.005)),
trControl=MyTrainControl)
# testing and get AUC
test <- predict(model, newdata=testset, type = "prob")
colAUC(test, testset$Target)
1.3 Linear and RBF with with cross validation
#Load Required Packages
library('caTools')
library('caret')
library('glmnet')
library('ipred')
library('e1071')
#data reading and setup
Data <- read.csv("overfitting.csv", header=TRUE)
Data$Target <- as.factor(ifelse(Data$Target_Practice==1,'X1','X0'))
Data$Target_Evaluate = NULL
Data$Target_Leaderboard = NULL
Data$Target_Practice = NULL
vars <- setdiff(names(Data),c('Target','case_id','train'))
#Order
Data <- Data[,c('Target','case_id','train',vars)]
#Split to train and test
trainset = Data[Data$train == 1,]
testset = Data[Data$train == 0,]
#Remove unwanted columns
trainset$case_id = NULL
trainset$train = NULL
#10-Fold cross validation
MyTrainControl=trainControl(
method = "repeatedCV",
number=10,
repeats=5,
returnResamp = "all",
classProbs = TRUE,
summaryFunction=twoClassSummary
)
# use all variables.
FL <- as.formula(paste("Target ~ ", paste(vars, collapse= "+")))
#use new variables selected
# FL <- as.formula(paste("Target ~ ", paste(NewVars, collapse= "+")))
library(kernlab)
model <- train(FL,data=trainset,method='svmLinear',
metric = "ROC",
probability=TRUE,
tuneLength=7,
trControl=MyTrainControl)
#
# SVM Radial
#model <- train(FL,data=trainset,method='svmRadial',
# metric = "ROC",
# probability=TRUE,
# tuneLength=7,
# trControl=MyTrainControl)
plot(model,metric = "ROC")
test <- predict(model, newdata=testset, type = "prob")
colAUC(test, testset$Target)
1.4 Pyhton code for generating some graphs
Python ROC curves look much more nice. beware the identation.
import numpy as np import pylab as pl from sklearn import svm, datasets from sklearn.utils import shuffle from sklearn.metrics import roc_curve, auc #Calculate auc for each value c parameter of TSVM ar = np.loadtxt("classification.test.c9") fpr9, tpr9, thresholds9 = roc_curve(test,ar) roc_auc9 = auc(fpr9,tpr9) ar = np.loadtxt("classification.test.c1") fpr1, tpr1, thresholds1 = roc_curve(test,ar) roc_auc1 = auc(fpr1,tpr1) ar = np.loadtxt("classification.test.c3") fpr3, tpr3, thresholds3 = roc_curve(test,ar) roc_auc3 = auc(fpr3,tpr3) ar = np.loadtxt("classification.test.cd25") fprd25, tprd25, thresholdsd25 = roc_curve(test,ar) roc_aucd25 = auc(fprd25,tprd25) ar = np.loadtxt("classification.test.cd50") fprd50, tprd50, thresholdsd50 = roc_curve(test,ar) roc_aucd50 = auc(fprd50,tprd50) ar = np.loadtxt("classification.test.c5") fpr5, tpr5, thresholds5 = roc_curve(test,ar) roc_auc5 = auc(fpr5,tpr5) ar = np.loadtxt("classification.test.c7") fpr7, tpr7, thresholds7 = roc_curve(test,ar) roc_auc7 = auc(fpr7,tpr7) #auto ar = np.loadtxt("classification.test.c0") fpr0, tpr0, thresholds0 = roc_curve(test,ar) roc_auc0 = auc(fpr0,tpr0) pl.clf() pl.plot(fpr0, tpr0, label='C=Auto ROC curve (area = %0.2f)' % roc_auc0) pl.plot(fpr1, tpr1, label='C=1 ROC curve (area = %0.2f)' % roc_auc1) pl.plot(fpr3, tpr3, label='C=3 ROC curve (area = %0.2f)' % roc_auc3) pl.plot(fpr5, tpr5, label='C=5 ROC curve (area = %0.2f)' % roc_auc5) pl.plot(fpr7, tpr7, label='C=7 ROC curve (area = %0.2f)' % roc_auc7) pl.plot(fpr9, tpr9, label='C=9 ROC curve (area = %0.2f)' % roc_auc9) pl.plot(fprd25, tprd25, label='C=0.25 ROC curve (area = %0.2f)' % roc_auc25) pl.plot(fprd50, tprd50, label='C=0.50 ROC curve (area = %0.2f)' % roc_auc50) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic Transductive SVM') pl.legend(loc="lower right") pl.show()
Date: 2012-03-16 09:13:22 CET
HTML generated by org-mode 6.34trans in emacs 23