Source Code used in the Project

Table of Contents

1 Source Code

1.1 Variable selection in R

Variable selection in based on http://www.uccor.edu.ar/paginas/seminarios/Software/SVM_RFE_R_implementation.pdf

 
####################################
# RFE parameters
####################################
library(ipred)
library(e1071)

#Custom Functions
svmFuncs <- caretFuncs #Default caret functions


#SVM Ranking function
svmFuncs$rank <- function (object, x, y) {
        w <- t(coef(object$finalModel)[[1]]) %*% object$finalModel@xmatrix[[1]]
        vimp <- data.frame(t(w)*t(w))
        names(vimp)[1] <- 'vimp'
        vimp$var <- row.names(vimp)
        order <- 1/(vimp$vimp)
        vimp <- vimp[order(order),]
        vimp$'Overall' <- seq(nrow(vimp),1)
        vimp
}


RFEcontrol <- rfeControl(
                functions = svmFuncs,
                method = "repeatedCV",
                number = 10,
                repeats = 10,
                rerank = FALSE,
                returnResamp = "final",
                saveDetails = FALSE,
                verbose = TRUE)

TrainControl=trainControl(
                method = "repeatedCV",
                number=10,
                repeats=1,
                returnResamp = "all",
                classProbs = TRUE,
                summaryFunction=twoClassSummary
                )




x <- trainset[,xnames]
y <- trainset$Target

RFE <- rfe(x,y,sizes = seq(130,160,by=10),
        method='svmLinear',
        tuneGrid = expand.grid(.C=1),
        metric='ROC',
        maximize=TRUE,
        rfeControl = RFEcontrol,
        trControl = TrainControl)

NewVars <- RFE$optVariables
RFE
plot(RFE)

1.2 Logistic Regression Training

More info on this on: http://cran.r-project.org/web/packages/glmnet/index.html

 
 library('caTools')
 library('caret')
 library('glmnet')
 library('ipred')
 library('e1071')

 Data <- read.csv("overfitting.csv", header=TRUE)
 
 Data$Target <- as.factor(ifelse(Data$Target_Practice ==1,'X1','X0'))
 Data$Target_Evaluate = NULL
 Data$Target_Leaderboard = NULL
 Data$Target_Practice = NULL
 xnames <- setdiff(names(Data),c('Target','case_id','train'))

 #Order
 Data <- Data[,c('Target','case_id','train',xnames)]

 #Split to train and test
 trainset = Data[Data$train == 1,]
 testset = Data[Data$train == 0,]

 #Remove unwanted columns
 trainset$case_id = NULL
 trainset$train = NULL
 

MyTrainControl=trainControl(
              method = "repeatedCV",
              number=10,
              repeats=5,
              returnResamp = "all",
              classProbs = TRUE,
              summaryFunction=twoClassSummary
              )

 theTarget <- 'Target'
 theFormula <- as.formula(paste(theTarget," ~ . "))

 # train the model using the training control defined 
 model <- train(theFormula,data=trainset,method='glmnet',
          metric = "ROC",
          tuneGrid = expand.grid(.alpha=c(0,1),.lambda=seq(0,.25,by=0.005)),
          trControl=MyTrainControl)
           
# testing and get AUC 
test <- predict(model, newdata=testset, type  = "prob")          

 colAUC(test, testset$Target)

1.3 Linear and RBF with with cross validation

 #Load Required Packages
library('caTools')
library('caret')
library('glmnet')
library('ipred')
library('e1071')

#data reading and setup

Data <- read.csv("overfitting.csv", header=TRUE)

Data$Target <- as.factor(ifelse(Data$Target_Practice==1,'X1','X0'))
Data$Target_Evaluate = NULL
Data$Target_Leaderboard = NULL
Data$Target_Practice = NULL
vars <- setdiff(names(Data),c('Target','case_id','train'))

#Order
Data <- Data[,c('Target','case_id','train',vars)]

#Split to train and test
trainset = Data[Data$train == 1,]
testset = Data[Data$train == 0,]

#Remove unwanted columns
trainset$case_id = NULL
trainset$train = NULL

#10-Fold cross validation
MyTrainControl=trainControl(
                method = "repeatedCV",
                number=10,
                repeats=5,
                returnResamp = "all",
                classProbs = TRUE,
                summaryFunction=twoClassSummary
                )


# use all variables.
FL <- as.formula(paste("Target ~ ", paste(vars, collapse= "+")))

#use new variables selected
# FL <- as.formula(paste("Target ~ ", paste(NewVars, collapse= "+")))

library(kernlab)

model <- train(FL,data=trainset,method='svmLinear',
        metric = "ROC",
        probability=TRUE,
        tuneLength=7,
        trControl=MyTrainControl)
#

# SVM Radial
#model <- train(FL,data=trainset,method='svmRadial',
#        metric = "ROC",
#        probability=TRUE,
#        tuneLength=7,
#        trControl=MyTrainControl)



plot(model,metric = "ROC")
test <- predict(model, newdata=testset, type  = "prob")
colAUC(test, testset$Target)

1.4 Pyhton code for generating some graphs

Python ROC curves look much more nice. beware the identation.

  

  import numpy as np
  import pylab as pl
  from sklearn import svm, datasets
  from sklearn.utils import shuffle
  from sklearn.metrics import roc_curve, auc
  
  #Calculate auc for each value c parameter of TSVM

  ar = np.loadtxt("classification.test.c9")
  fpr9, tpr9, thresholds9 = roc_curve(test,ar)
  roc_auc9 = auc(fpr9,tpr9)

  ar = np.loadtxt("classification.test.c1")
  fpr1, tpr1, thresholds1 = roc_curve(test,ar)
  roc_auc1 = auc(fpr1,tpr1)

  ar = np.loadtxt("classification.test.c3")
  fpr3, tpr3, thresholds3 = roc_curve(test,ar)
  roc_auc3 = auc(fpr3,tpr3)
  
  ar = np.loadtxt("classification.test.cd25")
  fprd25, tprd25, thresholdsd25 = roc_curve(test,ar)
  roc_aucd25 = auc(fprd25,tprd25)
  
  ar = np.loadtxt("classification.test.cd50")
  fprd50, tprd50, thresholdsd50 = roc_curve(test,ar)
  roc_aucd50 = auc(fprd50,tprd50)

  ar = np.loadtxt("classification.test.c5")
  fpr5, tpr5, thresholds5 = roc_curve(test,ar)
  roc_auc5 = auc(fpr5,tpr5)

  ar = np.loadtxt("classification.test.c7")
  fpr7, tpr7, thresholds7 = roc_curve(test,ar)
  roc_auc7 = auc(fpr7,tpr7)

  #auto
  ar = np.loadtxt("classification.test.c0")
  fpr0, tpr0, thresholds0 = roc_curve(test,ar)
  roc_auc0 = auc(fpr0,tpr0)

  
pl.clf()

pl.plot(fpr0, tpr0, label='C=Auto ROC curve (area = %0.2f)' % roc_auc0)
  
  
pl.plot(fpr1, tpr1, label='C=1 ROC curve (area = %0.2f)' % roc_auc1)
  
pl.plot(fpr3, tpr3, label='C=3 ROC curve (area = %0.2f)' % roc_auc3)
  
  
pl.plot(fpr5, tpr5, label='C=5 ROC curve (area = %0.2f)' % roc_auc5)
  
pl.plot(fpr7, tpr7, label='C=7 ROC curve (area = %0.2f)' % roc_auc7)
  
pl.plot(fpr9, tpr9, label='C=9 ROC curve (area = %0.2f)' % roc_auc9)
pl.plot(fprd25, tprd25, label='C=0.25 ROC curve (area = %0.2f)' % roc_auc25)
pl.plot(fprd50, tprd50, label='C=0.50 ROC curve (area = %0.2f)' % roc_auc50)
  

pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic Transductive SVM')
pl.legend(loc="lower right")
pl.show()

Author: Miguel Fernando Cabrera <miguel.cabrera@tum.de>

Date: 2012-03-16 09:13:22 CET

HTML generated by org-mode 6.34trans in emacs 23