How to change cost matrix in R using carriage and C5.0Cost?

Question

How to change cost matrix in R using carriage and C5.0Cost?

I am currently experimenting with a carriage and C5.0Cost in R. So far, I have a base model that works fine. But the settings give me some headaches.

It seems I can not change the cost of false positives.

library(mlbench)
data(Sonar)

library(caret)

set.seed(990)
inTraining <- createDataPartition(Sonar$Class, p = .5, list = FALSE)
inTraining
training <- Sonar[inTraining,]
test <- Sonar[-inTraining,]

set.seed(990)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=5)
statGrid <-  expand.grid(trials = 1,
                     model = "tree",
                     winnow = FALSE,
                     cost = matrix(c(
                         0, 2,
                         1, 0
                     ), 2, 2, byrow=TRUE))

set.seed(825)
statFit <- train(Class~., data=training, method="C5.0Cost", trControl=fitControl, tuneGrid = statGrid, metric = "Accuracy")

statFit["finalModel"]

write(capture.output(summary(statFit)), "c50model.txt")

R version 3.2.1 (2015-06-18) Platform: x86_64-w64-mingw32 / x64 (64-bit) Works under: Windows 8 x64 (build 9200)
locale: [1] LC_COLLATE = German_Germany.1252 LC_CTYPE = German_Germany.1252 LC_MONETARY = German_Germany.1252 [4] LC_NUMERIC = C LC_TIME = German_Germany.1252
enclosed basic packages: [1] grid graph statistics grDevices utils database sets database methods
other attached packages: [1] DMwR_0.4.1 plyr_1.8.3 C50_0.1.0-24 caret_6.0-52 ggplot2_1.0.1 lattice_0.20-31 [7] mlbench_2.1-1
( ): [1] Rcpp_0.11.6 compiler_3.2.1 nloptr_1.0.4 bitops_1.0-6
[5] xts_0.9-7 class_7.3-12 iterators_1.0.7 tools_3.2.1
[9] rpart_4.1-9 partykit_1.0-3 digest_0.6.8 lme4_1.1-8
[13] nlme_3.1-120 gtable_0.1.2 mgcv_1.8-6 Matrix_1.2-1
[17] foreach_1.4.2 parallel_3.2.1 brglm_0.5-9 SparseM_1.6
[21] proto_0.3-10 e1071_1.6-7 BradleyTerry2_1.0-6 stringr_1.0.0
[25] caTools_1.17.1 gtools_3.5.0 stats4_3.2.1 nnet_7.3-9
[29] survival_2.38-1 gdata_2.17.0 minqa_1.2.4 ROCR_1.0-7
[33] TTR_0.23-0 reshape2_1.4.1 car_2.0-26 magrittr_1.5
[37] gplots_2.17.0 scales_0.2.5 codetools_0.2-11 MASS_7.3-40
[41] splines_3.2.1. Msgstr " [45] colorspace_1.2-6 quantreg_5.11 KernSmooth_2.23-14 stringi_0.5-5
[49] munsell_0.4.2 zoo_1.7-12

, (?), - ( , ). , . , statFit["finalModel"] R-.

+1

r r-caret

JimBoy 17 . '15 16:19

2

3 C5.0 . , " [ ] "

0

Tchotchke 17 . '15 21:51

smithDe · Accepted Answer · 2017-03-17T22:34:15+0000

@JimBoy , . github "C5.0Cost" , , 1 (. cmat).

modelInfo, , . , , , grid.expand (costFP) false negative (costFN), , .

modelInfo <- list(label = "Cost-Sensitive C5.0",
            library = c("C50", "plyr"),
            loop = function(grid) {     
              loop <- ddply(grid, c("model", "winnow", "costFP","costFN"),
                            function(x) c(trials = max(x$trials)))                 

              submodels <- vector(mode = "list", length = nrow(loop))
              for(i in seq(along = loop$trials))
              {
                index <- which(grid$model == loop$model[i] & 
                                 grid$winnow == loop$winnow[i],
                               grid$costFP[i] == loop$costFP[i],
                               grid$costFN[i] == loop$costFN[i])
                trials <- grid[index, "trials"] 
                submodels[[i]] <- data.frame(trials = trials[trials != loop$trials[i]])
              }     
              list(loop = loop, submodels = submodels)
            },
            type = "Classification",
            parameters = data.frame(parameter = c('trials', 'model', 'winnow', "costFP","costFN"),
                                    class = c("numeric", "character", "logical", "numeric","numeric"),
                                    label = c('# Boosting Iterations', 'Model Type', 'Winnow', "CostFP","CostFN")),
            grid = function(x, y, len = NULL, search = "grid") {
              c5seq <- if(len == 1)  1 else  c(1, 10*((2:min(len, 11)) - 1))
              expand.grid(trials = c5seq, model = c("tree", "rules"), 
                          winnow = c(TRUE, FALSE),
                          costFP = 1:2,
                          costFN = 1:2)
              if(search == "grid") {
                c5seq <- if(len == 1)  1 else  c(1, 10*((2:min(len, 11)) - 1))
                out <- expand.grid(trials = c5seq, model = c("tree", "rules"), 
                                   winnow = c(TRUE, FALSE), costFP = 1:2, costFN = 1:2)
              } else {
                out <- data.frame(trials = sample(1:100, replace = TRUE, size = len),
                                  model = sample(c("tree", "rules"), replace = TRUE, size = len),
                                  winnow = sample(c(TRUE, FALSE), replace = TRUE, size = len),
                                  costFP = runif(len, min = 1, max = 20),
                                  costFN = runif(len, min = 1, max = 20))
              }
              out    
            },
            fit = function(x, y, wts, param, lev, last, classProbs, ...) { 
              theDots <- list(...)

              if(any(names(theDots) == "control"))
              {                           
                theDots$control$winnow <- param$winnow
              } else theDots$control <- C5.0Control(winnow = param$winnow)

              argList <- list(x = x, y = y, weights = wts, trials = param$trials,
                              rules = param$model == "rules")

              cmat <-matrix(c(0, param$costFP, param$costFN, 0), ncol = 2)
              rownames(cmat) <- colnames(cmat) <- levels(y)
              if(any(names(theDots) == "costFP")){
                warning("For 'C5.0Cost', the costs are a tuning parameter")
                theDots$costs <- cmat
              } else argList$costs <- cmat

              argList <- c(argList, theDots)
              do.call("C5.0.default", argList)
            },
            predict = function(modelFit, newdata, submodels = NULL) {
              out <- predict(modelFit, newdata)

              if(!is.null(submodels))
              {
                tmp <- out
                out <- vector(mode = "list", length = nrow(submodels) + 1)
                out[[1]] <- tmp

                for(j in seq(along = submodels$trials))
                  out[[j+1]] <- as.character(predict(modelFit, newdata, trial = submodels$trials[j]))
              }
              out              
            },
            prob = NULL,
            predictors = function(x, ...) {
              vars <- C5imp(x, metric = "splits")
              rownames(vars)[vars$Overall > 0]
            },
            levels = function(x) x$obsLevels,
            varImp = function(object, ...) C5imp(object, ...),
            tags = c("Tree-Based Model", "Rule-Based Model", "Implicit Feature Selection",
                     "Boosting", "Ensemble Model", "Cost Sensitive Learning", "Two Class Only", 
                     "Handle Missing Predictor Data", "Accepts Case Weights"),
            sort = function(x){
              x$model <- factor(as.character(x$model), levels = c("rules", "tree"))
              x[order(x$trials, x$model, !x$winnow, x$costFP,x$costFN),]
            },
            trim = function(x) {
              x$boostResults <- NULL
              x$size <- NULL
              x$call <- NULL
              x$output <- NULL
              x
            })

, , :

## Example provided
library(mlbench)
data(Sonar)

library(caret)


set.seed(990)
inTraining <- createDataPartition(Sonar$Class, p = .5, list = FALSE)
inTraining
training <- Sonar[inTraining,]
test <- Sonar[-inTraining,]



set.seed(990)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=5)


statGrid <-  expand.grid(trials = 3,
                         model = "tree",
                         winnow = FALSE,
                         cost = 2)

set.seed(825)


statFit <- train(Class~., data=training, method="C5.0Cost", trControl=fitControl, tuneGrid = statGrid, metric = "Accuracy")


## Example modified to include costs for both false positives and negatives
set.seed(825)
statGridMod <-  expand.grid(trials = 3,
                            model = "tree",
                            winnow = FALSE,
                            costFP = c(1,2,3), #new cost parameters
                            costFN = c(3,2,1)) #new cost parameters


statFit <- train(Class~., data=training, method=modelInfo, trControl=fitControl, tuneGrid = statGridMod, metric = "Accuracy")

statFit

How to change cost matrix in R using carriage and C5.0Cost?

More articles: