Continuous Continuity Variables by Value IV in R

I am building a logistic regression model in R. I want to optimally use continuous predictors in relation to the target variable. I know two things:

  • continuous variables are bounded in such a way that its IV (information value) is maximized

  • maximize the chi-square in a two-sided table of unforeseen circumstances - the target has two values ​​0 and 1, and a continuous variable with a bin has binders

Does anyone know of any functions in R that can do this kind of binning?

Your help will be greatly appreciated.

+4
source share
3 answers

First of all, you can use the weight of evidence (grief) with the woebinning package , which optimizes the number of boxes for IV

library(woeBinning) # get the bin cut points from your dataframe cutpoints <- woe.binning(dataset, "target_name", "Variable_name") woe.binning.plot(cutpoints) # apply the cutpoints to your dataframe dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe") 

It returns your dataset with two additional columns

  • Variable_name.binned which is shortcuts
  • Variable_name.woe.binned, which are replaced values ​​that can then be analyzed in regression instead of Variable_name

For the second point, on chi2, the packet discretization seems to handle this, but I haven't tested it.

+3
source

You can consider the methods used by regression splines to set the nodes of nodes. The rpart package probably has the corresponding code. You need to fine statistics, as this leads to a hidden concealment of the degrees of freedom spent in the process of moving faults to get the best results. Another common method is to determine gaps on equally spaced quantiles (quartiles or quintiles) within a subset with IV = 1. Something like this unchecked code:

 cont.var.vec <- # names of all your continuous variables breaks <- function(var,n) quantiles( dfrm[[var]], probs=seq(0,1,length.out=n), na.rm=TRUE) lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5) 
+2
source

s

 etwd("D:") rm(list=ls()) options (scipen = 999) read.csv("dummy_data.txt") -> dt head(dt) summary(dt) mydata <- dt head(mydata) summary(mydata) ##Capping for(i in 1:ncol(mydata)){ if(is.numeric(mydata[,i])){ val.quant <- unname(quantile(mydata[,i],probs = 0.75)) mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}}) } } library(randomForest) x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")] y <- as.factor(mydata$Y) set.seed(21) fit <- randomForest(x,y,importance=T,ntree = 70) mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")] mydata2$Y <- as.factor(mydata2$Y) fit$importance ####var reduction##### vartoremove <- ncol(mydata2) - 20 library(rminer) ##### for(i in 1:vartoremove){ rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100) varImportance <- Importance(rf,mydata2,method="sensg") Z <- order(varImportance$imp,decreasing = FALSE) IND <- Z[2] var_to_remove <- names(mydata2[IND]) mydata2[IND] = NULL print(i) } ########### library(smbinning) as.data.frame(mydata2) -> inp summary(inp) attach(inp) rm(result) str(inp) inp$target <- as.numeric(inp$Y) *1 table(inp$target) ftable(inp$Y,inp$target) inp$target <- inp$target -1 result= smbinning(df=inp, y="target", x="X37", p=0.0005) result$ivtable smbinning.plot(result,option="badrate",sub="test") summary(inp) result$ivtable boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution") ###Sample require(caTools) inp$Y <- NULL sample = sample.split(inp$target, SplitRatio = .7) train = subset(inp, sample == TRUE) test = subset(inp, sample == FALSE) head(train) nrow(train) fit1 <- glm(train$target~.,data=train,family = binomial) summary(rf) prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") ) result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005) result$ivtable smbinning.plot(result,option="badrate",sub="test") tail(prediction1) write.csv(prediction1 , "test_pred_logistic.csv") predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") ) write.csv(predict_train , "train_pred_logistic.csv") result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005) result$ivtable smbinning.plot(result,option="badrate",sub="train") ####random forest rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200) prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train)) result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005) result$ivtable smbinning.plot(result,option="badrate",sub="train") ###########IV library(devtools) install_github("riv","tomasgreif") library(woe) ##### K-fold Validation ######## library(caret) cv_fold_count = 2 folds = createFolds(mydata2$Y,cv_fold_count,list=T); smpl = folds[[i]]; g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")]; g_test = mydata2[smpl,!names(mydata2) %in% c("Y")]; cost_train = mydata2[-smpl,"Y"]; cost_test = mydata2[smpl,"Y"]; rf <- randomForest(g_train,cost_train) logit.data <- cbind(cost_train,g_train) logit.fit <- glm(cost_train~.,data=logit.data,family = binomial) prediction <- data.f 

rame (actual = test $ Y, predicted = forecast (rf, test))

-3
source

All Articles