s
etwd("D:") rm(list=ls()) options (scipen = 999) read.csv("dummy_data.txt") -> dt head(dt) summary(dt) mydata <- dt head(mydata) summary(mydata) ##Capping for(i in 1:ncol(mydata)){ if(is.numeric(mydata[,i])){ val.quant <- unname(quantile(mydata[,i],probs = 0.75)) mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}}) } } library(randomForest) x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")] y <- as.factor(mydata$Y) set.seed(21) fit <- randomForest(x,y,importance=T,ntree = 70) mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")] mydata2$Y <- as.factor(mydata2$Y) fit$importance ####var reduction##### vartoremove <- ncol(mydata2) - 20 library(rminer) ##### for(i in 1:vartoremove){ rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100) varImportance <- Importance(rf,mydata2,method="sensg") Z <- order(varImportance$imp,decreasing = FALSE) IND <- Z[2] var_to_remove <- names(mydata2[IND]) mydata2[IND] = NULL print(i) } ########### library(smbinning) as.data.frame(mydata2) -> inp summary(inp) attach(inp) rm(result) str(inp) inp$target <- as.numeric(inp$Y) *1 table(inp$target) ftable(inp$Y,inp$target) inp$target <- inp$target -1 result= smbinning(df=inp, y="target", x="X37", p=0.0005) result$ivtable smbinning.plot(result,option="badrate",sub="test") summary(inp) result$ivtable boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution") ###Sample require(caTools) inp$Y <- NULL sample = sample.split(inp$target, SplitRatio = .7) train = subset(inp, sample == TRUE) test = subset(inp, sample == FALSE) head(train) nrow(train) fit1 <- glm(train$target~.,data=train,family = binomial) summary(rf) prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") ) result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005) result$ivtable smbinning.plot(result,option="badrate",sub="test") tail(prediction1) write.csv(prediction1 , "test_pred_logistic.csv") predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") ) write.csv(predict_train , "train_pred_logistic.csv") result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005) result$ivtable smbinning.plot(result,option="badrate",sub="train") ####random forest rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200) prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train)) result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005) result$ivtable smbinning.plot(result,option="badrate",sub="train") ###########IV library(devtools) install_github("riv","tomasgreif") library(woe) ##### K-fold Validation ######## library(caret) cv_fold_count = 2 folds = createFolds(mydata2$Y,cv_fold_count,list=T); smpl = folds[[i]]; g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")]; g_test = mydata2[smpl,!names(mydata2) %in% c("Y")]; cost_train = mydata2[-smpl,"Y"]; cost_test = mydata2[smpl,"Y"]; rf <- randomForest(g_train,cost_train) logit.data <- cbind(cost_train,g_train) logit.fit <- glm(cost_train~.,data=logit.data,family = binomial) prediction <- data.f
rame (actual = test $ Y, predicted = forecast (rf, test))