1) Modelling with All Variables
model <- glm(Default_Payment~.-Customer.ID,data=train,family = binomial())
summary(model)
##
## Call:
## glm(formula = Default_Payment ~ . - Customer.ID, family = binomial(),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2580 -0.6014 -0.5133 -0.3207 3.4048
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2.756e+00 5.378e-01
## Credit_Amount -1.347e-06 1.663e-07
## GenderFemale -1.266e-01 3.603e-02
## Academic_QualificationGraduate 1.400e-02 4.154e-02
## Academic_QualificationPostgraduate -2.270e-02 5.591e-02
## Academic_QualificationProfessional -1.136e+00 4.675e-01
## Academic_QualificationOthers -1.211e+00 2.759e-01
## Academic_QualificationUnknown -4.848e-01 4.487e-01
## MaritalMarried 1.328e+00 5.295e-01
## MaritalSingle 1.147e+00 5.297e-01
## MaritalDo not Prefer To say 1.335e+00 5.524e-01
## Age_Years 2.432e-03 2.191e-03
## Repayment_Status_JanPayment delay for one month 7.308e-01 5.614e-02
## Repayment_Status_JanPayment delay for two months 2.032e+00 6.212e-02
## Repayment_Status_JanPayment delay for three months 2.107e+00 1.705e-01
## Repayment_Status_JanPayment delay for four months 1.929e+00 3.353e-01
## Repayment_Status_JanPayment delay for five months 1.512e+00 5.428e-01
## Repayment_Status_JanPayment delay for six months 4.739e-01 8.049e-01
## Repayment_Status_FebPayment delay for one month -2.871e-01 5.798e-01
## Repayment_Status_FebPayment delay for two months 1.934e-01 6.731e-02
## Repayment_Status_FebPayment delay for three months 1.370e-01 1.695e-01
## Repayment_Status_FebPayment delay for four months -5.539e-01 3.511e-01
## Repayment_Status_FebPayment delay for five months 1.221e+00 7.676e-01
## Repayment_Status_FebPayment delay for six months 1.483e+00 1.034e+00
## Repayment_Status_MarchPayment delay for one month -1.126e+01 2.294e+02
## Repayment_Status_MarchPayment delay for two months 3.093e-01 6.641e-02
## Repayment_Status_MarchPayment delay for three months 2.577e-01 2.069e-01
## Repayment_Status_MarchPayment delay for four months -1.381e-01 4.525e-01
## Repayment_Status_MarchPayment delay for five months -5.632e-01 7.179e-01
## Repayment_Status_MarchPayment delay for six months -1.066e-01 7.876e-01
## Repayment_Status_AprilPayment delay for one month 1.323e+01 2.294e+02
## Repayment_Status_AprilPayment delay for two months 3.116e-01 7.292e-02
## Repayment_Status_AprilPayment delay for three months 1.558e-01 2.472e-01
## Repayment_Status_AprilPayment delay for four months 5.947e-01 4.741e-01
## Repayment_Status_AprilPayment delay for five months -8.922e-01 7.627e-01
## Repayment_Status_AprilPayment delay for six months -1.367e+01 1.467e+02
## Repayment_Status_MayPayment delay for one month 3.001e-01 8.007e-02
## Repayment_Status_MayPayment delay for two months 6.348e-02 2.402e-01
## Repayment_Status_MayPayment delay for three months -4.483e-01 5.046e-01
## Repayment_Status_MayPayment delay for four months 1.159e+00 9.376e-01
## Repayment_Status_MayPayment delay for five months 1.418e+01 1.467e+02
## Repayment_Status_JunePayment delay for one month 2.900e-01 7.045e-02
## Repayment_Status_JunePayment delay for two months 7.052e-01 2.447e-01
## Repayment_Status_JunePayment delay for three months 2.007e-01 5.298e-01
## Repayment_Status_JunePayment delay for four months -3.368e-02 7.087e-01
## Repayment_Status_JunePayment delay for five months 7.158e-01 8.157e-01
## Jan_Bill_Amount -8.311e-07 3.975e-07
## Feb_Bill_Amount 2.223e-06 5.898e-07
## March_Bill_Amount 8.233e-07 8.313e-07
## April_Bill_Amount -6.872e-07 8.271e-07
## May_Bill_Amount 1.766e-07 1.488e-06
## June_Bill_Amount -1.084e-06 1.318e-06
## Previous_Payment_Jan -1.363e-05 2.432e-06
## Previous_Payment_Feb -4.544e-06 1.541e-06
## Previous_Payment_March 3.301e-07 1.415e-06
## Previous_Payment_April -4.536e-06 1.932e-06
## Previous_Payment_May -1.208e-06 1.668e-06
## Previous_Payment_June -3.966e-06 1.538e-06
## z value Pr(>|z|)
## (Intercept) -5.126 2.96e-07 ***
## Credit_Amount -8.099 5.54e-16 ***
## GenderFemale -3.513 0.000443 ***
## Academic_QualificationGraduate 0.337 0.736197
## Academic_QualificationPostgraduate -0.406 0.684791
## Academic_QualificationProfessional -2.431 0.015063 *
## Academic_QualificationOthers -4.391 1.13e-05 ***
## Academic_QualificationUnknown -1.080 0.280002
## MaritalMarried 2.507 0.012167 *
## MaritalSingle 2.166 0.030316 *
## MaritalDo not Prefer To say 2.417 0.015631 *
## Age_Years 1.110 0.266971
## Repayment_Status_JanPayment delay for one month 13.017 < 2e-16 ***
## Repayment_Status_JanPayment delay for two months 32.713 < 2e-16 ***
## Repayment_Status_JanPayment delay for three months 12.358 < 2e-16 ***
## Repayment_Status_JanPayment delay for four months 5.754 8.74e-09 ***
## Repayment_Status_JanPayment delay for five months 2.785 0.005354 **
## Repayment_Status_JanPayment delay for six months 0.589 0.555989
## Repayment_Status_FebPayment delay for one month -0.495 0.620502
## Repayment_Status_FebPayment delay for two months 2.874 0.004059 **
## Repayment_Status_FebPayment delay for three months 0.808 0.418961
## Repayment_Status_FebPayment delay for four months -1.578 0.114638
## Repayment_Status_FebPayment delay for five months 1.591 0.111574
## Repayment_Status_FebPayment delay for six months 1.434 0.151547
## Repayment_Status_MarchPayment delay for one month -0.049 0.960851
## Repayment_Status_MarchPayment delay for two months 4.658 3.20e-06 ***
## Repayment_Status_MarchPayment delay for three months 1.245 0.213112
## Repayment_Status_MarchPayment delay for four months -0.305 0.760139
## Repayment_Status_MarchPayment delay for five months -0.785 0.432743
## Repayment_Status_MarchPayment delay for six months -0.135 0.892365
## Repayment_Status_AprilPayment delay for one month 0.058 0.954010
## Repayment_Status_AprilPayment delay for two months 4.274 1.92e-05 ***
## Repayment_Status_AprilPayment delay for three months 0.630 0.528678
## Repayment_Status_AprilPayment delay for four months 1.254 0.209707
## Repayment_Status_AprilPayment delay for five months -1.170 0.242053
## Repayment_Status_AprilPayment delay for six months -0.093 0.925741
## Repayment_Status_MayPayment delay for one month 3.748 0.000178 ***
## Repayment_Status_MayPayment delay for two months 0.264 0.791591
## Repayment_Status_MayPayment delay for three months -0.888 0.374332
## Repayment_Status_MayPayment delay for four months 1.236 0.216445
## Repayment_Status_MayPayment delay for five months 0.097 0.922980
## Repayment_Status_JunePayment delay for one month 4.117 3.84e-05 ***
## Repayment_Status_JunePayment delay for two months 2.882 0.003957 **
## Repayment_Status_JunePayment delay for three months 0.379 0.704805
## Repayment_Status_JunePayment delay for four months -0.048 0.962091
## Repayment_Status_JunePayment delay for five months 0.878 0.380179
## Jan_Bill_Amount -2.091 0.036533 *
## Feb_Bill_Amount 3.769 0.000164 ***
## March_Bill_Amount 0.990 0.321998
## April_Bill_Amount -0.831 0.406064
## May_Bill_Amount 0.119 0.905531
## June_Bill_Amount -0.822 0.411112
## Previous_Payment_Jan -5.606 2.07e-08 ***
## Previous_Payment_Feb -2.949 0.003191 **
## Previous_Payment_March 0.233 0.815527
## Previous_Payment_April -2.349 0.018845 *
## Previous_Payment_May -0.724 0.468770
## Previous_Payment_June -2.578 0.009942 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25365 on 23999 degrees of freedom
## Residual deviance: 20897 on 23942 degrees of freedom
## AIC: 21013
##
## Number of Fisher Scoring iterations: 11
Prediction and Model evaluation
pred <- predict(model,newdata = test,type = 'response')
test$prd <- factor(ifelse(pred>0.5,1,0))
cf <- confusionMatrix(test$prd,reference = test$Default_Payment,positive='1')
cf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4438 876
## 1 235 451
##
## Accuracy : 0.8148
## 95% CI : (0.8048, 0.8246)
## No Information Rate : 0.7788
## P-Value [Acc > NIR] : 3.914e-12
##
## Kappa : 0.3501
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.33986
## Specificity : 0.94971
## Pos Pred Value : 0.65743
## Neg Pred Value : 0.83515
## Prevalence : 0.22117
## Detection Rate : 0.07517
## Detection Prevalence : 0.11433
## Balanced Accuracy : 0.64479
##
## 'Positive' Class : 1
##
- Although Accuracy is high 82% but senstivity is low at 32%
- Model does not predict Positive Class(Default Payment=1, event) well enough as compared to non-event( Default Payment=0)
2) Model with removing non significant variables
fml <- formula(Default_Payment~Credit_Amount+Marital+Gender+Academic_Qualification+Age_Years+Repayment_Status_Jan+Previous_Payment_Jan)
model2 <- glm(fml,data=train,family = binomial())
summary(model2)
##
## Call:
## glm(formula = fml, family = binomial(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9237 -0.6151 -0.5200 -0.3541 3.4806
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2.430e+00 5.160e-01
## Credit_Amount -1.988e-06 1.509e-07
## MaritalMarried 1.155e+00 5.081e-01
## MaritalSingle 9.693e-01 5.084e-01
## MaritalDo not Prefer To say 1.096e+00 5.320e-01
## GenderFemale -1.447e-01 3.546e-02
## Academic_QualificationGraduate 3.788e-02 4.076e-02
## Academic_QualificationPostgraduate -8.681e-03 5.489e-02
## Academic_QualificationProfessional -1.241e+00 4.728e-01
## Academic_QualificationOthers -1.352e+00 2.793e-01
## Academic_QualificationUnknown -6.452e-01 4.586e-01
## Age_Years 2.527e-03 2.158e-03
## Repayment_Status_JanPayment delay for one month 1.033e+00 4.539e-02
## Repayment_Status_JanPayment delay for two months 2.501e+00 5.226e-02
## Repayment_Status_JanPayment delay for three months 2.863e+00 1.527e-01
## Repayment_Status_JanPayment delay for four months 2.466e+00 2.901e-01
## Repayment_Status_JanPayment delay for five months 1.686e+00 4.443e-01
## Repayment_Status_JanPayment delay for six months 1.708e+00 3.633e-01
## Previous_Payment_Jan -1.348e-05 2.175e-06
## z value Pr(>|z|)
## (Intercept) -4.709 2.49e-06 ***
## Credit_Amount -13.175 < 2e-16 ***
## MaritalMarried 2.272 0.023084 *
## MaritalSingle 1.907 0.056558 .
## MaritalDo not Prefer To say 2.061 0.039319 *
## GenderFemale -4.081 4.48e-05 ***
## Academic_QualificationGraduate 0.930 0.352627
## Academic_QualificationPostgraduate -0.158 0.874335
## Academic_QualificationProfessional -2.625 0.008653 **
## Academic_QualificationOthers -4.839 1.30e-06 ***
## Academic_QualificationUnknown -1.407 0.159472
## Age_Years 1.171 0.241665
## Repayment_Status_JanPayment delay for one month 22.748 < 2e-16 ***
## Repayment_Status_JanPayment delay for two months 47.856 < 2e-16 ***
## Repayment_Status_JanPayment delay for three months 18.749 < 2e-16 ***
## Repayment_Status_JanPayment delay for four months 8.500 < 2e-16 ***
## Repayment_Status_JanPayment delay for five months 3.794 0.000149 ***
## Repayment_Status_JanPayment delay for six months 4.702 2.57e-06 ***
## Previous_Payment_Jan -6.197 5.75e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25365 on 23999 degrees of freedom
## Residual deviance: 21344 on 23981 degrees of freedom
## AIC: 21382
##
## Number of Fisher Scoring iterations: 5
Prediction and Model evaluation
pred2 <- predict(model2,newdata = test,type = 'response')
test$prd2 <- factor(ifelse(pred2>0.5,1,0))
cf2 <- confusionMatrix(test$prd2,reference = test$Default_Payment,positive='1')
cf2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4484 910
## 1 189 417
##
## Accuracy : 0.8168
## 95% CI : (0.8068, 0.8265)
## No Information Rate : 0.7788
## P-Value [Acc > NIR] : 2.381e-13
##
## Kappa : 0.3399
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3142
## Specificity : 0.9596
## Pos Pred Value : 0.6881
## Neg Pred Value : 0.8313
## Prevalence : 0.2212
## Detection Rate : 0.0695
## Detection Prevalence : 0.1010
## Balanced Accuracy : 0.6369
##
## 'Positive' Class : 1
##
- Model does not improve or worse
- Accuracy and senstivity are about the same at around 82% and 31% respectively
3)Over Sampling Data
- Handling class Imbalance problem using ROSE package
- Making proportion of default payment status ‘1’ same as ‘0’
newtrainsample <- ovun.sample(fml,data = train,method = 'over',N=37382)$data
summary(newtrainsample)
## Credit_Amount Gender Academic_Qualification
## Min. : 20000 Male :15270 Undergraduate:12389
## 1st Qu.: 50000 Female:22112 Graduate :18106
## Median : 200000 Postgraduate : 6429
## Mean : 178248 Professional : 111
## 3rd Qu.: 260000 Others : 268
## Max. :2000000 Unknown : 79
##
## Marital Age_Years
## Unknown : 62 Min. :21.00
## Married :17303 1st Qu.:28.00
## Single :19630 Median :34.00
## Do not Prefer To say: 387 Mean :35.48
## 3rd Qu.:42.00
## Max. :79.00
##
## Repayment_Status_Jan Previous_Payment_Jan Default_Payment
## Paid on time :25200 Min. : 0 0:18691
## Payment delay for one month : 5457 1st Qu.: 390 1:18691
## Payment delay for two months : 5640 Median : 3000
## Payment delay for three months: 797 Mean : 5367
## Payment delay for four months : 175 3rd Qu.: 4644
## Payment delay for five months : 38 Max. :606000
## Payment delay for six months : 75
- Now Proportion of Event(‘1’) is same as Non Event(‘1’)
Prediciton and Model EValuation
pred3 <- predict(model3,newdata = test,type = 'response')
test$prd3 <- factor(ifelse(pred3>0.5,1,0))
cf3 <- confusionMatrix(test$prd3,reference = test$Default_Payment,positive = '1')
cf3
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3958 606
## 1 715 721
##
## Accuracy : 0.7798
## 95% CI : (0.7691, 0.7903)
## No Information Rate : 0.7788
## P-Value [Acc > NIR] : 0.433185
##
## Kappa : 0.3792
##
## Mcnemar's Test P-Value : 0.002964
##
## Sensitivity : 0.5433
## Specificity : 0.8470
## Pos Pred Value : 0.5021
## Neg Pred Value : 0.8672
## Prevalence : 0.2212
## Detection Rate : 0.1202
## Detection Prevalence : 0.2393
## Balanced Accuracy : 0.6952
##
## 'Positive' Class : 1
##
- now this model has improved sensitivity of 55% significantly higher than previous models 31-32%
- However accuracy has reduced slightly to 78%