2 data preparation
2.1 Convert to factor
c("Gender","Academic_Qualification","Marital","Repayment_Status_Jan","Repayment_Status_Feb","Repayment_Status_March","Repayment_Status_April","Repayment_Status_May","Repayment_Status_June","Default_Payment")
cols.to.factor <-
lapply(data[,cols.to.factor],factor)
data[,cols.to.factor] <-
str(data)
## tibble [30,000 x 25] (S3: tbl_df/tbl/data.frame)
## $ Customer.ID : int [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
## $ Credit_Amount : num [1:30000] 20000 220000 90000 50000 50000 50000 500000 200000 240000 20000 ...
## $ Gender : Factor w/ 2 levels "1","2": 2 2 2 2 1 1 1 2 2 1 ...
## $ Academic_Qualification: Factor w/ 6 levels "1","2","3","4",..: 2 2 2 2 2 1 1 2 3 3 ...
## $ Marital : Factor w/ 4 levels "0","1","2","3": 2 3 3 2 2 3 3 3 2 3 ...
## $ Age_Years : int [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
## $ Repayment_Status_Jan : Factor w/ 7 levels "0","1","2","3",..: 3 1 1 1 1 1 1 1 1 1 ...
## $ Repayment_Status_Feb : Factor w/ 7 levels "0","1","2","3",..: 3 3 1 1 1 1 1 1 1 1 ...
## $ Repayment_Status_March: Factor w/ 7 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 3 1 ...
## $ Repayment_Status_April: Factor w/ 7 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Repayment_Status_May : Factor w/ 6 levels "0","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Repayment_Status_June : Factor w/ 6 levels "0","2","3","4",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ Jan_Bill_Amount : num [1:30000] 3933 3683 39339 46990 8637 ...
## $ Feb_Bill_Amount : num [1:30000] 3103 1735 14037 48333 5570 ...
## $ March_Bill_Amount : num [1:30000] 689 2682 23559 49292 35835 ...
## $ April_Bill_Amount : num [1:30000] 0 3272 24332 29324 20940 ...
## $ May_Bill_Amount : num [1:30000] 0 3455 14848 28858 18146 ...
## $ June_Bill_Amount : num [1:30000] 0 3261 15548 28547 18131 ...
## $ Previous_Payment_Jan : num [1:30000] 0 0 1619 3000 3000 ...
## $ Previous_Payment_Feb : num [1:30000] 679 2000 2500 2029 36672 ...
## $ Previous_Payment_March: num [1:30000] 0 1000 1000 1200 10000 657 59000 0 552 0 ...
## $ Previous_Payment_April: num [1:30000] 0 1000 1000 1100 9000 ...
## $ Previous_Payment_May : num [1:30000] 0 0 1000 1069 689 ...
## $ Previous_Payment_June : num [1:30000] 0 2000 5000 1000 679 ...
## $ Default_Payment : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
2.2 Re-labelling
levels(data$Gender) <-c("Male","Female")
levels(data$Marital) <-c("Unknown","Married","Single","Do not Prefer To say")
levels(data$Academic_Qualification) <- c('Undergraduate', 'Graduate', 'Postgraduate', 'Professional', 'Others', 'Unknown')
levels(data$Repayment_Status_Jan) <- c('Paid on time','Payment delay for one month', 'Payment delay for two months', 'Payment delay for three months', 'Payment delay for four months', 'Payment delay for five months', 'Payment delay for six months')
levels(data$Repayment_Status_Feb) <- c('Paid on time','Payment delay for one month', 'Payment delay for two months', 'Payment delay for three months', 'Payment delay for four months', 'Payment delay for five months', 'Payment delay for six months')
levels(data$Repayment_Status_March) <- c('Paid on time','Payment delay for one month', 'Payment delay for two months', 'Payment delay for three months', 'Payment delay for four months', 'Payment delay for five months', 'Payment delay for six months')
levels(data$Repayment_Status_April) <- c('Paid on time','Payment delay for one month', 'Payment delay for two months', 'Payment delay for three months', 'Payment delay for four months', 'Payment delay for five months', 'Payment delay for six months')
levels(data$Repayment_Status_May) <- c('Paid on time','Payment delay for one month', 'Payment delay for two months', 'Payment delay for three months', 'Payment delay for four months', 'Payment delay for five months', 'Payment delay for six months')
levels(data$Repayment_Status_June) <- c('Paid on time','Payment delay for one month', 'Payment delay for two months', 'Payment delay for three months', 'Payment delay for four months', 'Payment delay for five months', 'Payment delay for six months')