Loan Approval Analysis

Context

This dataset was sourced from Kaggle with the purpose of it being a basis to construct machine learning models to predict whether applicants will be approved or rejected based on their loan application. Predictors such as education level, income, employment length etc., can be used to determine whether or not an applicant will be able to repay their loan.

Import Libraries

require(tidyverse)
require(ggplot2)
require(scales)
require(readr)
require(data.table)
require(reshape2)
require(e1071)
require(class)
require(naivebayes)
require(randomForest)
require(kableExtra)

Importing Dataset

credit <- read_csv("credit_card/credit_record.csv")
app    <- read_csv("credit_card/application_record.csv")

Summary of Data

head(credit)

## # A tibble: 6 x 3
##        ID MONTHS_BALANCE STATUS
##     <dbl>          <dbl> <chr> 
## 1 5001711              0 X     
## 2 5001711             -1 0     
## 3 5001711             -2 0     
## 4 5001711             -3 0     
## 5 5001712              0 C     
## 6 5001712             -1 C

head(app)

## # A tibble: 6 x 18
##        ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL
##     <dbl> <chr>       <chr>        <chr>                  <dbl>            <dbl>
## 1 5008804 M           Y            Y                          0           427500
## 2 5008805 M           Y            Y                          0           427500
## 3 5008806 M           Y            Y                          0           112500
## 4 5008808 F           N            Y                          0           270000
## 5 5008809 F           N            Y                          0           270000
## 6 5008810 F           N            Y                          0           270000
## # ... with 12 more variables: NAME_INCOME_TYPE <chr>,
## #   NAME_EDUCATION_TYPE <chr>, NAME_FAMILY_STATUS <chr>,
## #   NAME_HOUSING_TYPE <chr>, DAYS_BIRTH <dbl>, DAYS_EMPLOYED <dbl>,
## #   FLAG_MOBIL <dbl>, FLAG_WORK_PHONE <dbl>, FLAG_PHONE <dbl>,
## #   FLAG_EMAIL <dbl>, OCCUPATION_TYPE <chr>, CNT_FAM_MEMBERS <dbl>

Renaming Columns

colnames(app) <- (c("ID", "Gender", "Car", "Prop", "Num_Child", 
                    "Income", "Inc_Cat", "Education", 
                    "Marital_Stat", "Housing_Type", "Birthday", "Emp_Start", 
                    "Cell", "Work", "Home", "Email",
                    "Occupation", "Family"))
colnames(credit) <-(c("ID", "Month_Start", "Status"))

Visualizing the Data

By visualizing the variables of the dataset we can gain insights to what the entire dataset looks like rather than looking into the first couple of rows using the head() function.

Gender Distribution

gender_plot <- ggplot(app, aes(Gender)) + 
  geom_histogram(stat = "count", color = "black", fill = "grey") +
  theme_classic() + 
  labs(x = "Gender", 
       y = "Count", 
       title = "Gender Distribution") +
  scale_y_continuous(labels = comma)
gender_plot

Age Distribution

age_plot <- app %>% 
  mutate(age = round(abs(Birthday)/365)) %>% 
  ggplot(aes(age)) +
  geom_histogram(stat = "count", color = "black", fill = "grey") + 
  theme_classic() + 
  labs(x = "Age", 
       y = "Count", 
       title = "Age Distribution") +
  scale_y_continuous(labels = comma)
age_plot

Employment Length Distribution

employment_plot <- app %>%
  mutate(years = ifelse(Emp_Start >=0, 0, round(abs(Emp_Start)/365))) %>%
  filter(years > 0) %>%
  ggplot(aes(years)) + 
  geom_histogram(stat = "count", color = "black", fill = "grey") +
  theme_classic() + 
  labs(x = "Years", 
       y = "Count", 
       title = "Employment Length Distribution") +
  scale_y_continuous(labels = comma)
 employment_plot

IDs with 0 years of experience are removed from this graph for visual clarity and those applicants are unemployed according to the dataset description.

Education Type Distribution

education_plot <- app %>% ggplot(aes(Education)) + 
  geom_histogram(stat = "count", color = "black", fill = "grey") +
  labs(x = "Education Type", 
       y = "Count", 
       title ="Education Type Distribution") + 
  theme_classic() +
  scale_y_continuous(labels = comma) + coord_flip()
education_plot

A majority of the applicants do hold a high school diploma, college degree or higher.

Credit Length

credit_plot <- credit %>% ggplot(aes(abs(Month_Start)/12)) +
  geom_histogram(stat = "count", color = "black", fill = "grey") +
  labs(x = "Credit Length in Years", 
       y = "Count", 
       title = "Credit Length Distribution") + 
  theme_classic() +
  scale_y_continuous(labels = comma)
credit_plot

This does not accurately represent each applicant’s credit history since it contains multiple credit reports per ID, so when joining we take the longest length to best represent each applicant.

Join Loan Application to Applicant’s Info

begin_month <- credit %>% 
  group_by(ID) %>% 
  summarise(min_month = min(Month_Start))
new_data <- left_join(app, begin_month, by=c("ID"))

By joining datasets we can consolidate the applicant’s ID credit history(s) to one loan application.

Determine Application Approval

credit$at_risk <- NA
credit$at_risk[credit$Status == '2'] = 'Yes'
credit$at_risk[credit$Status == '3'] = 'Yes'
credit$at_risk[credit$Status == '4'] = 'Yes'
credit$at_risk[credit$Status == '5'] = 'Yes'
counter <- credit %>% 
  group_by(ID) %>% 
  summarise_all(funs(sum(!is.na(.))))
counter$at_risk[counter$at_risk > 0]  = 'Yes'
counter$at_risk[counter$at_risk == 0] = 'No'
counter$at_risk <- as.factor(counter$at_risk)
counter         <- counter[c(1,4)]
new_data        <- inner_join(new_data, counter, by = 'ID')
new_data$target[new_data$at_risk == 'Yes'] = 1
new_data$target[new_data$at_risk == 'No']  = 0
counter %>% group_by(at_risk) %>% count()

## # A tibble: 2 x 2
## # Groups:   at_risk [2]
##   at_risk     n
##   <fct>   <int>
## 1 No      45318
## 2 Yes       667

There is a small amount of applicants that are at risk of defaulting on a loan.

Omit NA values

full_table <- na.omit(new_data)

Binary Features

Gender

full_table$Gender <- factor(full_table$Gender, labels = c('F','M'))
full_table %>% group_by(Gender) %>% count()

## # A tibble: 2 x 2
## # Groups:   Gender [2]
##   Gender     n
##   <fct>  <int>
## 1 F      15630
## 2 M       9504

Car

full_table$Car <- factor(full_table$Car, labels = c('No', 'Yes'))
full_table %>% group_by(Car) %>% count()

## # A tibble: 2 x 2
## # Groups:   Car [2]
##   Car       n
##   <fct> <int>
## 1 No    14618
## 2 Yes   10516

Property

full_table$Prop <- factor(full_table$Prop, labels = c('No','Yes'))
full_table %>% group_by(Prop) %>% count()

## # A tibble: 2 x 2
## # Groups:   Prop [2]
##   Prop      n
##   <fct> <int>
## 1 No     8673
## 2 Yes   16461

Email

full_table$Email <- factor(full_table$Email, labels = c('No', 'Yes'))
full_table %>% group_by(Email) %>% count()

## # A tibble: 2 x 2
## # Groups:   Email [2]
##   Email     n
##   <fct> <int>
## 1 No    22604
## 2 Yes    2530

Work Phone

full_table$Work <- factor(full_table$Work, labels = c('No','Yes'))
full_table %>% group_by(Work) %>% count()

## # A tibble: 2 x 2
## # Groups:   Work [2]
##   Work      n
##   <fct> <int>
## 1 No    18252
## 2 Yes    6882

Home

full_table$Home <- factor(full_table$Home, labels = c('No', 'Yes'))
full_table %>% group_by(Home) %>% count()

## # A tibble: 2 x 2
## # Groups:   Home [2]
##   Home      n
##   <fct> <int>
## 1 No    17775
## 2 Yes    7359

Cell

full_table$Cell <- factor(full_table$Cell, labels = c('Yes'))
full_table %>% group_by(Cell) %>% count()

## # A tibble: 1 x 2
## # Groups:   Cell [1]
##   Cell      n
##   <fct> <int>
## 1 Yes   25134

Since all of the applicants indicated that they have a cellphone, we can remove this as a potential feature for our models since they will not have an impact on the model’s decision.

Continuous Features

To help improve our models, we can bin data to help remove outliers in the dataset. If there is enough rows of data with the same feature, they will not be binned.

Number of Children

unique(full_table$Num_Child)

## [1]  0  3  1  2  4 14  5 19  7

full_table$Num_Child <- cut(full_table$Num_Child, 
                            breaks = c(0, 1, 2, 19),
                            include.lowest = TRUE,
                            labels = c('0','1','2+'))
full_table %>% 
  group_by(Num_Child) %>% 
  count()

## # A tibble: 3 x 2
## # Groups:   Num_Child [3]
##   Num_Child     n
##   <fct>     <int>
## 1 0         22026
## 2 1          2715
## 3 2+          393

Income

summary(full_table$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27000  135000  180000  194834  225000 1575000

Age

full_table$Age <- round(abs(full_table$Birthday)/365)
summary(full_table$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   33.00   40.00   40.54   48.00   67.00

Employment Length (Months)

full_table$Emp_Start <- round(full_table$Emp_Start/365, digits = 1)
full_table$Emp_Start[full_table$Emp_Start > 0] = 0
full_table$Emp_Start <- abs(full_table$Emp_Start)
summary(full_table$Emp_Start)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.700   5.300   7.191   9.500  43.000

Family Size - Binned

unique(full_table$Family)

##  [1]  2  1  5  3  4  6 15  7 20  9

full_table$Family <- cut(full_table$Family, 
                            breaks = c(1, 2, 3, 20),
                            include.lowest = TRUE,
                            labels = c('1','2','3+'))
full_table %>%
  group_by(Family) %>%
  count()

## # A tibble: 3 x 2
## # Groups:   Family [3]
##   Family     n
##   <fct>  <int>
## 1 1      16960
## 2 2       5216
## 3 3+      2958

Categorical Features

Income Category - Binning

unique(full_table$Inc_Cat)

## [1] "Working"              "Commercial associate" "State servant"       
## [4] "Student"              "Pensioner"

full_table %>% 
  group_by(Inc_Cat) %>%
  count()

## # A tibble: 5 x 2
## # Groups:   Inc_Cat [5]
##   Inc_Cat                  n
##   <chr>                <int>
## 1 Commercial associate  7052
## 2 Pensioner               13
## 3 State servant         2437
## 4 Student                 10
## 5 Working              15622

full_table$Inc_Cat[full_table$Inc_Cat   == 'Student' | 
                     full_table$Inc_Cat == 'Pensioner'] = 'State servant'

Binned Income Categories

full_table$Inc_Cat <- factor(full_table$Inc_Cat)
full_table %>% 
  group_by(Inc_Cat) %>% 
  count()

## # A tibble: 3 x 2
## # Groups:   Inc_Cat [3]
##   Inc_Cat                  n
##   <fct>                <int>
## 1 Commercial associate  7052
## 2 State servant         2460
## 3 Working              15622

Occupation Type - Binning

unique(full_table$Occupation)

##  [1] "Security staff"        "Sales staff"           "Accountants"          
##  [4] "Laborers"              "Managers"              "Drivers"              
##  [7] "Core staff"            "High skill tech staff" "Cleaning staff"       
## [10] "Private service staff" "Cooking staff"         "Low-skill Laborers"   
## [13] "Medicine staff"        "Secretaries"           "Waiters/barmen staff" 
## [16] "HR staff"              "Realty agents"         "IT staff"

full_table$Occupation[full_table$Occupation   == 'Laborers' | 
                        full_table$Occupation == 'Low-skill Laborers' |
                        full_table$Occupation == 'Cleaning staff' |
                        full_table$Occupation == 'Cooking staff' |
                        full_table$Occupation == 'Drivers' |
                        full_table$Occupation == 'Security staff' |
                        full_table$Occupation == 'Waiters/barmen staff'] = 'Laborer'
full_table$Occupation[full_table$Occupation   == 'Accountants' |
                        full_table$Occupation == 'Core staff' |
                        full_table$Occupation == 'HR staff' |
                        full_table$Occupation == 'Medicine staff' |
                        full_table$Occupation == 'Private service staff' |
                        full_table$Occupation == 'Realty agents' |
                        full_table$Occupation == 'Sales staff' |
                        full_table$Occupation == 'Secretaries'] = 'Office'
full_table$Occupation[full_table$Occupation   == 'Managers' |
                        full_table$Occupation == 'High skill tech staff'|
                        full_table$Occupation == 'IT staff'] = 'High Tech'

Binned Occupation Type

full_table$Occupation <- factor(full_table$Occupation)
full_table %>% 
  group_by(Occupation) %>% 
  count()

## # A tibble: 3 x 2
## # Groups:   Occupation [3]
##   Occupation     n
##   <fct>      <int>
## 1 High Tech   4455
## 2 Laborer    10496
## 3 Office     10183

Education Type

full_table$Education[full_table$Education == 'Academic degree'] = 
  'Higher education'
full_table$Education <- factor(full_table$Education)
full_table %>% 
  group_by(Education) %>% 
  count()

## # A tibble: 4 x 2
## # Groups:   Education [4]
##   Education                         n
##   <fct>                         <int>
## 1 Higher education               7146
## 2 Incomplete higher               993
## 3 Lower secondary                 187
## 4 Secondary / secondary special 16808

Housing Type

full_table$Housing_Type <- factor(full_table$Housing_Type)
full_table %>% 
  group_by(Housing_Type) %>%
  count()

## # A tibble: 6 x 2
## # Groups:   Housing_Type [6]
##   Housing_Type            n
##   <fct>               <int>
## 1 Co-op apartment       152
## 2 House / apartment   22102
## 3 Municipal apartment   812
## 4 Office apartment      199
## 5 Rented apartment      439
## 6 With parents         1430

Marital Status

full_table$Marital_Stat <- factor(full_table$Marital_Stat)
full_table %>% 
  group_by(Marital_Stat) %>% 
  count()

## # A tibble: 5 x 2
## # Groups:   Marital_Stat [5]
##   Marital_Stat             n
##   <fct>                <int>
## 1 Civil marriage        2133
## 2 Married              17509
## 3 Separated             1467
## 4 Single / not married  3445
## 5 Widow                  580

Examine Cleaned Data

reduced_table <- full_table[-c(1, 11, 13, 19, 20)]
summary(reduced_table)

##  Gender     Car         Prop       Num_Child      Income       
##  F:15630   No :14618   No : 8673   0 :22026   Min.   :  27000  
##  M: 9504   Yes:10516   Yes:16461   1 : 2715   1st Qu.: 135000  
##                                    2+:  393   Median : 180000  
##                                               Mean   : 194834  
##                                               3rd Qu.: 225000  
##                                               Max.   :1575000  
##                  Inc_Cat                              Education    
##  Commercial associate: 7052   Higher education             : 7146  
##  State servant       : 2460   Incomplete higher            :  993  
##  Working             :15622   Lower secondary              :  187  
##                               Secondary / secondary special:16808  
##                                                                    
##                                                                    
##                Marital_Stat                Housing_Type     Emp_Start     
##  Civil marriage      : 2133   Co-op apartment    :  152   Min.   : 0.000  
##  Married             :17509   House / apartment  :22102   1st Qu.: 2.700  
##  Separated           : 1467   Municipal apartment:  812   Median : 5.300  
##  Single / not married: 3445   Office apartment   :  199   Mean   : 7.191  
##  Widow               :  580   Rented apartment   :  439   3rd Qu.: 9.500  
##                               With parents       : 1430   Max.   :43.000  
##   Work        Home       Email           Occupation    Family    
##  No :18252   No :17775   No :22604   High Tech: 4455   1 :16960  
##  Yes: 6882   Yes: 7359   Yes: 2530   Laborer  :10496   2 : 5216  
##                                      Office   :10183   3+: 2958  
##                                                                  
##                                                                  
##                                                                  
##      target             Age       
##  Min.   :0.00000   Min.   :21.00  
##  1st Qu.:0.00000   1st Qu.:33.00  
##  Median :0.00000   Median :40.00  
##  Mean   :0.01679   Mean   :40.54  
##  3rd Qu.:0.00000   3rd Qu.:48.00  
##  Max.   :1.00000   Max.   :67.00

ggplot(full_table, aes(x = at_risk, y = Income)) + 
  geom_boxplot() + 
  facet_wrap(~Gender) +
  labs(title = "Credit Risk Distribution Based on Gender and Income Distribuiton", 
       x = "Credit Risk", 
       y = "Income") +
  theme_classic()

ggplot(full_table, 
       aes(x = Age, y = Income, color = at_risk)) +
  geom_violin()

Sampling Data

The dataset is split 70/30 at a random since the number of at risk applicants are so low.

set.seed(920)
sample <- sample.int(n = nrow(full_table), size = floor(.70*nrow(full_table)), replace = F)
train.set <- reduced_table[sample, ]
test.set  <- reduced_table[-sample, ]

Training Models for Classification

The methods that will be implemented to classify and predict if an applicant is eligible or not are Logistic Regression, Support Vector Machines, Naive Bayes, KNN, and Random Forest (Decision Trees).

Logistic Regression

Logistic regression predicts the value of a categorical variable by finding the relationship between the categorical variable and the independent variables (predictors). They are mainly used in binary classification scenarios.

lm.model <- glm(formula = target ~ ., 
                family = binomial(link='logit'), 
                data = train.set)
summary(lm.model)

## 
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"), 
##     data = train.set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3873  -0.2007  -0.1780  -0.1564   3.2992  
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -1.645e+01  2.310e+02  -0.071 0.943218
## GenderM                                 2.497e-01  1.448e-01   1.724 0.084666
## CarYes                                 -1.556e-01  1.288e-01  -1.208 0.227010
## PropYes                                -1.967e-01  1.260e-01  -1.561 0.118502
## Num_Child1                              6.338e-01  6.272e-01   1.010 0.312278
## Num_Child2+                             1.679e+00  7.346e-01   2.286 0.022277
## Income                                  1.222e-07  5.910e-07   0.207 0.836119
## Inc_CatState servant                    1.440e-01  2.127e-01   0.677 0.498360
## Inc_CatWorking                         -6.882e-02  1.348e-01  -0.510 0.609712
## EducationIncomplete higher              1.039e-01  2.853e-01   0.364 0.715826
## EducationLower secondary                2.661e-01  6.005e-01   0.443 0.657649
## EducationSecondary / secondary special -1.571e-01  1.401e-01  -1.121 0.262113
## Marital_StatMarried                     9.471e-02  2.249e-01   0.421 0.673728
## Marital_StatSeparated                  -2.916e-01  3.678e-01  -0.793 0.427937
## Marital_StatSingle / not married        4.642e-01  2.571e-01   1.806 0.070949
## Marital_StatWidow                       5.565e-01  4.074e-01   1.366 0.171959
## Housing_TypeHouse / apartment           1.275e+01  2.310e+02   0.055 0.955987
## Housing_TypeMunicipal apartment         1.301e+01  2.310e+02   0.056 0.955081
## Housing_TypeOffice apartment            1.293e+01  2.310e+02   0.056 0.955364
## Housing_TypeRented apartment            1.203e+01  2.310e+02   0.052 0.958459
## Housing_TypeWith parents                1.271e+01  2.310e+02   0.055 0.956103
## Emp_Start                              -4.078e-02  1.184e-02  -3.443 0.000576
## WorkYes                                 1.173e-01  1.419e-01   0.826 0.408662
## HomeYes                                -1.587e-02  1.380e-01  -0.115 0.908473
## EmailYes                                6.328e-02  1.903e-01   0.332 0.739530
## OccupationLaborer                      -1.267e-01  1.776e-01  -0.713 0.475767
## OccupationOffice                       -1.348e-01  1.733e-01  -0.778 0.436720
## Family2                                -1.365e-01  1.691e-01  -0.807 0.419426
## Family3+                               -7.504e-01  6.752e-01  -1.111 0.266381
## Age                                     1.173e-03  7.031e-03   0.167 0.867442
##                                           
## (Intercept)                               
## GenderM                                .  
## CarYes                                    
## PropYes                                   
## Num_Child1                                
## Num_Child2+                            *  
## Income                                    
## Inc_CatState servant                      
## Inc_CatWorking                            
## EducationIncomplete higher                
## EducationLower secondary                  
## EducationSecondary / secondary special    
## Marital_StatMarried                       
## Marital_StatSeparated                     
## Marital_StatSingle / not married       .  
## Marital_StatWidow                         
## Housing_TypeHouse / apartment             
## Housing_TypeMunicipal apartment           
## Housing_TypeOffice apartment              
## Housing_TypeRented apartment              
## Housing_TypeWith parents                  
## Emp_Start                              ***
## WorkYes                                   
## HomeYes                                   
## EmailYes                                  
## OccupationLaborer                         
## OccupationOffice                          
## Family2                                   
## Family3+                                  
## Age                                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3037.7  on 17592  degrees of freedom
## Residual deviance: 2984.2  on 17563  degrees of freedom
## AIC: 3044.2
## 
## Number of Fisher Scoring iterations: 15

Predict Logistic Regression Model Performance

log.predict <- predict(lm.model, test.set, type = "response")
log.prediction.rd <- ifelse(log.predict > 0.5, 1, 0)
print(paste('Accuracy:', 1-mean(log.prediction.rd != test.set$target)))

## [1] "Accuracy: 0.983821774300491"

The logistic regression model does a good job of classifying the different applicants regardless of the skewed sample size.

Support Vector Machines (SVM)

Support Vector Machines represent data points as objects in space. The data is then split by a function created by the SVM to classify the different spaces according to the target outputs. SVMs are more efficient when using data with high dimensionality.

svmfit = svm(target ~ ., 
             data = train.set, 
             kernel = "linear", 
             type = "C-classification")
summary(svmfit)

## 
## Call:
## svm(formula = target ~ ., data = train.set, kernel = "linear", type = "C-classification")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  959
## 
##  ( 659 300 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

SVM Performance

pred       <- predict(svmfit, test.set)
svm.table0 <- table(test.set$target, pred)
paste("Accuracy:", sum(diag(svm.table0))/sum(svm.table0))

## [1] "Accuracy: 0.983821774300491"

The SVM classification technique also returned a comparable accuracy, model tuning and parameter adjustment can give way to a slightly higher accuracy.

Using an Optimal SVM

optimal.svm <- svm(as.factor(target) ~ .,
                   data = train.set,
                   type = "C-classification",
                   kernel = "linear",
                   gamma = 0.1,
                   cost = 1)
summary(optimal.svm)

## 
## Call:
## svm(formula = as.factor(target) ~ ., data = train.set, type = "C-classification", 
##     kernel = "linear", gamma = 0.1, cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  959
## 
##  ( 659 300 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Optimal SVM Performance

svm.predict <- predict(optimal.svm, test.set[,-16])
svm.table <- table(svm.predict, test.set$target)
paste("Accuracy:", sum(diag(svm.table))/sum(svm.table))

## [1] "Accuracy: 0.983821774300491"

K-Nearest Neighbors (KNN)

K-nearest neighbors method is a classification method that relies on the distance between datapoints in order to classify new data points.

train_l     <- train.set[c(5,10,17)]
test_l      <- test.set[c(5,10,17)]
train_label <- train.set$target
test_label  <- test.set$target
knn.model   <- knn(train = train_l, test = test_l, cl = train_label, k = 132)
knn.model2  <- knn(train = train_l, test = test_l, cl = train_label, k = 133)

knn.table   <- table(knn.model, test_label)
knn.table2  <- table(knn.model2, test_label)
paste("Accuracy for k = 132:", sum(diag(knn.table))/sum(knn.table))

## [1] "Accuracy for k = 132: 0.983821774300491"

paste("Accuracy for k = 133:", sum(diag(knn.table2))/sum(knn.table2))

## [1] "Accuracy for k = 133: 0.983821774300491"

Naive Bayes (NB)

Naive Bayes uses the Bayes Theorem to solve classification problems by means of conditional probability. This is done by considering the predictor variable independent of one another.

nb.model   <- naiveBayes(as.factor(target) ~ ., 
                         data = train.set)
nb.model

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##          0          1 
## 0.98294776 0.01705224 
## 
## Conditional probabilities:
##    Gender
## Y           F         M
##   0 0.6229110 0.3770890
##   1 0.5666667 0.4333333
## 
##    Car
## Y          No       Yes
##   0 0.5783265 0.4216735
##   1 0.5933333 0.4066667
## 
##    Prop
## Y          No       Yes
##   0 0.3471347 0.6528653
##   1 0.4000000 0.6000000
## 
##    Num_Child
## Y            0          1         2+
##   0 0.87555658 0.10958191 0.01486150
##   1 0.86666667 0.10000000 0.03333333
## 
##    Income
## Y       [,1]      [,2]
##   0 194922.9 104576.29
##   1 199291.5  96269.56
## 
##    Inc_Cat
## Y   Commercial associate State servant    Working
##   0           0.28022899    0.09680217 0.62296883
##   1           0.30000000    0.10666667 0.59333333
## 
##    Education
## Y   Higher education Incomplete higher Lower secondary
##   0      0.286879084       0.038281386     0.006997051
##   1      0.316666667       0.050000000     0.010000000
##    Education
## Y   Secondary / secondary special
##   0                   0.667842480
##   1                   0.623333333
## 
##    Marital_Stat
## Y   Civil marriage    Married  Separated Single / not married      Widow
##   0     0.08714509 0.69658243 0.05950385           0.13456312 0.02220552
##   1     0.07666667 0.65666667 0.04000000           0.19666667 0.03000000
## 
##    Housing_Type
## Y   Co-op apartment House / apartment Municipal apartment Office apartment
##   0     0.006129648       0.880240560         0.032093911      0.007806627
##   1     0.000000000       0.873333333         0.043333333      0.010000000
##    Housing_Type
## Y   Rented apartment With parents
##   0      0.016943272  0.056785983
##   1      0.010000000  0.063333333
## 
##    Emp_Start
## Y       [,1]     [,2]
##   0 7.188203 6.418510
##   1 5.832667 5.470499
## 
##    Work
## Y          No       Yes
##   0 0.7243393 0.2756607
##   1 0.7000000 0.3000000
## 
##    Home
## Y          No       Yes
##   0 0.7058347 0.2941653
##   1 0.7033333 0.2966667
## 
##    Email
## Y          No       Yes
##   0 0.8983404 0.1016596
##   1 0.8900000 0.1100000
## 
##    Occupation
## Y   High Tech   Laborer    Office
##   0 0.1764876 0.4173943 0.4061181
##   1 0.1966667 0.4200000 0.3833333
## 
##    Family
## Y           1         2        3+
##   0 0.6707916 0.2110102 0.1181981
##   1 0.6900000 0.1866667 0.1233333
## 
##    Age
## Y       [,1]     [,2]
##   0 40.51096 9.519697
##   1 39.64000 9.407732

NB Model Performance

nb.predict <- predict(nb.model, test.set)
nb.table   <- table(test.set$target, nb.predict)
paste("Accuracy:", sum(diag(nb.table))/sum(nb.table))

## [1] "Accuracy: 0.983821774300491"

Random Forest Model

Random Forests is a classification method that uses a large number of decision trees. These decision trees are used to identify a classification consensus by selecting a common output from the data.

rf.model   <- randomForest(as.factor(target) ~.,
                           data = train.set)
rf.model

## 
## Call:
##  randomForest(formula = as.factor(target) ~ ., data = train.set) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 1.85%
## Confusion matrix:
##       0  1 class.error
## 0 17233 60 0.003469612
## 1   266 34 0.886666667

Random Forest Model Performance

rf.predict <- predict(rf.model, test.set)
rf.table   <- table(test.set$target, rf.predict)
paste("Accuracy:", sum(diag(nb.table))/sum(nb.table))

## [1] "Accuracy: 0.983821774300491"

Overall the accuracies of each of the classification methods are negligible. Some of the parameters could be tuned in order to improve their performance.