TopGear data

We start by loading the data, and selecting the relevant features. As pre-processing, we set values of accel==0 to missing because it is impossible. We also remove the incorrect weight of the Peugeot 107.

data(TopGear, package = "robustHD")
cars = TopGear; rm(TopGear)
dim(cars) # [1] 297  32

## [1] 297  32

rownames(cars) = paste(cars[,1],cars[,2])
# Now the rownames are make and model of the cars.
colnames(cars)

##  [1] "Maker"              "Model"              "Type"              
##  [4] "Fuel"               "Price"              "Cylinders"         
##  [7] "Displacement"       "DriveWheel"         "BHP"               
## [10] "Torque"             "Acceleration"       "TopSpeed"          
## [13] "MPG"                "Weight"             "Length"            
## [16] "Width"              "Height"             "AdaptiveHeadlights"
## [19] "AdjustableSteering" "AlarmSystem"        "Automatic"         
## [22] "Bluetooth"          "ClimateControl"     "CruiseControl"     
## [25] "ElectricSeats"      "Leather"            "ParkingSensors"    
## [28] "PowerSteering"      "SatNav"             "ESP"               
## [31] "Verdict"            "Origin"

colnames(cars)[4:17] = c("fuel","price","cyl","displ","drive",
                         "hp","torque","accel","topspeed",
                         "MPG","weight","length","width",
                         "height")
summary(cars$accel) # zero minimum is impossible

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.900   9.100   8.839  11.400  16.900

rownames(cars)[which(cars$accel == 0)]

## [1] "Citroen C5 Tourer" "Ford Mondeo"       "Lotus Elise"      
## [4] "Renault Twizy"     "Ssangyong Rodius"

# [1] "Citroen C5 Tourer" "Ford Mondeo"   "Lotus Elise"
# [4] "Renault Twizy"     "Ssangyong Rodius"
cars$accel[cars$accel == 0] = NA
summary(cars$accel) # OK

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2.50    6.20    9.20    8.99   11.50   16.90       5

cars$weight[order(cars$weight)[1:4]]

## [1] 210 450 490 550

rownames(cars)[order(cars$weight)[1:4]]

## [1] "Peugeot 107"      "Renault Twizy"    "Morgan 3 Wheeler" "Caterham Super 7"

cars$weight[cars$weight == 210] = NA
summary(cars$weight) # OK

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     450    1250    1496    1541    1778    2705      34

# save(cars, file = "Topgear.RData")
# load("Topgear.RData")

Numerical example in the introduction

summary(cars$topspeed) # in mph

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    50.0   112.0   126.0   132.7   151.0   252.0       4

summary(cars$displ)    # in cc

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     647    1560    1995    2504    2988    7993       9

summary(cars$length)   # in mm

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2337    4157    4464    4427    4766    5612      11

cars$length = cars$length/1000 # in meters

fit = lm(hp ~ topspeed + length + displ, data = cars)
summary(fit)

## 
## Call:
## lm(formula = hp ~ topspeed + length + displ, data = cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -142.034  -20.667   -2.715   15.966  174.020 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.069e+02  2.861e+01  -7.232 4.78e-12 ***
## topspeed     2.466e+00  1.373e-01  17.963  < 2e-16 ***
## length      -1.313e+01  6.187e+00  -2.122   0.0347 *  
## displ        6.255e-02  2.808e-03  22.275  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.72 on 274 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.9405, Adjusted R-squared:  0.9399 
## F-statistic:  1445 on 3 and 274 DF,  p-value: < 2.2e-16

predsplot(fit, main = "Top Gear data")

## 
## Prediction terms in the model:
##         prediction term   stdev up/down
##                topspeed  68.380      up
##                  length   5.817    down
##                   displ  91.790      up
##  Total prediction of hp 149.200

fact = 0.51
width = fact*10; height = fact*8
maxnchar = 6
main = "Top Gear data"
pdf(file = "topgear_numerical_predsplot.pdf",
    width = width, height = height)
predsplot(fit, main=main, maxchar.level = maxnchar)

## 
## Prediction terms in the model:
##         prediction term   stdev up/down
##                topspeed  68.380      up
##                  length   5.817    down
##                   displ  91.790      up
##  Total prediction of hp 149.200

dev.off()

## png 
##   2

predscor(fit, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##          topspeed length displ
## topspeed     1.00  -0.45  0.80
## length      -0.45   1.00 -0.56
## displ        0.80  -0.56  1.00

# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "topgear_numerical_predscor.pdf", 
#     width = height, height = height)
# predscor(fit, sort.by.stdev = FALSE)
# dev.off()

Figure 2 and TopGear figures in Supplementary Material:

cars$GPM = 1/cars$MPG

fit4 = lm(GPM ~ accel + drive + weight + fuel, data = cars)
summary(fit4)

## 
## Call:
## lm(formula = GPM ~ accel + drive + weight + fuel, data = cars)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.026997 -0.004301  0.000268  0.003660  0.055643 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.681e-02  4.884e-03   3.443 0.000677 ***
## accel       -1.356e-03  2.459e-04  -5.516 8.81e-08 ***
## driveFront  -2.927e-03  1.619e-03  -1.808 0.071853 .  
## driveRear   -2.745e-04  1.577e-03  -0.174 0.861992    
## weight       1.131e-05  1.841e-06   6.143 3.25e-09 ***
## fuelPetrol   8.427e-03  1.246e-03   6.761 9.98e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007973 on 245 degrees of freedom
##   (46 observations deleted due to missingness)
## Multiple R-squared:  0.6058, Adjusted R-squared:  0.5977 
## F-statistic: 75.29 on 5 and 245 DF,  p-value: < 2.2e-16

## Figure 2:
## 
fact = 0.52
width = fact*10; height = fact*8
maxnchar = 6
main = "Top Gear data"
# pdf(file = "topgear_4_predsplot.pdf",
#     width = width, height = height)
predsplot(fit4, main=main, maxchar.level = maxnchar)

## 
## Prediction terms in the model:
##          prediction term    stdev up/down
##                    accel 0.004329    down
##                    drive 0.001400        
##                   weight 0.004490      up
##                     fuel 0.004104        
##  Total prediction of GPM 0.009783

# dev.off()

Three figures in the Supplementary Material:

## 
# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "topgear_4_predscor.pdf", 
#     width = height, height = height)
predscor(fit4, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##        accel drive weight  fuel
## accel   1.00  0.65   0.49  0.31
## drive   0.65  1.00   0.62  0.08
## weight  0.49  0.62   1.00 -0.23
## fuel    0.31  0.08  -0.23  1.00

# dev.off()

car = "Bentley Continental"
# pdf(file = "topgear_4_predsplot_1_dens.pdf",
#     width = width, height = height)
predsplot(fit4, main = main, casetoshow=car, 
          displaytype = "density", 
          maxchar.level = maxnchar, 
          xlab = paste0("prediction for ",car))

## 
##  Prediction terms for case Bentley Continental:
##          prediction term    value
##                    accel +0.00621
##                    drive +0.00152
##                   weight +0.00857
##                     fuel +0.00322
##                      SUM +0.01952
##               centercept  0.02599
##  Total prediction of GPM  0.04551

# dev.off()

car = "Kia Rio"
# pdf(file = "topgear_4_predsplot_2_stairs.pdf",
#     width = width, height = height)
predsplot(fit4, main = main, casetoshow=car, 
          staircase = TRUE, maxchar.level = maxnchar, 
          xlab = paste0("prediction for ",car))

## 
##  Prediction terms for case Kia Rio:
##          prediction term    value
##                    accel -0.00884
##                    drive -0.00141
##                   weight -0.00420
##                     fuel -0.00520
##                      SUM -0.01966
##               centercept  0.02599
##  Total prediction of GPM  0.00633

# dev.off()

Test combination of expressions and types

We add a logical variable (AlarmSystem) as well as a character variable (SatNav)

summary(cars$AlarmSystem)

## no/optional    standard 
##         112         185

cars$alarm = (cars$AlarmSystem == "standard")
summary(cars$alarm)

##    Mode   FALSE    TRUE 
## logical     112     185

summary(cars$SatNav)

##       no optional standard 
##       86      116       95

cars$navig = "y"
cars$navig[cars$SatNav == "no"] = "n"
summary(cars$navig)

##    Length     Class      Mode 
##       297 character character

table(cars$navig)

## 
##   n   y 
##  86 211

class(cars$navig)

## [1] "character"

str(cars)

## 'data.frame':    297 obs. of  35 variables:
##  $ Maker             : Factor w/ 51 levels "Alfa Romeo","Aston Martin",..: 1 1 2 2 2 2 2 2 2 3 ...
##  $ Model             : Factor w/ 296 levels "1 Series","1 Series Convertible",..: 139 186 101 102 103 262 266 267 268 29 ...
##  $ Type              : Factor w/ 297 levels "107 1.0 68 Active 5d",..: 139 190 100 101 102 263 269 268 267 30 ...
##  $ fuel              : Factor w/ 2 levels "Diesel","Petrol": 1 2 2 2 2 2 2 2 2 2 ...
##  $ price             : num  21250 15155 30995 131995 141995 ...
##  $ cyl               : num  4 4 4 12 12 12 12 8 8 4 ...
##  $ displ             : num  1598 1368 1329 5935 5935 ...
##  $ drive             : Factor w/ 3 levels "4WD","Front",..: 2 2 2 3 3 3 3 3 3 2 ...
##  $ hp                : num  105 105 98 517 517 510 573 420 420 86 ...
##  $ torque            : num  236 95 92 457 457 420 457 346 346 118 ...
##  $ accel             : num  11.3 10.7 11.8 4.6 4.6 4.2 4.1 4.7 4.7 11.7 ...
##  $ topspeed          : num  115 116 106 183 183 190 183 180 180 112 ...
##  $ MPG               : num  64 49 56 19 19 17 19 20 20 55 ...
##  $ weight            : num  1385 1090 988 1785 1890 ...
##  $ length            : num  4.35 4.06 3.08 4.72 4.72 ...
##  $ width             : num  1798 1720 1680 NA NA ...
##  $ height            : num  1465 1446 1500 1282 1282 ...
##  $ AdaptiveHeadlights: Factor w/ 3 levels "no","optional",..: 2 2 1 3 3 1 3 1 1 2 ...
##  $ AdjustableSteering: Factor w/ 2 levels "no","standard": 2 2 2 2 2 2 2 2 2 2 ...
##  $ AlarmSystem       : Factor w/ 2 levels "no/optional",..: 2 2 1 1 1 2 2 2 2 2 ...
##  $ Automatic         : Factor w/ 3 levels "no","optional",..: 1 1 2 3 3 1 3 2 2 1 ...
##  $ Bluetooth         : Factor w/ 3 levels "no","optional",..: 3 3 3 3 3 3 3 2 2 3 ...
##  $ ClimateControl    : Factor w/ 3 levels "no","optional",..: 3 2 3 3 3 3 3 3 3 2 ...
##  $ CruiseControl     : Factor w/ 3 levels "no","optional",..: 3 3 3 3 3 3 3 2 2 2 ...
##  $ ElectricSeats     : Factor w/ 3 levels "no","optional",..: 2 1 1 3 3 3 3 3 3 1 ...
##  $ Leather           : Factor w/ 3 levels "no","optional",..: 2 2 1 3 3 3 3 3 3 2 ...
##  $ ParkingSensors    : Factor w/ 3 levels "no","optional",..: 2 3 1 3 3 3 3 3 3 2 ...
##  $ PowerSteering     : Factor w/ 2 levels "no","standard": 2 2 2 2 2 2 2 2 2 2 ...
##  $ SatNav            : Factor w/ 3 levels "no","optional",..: 2 2 3 3 3 3 3 2 2 2 ...
##  $ ESP               : Factor w/ 3 levels "no","optional",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Verdict           : num  6 5 7 7 7 7 7 8 7 6 ...
##  $ Origin            : Factor w/ 3 levels "Asia","Europe",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ GPM               : num  0.0156 0.0204 0.0179 0.0526 0.0526 ...
##  $ alarm             : logi  TRUE TRUE FALSE FALSE FALSE TRUE ...
##  $ navig             : chr  "y" "y" "y" "y" ...

fitcombin = lm(1/MPG ~ accel + log(weight) + accel:torque + alarm + navig, data = cars)
summary(fitcombin)

## 
## Call:
## lm(formula = 1/MPG ~ accel + log(weight) + accel:torque + alarm + 
##     navig, data = cars)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.026131 -0.005201 -0.001251  0.004847  0.056823 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.018e-01  3.347e-02  -3.040  0.00262 ** 
## accel        -2.115e-03  2.461e-04  -8.591 9.87e-16 ***
## log(weight)   2.135e-02  4.709e-03   4.534 9.01e-06 ***
## alarmTRUE    -2.450e-03  1.343e-03  -1.824  0.06932 .  
## navigy       -1.693e-03  1.367e-03  -1.239  0.21667    
## accel:torque -3.403e-06  1.394e-06  -2.442  0.01532 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.008684 on 247 degrees of freedom
##   (44 observations deleted due to missingness)
## Multiple R-squared:  0.5293, Adjusted R-squared:  0.5198 
## F-statistic: 55.56 on 5 and 247 DF,  p-value: < 2.2e-16

fact = 0.45
width = fact*10; height = fact*8
main = "Example with transformation, interaction, logical, character"
# pdf(file = "topgear_logical+char_predsplot.pdf", width = width, height = height)
predsplot(fitcombin, main=main)

## 
## Prediction terms in the model:
##            prediction term     stdev up/down
##                      accel 0.0067300    down
##                log(weight) 0.0054000      up
##                      alarm 0.0011630        
##                      navig 0.0007587        
##               accel:torque 0.0024290        
##  Total prediction of 1/MPG 0.0091180

# dev.off()  

fact = 0.6
width = fact*10; height = fact*8
# pdf(file = "topgear_logical+char_predscor.pdf",
#     width = height, height = height)
predscor(fitcombin)

## Correlation matrix of the prediction terms:
##              accel log(weight) accel:torque alarm navig
## accel         1.00        0.53        -0.09 -0.37 -0.33
## log(weight)   0.53        1.00        -0.75 -0.43 -0.35
## accel:torque -0.09       -0.75         1.00  0.26  0.18
## alarm        -0.37       -0.43         0.26  1.00  0.38
## navig        -0.33       -0.35         0.18  0.38  1.00

# dev.off()

Titanic data

We start by loading the data, turning the pclass and sex variables into factors.

data(data_titanic, package = "classmap")
titanic = data_titanic; rm(data_titanic)
dim(titanic) # 1309  13

## [1] 1309   13

# A data frame with 1309 observations on the following variables.
#   passengerId: a unique identifier for each passenger.
#   pclass: travel class of the passenger.
#   name: name of the passenger.
#   sex: sex of the passenger.
#   age: age of the passenger.
#   sibsp: number of siblings and spouses traveling with the passenger.
#   parch: number of parents and children traveling with the passenger.
#   ticket: ticket number of the passenger.
#   fare: fare paid for the ticket.
#   Cabin: cabin number of the passenger.
#   embarked: Port of embarkation. Takes the values
#             C (Cherbourg), Q (Queenstown) and
#             S (Southampton).
#   y: factor indicating casualty or survivor.
#   dataType: vector taking the values “train” or “test”
#             indicating whether the observation belongs
#             to training or test data.
#
colnames(titanic) = c("passengerId","pclass","name","sex",
                   "age", "sibsp", "parch", "ticket",
                   "fare", "cabin", "embarked", "survival",
                   "dataType")
str(titanic)

## 'data.frame':    1309 obs. of  13 variables:
##  $ passengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ sex        : chr  "male" "female" "female" "female" ...
##  $ age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ sibsp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ cabin      : chr  "" "C85" "" "C123" ...
##  $ embarked   : chr  "S" "C" "S" "S" ...
##  $ survival   : Factor w/ 2 levels "casualty","survived": 1 2 2 2 1 1 1 1 2 2 ...
##  $ dataType   : chr  "train" "train" "train" "train" ...

unique(titanic$pclass) # 3 1 2

## [1] 3 1 2

titanic$pclass = factor(titanic$pclass, levels = unique(titanic$pclass))
#

unique(titanic$sex)

## [1] "male"   "female"

titanic$sex = factor(titanic$sex, levels = unique(titanic$sex) )
titanic$sex = factor(titanic$sex, labels = c("M","F"))
head(titanic)

##   passengerId pclass                                                name sex
## 1           1      3                             Braund, Mr. Owen Harris   M
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer)   F
## 3           3      3                              Heikkinen, Miss. Laina   F
## 4           4      1        Futrelle, Mrs. Jacques Heath (Lily May Peel)   F
## 5           5      3                            Allen, Mr. William Henry   M
## 6           6      3                                    Moran, Mr. James   M
##   age sibsp parch           ticket    fare cabin embarked survival dataType
## 1  22     1     0        A/5 21171  7.2500              S casualty    train
## 2  38     1     0         PC 17599 71.2833   C85        C survived    train
## 3  26     0     0 STON/O2. 3101282  7.9250              S survived    train
## 4  35     1     0           113803 53.1000  C123        S survived    train
## 5  35     0     0           373450  8.0500              S casualty    train
## 6  NA     0     0           330877  8.4583              Q casualty    train

# save(titanic, file="Titanic.RData")
# load("Titanic.RData")

Now fit a logistic regression model.

fit5 <- glm(survival ~ sex + age + sibsp + parch + pclass,
            family=binomial, data = titanic)
summary(fit5)

## 
## Call:
## glm(formula = survival ~ sex + age + sibsp + parch + pclass, 
##     family = binomial, data = titanic)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.002093   0.221287  -4.528 5.94e-06 ***
## sexF         2.556861   0.173270  14.756  < 2e-16 ***
## age         -0.039489   0.006634  -5.952 2.64e-09 ***
## sibsp       -0.352914   0.105349  -3.350 0.000808 ***
## parch        0.074361   0.099907   0.744 0.456697    
## pclass1      2.352022   0.228803  10.280  < 2e-16 ***
## pclass2      0.985265   0.199384   4.942 7.75e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1414.62  on 1045  degrees of freedom
## Residual deviance:  970.12  on 1039  degrees of freedom
##   (263 observations deleted due to missingness)
## AIC: 984.12
## 
## Number of Fisher Scoring iterations: 4

Figure 3:

fact = 0.5
width = fact*10; height = fact*8
main = "Titanic data"
# pdf(file = "titanic_5_predsplot_dens.pdf",
#     width = width, height = height)
predsplot(fit5, main=main, displaytype = "density")

## 
## Prediction terms in the model:
##                      prediction term   stdev up/down
##                                  sex 1.23600        
##                                  age 0.56920    down
##                                sibsp 0.32190    down
##                                parch 0.06244      up
##                               pclass 0.98130        
##  Total linear prediction of survival 1.67000        
## 
## The glm fit used the logit link function.

# The glm fit used the logit link function.
# dev.off()

# With other options for density estimation:
predsplot(fit5, main=main, displaytype = "density",
          bw = "SJ", adjust = 0.5)

## 
## Prediction terms in the model:
##                      prediction term   stdev up/down
##                                  sex 1.23600        
##                                  age 0.56920    down
##                                sibsp 0.32190    down
##                                parch 0.06244      up
##                               pclass 0.98130        
##  Total linear prediction of survival 1.67000        
## 
## The glm fit used the logit link function.

Three figures in Supplementary Material:

# pdf(file = "titanic_5_predsplot_1.pdf",
#     width = width, height = height)
predsplot(fit5, main = main, casetoshow=1)

## 
##  Prediction terms for case 1:
##                                 prediction term    value
##                                             sex -0.94843
##                                             age +0.31122
##                                           sibsp -0.17544
##                                           parch -0.03128
##                                          pclass -0.88444
##                                             SUM -1.72839
##                                      centercept -0.49537
##             Total linear prediction of survival -2.22376
##  Total prediction of survival in response units  0.09764
## 
## The glm fit used the logit link function.

# dev.off()

# pdf(file = "titanic_5_predsplot_2_stairs.pdf",
#     width = width, height = height)
predsplot(fit5, main = main, casetoshow=2, staircase = TRUE)

## 
##  Prediction terms for case 2:
##                                 prediction term    value
##                                             sex +1.60843
##                                             age -0.32060
##                                           sibsp -0.17544
##                                           parch -0.03128
##                                          pclass +1.46758
##                                             SUM +2.54868
##                                      centercept -0.49537
##             Total linear prediction of survival  2.05331
##  Total prediction of survival in response units  0.88628
## 
## The glm fit used the logit link function.

# dev.off()

# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "titanic_5_predscor.pdf", 
#     width = height, height = height)
predscor(fit5, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##          sex   age sibsp parch pclass
## sex     1.00  0.06 -0.10  0.22   0.14
## age     0.06  1.00 -0.24  0.15  -0.41
## sibsp  -0.10 -0.24  1.00 -0.37   0.04
## parch   0.22  0.15 -0.37  1.00  -0.02
## pclass  0.14 -0.41  0.04 -0.02   1.00

# dev.off()

German credit data

data(german.credit, package = "fairml")
credit = german.credit; rm(german.credit)
dim(credit) # 1000 21

## [1] 1000   21

str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Account_status          : Factor w/ 4 levels "< 0 DM",">= 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ Duration                : num  6 48 12 42 24 36 24 36 12 30 ...
##  $ Credit_history          : Factor w/ 5 levels "all credits at this bank paid back duly",..: 2 4 2 4 3 4 4 4 4 2 ...
##  $ Purpose                 : Factor w/ 10 levels "business","car (new)",..: 8 8 5 6 2 5 6 3 8 2 ...
##  $ Credit_amount           : num  1169 5951 2096 7882 4870 ...
##  $ Savings_bonds           : Factor w/ 5 levels "< 100 DM",">= 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ Present_employment_since: Factor w/ 5 levels "< 1 year",">= 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
##  $ Installment_rate        : num  4 2 2 2 3 2 3 2 2 4 ...
##  $ Other_debtors_guarantors: Factor w/ 3 levels "co-applicant",..: 3 3 3 2 3 3 3 3 3 3 ...
##  $ Resident_since          : num  4 2 3 4 4 4 4 2 4 2 ...
##  $ Property                : Factor w/ 4 levels "building society savings agreement / life insurance",..: 3 3 3 1 4 4 1 2 3 2 ...
##  $ Age                     : num  67 22 49 45 53 35 53 35 61 28 ...
##  $ Other_installment_plans : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Housing                 : Factor w/ 3 levels "rent","own","for free": 2 2 2 3 3 3 2 1 2 2 ...
##  $ Existing_credits        : num  2 1 1 1 2 1 1 1 1 2 ...
##  $ Job                     : Factor w/ 4 levels "management / self-employed / highly qualified employee / officer",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ People_maintenance_for  : num  1 1 2 2 2 2 1 1 1 1 ...
##  $ Telephone               : Factor w/ 2 levels "none","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ Foreign_worker          : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Credit_risk             : Factor w/ 2 levels "BAD","GOOD": 2 1 2 2 1 2 2 2 2 1 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 1 1 1 1 1 ...

#
# Shorten variable names for plotting:
colnames(credit) <- c("checking","months","history","purpose","amount","savings","employment","rate","guarantors","residence","property","age","inst_plans","housing","nloans","job","nclients","phone","foreign","credit","sex")
#
# Give factors sex and purpose shorter labels for plotting:
credit$sex = factor(as.numeric(credit$sex), labels = c("F","M"))
levels(credit$purpose)

##  [1] "business"              "car (new)"             "car (used)"           
##  [4] "domestic appliances"   "education"             "furniture / equipment"
##  [7] "others"                "radio / television"    "repairs"              
## [10] "retrainin"

# [10] "retrainin"
levels(credit$purpose) = c("business","n.car","u.car","appliances","education","furniture","other","TV","repairs","retraining")
# save(credit, file = "German_credit.RData")
# load("German_credit.RData")

Now fit a logistic regression model

fit7 <- glm(credit ~ months + purpose + amount + rate + age +
            nclients + sex, family=binomial, data = credit)

Figure in Supplementary Material:

main = "German credit data"
# fact = 0.48
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_predsplot.pdf",
#     width = width, height = height)
predsplot(fit7, main = main)

## 
## Prediction terms in the model:
##                    prediction term   stdev up/down
##                             months 0.37690    down
##                            purpose 0.51700        
##                             amount 0.26380    down
##                               rate 0.27320    down
##                                age 0.24350      up
##                           nclients 0.07078    down
##                                sex 0.21110        
##  Total linear prediction of credit 0.80900        
## 
## The glm fit used the logit link function.

# dev.off()

Figure 4:

# fact = 0.48
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_predsplot_1.pdf",
#     width = width, height = height)
predsplot(fit7, main = main, casetoshow=1, 
          displaytype = "density")

## 
##  Prediction terms for case 1:
##                               prediction term    value
##                                        months +0.46584
##                                       purpose +0.37611
##                                        amount +0.19645
##                                          rate -0.25082
##                                           age +0.67325
##                                      nclients +0.03030
##                                           sex +0.14143
##                                           SUM +1.63255
##                                    centercept  0.95998
##             Total linear prediction of credit  2.59253
##  Total prediction of credit in response units  0.93038
## 
## The glm fit used the logit link function.

# dev.off()

Figure 5:

# pdf(file = "germancredit_7_predsplot_2_stairs.pdf",
#     width = width, height = height)
predsplot(fit7, main = main, casetoshow=2, staircase = TRUE)

## 
##  Prediction terms for case 2:
##                               prediction term    value
##                                        months -0.84700
##                                       purpose +0.37611
##                                        amount -0.25041
##                                          rate +0.23763
##                                           age -0.28994
##                                      nclients +0.03030
##                                           sex -0.31479
##                                           SUM -1.05811
##                                    centercept  0.95998
##             Total linear prediction of credit -0.09813
##  Total prediction of credit in response units  0.47549
## 
## The glm fit used the logit link function.

# dev.off()

Figure 6:

# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_predscor_equalsizes.pdf", 
#     width = height, height = height)
predscor(fit7, sort.by.stdev = FALSE, cell.length = "equal")

## Correlation matrix of the prediction terms:
##          months purpose amount  rate   age nclients   sex
## months     1.00   -0.12   0.62  0.07  0.04    -0.02 -0.08
## purpose   -0.12    1.00  -0.12 -0.01 -0.03     0.04  0.05
## amount     0.62   -0.12   1.00 -0.27 -0.03     0.02 -0.09
## rate       0.07   -0.01  -0.27  1.00 -0.06    -0.07 -0.09
## age        0.04   -0.03  -0.03 -0.06  1.00    -0.12  0.16
## nclients  -0.02    0.04   0.02 -0.07 -0.12     1.00 -0.20
## sex       -0.08    0.05  -0.09 -0.09  0.16    -0.20  1.00

# dev.off()
#
# pdf(file = "germancredit_7_predscor.pdf", 
#     width = height, height = height)
predscor(fit7, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##          months purpose amount  rate   age nclients   sex
## months     1.00   -0.12   0.62  0.07  0.04    -0.02 -0.08
## purpose   -0.12    1.00  -0.12 -0.01 -0.03     0.04  0.05
## amount     0.62   -0.12   1.00 -0.27 -0.03     0.02 -0.09
## rate       0.07   -0.01  -0.27  1.00 -0.06    -0.07 -0.09
## age        0.04   -0.03  -0.03 -0.06  1.00    -0.12  0.16
## nclients  -0.02    0.04   0.02 -0.07 -0.12     1.00 -0.20
## sex       -0.08    0.05  -0.09 -0.09  0.16    -0.20  1.00

# dev.off()

Two figures in Supplementary Material:

## Prediction for a new case:
newc = list("u.car", 36, 2, 6000, 55, "F", 1)
names(newc) = c("purpose","months","rate","amount",
                "age","sex","nclients")
# fact = 0.48
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_predsplot_new_stairs.pdf",
#     width = width, height = height)
predsplot(fit7, main = main, casetoshow = newc, 
          staircase = TRUE)

## 
##  The case to show is: 
## $purpose
## [1] "u.car"
## 
## $months
## [1] 36
## 
## $rate
## [1] 2
## 
## $amount
## [1] 6000
## 
## $age
## [1] 55
## 
## $sex
## [1] "F"
## 
## $nclients
## [1] 1
## 
## 
##  Prediction terms for the new case:
##                               prediction term    value
##                                        months -0.47190
##                                       purpose +1.02816
##                                        amount -0.25499
##                                          rate +0.23763
##                                           age +0.41640
##                                      nclients +0.03030
##                                           sex +0.14143
##                                           SUM +1.12701
##                                    centercept  0.95998
##             Total linear prediction of credit  2.08699
##  Total prediction of credit in response units  0.88963
## 
## The glm fit used the logit link function.

# dev.off()

## Figure with profile:
# pdf(file = "germancredit_7_predsplot_1_dens_profile.pdf",
#     width = width, height = height)
predsplot(fit7, main = main, casetoshow=1, 
               displaytype = "density", profile = TRUE)

## 
##  Prediction terms for case 1:
##                               prediction term    value
##                                        months +0.46584
##                                       purpose +0.37611
##                                        amount +0.19645
##                                          rate -0.25082
##                                           age +0.67325
##                                      nclients +0.03030
##                                           sex +0.14143
##                                           SUM +1.63255
##                                    centercept  0.95998
##             Total linear prediction of credit  2.59253
##  Total prediction of credit in response units  0.93038
## 
## The glm fit used the logit link function.

# dev.off()

predsplot_examples

Rousseeuw, P.J.

2025-07-14