library(tidyverse)
library(ggplot2)The Iris flower data set or Fisher’s Iris data set is a multivariate data set introduced by the British statistician, eugenicist, and biologist Ronald Fisher in his 1936 paper The use of multiple measurements in taxonomic problems as an example of linear discriminant analysis.
The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Based on the combination of these four features, Fisher developed a linear discriminant model to distinguish the species from each other.
head(iris)iris_df = iris %>% rename_all(janitor::make_clean_names)
str(iris_df)## 'data.frame': 150 obs. of 5 variables:
## $ sepal_length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ sepal_width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ petal_length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ petal_width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
pairs(iris_df[,-5], pch = 21,
bg = c("#1b9e77", "#d95f02", "#7570b3")[unclass(iris_df$species)])
summary(iris_df)## sepal_length sepal_width petal_length petal_width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
plot_hist_by_species = function(col, colname) {
iris_df %>% ggplot(aes(x = species , y = {{col}}, color = species)) +
geom_boxplot() + geom_jitter( alpha = .5, position=position_jitter(w=0.1,h=0.1) ) +
theme(legend.position = "") + labs(title = colname)
}
p1 = plot_hist_by_species(sepal_width, "Sepal Width")
p2 = plot_hist_by_species(sepal_length, "Sepal Length")
p3 = plot_hist_by_species(petal_length, "Petal Length")
p4 = plot_hist_by_species(petal_width, "Petal Width")
gridExtra::grid.arrange(p1, p2, p3, p4)
library(VGAM)
model_1 = vglm( species ~ sepal_length + sepal_width + petal_length + petal_width, family=multinomial, iris_df)
summary(model_1)##
## Call:
## vglm(formula = species ~ sepal_length + sepal_width + petal_length +
## petal_width, family = multinomial, data = iris_df)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## log(mu[,1]/mu[,3]) -0.0003873 1.813e-10 6.194e-10 1.252e-06 0.0003643
## log(mu[,2]/mu[,3]) -1.9700374 -3.609e-04 -5.338e-06 4.763e-04 2.5601905
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 35.490 22666.953 NA NA
## (Intercept):2 42.638 25.708 1.659 0.0972 .
## sepal_length:1 9.495 6729.217 NA NA
## sepal_length:2 2.465 2.394 1.030 0.3032
## sepal_width:1 12.300 3143.611 NA NA
## sepal_width:2 6.681 4.480 1.491 0.1359
## petal_length:1 -22.975 4799.227 -0.005 0.9962
## petal_length:2 -9.429 4.737 NA NA
## petal_width:1 -33.843 7583.502 NA NA
## petal_width:2 -18.286 9.743 NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
##
## Residual deviance: 11.8985 on 290 degrees of freedom
##
## Log-likelihood: -5.9493 on 290 degrees of freedom
##
## Number of Fisher scoring iterations: 21
##
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):1', 'sepal_length:1', 'sepal_width:1', 'petal_length:2', 'petal_width:1', 'petal_width:2'
##
##
## Reference group is level 3 of the response
# reference group:virginica
levels(iris_df$species)## [1] "setosa" "versicolor" "virginica"
preds_probs = predict(model_1, type = "response")
preds = apply(preds_probs, MARGIN = 1, which.max)
species_fun = function(level) return(levels(iris$Species)[level])
preds_class = map_chr(preds, species_fun)
# confusion matrix
table("Predicted" = preds_class, "Actual" = iris_df$species )## Actual
## Predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 1 49
Description
10 variable subset of the 1996 American National Election Study. Missing values and “don’t know” responses have been listwise deleted. Respondents expressing a voting preference other than Clinton or Dole have been removed (Rosenstone, Kinder, and Miller (1997)).
Format
A data frame with 944 observations on the following 10 variables:
popul population of respondent’s location in 1000s of peopleTVnews days in the past week spent watching news on TVselfLR Left-Right self-placement of respondent: an ordered factor with levels extremely liberal, extLib < liberal, Lib < slightly liberal, sliLib < moderate, Mod < slightly conservative, sliCon < conservative, Con < extremely conservative, extConClinLR Left-Right placement of Bill Clinton (same scale as selfLR): an ordered factor with levels extLib < Lib < sliLib < Mod < sliCon < Con < extCon newhamp 67DoleLRLeft-Right placement of Bob Dole (same scale as selfLR): an ordered factor with levels extLib < Lib < sliLib < Mod < sliCon < Con < extConPID Party identification: an ordered factor with levels strong Democrat, strDem < weak Democrat, weakDem < independent Democrat, indDem < independent independentindind < indepedent Republican, indRep < waek Republican, weakRep < strong Republican, strRepage Respondent’s age in yearseduc Respondent’s education: an ordered factor with levels 8 years or less, MS < high school dropout, HSdrop < high school diploma or GED, HS < some College, Coll < Community or junior College degree, CCdeg < BA degree, BAdeg < postgraduate degree, MAdeg income Respondent’s family income: an ordered factor with levels $3Kminus < $3K-$5K < $5K-$7K < $7K-$9K < $9K-$10K < $10K-$11K < $11K-$12K < $12K-$13K < $13K-$14K < $14K-$15K < $15K-$17K < $17K-$20K < $20K-$22K < $22K-$25K < $25K-$30K < $30K-$35K < $35K-$40K < $40K-$45K < $45K-$50K < $50K-$60K < $60K-$75K < $75K-$90K < $90K-$105K < $105Kplus vote Expected vote in 1996 presidential election: a factor with levels Clinton and DoleSource: Faraway Book:
library(faraway)
data(nes96)
str(nes96)## 'data.frame': 944 obs. of 10 variables:
## $ popul : int 0 190 31 83 640 110 100 31 180 2800 ...
## $ TVnews: int 7 1 7 4 7 3 7 1 7 0 ...
## $ selfLR: Ord.factor w/ 7 levels "extLib"<"Lib"<..: 7 3 2 3 5 3 5 5 4 3 ...
## $ ClinLR: Ord.factor w/ 7 levels "extLib"<"Lib"<..: 1 3 2 4 6 4 6 4 6 3 ...
## $ DoleLR: Ord.factor w/ 7 levels "extLib"<"Lib"<..: 6 5 6 5 4 6 4 5 3 7 ...
## $ PID : Ord.factor w/ 7 levels "strDem"<"weakDem"<..: 7 2 2 2 1 2 2 5 4 1 ...
## $ age : int 36 20 24 28 68 21 77 21 31 39 ...
## $ educ : Ord.factor w/ 7 levels "MS"<"HSdrop"<..: 3 4 6 6 6 4 4 4 4 3 ...
## $ income: Ord.factor w/ 24 levels "$3Kminus"<"$3K-$5K"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ vote : Factor w/ 2 levels "Clinton","Dole": 2 1 1 1 1 1 1 1 1 1 ...
summary(nes96)## popul TVnews selfLR ClinLR DoleLR
## Min. : 0.0 Min. :0.000 extLib: 16 extLib:109 extLib: 13
## 1st Qu.: 1.0 1st Qu.:1.000 Lib :103 Lib :317 Lib : 31
## Median : 22.0 Median :3.000 sliLib:147 sliLib:236 sliLib: 43
## Mean : 306.4 Mean :3.728 Mod :256 Mod :160 Mod : 87
## 3rd Qu.: 110.0 3rd Qu.:7.000 sliCon:170 sliCon: 67 sliCon:195
## Max. :7300.0 Max. :7.000 Con :218 Con : 36 Con :460
## extCon: 34 extCon: 19 extCon:115
## PID age educ income vote
## strDem :200 Min. :19.00 MS : 13 $60K-$75K:103 Clinton:551
## weakDem:180 1st Qu.:34.00 HSdrop: 52 $50K-$60K:100 Dole :393
## indDem :108 Median :44.00 HS :248 $30K-$35K: 70
## indind : 37 Mean :47.04 Coll :187 $25K-$30K: 68
## indRep : 94 3rd Qu.:58.00 CCdeg : 90 $105Kplus: 68
## weakRep:150 Max. :91.00 BAdeg :227 $35K-$40K: 62
## strRep :175 MAdeg :127 (Other) :473
The response will be party identification of the respondent: Democrat, Independent or Republican. The original data involved more than three categories; They collapse this to three for simplicity.
nes96$PID_new = case_when(
nes96$PID %in% c("strDem", "weakDem") ~ "Democrat",
nes96$PID %in% c("indDem", "indind", "indRep") ~ "Independent",
nes96$PID %in% c("weakRep", "strRep") ~ "Republican"
)
table(nes96$PID_new)##
## Democrat Independent Republican
## 380 239 325
For simplicity, they consider only the age, education level and income group of the respondents.
# unique(nes96$income)
nes96 %>% group_by(income) %>% mutate(sum_inc = n()) %>% ungroup() %>% group_by(income, PID_new) %>%
summarise(n = n(), pct = n/ sum_inc) %>%
ggplot( aes( x= income, y= pct, col = PID_new, group = PID_new)) +
geom_point() + geom_line(alpha = .5, size =1.5) +
theme(axis.text.x = element_text(angle = 90))## `summarise()` has grouped output by 'income', 'PID_new'. You can override using the `.groups` argument.

In the book they converted income variable to a numberic variable by taking the midpoint of each range, lets do the same ( we don’t want to estimate too much parameters):
mid_points <- c(1.5,4,6,8,9.5,10.5,11.5,12.5,13.5,14.5,16,18.5,21,23.5,27.5,32.5,37.5,42.5,47.5,55,67.5,82.5,97.5,115)
nes96$numeric_income <- mid_points[unclass(nes96$income)]
summary(nes96$numeric_income)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.50 23.50 37.50 46.58 67.50 115.00
nes96 %>% group_by(educ) %>% mutate(sum_inc = n()) %>% ungroup() %>% group_by(educ, PID_new) %>%
summarise(n = n(), pct = n/ sum_inc) %>%
ggplot( aes( x= educ, y= pct, col = PID_new, group = PID_new)) +
geom_point() + geom_line(alpha = .5, size =1.5) ## `summarise()` has grouped output by 'educ', 'PID_new'. You can override using the `.groups` argument.

nes96 %>% ggplot( aes( age, fill = PID_new, color = PID_new)) +
geom_histogram(aes(y=..density..) , bins = 15, alpha = .8, color = 'white', position = 'dodge') +
geom_density(alpha=.15, fill = 'white')
library(VGAM)
model_2 = vglm( PID_new ~ age + numeric_income , family=multinomial, nes96)
summary(model_2)##
## Call:
## vglm(formula = PID_new ~ age + numeric_income, family = multinomial,
## data = nes96)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## log(mu[,1]/mu[,3]) -1.584 -0.8589 -0.4004 0.9952 2.218
## log(mu[,2]/mu[,3]) -1.094 -0.8435 -0.2753 1.3800 2.016
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 1.154791 0.273651 4.220 2.44e-05 ***
## (Intercept):2 -0.032387 0.310862 -0.104 0.917
## age:1 -0.004133 0.004700 -0.879 0.379
## age:2 -0.003907 0.005396 -0.724 0.469
## numeric_income:1 -0.017868 0.002667 -6.700 2.09e-11 ***
## numeric_income:2 -0.001744 0.002654 -0.657 0.511
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
##
## Residual deviance: 1984.539 on 1882 degrees of freedom
##
## Log-likelihood: -992.2695 on 1882 degrees of freedom
##
## Number of Fisher scoring iterations: 4
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Reference group is level 3 of the response
# ref category - republican
levels(factor(nes96$PID_new))## [1] "Democrat" "Independent" "Republican"
predict(model_2,data.frame(age = 25, numeric_income=0),type="response")## Democrat Independent Republican
## 1 0.6037795 0.1852455 0.2109751
predict(model_2,data.frame(age = 25, numeric_income=median(nes96$numeric_income)),type="response")## Democrat Independent Republican
## 1 0.4455226 0.2502312 0.3042463
predict(model_2,data.frame(age = 25, numeric_income=mean(nes96$numeric_income)),type="response")## Democrat Independent Republican
## 1 0.4076149 0.2650188 0.3273663
predict(model_2,data.frame(age = 50, numeric_income=median(nes96$numeric_income)),type="response")## Democrat Independent Republican
## 1 0.4306501 0.2432487 0.3261012
predict(model_2,data.frame(age = 70, numeric_income=median(nes96$numeric_income)),type="response")## Democrat Independent Republican
## 1 0.4184309 0.2374178 0.3441513
model_3 = vglm( PID_new ~ age + numeric_income + educ , family=multinomial, nes96)
summary(model_3)##
## Call:
## vglm(formula = PID_new ~ age + numeric_income + educ, family = multinomial,
## data = nes96)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## log(mu[,1]/mu[,3]) -2.986 -0.8454 -0.3919 1.000 2.267
## log(mu[,2]/mu[,3]) -2.318 -0.7826 -0.2850 1.198 2.283
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 1.642650 0.331255 4.959 7.09e-07 ***
## (Intercept):2 0.445389 0.372285 1.196 0.2316
## age:1 -0.008194 0.004903 -1.671 0.0946 .
## age:2 -0.008041 0.005619 -1.431 0.1524
## numeric_income:1 -0.017247 0.002882 -5.985 2.17e-09 ***
## numeric_income:2 -0.001008 0.002917 -0.345 0.7297
## educ.L:1 -1.194114 0.650009 -1.837 0.0662 .
## educ.L:2 -1.130593 0.713072 -1.586 0.1128
## educ.Q:1 1.229265 0.603936 2.035 0.0418 *
## educ.Q:2 1.107555 0.660372 1.677 0.0935 .
## educ.C:1 -0.154448 0.486465 -0.317 0.7509
## educ.C:2 -0.042485 0.531864 -0.080 0.9363
## educ^4:1 0.028269 0.360477 0.078 0.9375
## educ^4:2 -0.048311 0.397628 -0.121 0.9033
## educ^5:1 0.122118 0.269581 0.453 0.6506
## educ^5:2 0.258208 0.296246 0.872 0.3834
## educ^6:1 0.037413 0.203183 0.184 0.8539
## educ^6:2 0.191694 0.226809 0.845 0.3980
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
##
## Residual deviance: 1968.332 on 1870 degrees of freedom
##
## Log-likelihood: -984.1663 on 1870 degrees of freedom
##
## Number of Fisher scoring iterations: 4
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Reference group is level 3 of the response