Using binary logistic regression classified species of palmetto based on some covariates.
library(tidyverse) # Easily Install and Load the 'Tidyverse', CRAN v1.3.0
library(here) # A Simpler Way to Find Your Files, CRAN v1.0.1
library(broom) # Convert Statistical Objects into Tidy Tibbles, CRAN v0.7.4
library(ggExtra) # Add Marginal Histograms to 'ggplot2', and More 'ggplot2'Enhancements, CRAN v0.9
library(gghalves) # Compose Half-Half Plots Using Your Favourite Geoms, CRAN v0.1.1
library(kableExtra) # Construct Complex Table with 'kable' and Pipe Syntax, CRAN v1.3.1
palmetto <- read_csv(here("_texts", "logistic_regression",
'data', 'palmetto.csv')) # read in data
# From metadata found that in the `species` column, 1 = Serenoa repens and 2 = Sabal etonia
palmetto <- palmetto %>%
select(height:green_lvs, species) %>% # select for variables we are interested in
mutate(species = factor(species)) %>% # change species into a factor
mutate(species = fct_recode(species, # name the species accordingly
`Serenoa repens` = "1",
`Sabal etonia` = "2"))
my_colors <- c("chartreuse3", "royalblue1") # color palette
p1 <- ggplot(data = palmetto, aes(x = height, y = length, colour = species)) +
geom_point(alpha = 0.75) + # scatterplot
scale_colour_manual(values = my_colors) +
theme_minimal() +
labs(x = "Height",
y = "Length") +
theme(axis.title = element_text(face = "bold", size = 11),
axis.text = element_text(face = "bold", size = 10),
axis.text.x = element_text(face = "bold", size = 11),
legend.text = element_text(face = "bold", size = 10),
legend.title = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 11)
)
ggMarginal(p1, groupColour = TRUE, groupFill = TRUE) # add marginal density plots
Figure 1: Comparison of height and length of each palmetto tree differentiated by species.
ggplot(data = palmetto, aes(x = green_lvs, fill = species)) +
geom_bar(position = "dodge") + # bar graph
scale_fill_manual(values = my_colors) +
theme_minimal() +
labs(x = "Number of Green Leaves",
y = "Count") +
scale_x_continuous(breaks = seq(0,20,1)) +
scale_y_continuous(breaks = seq(0,1750,250)) +
theme(axis.title = element_text(face = "bold", size = 11),
axis.text.x = element_text(face = "bold", size = 10),
axis.text.y = element_text(face = "bold", size = 11),
legend.text = element_text(face = "bold", size = 10),
legend.title = element_text(face = "bold", size = 11),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
plot.title = element_text(face = "bold", size = 11)
)
Figure 2: Count of green leaves per palmetto tree differentiated by species.
ggplot(data = palmetto, aes(x = species, y = width, fill = species, color = species)) +
geom_half_violin(side = "r") +
geom_half_point(side = "l", alpha = 0.4) +
scale_fill_manual(values = my_colors) +
scale_color_manual(values = my_colors) +
theme_minimal() +
labs(x = "Species",
y = "Width") +
scale_y_continuous(breaks = seq(0,300,50)) +
theme(axis.title = element_text(face = "bold", size = 11),
axis.text.x = element_text(face = "bold", size = 10),
axis.text.y = element_text(face = "bold", size = 11),
legend.text = element_text(face = "bold", size = 10),
legend.title = element_text(face = "bold", size = 11),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
Figure 3: Plot showing widest width of the canopy perpendicular to the canopy length of palmetto trees differentiated by species.
palmetto_blr <- glm(species ~ .,
data = palmetto,
family = "binomial") # model
blr_tidy <- broom::tidy(palmetto_blr) # model in tidy format
blr_tidy %>% # table for model results
kbl(caption = "<b style='color:black;'><strong>Statistical
results showing estimates and standard errors of
coefficients along with z-values and p-values
corresponding to the z-values.<strong></b>",
escape = FALSE,
format = "html") %>%
kable_material(c("hover")) %>%
column_spec(1, bold = TRUE)
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 3.2266851 | 0.1420708 | 22.71180 | 0 |
height | -0.0292173 | 0.0023061 | -12.66984 | 0 |
length | 0.0458233 | 0.0018661 | 24.55600 | 0 |
width | 0.0394434 | 0.0021000 | 18.78227 | 0 |
green_lvs | -1.9084747 | 0.0388634 | -49.10728 | 0 |
blr_fitted <- palmetto_blr %>%
broom::augment(type.predict = "response") # obtained probabilities of each plant being classified
blr_fitted <- blr_fitted %>%
mutate(species_fitted =
case_when(.fitted >= 0.5 ~ "Sabal etonia",
.fitted < 0.5 ~ "Serenoa repens")) # added a column for what the model predicted
final_counts <- blr_fitted %>%
count(across(contains("species"))) # used `across` to obtain counts
# Final table construction with correct, incorrect and % correct columns
final_table <-
tibble(
species = c("Serenoa repens", "Sabal etonia"),
correct = c(
as.numeric(
final_counts %>%
filter(species == "Serenoa repens" &
species_fitted == "Serenoa repens") %>%
select(n)
),
as.numeric(
final_counts %>%
filter(species == "Sabal etonia"
& species_fitted == "Sabal etonia") %>%
select(n)
)
),
incorrect = c(
as.numeric(
final_counts %>%
filter(species == "Serenoa repens" &
species_fitted == "Sabal etonia") %>%
select(n)
),
as.numeric(
final_counts %>%
filter(species == "Sabal etonia" &
species_fitted == "Serenoa repens") %>%
select(n)
)
)
)
final_table <- final_table %>%
mutate(`% correct` = (correct / (correct + incorrect)) * 100) # add % correct column
final_table %>% # final table
kbl(caption = "<b style='color:black;'><strong>
Comparison of correctly, incorrectly, and percent
correctly classified species of trees from the
binary logistic regression model.
<strong></b>",
escape = FALSE,
format = "html") %>%
kable_material(c("hover")) %>%
column_spec(2:4, bold = TRUE) %>%
row_spec(1, color = "white", background = "#66CD00") %>%
row_spec(2, color = "white", background = "#4876FF") %>%
column_spec(1, bold = TRUE, italic = TRUE, color = "black")
species | correct | incorrect | % correct |
---|---|---|---|
Serenoa repens | 5548 | 564 | 90.77225 |
Sabal etonia | 5701 | 454 | 92.62388 |
For attribution, please cite this work as
Khanjian (2021, Jan. 26). Roupen Khanjian: Florida Palmetto Analysis. Retrieved from https://khanjian.github.io/roupen-website/texts/logistic_regression/
BibTeX citation
@misc{khanjian2021florida, author = {Khanjian, Roupen}, title = {Roupen Khanjian: Florida Palmetto Analysis}, url = {https://khanjian.github.io/roupen-website/texts/logistic_regression/}, year = {2021} }