Florida Palmetto Analysis

Logistic Regression R

Using binary logistic regression classified species of palmetto based on some covariates.

Roupen Khanjian true
01-26-2021
Code
library(tidyverse) # Easily Install and Load the 'Tidyverse', CRAN v1.3.0
library(here) # A Simpler Way to Find Your Files, CRAN v1.0.1
library(broom) # Convert Statistical Objects into Tidy Tibbles, CRAN v0.7.4
library(ggExtra) # Add Marginal Histograms to 'ggplot2', and More 'ggplot2'Enhancements, CRAN v0.9
library(gghalves) # Compose Half-Half Plots Using Your Favourite Geoms, CRAN v0.1.1
library(kableExtra) # Construct Complex Table with 'kable' and Pipe Syntax, CRAN v1.3.1


palmetto <- read_csv(here("_texts", "logistic_regression",
                          'data', 'palmetto.csv')) # read in data

# From metadata found that in the `species` column, 1 = Serenoa repens and 2 = Sabal etonia

palmetto <- palmetto %>% 
  select(height:green_lvs, species) %>% # select for variables we are interested in
  mutate(species = factor(species)) %>% # change species into a factor
  mutate(species = fct_recode(species, # name the species accordingly
                              `Serenoa repens` = "1",
                              `Sabal etonia` = "2"))

Exploratory Data Analysis

Code
my_colors <- c("chartreuse3", "royalblue1") # color palette

p1 <- ggplot(data = palmetto, aes(x = height, y = length, colour = species)) +
  geom_point(alpha = 0.75) + # scatterplot
  scale_colour_manual(values = my_colors) +
  theme_minimal() +
  labs(x = "Height",
       y = "Length") +
  theme(axis.title = element_text(face = "bold", size = 11),
        axis.text = element_text(face = "bold", size = 10),
        axis.text.x = element_text(face = "bold", size = 11),
        legend.text = element_text(face = "bold", size = 10),
        legend.title = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 11)
        )

ggMarginal(p1, groupColour = TRUE, groupFill = TRUE) # add marginal density plots
Comparison of height and length of each palmetto tree differentiated by species.

Figure 1: Comparison of height and length of each palmetto tree differentiated by species.

Code
ggplot(data = palmetto, aes(x = green_lvs, fill = species)) +
  geom_bar(position = "dodge") + # bar graph
  scale_fill_manual(values = my_colors) +
  theme_minimal() +
  labs(x = "Number of Green Leaves",
       y = "Count") +
  scale_x_continuous(breaks = seq(0,20,1)) +
  scale_y_continuous(breaks = seq(0,1750,250)) +
  theme(axis.title = element_text(face = "bold", size = 11),
        axis.text.x = element_text(face = "bold", size = 10),
        axis.text.y = element_text(face = "bold", size = 11),
        legend.text = element_text(face = "bold", size = 10),
        legend.title = element_text(face = "bold", size = 11),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title = element_text(face = "bold", size = 11)
        )
Count of green leaves per palmetto tree differentiated by species.

Figure 2: Count of green leaves per palmetto tree differentiated by species.

Code
ggplot(data = palmetto, aes(x = species, y = width, fill = species, color = species)) +
  geom_half_violin(side = "r") + 
  geom_half_point(side = "l", alpha = 0.4) +
  scale_fill_manual(values = my_colors) +
  scale_color_manual(values = my_colors) +
  theme_minimal() +
  labs(x = "Species",
       y = "Width") +
  scale_y_continuous(breaks = seq(0,300,50)) +
  theme(axis.title = element_text(face = "bold", size = 11),
        axis.text.x = element_text(face = "bold", size = 10),
        axis.text.y = element_text(face = "bold", size = 11),
        legend.text = element_text(face = "bold", size = 10),
        legend.title = element_text(face = "bold", size = 11),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()
        )
Plot showing widest width of the canopy perpendicular to the canopy length of palmetto trees differentiated by species.

Figure 3: Plot showing widest width of the canopy perpendicular to the canopy length of palmetto trees differentiated by species.

Binary Logistic Regression Model

Code
palmetto_blr <- glm(species ~ ., 
                            data = palmetto, 
                            family = "binomial") # model


blr_tidy <- broom::tidy(palmetto_blr) # model in tidy format

blr_tidy %>% # table for model results
  kbl(caption = "<b style='color:black;'><strong>Statistical 
      results showing estimates and standard errors of 
      coefficients along with z-values and p-values 
      corresponding to the z-values.<strong></b>",
      escape = FALSE,
      format = "html")  %>% 
  kable_material(c("hover")) %>% 
  column_spec(1, bold = TRUE)
Table 1: Statistical results showing estimates and standard errors of coefficients along with z-values and p-values corresponding to the z-values.
term estimate std.error statistic p.value
(Intercept) 3.2266851 0.1420708 22.71180 0
height -0.0292173 0.0023061 -12.66984 0
length 0.0458233 0.0018661 24.55600 0
width 0.0394434 0.0021000 18.78227 0
green_lvs -1.9084747 0.0388634 -49.10728 0

Classificaton Accuracy of Model

Code
blr_fitted <- palmetto_blr %>% 
  broom::augment(type.predict = "response") # obtained probabilities of each plant being classified 

blr_fitted <- blr_fitted %>%
  mutate(species_fitted =
           case_when(.fitted >= 0.5 ~ "Sabal etonia",
                     .fitted < 0.5 ~ "Serenoa repens")) # added a column for what the model predicted

final_counts <- blr_fitted %>% 
  count(across(contains("species"))) # used `across` to obtain counts

# Final table construction with correct, incorrect and % correct columns
final_table <-
  tibble(
    species = c("Serenoa repens", "Sabal etonia"),
    correct = c(
      as.numeric(
        final_counts %>%
          filter(species == "Serenoa repens" &
                   species_fitted == "Serenoa repens") %>%
          select(n)
      ),
      as.numeric(
        final_counts %>%
          filter(species == "Sabal etonia"
                 & species_fitted == "Sabal etonia") %>%
          select(n)
      )
    ),
    incorrect = c(
      as.numeric(
        final_counts %>%
          filter(species == "Serenoa repens" &
                   species_fitted == "Sabal etonia") %>%
          select(n)
      ),
      as.numeric(
        final_counts %>%
          filter(species == "Sabal etonia" &
                   species_fitted == "Serenoa repens") %>%
          select(n)
      )
    )
  )

final_table <- final_table  %>%
  mutate(`% correct` = (correct / (correct + incorrect)) * 100) # add % correct column

final_table %>% # final table
  kbl(caption = "<b style='color:black;'><strong>
      Comparison of correctly, incorrectly, and percent 
      correctly classified species of trees from the 
      binary logistic regression model.
      <strong></b>",
      escape = FALSE,
      format = "html") %>% 
  kable_material(c("hover")) %>%
  column_spec(2:4, bold = TRUE) %>% 
  row_spec(1, color = "white", background = "#66CD00") %>% 
  row_spec(2,  color = "white", background = "#4876FF") %>% 
  column_spec(1, bold = TRUE, italic = TRUE, color = "black") 
Table 2: Comparison of correctly, incorrectly, and percent correctly classified species of trees from the binary logistic regression model.
species correct incorrect % correct
Serenoa repens 5548 564 90.77225
Sabal etonia 5701 454 92.62388

Citation

For attribution, please cite this work as

Khanjian (2021, Jan. 26). Roupen Khanjian: Florida Palmetto Analysis. Retrieved from https://khanjian.github.io/roupen-website/texts/logistic_regression/

BibTeX citation

@misc{khanjian2021florida,
  author = {Khanjian, Roupen},
  title = {Roupen Khanjian: Florida Palmetto Analysis},
  url = {https://khanjian.github.io/roupen-website/texts/logistic_regression/},
  year = {2021}
}