R Regression: Logistic Regression
Jump to navigation
Jump to search
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/
install.packages("mlbench") # Load the data set PIMA INDIAN data("PimaIndiansDiabetes2", package = "mlbench") # Inspect the data head(PimaIndiansDiabetes2, 4)
library(tidyverse) library(caret) theme_set(theme_bw())
# PREPARE DATA # Load the data and remove NAs data("PimaIndiansDiabetes2", package = "mlbench") PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2) # Inspect the data sample_n(PimaIndiansDiabetes2, 3) # Split the data into training and test set set.seed(123) training.samples <- PimaIndiansDiabetes2$diabetes %>% createDataPartition(p = 0.8, list = FALSE) train.data <- PimaIndiansDiabetes2[training.samples, ] test.data <- PimaIndiansDiabetes2[-training.samples, ]
# Fit the model # The R function glm(), for generalized linear model, can be used to compute logistic regression. # You need to specify the option family = binomial, which tells to R that we want to fit logistic regression. model <- glm( diabetes ~., data = train.data, family = binomial) # Summarize the model summary(model) # Make predictions probabilities <- model %>% predict(test.data, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") # Model accuracy mean(predicted.classes == test.data$diabetes)
# Simole Logistic Regression model <- glm( diabetes ~ glucose, data = train.data, family = binomial) summary(model)$coef
# Prediction newdata <- data.frame(glucose = c(20, 180)) probabilities <- model %>% predict(newdata, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") predicted.classes
# PLOT Prediction Function train.data %>% mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>% ggplot(aes(glucose, prob)) + geom_point(alpha = 0.2) + geom_smooth(method = "glm", method.args = list(family = "binomial")) + labs( title = "Logistic Regression Model", x = "Plasma Glucose Concentration", y = "Probability of being diabete-pos" )
# # Multiple Logistic Regression # model <- glm( diabetes ~ glucose + mass + pregnant, data = train.data, family = binomial) summary(model)$coef
# Model Summary model <- glm( diabetes ~., data = train.data, family = binomial) summary(model)$coef # alternate show coef(model) summary(model )$coef
# UBAH MODEL model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, data = train.data, family = binomial)
# Predict probabilities <- model %>% predict(test.data, type = "response") head(probabilities) # check class contrasts(test.data$diabetes)
# Predict the class of individuals: predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") head(predicted.classes)
# Assessing model accuracy mean(predicted.classes == test.data$diabetes)
# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)), # when linearity of the predictor cannot be assumed. This can be done using the mgcv package: library("mgcv") # Fit the model gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant, data = train.data, family = "binomial") # Summarize model summary(gam.model ) # Make predictions probabilities <- gam.model %>% predict(test.data, type = "response") predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg") # Model Accuracy mean(predicted.classes == test.data$diabetes)