R Regression: Logistic Regression
Revision as of 16:07, 29 November 2019 by Onnowpurbo (talk | contribs)
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/
install.packages("mlbench")
# Load the data set PIMA INDIAN
data("PimaIndiansDiabetes2", package = "mlbench")
# Inspect the data
head(PimaIndiansDiabetes2, 4)
library(tidyverse) library(caret) theme_set(theme_bw())
# PREPARE DATA
# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)
# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]
# Fit the model # The R function glm(), for generalized linear model, can be used to compute logistic regression. # You need to specify the option family = binomial, which tells to R that we want to fit logistic regression. model <- glm( diabetes ~., data = train.data, family = binomial) # Summarize the model summary(model) # Make predictions probabilities <- model %>% predict(test.data, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") # Model accuracy mean(predicted.classes == test.data$diabetes)
# Simole Logistic Regression model <- glm( diabetes ~ glucose, data = train.data, family = binomial) summary(model)$coef
# Prediction newdata <- data.frame(glucose = c(20, 180)) probabilities <- model %>% predict(newdata, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") predicted.classes
# PLOT Prediction Function
train.data %>%
mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
ggplot(aes(glucose, prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
#
# Multiple Logistic Regression
#
model <- glm( diabetes ~ glucose + mass + pregnant,
data = train.data, family = binomial)
summary(model)$coef
# Model Summary model <- glm( diabetes ~., data = train.data, family = binomial) summary(model)$coef # alternate show coef(model) summary(model )$coef
# UBAH MODEL
model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree,
data = train.data, family = binomial)
# Predict probabilities <- model %>% predict(test.data, type = "response") head(probabilities) # check class contrasts(test.data$diabetes)
# Predict the class of individuals: predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") head(predicted.classes)
# Assessing model accuracy mean(predicted.classes == test.data$diabetes)
# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)),
# when linearity of the predictor cannot be assumed. This can be done using the mgcv package:
library("mgcv")
# Fit the model
gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant,
data = train.data, family = "binomial")
# Summarize model
summary(gam.model )
# Make predictions
probabilities <- gam.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg")
# Model Accuracy
mean(predicted.classes == test.data$diabetes)