R Regression: Logistic Regression

From OnnoWiki
Jump to navigation Jump to search
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/


install.packages("mlbench")
# Load the data set PIMA INDIAN
data("PimaIndiansDiabetes2", package = "mlbench")
# Inspect the data
head(PimaIndiansDiabetes2, 4)


library(tidyverse)
library(caret)
theme_set(theme_bw())


# PREPARE DATA
# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)
# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]


# Fit the model
# The R function glm(), for generalized linear model, can be used to compute logistic regression.
# You need to specify the option family = binomial, which tells to R that we want to fit logistic regression.
model <- glm( diabetes ~., data = train.data, family = binomial)
# Summarize the model
summary(model)
# Make predictions
probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
# Model accuracy
mean(predicted.classes == test.data$diabetes)



# Simole Logistic Regression
model <- glm( diabetes ~ glucose, data = train.data, family = binomial)
summary(model)$coef


# Prediction
newdata <- data.frame(glucose = c(20,  180))
probabilities <- model %>% predict(newdata, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
predicted.classes


# PLOT Prediction Function
train.data %>%
  mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
  ggplot(aes(glucose, prob)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  labs(
    title = "Logistic Regression Model", 
    x = "Plasma Glucose Concentration",
    y = "Probability of being diabete-pos"
  )



#
# Multiple Logistic Regression
#
model <- glm( diabetes ~ glucose + mass + pregnant, 
              data = train.data, family = binomial)
summary(model)$coef 
# Model Summary
model <- glm( diabetes ~., data = train.data, family = binomial)
summary(model)$coef
# alternate show
coef(model)
summary(model )$coef



# UBAH MODEL
model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, 
              data = train.data, family = binomial)


# Predict
probabilities <- model %>% predict(test.data, type = "response")
head(probabilities)
# check class
contrasts(test.data$diabetes)
# Predict the class of individuals:
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
head(predicted.classes)


# Assessing model accuracy
mean(predicted.classes == test.data$diabetes)



# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)),
# when linearity of the predictor cannot be assumed. This can be done using the mgcv package:
library("mgcv")
# Fit the model
gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant,
                 data = train.data, family = "binomial")
# Summarize model
summary(gam.model )
# Make predictions
probabilities <- gam.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg")
# Model Accuracy
mean(predicted.classes == test.data$diabetes)





Referensi

Pranala Menarik