Difference between revisions of "R Regression: Logistic Regression"

From OnnoWiki
Jump to navigation Jump to search
(Created page with " # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ ==Referensi== * http://www.sthda.com/englis...")
 
 
Line 1: Line 1:
 
  # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/
 
  # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/
 +
 +
 +
install.packages("mlbench")
 +
# Load the data set PIMA INDIAN
 +
data("PimaIndiansDiabetes2", package = "mlbench")
 +
# Inspect the data
 +
head(PimaIndiansDiabetes2, 4)
 +
 +
 +
library(tidyverse)
 +
library(caret)
 +
theme_set(theme_bw())
 +
 +
 +
# PREPARE DATA
 +
# Load the data and remove NAs
 +
data("PimaIndiansDiabetes2", package = "mlbench")
 +
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
 +
# Inspect the data
 +
sample_n(PimaIndiansDiabetes2, 3)
 +
# Split the data into training and test set
 +
set.seed(123)
 +
training.samples <- PimaIndiansDiabetes2$diabetes %>%
 +
  createDataPartition(p = 0.8, list = FALSE)
 +
train.data  <- PimaIndiansDiabetes2[training.samples, ]
 +
test.data <- PimaIndiansDiabetes2[-training.samples, ]
 +
 +
 +
 +
# Fit the model
 +
# The R function glm(), for generalized linear model, can be used to compute logistic regression.
 +
# You need to specify the option family = binomial, which tells to R that we want to fit logistic regression.
 +
model <- glm( diabetes ~., data = train.data, family = binomial)
 +
# Summarize the model
 +
summary(model)
 +
# Make predictions
 +
probabilities <- model %>% predict(test.data, type = "response")
 +
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
 +
# Model accuracy
 +
mean(predicted.classes == test.data$diabetes)
 +
 +
 +
 +
 +
# Simole Logistic Regression
 +
model <- glm( diabetes ~ glucose, data = train.data, family = binomial)
 +
summary(model)$coef
 +
 +
 +
 +
# Prediction
 +
newdata <- data.frame(glucose = c(20,  180))
 +
probabilities <- model %>% predict(newdata, type = "response")
 +
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
 +
predicted.classes
 +
 +
 +
# PLOT Prediction Function
 +
train.data %>%
 +
  mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
 +
  ggplot(aes(glucose, prob)) +
 +
  geom_point(alpha = 0.2) +
 +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
 +
  labs(
 +
    title = "Logistic Regression Model",
 +
    x = "Plasma Glucose Concentration",
 +
    y = "Probability of being diabete-pos"
 +
  )
 +
 +
 +
 +
 +
#
 +
# Multiple Logistic Regression
 +
#
 +
model <- glm( diabetes ~ glucose + mass + pregnant,
 +
              data = train.data, family = binomial)
 +
summary(model)$coef
 +
 +
# Model Summary
 +
model <- glm( diabetes ~., data = train.data, family = binomial)
 +
summary(model)$coef
 +
# alternate show
 +
coef(model)
 +
summary(model )$coef
 +
 +
 +
 +
 +
 +
# UBAH MODEL
 +
model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree,
 +
              data = train.data, family = binomial)
 +
 +
 +
 +
# Predict
 +
probabilities <- model %>% predict(test.data, type = "response")
 +
head(probabilities)
 +
# check class
 +
contrasts(test.data$diabetes)
 +
 +
# Predict the class of individuals:
 +
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
 +
head(predicted.classes)
 +
 +
 +
# Assessing model accuracy
 +
mean(predicted.classes == test.data$diabetes)
 +
 +
 +
 +
 +
 +
# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)),
 +
# when linearity of the predictor cannot be assumed. This can be done using the mgcv package:
 +
library("mgcv")
 +
# Fit the model
 +
gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant,
 +
                  data = train.data, family = "binomial")
 +
# Summarize model
 +
summary(gam.model )
 +
# Make predictions
 +
probabilities <- gam.model %>% predict(test.data, type = "response")
 +
predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg")
 +
# Model Accuracy
 +
mean(predicted.classes == test.data$diabetes)
 +
  
  

Latest revision as of 16:07, 29 November 2019

# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/


install.packages("mlbench")
# Load the data set PIMA INDIAN
data("PimaIndiansDiabetes2", package = "mlbench")
# Inspect the data
head(PimaIndiansDiabetes2, 4)


library(tidyverse)
library(caret)
theme_set(theme_bw())


# PREPARE DATA
# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)
# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]


# Fit the model
# The R function glm(), for generalized linear model, can be used to compute logistic regression.
# You need to specify the option family = binomial, which tells to R that we want to fit logistic regression.
model <- glm( diabetes ~., data = train.data, family = binomial)
# Summarize the model
summary(model)
# Make predictions
probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
# Model accuracy
mean(predicted.classes == test.data$diabetes)



# Simole Logistic Regression
model <- glm( diabetes ~ glucose, data = train.data, family = binomial)
summary(model)$coef


# Prediction
newdata <- data.frame(glucose = c(20,  180))
probabilities <- model %>% predict(newdata, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
predicted.classes


# PLOT Prediction Function
train.data %>%
  mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
  ggplot(aes(glucose, prob)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  labs(
    title = "Logistic Regression Model", 
    x = "Plasma Glucose Concentration",
    y = "Probability of being diabete-pos"
  )



#
# Multiple Logistic Regression
#
model <- glm( diabetes ~ glucose + mass + pregnant, 
              data = train.data, family = binomial)
summary(model)$coef 
# Model Summary
model <- glm( diabetes ~., data = train.data, family = binomial)
summary(model)$coef
# alternate show
coef(model)
summary(model )$coef



# UBAH MODEL
model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, 
              data = train.data, family = binomial)


# Predict
probabilities <- model %>% predict(test.data, type = "response")
head(probabilities)
# check class
contrasts(test.data$diabetes)
# Predict the class of individuals:
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
head(predicted.classes)


# Assessing model accuracy
mean(predicted.classes == test.data$diabetes)



# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)),
# when linearity of the predictor cannot be assumed. This can be done using the mgcv package:
library("mgcv")
# Fit the model
gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant,
                 data = train.data, family = "binomial")
# Summarize model
summary(gam.model )
# Make predictions
probabilities <- gam.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg")
# Model Accuracy
mean(predicted.classes == test.data$diabetes)





Referensi

Pranala Menarik