Difference between revisions of "R Regression: Logistic Regression"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with " # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ ==Referensi== * http://www.sthda.com/englis...") |
Onnowpurbo (talk | contribs) |
||
| Line 1: | Line 1: | ||
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ | # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ | ||
| + | |||
| + | |||
| + | install.packages("mlbench") | ||
| + | # Load the data set PIMA INDIAN | ||
| + | data("PimaIndiansDiabetes2", package = "mlbench") | ||
| + | # Inspect the data | ||
| + | head(PimaIndiansDiabetes2, 4) | ||
| + | |||
| + | |||
| + | library(tidyverse) | ||
| + | library(caret) | ||
| + | theme_set(theme_bw()) | ||
| + | |||
| + | |||
| + | # PREPARE DATA | ||
| + | # Load the data and remove NAs | ||
| + | data("PimaIndiansDiabetes2", package = "mlbench") | ||
| + | PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2) | ||
| + | # Inspect the data | ||
| + | sample_n(PimaIndiansDiabetes2, 3) | ||
| + | # Split the data into training and test set | ||
| + | set.seed(123) | ||
| + | training.samples <- PimaIndiansDiabetes2$diabetes %>% | ||
| + | createDataPartition(p = 0.8, list = FALSE) | ||
| + | train.data <- PimaIndiansDiabetes2[training.samples, ] | ||
| + | test.data <- PimaIndiansDiabetes2[-training.samples, ] | ||
| + | |||
| + | |||
| + | |||
| + | # Fit the model | ||
| + | # The R function glm(), for generalized linear model, can be used to compute logistic regression. | ||
| + | # You need to specify the option family = binomial, which tells to R that we want to fit logistic regression. | ||
| + | model <- glm( diabetes ~., data = train.data, family = binomial) | ||
| + | # Summarize the model | ||
| + | summary(model) | ||
| + | # Make predictions | ||
| + | probabilities <- model %>% predict(test.data, type = "response") | ||
| + | predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | ||
| + | # Model accuracy | ||
| + | mean(predicted.classes == test.data$diabetes) | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | # Simole Logistic Regression | ||
| + | model <- glm( diabetes ~ glucose, data = train.data, family = binomial) | ||
| + | summary(model)$coef | ||
| + | |||
| + | |||
| + | |||
| + | # Prediction | ||
| + | newdata <- data.frame(glucose = c(20, 180)) | ||
| + | probabilities <- model %>% predict(newdata, type = "response") | ||
| + | predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | ||
| + | predicted.classes | ||
| + | |||
| + | |||
| + | # PLOT Prediction Function | ||
| + | train.data %>% | ||
| + | mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>% | ||
| + | ggplot(aes(glucose, prob)) + | ||
| + | geom_point(alpha = 0.2) + | ||
| + | geom_smooth(method = "glm", method.args = list(family = "binomial")) + | ||
| + | labs( | ||
| + | title = "Logistic Regression Model", | ||
| + | x = "Plasma Glucose Concentration", | ||
| + | y = "Probability of being diabete-pos" | ||
| + | ) | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | # | ||
| + | # Multiple Logistic Regression | ||
| + | # | ||
| + | model <- glm( diabetes ~ glucose + mass + pregnant, | ||
| + | data = train.data, family = binomial) | ||
| + | summary(model)$coef | ||
| + | |||
| + | # Model Summary | ||
| + | model <- glm( diabetes ~., data = train.data, family = binomial) | ||
| + | summary(model)$coef | ||
| + | # alternate show | ||
| + | coef(model) | ||
| + | summary(model )$coef | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | |||
| + | # UBAH MODEL | ||
| + | model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, | ||
| + | data = train.data, family = binomial) | ||
| + | |||
| + | |||
| + | |||
| + | # Predict | ||
| + | probabilities <- model %>% predict(test.data, type = "response") | ||
| + | head(probabilities) | ||
| + | # check class | ||
| + | contrasts(test.data$diabetes) | ||
| + | |||
| + | # Predict the class of individuals: | ||
| + | predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | ||
| + | head(predicted.classes) | ||
| + | |||
| + | |||
| + | # Assessing model accuracy | ||
| + | mean(predicted.classes == test.data$diabetes) | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | |||
| + | # You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)), | ||
| + | # when linearity of the predictor cannot be assumed. This can be done using the mgcv package: | ||
| + | library("mgcv") | ||
| + | # Fit the model | ||
| + | gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant, | ||
| + | data = train.data, family = "binomial") | ||
| + | # Summarize model | ||
| + | summary(gam.model ) | ||
| + | # Make predictions | ||
| + | probabilities <- gam.model %>% predict(test.data, type = "response") | ||
| + | predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg") | ||
| + | # Model Accuracy | ||
| + | mean(predicted.classes == test.data$diabetes) | ||
| + | |||
Latest revision as of 16:07, 29 November 2019
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/
install.packages("mlbench")
# Load the data set PIMA INDIAN
data("PimaIndiansDiabetes2", package = "mlbench")
# Inspect the data
head(PimaIndiansDiabetes2, 4)
library(tidyverse) library(caret) theme_set(theme_bw())
# PREPARE DATA
# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)
# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]
# Fit the model # The R function glm(), for generalized linear model, can be used to compute logistic regression. # You need to specify the option family = binomial, which tells to R that we want to fit logistic regression. model <- glm( diabetes ~., data = train.data, family = binomial) # Summarize the model summary(model) # Make predictions probabilities <- model %>% predict(test.data, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") # Model accuracy mean(predicted.classes == test.data$diabetes)
# Simole Logistic Regression model <- glm( diabetes ~ glucose, data = train.data, family = binomial) summary(model)$coef
# Prediction newdata <- data.frame(glucose = c(20, 180)) probabilities <- model %>% predict(newdata, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") predicted.classes
# PLOT Prediction Function
train.data %>%
mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
ggplot(aes(glucose, prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
#
# Multiple Logistic Regression
#
model <- glm( diabetes ~ glucose + mass + pregnant,
data = train.data, family = binomial)
summary(model)$coef
# Model Summary model <- glm( diabetes ~., data = train.data, family = binomial) summary(model)$coef # alternate show coef(model) summary(model )$coef
# UBAH MODEL
model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree,
data = train.data, family = binomial)
# Predict probabilities <- model %>% predict(test.data, type = "response") head(probabilities) # check class contrasts(test.data$diabetes)
# Predict the class of individuals: predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") head(predicted.classes)
# Assessing model accuracy mean(predicted.classes == test.data$diabetes)
# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)),
# when linearity of the predictor cannot be assumed. This can be done using the mgcv package:
library("mgcv")
# Fit the model
gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant,
data = train.data, family = "binomial")
# Summarize model
summary(gam.model )
# Make predictions
probabilities <- gam.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg")
# Model Accuracy
mean(predicted.classes == test.data$diabetes)