Difference between revisions of "R Regression: Logistic Regression"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with " # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ ==Referensi== * http://www.sthda.com/englis...") |
Onnowpurbo (talk | contribs) |
||
Line 1: | Line 1: | ||
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ | # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ | ||
+ | |||
+ | |||
+ | install.packages("mlbench") | ||
+ | # Load the data set PIMA INDIAN | ||
+ | data("PimaIndiansDiabetes2", package = "mlbench") | ||
+ | # Inspect the data | ||
+ | head(PimaIndiansDiabetes2, 4) | ||
+ | |||
+ | |||
+ | library(tidyverse) | ||
+ | library(caret) | ||
+ | theme_set(theme_bw()) | ||
+ | |||
+ | |||
+ | # PREPARE DATA | ||
+ | # Load the data and remove NAs | ||
+ | data("PimaIndiansDiabetes2", package = "mlbench") | ||
+ | PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2) | ||
+ | # Inspect the data | ||
+ | sample_n(PimaIndiansDiabetes2, 3) | ||
+ | # Split the data into training and test set | ||
+ | set.seed(123) | ||
+ | training.samples <- PimaIndiansDiabetes2$diabetes %>% | ||
+ | createDataPartition(p = 0.8, list = FALSE) | ||
+ | train.data <- PimaIndiansDiabetes2[training.samples, ] | ||
+ | test.data <- PimaIndiansDiabetes2[-training.samples, ] | ||
+ | |||
+ | |||
+ | |||
+ | # Fit the model | ||
+ | # The R function glm(), for generalized linear model, can be used to compute logistic regression. | ||
+ | # You need to specify the option family = binomial, which tells to R that we want to fit logistic regression. | ||
+ | model <- glm( diabetes ~., data = train.data, family = binomial) | ||
+ | # Summarize the model | ||
+ | summary(model) | ||
+ | # Make predictions | ||
+ | probabilities <- model %>% predict(test.data, type = "response") | ||
+ | predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | ||
+ | # Model accuracy | ||
+ | mean(predicted.classes == test.data$diabetes) | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | # Simole Logistic Regression | ||
+ | model <- glm( diabetes ~ glucose, data = train.data, family = binomial) | ||
+ | summary(model)$coef | ||
+ | |||
+ | |||
+ | |||
+ | # Prediction | ||
+ | newdata <- data.frame(glucose = c(20, 180)) | ||
+ | probabilities <- model %>% predict(newdata, type = "response") | ||
+ | predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | ||
+ | predicted.classes | ||
+ | |||
+ | |||
+ | # PLOT Prediction Function | ||
+ | train.data %>% | ||
+ | mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>% | ||
+ | ggplot(aes(glucose, prob)) + | ||
+ | geom_point(alpha = 0.2) + | ||
+ | geom_smooth(method = "glm", method.args = list(family = "binomial")) + | ||
+ | labs( | ||
+ | title = "Logistic Regression Model", | ||
+ | x = "Plasma Glucose Concentration", | ||
+ | y = "Probability of being diabete-pos" | ||
+ | ) | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | # | ||
+ | # Multiple Logistic Regression | ||
+ | # | ||
+ | model <- glm( diabetes ~ glucose + mass + pregnant, | ||
+ | data = train.data, family = binomial) | ||
+ | summary(model)$coef | ||
+ | |||
+ | # Model Summary | ||
+ | model <- glm( diabetes ~., data = train.data, family = binomial) | ||
+ | summary(model)$coef | ||
+ | # alternate show | ||
+ | coef(model) | ||
+ | summary(model )$coef | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | # UBAH MODEL | ||
+ | model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, | ||
+ | data = train.data, family = binomial) | ||
+ | |||
+ | |||
+ | |||
+ | # Predict | ||
+ | probabilities <- model %>% predict(test.data, type = "response") | ||
+ | head(probabilities) | ||
+ | # check class | ||
+ | contrasts(test.data$diabetes) | ||
+ | |||
+ | # Predict the class of individuals: | ||
+ | predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | ||
+ | head(predicted.classes) | ||
+ | |||
+ | |||
+ | # Assessing model accuracy | ||
+ | mean(predicted.classes == test.data$diabetes) | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | # You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)), | ||
+ | # when linearity of the predictor cannot be assumed. This can be done using the mgcv package: | ||
+ | library("mgcv") | ||
+ | # Fit the model | ||
+ | gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant, | ||
+ | data = train.data, family = "binomial") | ||
+ | # Summarize model | ||
+ | summary(gam.model ) | ||
+ | # Make predictions | ||
+ | probabilities <- gam.model %>% predict(test.data, type = "response") | ||
+ | predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg") | ||
+ | # Model Accuracy | ||
+ | mean(predicted.classes == test.data$diabetes) | ||
+ | |||
Latest revision as of 16:07, 29 November 2019
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/
install.packages("mlbench") # Load the data set PIMA INDIAN data("PimaIndiansDiabetes2", package = "mlbench") # Inspect the data head(PimaIndiansDiabetes2, 4)
library(tidyverse) library(caret) theme_set(theme_bw())
# PREPARE DATA # Load the data and remove NAs data("PimaIndiansDiabetes2", package = "mlbench") PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2) # Inspect the data sample_n(PimaIndiansDiabetes2, 3) # Split the data into training and test set set.seed(123) training.samples <- PimaIndiansDiabetes2$diabetes %>% createDataPartition(p = 0.8, list = FALSE) train.data <- PimaIndiansDiabetes2[training.samples, ] test.data <- PimaIndiansDiabetes2[-training.samples, ]
# Fit the model # The R function glm(), for generalized linear model, can be used to compute logistic regression. # You need to specify the option family = binomial, which tells to R that we want to fit logistic regression. model <- glm( diabetes ~., data = train.data, family = binomial) # Summarize the model summary(model) # Make predictions probabilities <- model %>% predict(test.data, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") # Model accuracy mean(predicted.classes == test.data$diabetes)
# Simole Logistic Regression model <- glm( diabetes ~ glucose, data = train.data, family = binomial) summary(model)$coef
# Prediction newdata <- data.frame(glucose = c(20, 180)) probabilities <- model %>% predict(newdata, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") predicted.classes
# PLOT Prediction Function train.data %>% mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>% ggplot(aes(glucose, prob)) + geom_point(alpha = 0.2) + geom_smooth(method = "glm", method.args = list(family = "binomial")) + labs( title = "Logistic Regression Model", x = "Plasma Glucose Concentration", y = "Probability of being diabete-pos" )
# # Multiple Logistic Regression # model <- glm( diabetes ~ glucose + mass + pregnant, data = train.data, family = binomial) summary(model)$coef
# Model Summary model <- glm( diabetes ~., data = train.data, family = binomial) summary(model)$coef # alternate show coef(model) summary(model )$coef
# UBAH MODEL model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, data = train.data, family = binomial)
# Predict probabilities <- model %>% predict(test.data, type = "response") head(probabilities) # check class contrasts(test.data$diabetes)
# Predict the class of individuals: predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") head(predicted.classes)
# Assessing model accuracy mean(predicted.classes == test.data$diabetes)
# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)), # when linearity of the predictor cannot be assumed. This can be done using the mgcv package: library("mgcv") # Fit the model gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant, data = train.data, family = "binomial") # Summarize model summary(gam.model ) # Make predictions probabilities <- gam.model %>% predict(test.data, type = "response") predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg") # Model Accuracy mean(predicted.classes == test.data$diabetes)