suppressPackageStartupMessages({ #Esta funcionsuprime los 'warnings' en la carga de las librerias
library(tidyverse)
library(tidymodels)
library(lubridate)
library(paletteer)
library(glmnet)
library(randomForest)
library(xgboost)
library(DT)
})
Modelos de Regresión para Ciencia de Datos
1 Carga de las librerías
2 Importación de datos
Para este ejemplo se utilizará una base de datos de renta de bicicletas que se puede encontrar en la liga: https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv
<- read_csv(file = "https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv", show_col_types = FALSE) bike_data
Los datos incluidos en la base de datos son los siguientes:
instant: A unique row identifier.
dteday: The date on which the data was observed. In this case, the data was collected daily, so there’s one row per date.
season: A numerically encoded value that indicates the season. The value 1 represents spring, 2 represents summer, 3 represents fall, and 4 represents winter.
yr: The year of the study in which the observation was made. The study took place over two years. Year 0 represents 2011, and year 1 represents 2012.
mnth: The calendar month in which the observation was made. The value 1 represents January and continues through 12 for December.
holiday: A binary value indicates whether the observation was made on a public holiday.
weekday: The day of the week on which the observation was made. The value 0 represents Sunday and continues through 6 for Saturday.
workingday: A binary value indicates whether the day is a working day. It’s not a weekend or holiday.
weathersit: A categorical value indicates the weather situation. The value 1 represents clear, 2 represents mist and clouds, 3 represents light rain and snow, and 4 represents heavy rain, hail, snow, and fog.
temp: The temperature in Celsius (normalized).
atemp: The apparent, or “feels-like,” temperature in Celsius (normalized).
hum: The humidity level (normalized).
windspeed: The windspeed (normalized).
rentals: The number of bicycle rentals recorded.
3 Extracción de los días
La columna ‘dteday’ contiene la fecha en la que se levantó cada observación. Cada fila correpsonde a un día diferente. Se va a extraer el número de día del mes a partir de la fecha de levantamiento.
<-bike_data %>%
bike_datamutate(dteday=mdy(dteday)) %>%
mutate(day=day(dteday))
4 Selección de las variables relevantes para el análisis
<-bike_data %>%
bike_selectselect(c(season, mnth, holiday, weekday, workingday, weathersit, temp,
%>%
atemp, hum, windspeed, rentals)) mutate(across(1:6, factor))
5 Partición de la base de datos en las submuestras de entrenamiento y prueba
Se utilizará una proporción de 80% para la muestra de entrenamiento y del 20% para la muestra de prueba
set.seed(1234)
<- bike_select %>%
bike_split initial_split(prop = 0.8,strata = holiday) #se hará la partición de manera proporcional de acuerdo a la variable 'holiday'
#generación de las dos submuestras
<- training(bike_split)
bike_train <- testing(bike_split) bike_test
6 Especificación de los parámetros de evaluación de los modelos de regresión
En el siguiente chunk se determina el conjunto de medidas de desempeño de los modelos de regresion: la raiz de los errores al cuadrado y el coeficiente de determinacion (R2)
<- metric_set(rmse, rsq) #raiz de los errores al cuadrado y la R cuadrada eval_metrics
7 Número de observaciones en cada submuestra
cat("La muestra de entrenamiento tiene ", nrow(bike_train), "observaciones",
"\nLa muestra de prueba tiene ", nrow(bike_test), "observaciones")
La muestra de entrenamiento tiene 584 observaciones
La muestra de prueba tiene 147 observaciones
8 Especificación de los modelos de regresión:
8.1 Regresion lineal
Especificación del modelo
<-
lm_spec # tipo de regresion
linear_reg() %>%
# metodo de calculo
set_engine("lm") %>%
# Modo
set_mode("regression")
Entrenamiento del modelo
<- lm_spec %>%
lm_mod fit(rentals ~ ., data = bike_train)
# Resultados del modelo
lm_mod
parsnip model object
Call:
stats::lm(formula = rentals ~ ., data = data)
Coefficients:
(Intercept) season2 season3 season4 mnth2 mnth3
917.567 125.096 -1.973 64.748 -2.643 268.056
mnth4 mnth5 mnth6 mnth7 mnth8 mnth9
297.293 291.410 21.795 -38.721 94.059 267.876
mnth10 mnth11 mnth12 holiday1 weekday1 weekday2
313.275 111.611 -3.491 580.699 -778.240 -826.688
weekday3 weekday4 weekday5 weekday6 workingday1 weathersit2
-856.519 -833.420 -675.972 90.363 NA -57.948
weathersit3 temp atemp hum windspeed
-332.805 2579.758 -586.472 -734.162 -1146.455
Evaluación del modelo entrenado
# Predicciones del modelo sobre la muestra de prueba
<- lm_mod %>%
pred predict(new_data = bike_test)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
# Vista delas predicciones
%>%
pred slice_head(n = 5)
# A tibble: 5 × 1
.pred
<dbl>
1 -23.0
2 -8.62
3 581.
4 473.
5 807.
# Fusión de las predicciones del modelo con los datos reales de la muestra de prueba
<- bike_test %>%
results_lin bind_cols(lm_mod %>%
# Prediccion de las rentas
predict(new_data = bike_test) %>%
rename(predictions = .pred))
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
# Comparación de las predicciones
%>%
results_lin select(c(rentals, predictions)) %>%
slice_head(n = 10)
# A tibble: 10 × 2
rentals predictions
<dbl> <dbl>
1 82 -23.0
2 88 -8.62
3 68 581.
4 54 473.
5 251 807.
6 86 -221.
7 34 -800.
8 15 -177.
9 354 901.
10 397 907.
Visualización de los resultados
%>%
results_lin ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(size = 1.5, color = "navyblue") +
# linea de regresion
geom_smooth(method = "lm", se = F, color = 'red', size=1.1) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
`geom_smooth()` using formula = 'y ~ x'
Medidas de desempeño del modelo
<-eval_metrics(data = results_lin,
(linear_metricstruth = rentals,
estimate = predictions))
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 380.
2 rsq standard 0.684
8.2 Modelo LASSO (Least Absolute Shrinkage and Selection Operator)
Especificación del modelo
<- linear_reg(
lasso_spec engine = "glmnet",
mode = "regression",
penalty = 1,
mixture = 1)
Entrenamiento del modelo
<- lasso_spec %>%
lasso_mod fit(rentals ~ ., data = bike_train)
Evaluación del modelo entrenado
<- bike_test %>%
results_lasso bind_cols(lasso_mod %>% predict(new_data = bike_test) %>%
rename(predictions = .pred))
<- eval_metrics(data = results_lasso,
(lasso_metrics truth = rentals,
estimate = predictions))
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 378.
2 rsq standard 0.687
Visualización de los resultados
theme_set(theme_light())
<- results_lasso %>%
lasso_plt ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(size = 1.5, color = 'darkorchid') +
geom_smooth(method = 'lm', color = 'black', se = F, size=1.1) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
lasso_plt
`geom_smooth()` using formula = 'y ~ x'
8.3 Arbol de Decisión (Decision tree)
Especificación del modelo
<- decision_tree(
tree_spec engine = "rpart",
mode = "regression")
Entrenamiento del modelo
<- tree_spec %>%
tree_mod fit(rentals ~ ., data = bike_train)
tree_mod
parsnip model object
n= 584
node), split, n, deviance, yval
* denotes terminal node
1) root 584 277850900 853.5685
2) temp< 0.41875 223 26321840 375.4798
4) temp< 0.321703 125 3676634 229.8480 *
5) temp>=0.321703 98 16612650 561.2347
10) weekday=1,2,3,4,5 65 6011070 402.4308 *
11) weekday=0,6 33 5733607 874.0303 *
3) temp>=0.41875 361 169072100 1148.8980
6) workingday=1 256 24052250 807.0938
12) hum>=0.7368205 64 5498455 561.9531 *
13) hum< 0.7368205 192 13425780 888.8073 *
7) workingday=0 105 42191810 1982.2480
14) hum>=0.8322915 7 1597155 1200.1430 *
15) hum< 0.8322915 98 36006990 2038.1120
30) atemp>=0.710244 8 1713222 1450.5000 *
31) atemp< 0.710244 90 31285930 2090.3440
62) mnth=8,11 12 4321939 1639.9170 *
63) mnth=3,4,5,6,7,9,10 78 24154810 2159.6410 *
Evaluación del modelo entrenado
<- tree_mod %>%
results_tree augment(new_data = bike_test) %>%
rename(predictions = .pred)
<- eval_metrics(data = results_tree,
(tree_metrics truth = rentals,
estimate = predictions))
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 380.
2 rsq standard 0.680
Visualización del modelo entrenado
<- results_tree %>%
tree_plt ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(color = 'tomato') +
geom_smooth(method = 'lm', color = 'steelblue', se = F) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
tree_plt
`geom_smooth()` using formula = 'y ~ x'
8.4 Random Forest
Especificación del modelo
set.seed(1234)
<- rand_forest() %>%
rf_spec set_engine('randomForest') %>%
set_mode('regression')
Entrenamiento del modelo
<- rf_spec %>%
rf_mod fit(rentals ~ ., data = bike_train)
rf_mod
parsnip model object
Call:
randomForest(x = maybe_data_frame(x), y = y)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 3
Mean of squared residuals: 102528.2
% Var explained: 78.45
Evaluación del modelo entrenado
<- rf_mod %>%
results_rf augment(new_data = bike_test) %>%
rename(predictions = .pred)
<- eval_metrics(data = results_rf,
(rf_metrics truth = rentals,
estimate = predictions))
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 304.
2 rsq standard 0.796
Visualización del modelo entrenado
<- results_rf %>%
rf_plt ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(color = '#6CBE50FF') +
# overlay regression line
geom_smooth(method = 'lm', color = '#2B7FF9FF', se = F) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
rf_plt
`geom_smooth()` using formula = 'y ~ x'
8.5 Modelo Gradient Boost (xgboost)
Especificación del modelo
set.seed(1234)
<- boost_tree() %>%
boost_spec set_engine('xgboost') %>%
set_mode('regression')
Entrenamiento del modelo entrenado
<- boost_spec %>%
boost_mod fit(rentals ~ ., data = bike_train)
boost_mod
parsnip model object
##### xgb.Booster
raw: 58 Kb
call:
xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0,
colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1,
subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist,
verbose = 0, nthread = 1, objective = "reg:squarederror")
params (as set within xgb.train):
eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", nthread = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 34
niter: 15
nfeatures : 34
evaluation_log:
iter training_rmse
1 801.8523
2 597.8338
---
14 123.0786
15 118.6168
Evaluación dle modelo entrenado
<- boost_mod %>%
results_boost augment(new_data = bike_test) %>%
rename(predictions = .pred)
<- eval_metrics(data = results_boost,
(boost_metrics truth = rentals,
estimate = predictions))
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 320.
2 rsq standard 0.776
Visualización del modelo entrenado
<- results_boost %>%
boost_plt ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(color = 'springgreen2', size=1.9) +
geom_smooth(method = 'lm', color = 'green4', se = F) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
boost_plt
`geom_smooth()` using formula = 'y ~ x'
9 Resumen de resultados
<-cbind(
summaryround(linear_metrics[3],2),
round(lasso_metrics[3],2),
round(tree_metrics[3],2),
round(rf_metrics[3],2),
round(boost_metrics[3],2))
rownames(summary)<-c("RMSE","R2")
colnames(summary)<-c("Modelo Lineal", "Modelo LASSO", "Arbol de Decisión", "Random Forest", "Gradient Boost")
datatable(summary) %>%
formatStyle(columns="Modelo Lineal", backgroundColor = "deepskyblue") %>%
formatStyle(columns="Modelo LASSO", backgroundColor = "aliceblue") %>%
formatStyle(columns="Arbol de Decisión", backgroundColor = "deepskyblue") %>%
formatStyle(columns="Random Forest", backgroundColor = "aliceblue") %>%
formatStyle(columns="Gradient Boost", backgroundColor = "deepskyblue")