suppressPackageStartupMessages({ #Esta funcionsuprime los 'warnings' en la carga de las librerias
library(tidyverse)
library(tidymodels)
library(lubridate)
library(paletteer)
library(glmnet)
library(randomForest)
library(xgboost)
library(DT)
})Modelos de Regresión para Ciencia de Datos
1 Carga de las librerías
2 Importación de datos
Para este ejemplo se utilizará una base de datos de renta de bicicletas que se puede encontrar en la liga: https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv
bike_data <- read_csv(file = "https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv", show_col_types = FALSE)Los datos incluidos en la base de datos son los siguientes:
instant: A unique row identifier.
dteday: The date on which the data was observed. In this case, the data was collected daily, so there’s one row per date.
season: A numerically encoded value that indicates the season. The value 1 represents spring, 2 represents summer, 3 represents fall, and 4 represents winter.
yr: The year of the study in which the observation was made. The study took place over two years. Year 0 represents 2011, and year 1 represents 2012.
mnth: The calendar month in which the observation was made. The value 1 represents January and continues through 12 for December.
holiday: A binary value indicates whether the observation was made on a public holiday.
weekday: The day of the week on which the observation was made. The value 0 represents Sunday and continues through 6 for Saturday.
workingday: A binary value indicates whether the day is a working day. It’s not a weekend or holiday.
weathersit: A categorical value indicates the weather situation. The value 1 represents clear, 2 represents mist and clouds, 3 represents light rain and snow, and 4 represents heavy rain, hail, snow, and fog.
temp: The temperature in Celsius (normalized).
atemp: The apparent, or “feels-like,” temperature in Celsius (normalized).
hum: The humidity level (normalized).
windspeed: The windspeed (normalized).
rentals: The number of bicycle rentals recorded.
3 Extracción de los días
La columna ‘dteday’ contiene la fecha en la que se levantó cada observación. Cada fila correpsonde a un día diferente. Se va a extraer el número de día del mes a partir de la fecha de levantamiento.
bike_data<-bike_data %>%
mutate(dteday=mdy(dteday)) %>%
mutate(day=day(dteday))4 Selección de las variables relevantes para el análisis
bike_select<-bike_data %>%
select(c(season, mnth, holiday, weekday, workingday, weathersit, temp,
atemp, hum, windspeed, rentals)) %>%
mutate(across(1:6, factor))5 Partición de la base de datos en las submuestras de entrenamiento y prueba
Se utilizará una proporción de 80% para la muestra de entrenamiento y del 20% para la muestra de prueba
set.seed(1234)
bike_split <- bike_select %>%
initial_split(prop = 0.8,strata = holiday) #se hará la partición de manera proporcional de acuerdo a la variable 'holiday'
#generación de las dos submuestras
bike_train <- training(bike_split)
bike_test <- testing(bike_split)6 Especificación de los parámetros de evaluación de los modelos de regresión
En el siguiente chunk se determina el conjunto de medidas de desempeño de los modelos de regresion: la raiz de los errores al cuadrado y el coeficiente de determinacion (R2)
eval_metrics <- metric_set(rmse, rsq) #raiz de los errores al cuadrado y la R cuadrada7 Número de observaciones en cada submuestra
cat("La muestra de entrenamiento tiene ", nrow(bike_train), "observaciones",
"\nLa muestra de prueba tiene ", nrow(bike_test), "observaciones")La muestra de entrenamiento tiene 584 observaciones
La muestra de prueba tiene 147 observaciones
8 Especificación de los modelos de regresión:
8.1 Regresion lineal
Especificación del modelo
lm_spec <-
# tipo de regresion
linear_reg() %>%
# metodo de calculo
set_engine("lm") %>%
# Modo
set_mode("regression")Entrenamiento del modelo
lm_mod <- lm_spec %>%
fit(rentals ~ ., data = bike_train)
# Resultados del modelo
lm_modparsnip model object
Call:
stats::lm(formula = rentals ~ ., data = data)
Coefficients:
(Intercept) season2 season3 season4 mnth2 mnth3
917.567 125.096 -1.973 64.748 -2.643 268.056
mnth4 mnth5 mnth6 mnth7 mnth8 mnth9
297.293 291.410 21.795 -38.721 94.059 267.876
mnth10 mnth11 mnth12 holiday1 weekday1 weekday2
313.275 111.611 -3.491 580.699 -778.240 -826.688
weekday3 weekday4 weekday5 weekday6 workingday1 weathersit2
-856.519 -833.420 -675.972 90.363 NA -57.948
weathersit3 temp atemp hum windspeed
-332.805 2579.758 -586.472 -734.162 -1146.455
Evaluación del modelo entrenado
# Predicciones del modelo sobre la muestra de prueba
pred <- lm_mod %>%
predict(new_data = bike_test)Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
# Vista delas predicciones
pred %>%
slice_head(n = 5)# A tibble: 5 × 1
.pred
<dbl>
1 -23.0
2 -8.62
3 581.
4 473.
5 807.
# Fusión de las predicciones del modelo con los datos reales de la muestra de prueba
results_lin <- bike_test %>%
bind_cols(lm_mod %>%
# Prediccion de las rentas
predict(new_data = bike_test) %>%
rename(predictions = .pred))Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
# Comparación de las predicciones
results_lin %>%
select(c(rentals, predictions)) %>%
slice_head(n = 10)# A tibble: 10 × 2
rentals predictions
<dbl> <dbl>
1 82 -23.0
2 88 -8.62
3 68 581.
4 54 473.
5 251 807.
6 86 -221.
7 34 -800.
8 15 -177.
9 354 901.
10 397 907.
Visualización de los resultados
results_lin %>%
ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(size = 1.5, color = "navyblue") +
# linea de regresion
geom_smooth(method = "lm", se = F, color = 'red', size=1.1) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
`geom_smooth()` using formula = 'y ~ x'

Medidas de desempeño del modelo
(linear_metrics<-eval_metrics(data = results_lin,
truth = rentals,
estimate = predictions))# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 380.
2 rsq standard 0.684
8.2 Modelo LASSO (Least Absolute Shrinkage and Selection Operator)
Especificación del modelo
lasso_spec <- linear_reg(
engine = "glmnet",
mode = "regression",
penalty = 1,
mixture = 1)Entrenamiento del modelo
lasso_mod <- lasso_spec %>%
fit(rentals ~ ., data = bike_train)Evaluación del modelo entrenado
results_lasso <- bike_test %>%
bind_cols(lasso_mod %>% predict(new_data = bike_test) %>%
rename(predictions = .pred))
(lasso_metrics <- eval_metrics(data = results_lasso,
truth = rentals,
estimate = predictions))# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 378.
2 rsq standard 0.687
Visualización de los resultados
theme_set(theme_light())
lasso_plt <- results_lasso %>%
ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(size = 1.5, color = 'darkorchid') +
geom_smooth(method = 'lm', color = 'black', se = F, size=1.1) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
lasso_plt`geom_smooth()` using formula = 'y ~ x'

8.3 Arbol de Decisión (Decision tree)
Especificación del modelo
tree_spec <- decision_tree(
engine = "rpart",
mode = "regression")Entrenamiento del modelo
tree_mod <- tree_spec %>%
fit(rentals ~ ., data = bike_train)
tree_modparsnip model object
n= 584
node), split, n, deviance, yval
* denotes terminal node
1) root 584 277850900 853.5685
2) temp< 0.41875 223 26321840 375.4798
4) temp< 0.321703 125 3676634 229.8480 *
5) temp>=0.321703 98 16612650 561.2347
10) weekday=1,2,3,4,5 65 6011070 402.4308 *
11) weekday=0,6 33 5733607 874.0303 *
3) temp>=0.41875 361 169072100 1148.8980
6) workingday=1 256 24052250 807.0938
12) hum>=0.7368205 64 5498455 561.9531 *
13) hum< 0.7368205 192 13425780 888.8073 *
7) workingday=0 105 42191810 1982.2480
14) hum>=0.8322915 7 1597155 1200.1430 *
15) hum< 0.8322915 98 36006990 2038.1120
30) atemp>=0.710244 8 1713222 1450.5000 *
31) atemp< 0.710244 90 31285930 2090.3440
62) mnth=8,11 12 4321939 1639.9170 *
63) mnth=3,4,5,6,7,9,10 78 24154810 2159.6410 *
Evaluación del modelo entrenado
results_tree <- tree_mod %>%
augment(new_data = bike_test) %>%
rename(predictions = .pred)
(tree_metrics <- eval_metrics(data = results_tree,
truth = rentals,
estimate = predictions))# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 380.
2 rsq standard 0.680
Visualización del modelo entrenado
tree_plt <- results_tree %>%
ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(color = 'tomato') +
geom_smooth(method = 'lm', color = 'steelblue', se = F) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
tree_plt`geom_smooth()` using formula = 'y ~ x'

8.4 Random Forest
Especificación del modelo
set.seed(1234)
rf_spec <- rand_forest() %>%
set_engine('randomForest') %>%
set_mode('regression')Entrenamiento del modelo
rf_mod <- rf_spec %>%
fit(rentals ~ ., data = bike_train)
rf_modparsnip model object
Call:
randomForest(x = maybe_data_frame(x), y = y)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 3
Mean of squared residuals: 102528.2
% Var explained: 78.45
Evaluación del modelo entrenado
results_rf <- rf_mod %>%
augment(new_data = bike_test) %>%
rename(predictions = .pred)
(rf_metrics <- eval_metrics(data = results_rf,
truth = rentals,
estimate = predictions))# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 304.
2 rsq standard 0.796
Visualización del modelo entrenado
rf_plt <- results_rf %>%
ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(color = '#6CBE50FF') +
# overlay regression line
geom_smooth(method = 'lm', color = '#2B7FF9FF', se = F) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
rf_plt`geom_smooth()` using formula = 'y ~ x'

8.5 Modelo Gradient Boost (xgboost)
Especificación del modelo
set.seed(1234)
boost_spec <- boost_tree() %>%
set_engine('xgboost') %>%
set_mode('regression')Entrenamiento del modelo entrenado
boost_mod <- boost_spec %>%
fit(rentals ~ ., data = bike_train)
boost_modparsnip model object
##### xgb.Booster
raw: 58 Kb
call:
xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0,
colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1,
subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist,
verbose = 0, nthread = 1, objective = "reg:squarederror")
params (as set within xgb.train):
eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", nthread = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 34
niter: 15
nfeatures : 34
evaluation_log:
iter training_rmse
1 801.8523
2 597.8338
---
14 123.0786
15 118.6168
Evaluación dle modelo entrenado
results_boost <- boost_mod %>%
augment(new_data = bike_test) %>%
rename(predictions = .pred)
(boost_metrics <- eval_metrics(data = results_boost,
truth = rentals,
estimate = predictions))# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 320.
2 rsq standard 0.776
Visualización del modelo entrenado
boost_plt <- results_boost %>%
ggplot(mapping = aes(x = rentals, y = predictions)) +
geom_point(color = 'springgreen2', size=1.9) +
geom_smooth(method = 'lm', color = 'green4', se = F) +
ggtitle("Predicción de las rentas diarias de bicicletas") +
xlab("Rentas reales") +
ylab("Rentas estimadas") +
theme(plot.title = element_text(hjust = 0.5))
boost_plt`geom_smooth()` using formula = 'y ~ x'

9 Resumen de resultados
summary<-cbind(
round(linear_metrics[3],2),
round(lasso_metrics[3],2),
round(tree_metrics[3],2),
round(rf_metrics[3],2),
round(boost_metrics[3],2))
rownames(summary)<-c("RMSE","R2")
colnames(summary)<-c("Modelo Lineal", "Modelo LASSO", "Arbol de Decisión", "Random Forest", "Gradient Boost")
datatable(summary) %>%
formatStyle(columns="Modelo Lineal", backgroundColor = "deepskyblue") %>%
formatStyle(columns="Modelo LASSO", backgroundColor = "aliceblue") %>%
formatStyle(columns="Arbol de Decisión", backgroundColor = "deepskyblue") %>%
formatStyle(columns="Random Forest", backgroundColor = "aliceblue") %>%
formatStyle(columns="Gradient Boost", backgroundColor = "deepskyblue")