Boosted Ensembles#

Continuing on with our work with classifiers and ensembles today we introduce boosted models, a powerful collection of tree based models that learn in sequence.

# !pip install xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

Gradient Boosted Models#

from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.datasets import fetch_california_housing, load_breast_cancer
from sklearn.datasets import fetch_openml
bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True).frame
bikes.head()
season year month hour holiday weekday workingday weather temp feel_temp humidity windspeed count
0 spring 0 1 0 False 6 False clear 9.84 14.395 0.81 0.0 16
1 spring 0 1 1 False 6 False clear 9.02 13.635 0.80 0.0 40
2 spring 0 1 2 False 6 False clear 9.02 13.635 0.80 0.0 32
3 spring 0 1 3 False 6 False clear 9.84 14.395 0.75 0.0 13
4 spring 0 1 4 False 6 False clear 9.84 14.395 0.75 0.0 1
df = bikes[bikes['year'] == 0]
fig,ax = plt.subplots(figsize = (20, 5))
average_bike_rentals = df.groupby(
    ["year", "season", "weekday", "hour"], observed=True
).mean(numeric_only=True)["count"]
average_bike_rentals.groupby(['season'], observed = True).plot(legend = True)
plt.xticks([], [])
plt.xlabel('Year 0 (training data)')
plt.grid();
_images/ac25f9481f4368e6bdab6139a7f01f2cfd34160df936ba346eced75d0d82d56e.png
bikes.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      17379 non-null  category
 1   year        17379 non-null  int64   
 2   month       17379 non-null  int64   
 3   hour        17379 non-null  int64   
 4   holiday     17379 non-null  category
 5   weekday     17379 non-null  int64   
 6   workingday  17379 non-null  category
 7   weather     17379 non-null  category
 8   temp        17379 non-null  float64 
 9   feel_temp   17379 non-null  float64 
 10  humidity    17379 non-null  float64 
 11  windspeed   17379 non-null  float64 
 12  count       17379 non-null  int64   
dtypes: category(4), float64(4), int64(5)
memory usage: 1.3 MB
X = bikes.drop(columns = 'count')
y = bikes['count']
train = bikes[bikes['year'] == 0]
test = bikes[bikes['year'] == 1]
X_train = train.drop(columns = 'count')
y_train = train['count']
X_test = test.drop(columns = 'count')
y_test = test['count']
reg = XGBRegressor(enable_categorical=True)
reg.fit(X_train, y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=True, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
reg.score(X_train, y_train)
0.9789959788322449
reg.score(X_test, y_test)
0.6463150978088379

PROBLEM

What should we do if we want to improve performance? See here

Inspecting Results#

reg.feature_importances_
array([0.06592979, 0.        , 0.03411739, 0.27398434, 0.01827372,
       0.0154194 , 0.28702936, 0.06249612, 0.08871585, 0.12923817,
       0.01873765, 0.00605831], dtype=float32)
pd.DataFrame({'feature': X_train.columns.tolist(), 'importance': reg.feature_importances_}).sort_values(by = 'importance', ascending = False)
feature importance
6 workingday 0.287029
3 hour 0.273984
9 feel_temp 0.129238
8 temp 0.088716
0 season 0.065930
7 weather 0.062496
2 month 0.034117
10 humidity 0.018738
4 holiday 0.018274
5 weekday 0.015419
11 windspeed 0.006058
1 year 0.000000
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
X_test.head()
season year month hour holiday weekday workingday weather temp feel_temp humidity windspeed
8645 spring 1 1 0 False 0 False clear 14.76 18.940 0.66 0.0000
8646 spring 1 1 1 False 0 False clear 14.76 17.425 0.66 8.9981
8647 spring 1 1 2 False 0 False clear 13.12 17.425 0.76 0.0000
8648 spring 1 1 3 False 0 False clear 12.30 16.665 0.81 0.0000
8649 spring 1 1 4 False 0 False clear 11.48 15.150 0.81 6.0032
fig, ax = plt.subplots(figsize = (20, 5))
PartialDependenceDisplay.from_estimator(reg, X_test, features = ['hour', 'temp', 'windspeed', 'humidity'],  n_cols = 4,ax = ax)
<sklearn.inspection._plot.partial_dependence.PartialDependenceDisplay at 0x16099b680>
_images/254ee0527cb875a121f0556101c2a76401cf7c077c25e0c728cd8283f18c531c.png
fig, ax = plt.subplots(figsize = (20, 5))
PartialDependenceDisplay.from_estimator(reg, X_test, features = ['hour', 'temp', 'windspeed', 'humidity'],  
                                        kind = 'both',
                                        n_cols = 4,ax = ax,
                                       subsample = 30)
<sklearn.inspection._plot.partial_dependence.PartialDependenceDisplay at 0x1775c7200>
_images/f7f3946e11680aa9c270ca0f445ff87836ec3b9a7eb9f20ab606e5fe797a7357.png

We can also explore two variable PDP plots to understand interactions between features. Note the wiggles in the ICE plots above with temperature and humidity.

# Slower to plot the 2-way PDP
# PartialDependenceDisplay.from_estimator(reg, X_train, [('temp', 'humidity')], subsample = 50, n_jobs = -1, grid_resolution=20, kind = 'average' )

Permutation Feature Importance

Permute individual features while holding others constant to see the effect on target.

results = permutation_importance(reg, X_train, y_train, n_repeats=10)
results
{'importances_mean': array([0.06408446, 0.        , 0.05608611, 1.38907729, 0.00536676,
        0.03095871, 0.26588249, 0.03003684, 0.09342833, 0.06718443,
        0.09206399, 0.01775954]),
 'importances_std': array([0.00146776, 0.        , 0.00142307, 0.02330278, 0.00026406,
        0.00068182, 0.00639445, 0.00058722, 0.00220706, 0.00162217,
        0.00358808, 0.0007023 ]),
 'importances': array([[0.06300265, 0.06402278, 0.06463879, 0.06582814, 0.06204641,
         0.06684893, 0.06517816, 0.06249613, 0.06286907, 0.06391352],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.05552417, 0.05325413, 0.05561483, 0.05599797, 0.05715764,
         0.05666196, 0.05552161, 0.05509174, 0.05717289, 0.05886412],
        [1.39215404, 1.38278419, 1.39891392, 1.43035001, 1.39482099,
         1.3678748 , 1.39885134, 1.41269845, 1.36795563, 1.34436959],
        [0.00556594, 0.00533265, 0.00586772, 0.00505203, 0.00564623,
         0.005108  , 0.00524527, 0.00502574, 0.00529748, 0.00552654],
        [0.03143972, 0.03074974, 0.03096807, 0.03105378, 0.03102279,
         0.03210676, 0.03005022, 0.0318892 , 0.02998865, 0.03031814],
        [0.26693612, 0.27388686, 0.26138693, 0.26471162, 0.25932431,
         0.27060401, 0.25526071, 0.27479178, 0.26027721, 0.27164531],
        [0.03051323, 0.02920526, 0.03001463, 0.03068423, 0.02960652,
         0.02993691, 0.03099948, 0.02955091, 0.0305323 , 0.02932495],
        [0.0908969 , 0.09123766, 0.09405136, 0.09543508, 0.09107119,
         0.09412605, 0.09672433, 0.0918116 , 0.09207493, 0.09685421],
        [0.06753367, 0.06542045, 0.06594509, 0.06789696, 0.06580669,
         0.06691229, 0.06712925, 0.06528443, 0.06977129, 0.07014418],
        [0.09755111, 0.0880959 , 0.08526027, 0.08980298, 0.09112072,
         0.09527755, 0.09177947, 0.09575665, 0.09144682, 0.0945484 ],
        [0.01812655, 0.01650363, 0.01809645, 0.01843345, 0.01672405,
         0.01816964, 0.01718265, 0.01865005, 0.01822078, 0.01748812]])}
pd.DataFrame(results['importances'], index = X_train.columns.tolist()).T.plot(kind = 'box', vert = False)
<Axes: >
_images/531748ccbddb24ea292790ec80058690ccaf47e0838c2fdd56c1e891b15dad0f.png
pd.DataFrame(results['importances_mean'], index = X_train.columns.tolist()).sort_values(0, ascending = False)
0
hour 1.389077
workingday 0.265882
temp 0.093428
humidity 0.092064
feel_temp 0.067184
season 0.064084
month 0.056086
weekday 0.030959
weather 0.030037
windspeed 0.017760
holiday 0.005367
year 0.000000

Comparing Boosted Models#

Returning to the food marketing data from last class:

  • Can you tune a boosted model to do better than the baseline and optimize towards precision?

  • What features were important to making these predictions and how would you suggest targeting future customers?

food_marketing = pd.read_csv('https://raw.githubusercontent.com/jfkoehler/nyu_bootcamp_fa25/refs/heads/main/data/food_data.csv')
food_marketing.head()
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
0 5524 1957 Graduation Single 58138.0 0 0 2012-09-04 58 635 88 546 172 88 88 3 8 10 4 7 0 0 0 0 0 0 3 11 1
1 2174 1954 Graduation Single 46344.0 1 1 2014-03-08 38 11 1 6 2 1 6 2 1 1 2 5 0 0 0 0 0 0 3 11 0
2 4141 1965 Graduation Together 71613.0 0 0 2013-08-21 26 426 49 127 111 21 42 1 8 2 10 4 0 0 0 0 0 0 3 11 0
3 6182 1984 Graduation Together 26646.0 1 0 2014-02-10 26 11 4 20 10 3 5 2 2 0 4 6 0 0 0 0 0 0 3 11 0
4 5324 1981 PhD Married 58293.0 1 0 2014-01-19 94 173 43 118 46 27 15 5 5 3 6 5 0 0 0 0 0 0 3 11 0
food_marketing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64  
 16  NumWebPurchases      2240 non-null   int64  
 17  NumCatalogPurchases  2240 non-null   int64  
 18  NumStorePurchases    2240 non-null   int64  
 19  NumWebVisitsMonth    2240 non-null   int64  
 20  AcceptedCmp3         2240 non-null   int64  
 21  AcceptedCmp4         2240 non-null   int64  
 22  AcceptedCmp5         2240 non-null   int64  
 23  AcceptedCmp1         2240 non-null   int64  
 24  AcceptedCmp2         2240 non-null   int64  
 25  Complain             2240 non-null   int64  
 26  Z_CostContact        2240 non-null   int64  
 27  Z_Revenue            2240 non-null   int64  
 28  Response             2240 non-null   int64  
dtypes: float64(1), int64(25), object(3)
memory usage: 507.6+ KB
food_marketing[['Education', 'Marital_Status']] = food_marketing[['Education', 'Marital_Status']].astype('category')
food_marketing.drop(columns = 'Dt_Customer', inplace = True)
X = food_marketing.drop(columns = 'Response')
y = food_marketing['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=10)

PROBLEM#

Below, a dataset around an email campaign for sales is loaded and displayed. Explore the description of the task here. Consider a strategy for model building and evaluation.

# !pip install scikit-uplift
from sklift.datasets import fetch_hillstrom
dataset = fetch_hillstrom(target_col='conversion')
data, target, treatment = dataset.data, dataset.target, dataset.treatment
data.head()
recency history_segment history mens womens zip_code newbie channel
0 10 2) $100 - $200 142.44 1 0 Surburban 0 Phone
1 6 3) $200 - $350 329.08 1 1 Rural 1 Web
2 7 2) $100 - $200 180.65 0 1 Surburban 1 Web
3 9 5) $500 - $750 675.83 1 0 Rural 1 Web
4 2 1) $0 - $100 45.34 1 0 Urban 0 Web
target.head()
0    0
1    0
2    0
3    0
4    0
Name: conversion, dtype: int64
treatment.head()
0    Womens E-Mail
1        No E-Mail
2    Womens E-Mail
3      Mens E-Mail
4    Womens E-Mail
Name: segment, dtype: object