Boosted Ensembles

Boosted Ensembles#

Continuing on with our work with classifiers and ensembles today we introduce boosted models, a powerful collection of tree based models that learn in sequence.

# !pip install xgboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

Gradient Boosted Models#

from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.datasets import fetch_california_housing, load_breast_cancer
from sklearn.datasets import fetch_openml

bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True).frame

bikes.head()

	season	month	hour	holiday	weekday	workingday	weather	temp	feel_temp	humidity	count
0	spring	1	0	False	6	False	clear	9.84	14.395	0.81	16
1	spring	1	1	False	6	False	clear	9.02	13.635	0.80	40
2	spring	1	2	False	6	False	clear	9.02	13.635	0.80	32
3	spring	1	3	False	6	False	clear	9.84	14.395	0.75	13
4	spring	1	4	False	6	False	clear	9.84	14.395	0.75	1

df = bikes[bikes['year'] == 0]
fig,ax = plt.subplots(figsize = (20, 5))
average_bike_rentals = df.groupby(
    ["year", "season", "weekday", "hour"], observed=True
).mean(numeric_only=True)["count"]
average_bike_rentals.groupby(['season'], observed = True).plot(legend = True)
plt.xticks([], [])
plt.xlabel('Year 0 (training data)')
plt.grid();

_images/ac25f9481f4368e6bdab6139a7f01f2cfd34160df936ba346eced75d0d82d56e.png

bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      17379 non-null  category
 1   year        17379 non-null  int64   
 2   month       17379 non-null  int64   
 3   hour        17379 non-null  int64   
 4   holiday     17379 non-null  category
 5   weekday     17379 non-null  int64   
 6   workingday  17379 non-null  category
 7   weather     17379 non-null  category
 8   temp        17379 non-null  float64 
 9   feel_temp   17379 non-null  float64 
 10  humidity    17379 non-null  float64 
 11  windspeed   17379 non-null  float64 
 12  count       17379 non-null  int64   
dtypes: category(4), float64(4), int64(5)
memory usage: 1.3 MB

X = bikes.drop(columns = 'count')
y = bikes['count']
train = bikes[bikes['year'] == 0]
test = bikes[bikes['year'] == 1]
X_train = train.drop(columns = 'count')
y_train = train['count']
X_test = test.drop(columns = 'count')
y_test = test['count']

reg = XGBRegressor(enable_categorical=True)
reg.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=True, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

reg.score(X_train, y_train)

0.9789959788322449

reg.score(X_test, y_test)

0.6463150978088379

PROBLEM

What should we do if we want to improve performance? See here

Inspecting Results#

reg.feature_importances_

array([0.06592979, 0.        , 0.03411739, 0.27398434, 0.01827372,
       0.0154194 , 0.28702936, 0.06249612, 0.08871585, 0.12923817,
       0.01873765, 0.00605831], dtype=float32)

pd.DataFrame({'feature': X_train.columns.tolist(), 'importance': reg.feature_importances_}).sort_values(by = 'importance', ascending = False)

	feature	importance
6	workingday	0.287029
3	hour	0.273984
9	feel_temp	0.129238
8	temp	0.088716
0	season	0.065930
7	weather	0.062496
2	month	0.034117
10	humidity	0.018738
4	holiday	0.018274
5	weekday	0.015419
11	windspeed	0.006058
1	year	0.000000

from sklearn.inspection import PartialDependenceDisplay, permutation_importance

X_test.head()

	season	year	month	hour	holiday	workingday	weather	temp	feel_temp	humidity	windspeed
8645	spring	1	1	0	False	False	clear	14.76	18.940	0.66	0.0000
8646	spring	1	1	1	False	False	clear	14.76	17.425	0.66	8.9981
8647	spring	1	1	2	False	False	clear	13.12	17.425	0.76	0.0000
8648	spring	1	1	3	False	False	clear	12.30	16.665	0.81	0.0000
8649	spring	1	1	4	False	False	clear	11.48	15.150	0.81	6.0032

fig, ax = plt.subplots(figsize = (20, 5))
PartialDependenceDisplay.from_estimator(reg, X_test, features = ['hour', 'temp', 'windspeed', 'humidity'],  n_cols = 4,ax = ax)

<sklearn.inspection._plot.partial_dependence.PartialDependenceDisplay at 0x16099b680>

_images/254ee0527cb875a121f0556101c2a76401cf7c077c25e0c728cd8283f18c531c.png

fig, ax = plt.subplots(figsize = (20, 5))
PartialDependenceDisplay.from_estimator(reg, X_test, features = ['hour', 'temp', 'windspeed', 'humidity'],  
                                        kind = 'both',
                                        n_cols = 4,ax = ax,
                                       subsample = 30)

<sklearn.inspection._plot.partial_dependence.PartialDependenceDisplay at 0x1775c7200>

_images/f7f3946e11680aa9c270ca0f445ff87836ec3b9a7eb9f20ab606e5fe797a7357.png

We can also explore two variable PDP plots to understand interactions between features. Note the wiggles in the ICE plots above with temperature and humidity.

# Slower to plot the 2-way PDP
# PartialDependenceDisplay.from_estimator(reg, X_train, [('temp', 'humidity')], subsample = 50, n_jobs = -1, grid_resolution=20, kind = 'average' )

Permutation Feature Importance

Permute individual features while holding others constant to see the effect on target.

results = permutation_importance(reg, X_train, y_train, n_repeats=10)

results

{'importances_mean': array([0.06408446, 0.        , 0.05608611, 1.38907729, 0.00536676,
        0.03095871, 0.26588249, 0.03003684, 0.09342833, 0.06718443,
        0.09206399, 0.01775954]),
 'importances_std': array([0.00146776, 0.        , 0.00142307, 0.02330278, 0.00026406,
        0.00068182, 0.00639445, 0.00058722, 0.00220706, 0.00162217,
        0.00358808, 0.0007023 ]),
 'importances': array([[0.06300265, 0.06402278, 0.06463879, 0.06582814, 0.06204641,
         0.06684893, 0.06517816, 0.06249613, 0.06286907, 0.06391352],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.05552417, 0.05325413, 0.05561483, 0.05599797, 0.05715764,
         0.05666196, 0.05552161, 0.05509174, 0.05717289, 0.05886412],
        [1.39215404, 1.38278419, 1.39891392, 1.43035001, 1.39482099,
         1.3678748 , 1.39885134, 1.41269845, 1.36795563, 1.34436959],
        [0.00556594, 0.00533265, 0.00586772, 0.00505203, 0.00564623,
         0.005108  , 0.00524527, 0.00502574, 0.00529748, 0.00552654],
        [0.03143972, 0.03074974, 0.03096807, 0.03105378, 0.03102279,
         0.03210676, 0.03005022, 0.0318892 , 0.02998865, 0.03031814],
        [0.26693612, 0.27388686, 0.26138693, 0.26471162, 0.25932431,
         0.27060401, 0.25526071, 0.27479178, 0.26027721, 0.27164531],
        [0.03051323, 0.02920526, 0.03001463, 0.03068423, 0.02960652,
         0.02993691, 0.03099948, 0.02955091, 0.0305323 , 0.02932495],
        [0.0908969 , 0.09123766, 0.09405136, 0.09543508, 0.09107119,
         0.09412605, 0.09672433, 0.0918116 , 0.09207493, 0.09685421],
        [0.06753367, 0.06542045, 0.06594509, 0.06789696, 0.06580669,
         0.06691229, 0.06712925, 0.06528443, 0.06977129, 0.07014418],
        [0.09755111, 0.0880959 , 0.08526027, 0.08980298, 0.09112072,
         0.09527755, 0.09177947, 0.09575665, 0.09144682, 0.0945484 ],
        [0.01812655, 0.01650363, 0.01809645, 0.01843345, 0.01672405,
         0.01816964, 0.01718265, 0.01865005, 0.01822078, 0.01748812]])}

pd.DataFrame(results['importances'], index = X_train.columns.tolist()).T.plot(kind = 'box', vert = False)

<Axes: >

_images/531748ccbddb24ea292790ec80058690ccaf47e0838c2fdd56c1e891b15dad0f.png

pd.DataFrame(results['importances_mean'], index = X_train.columns.tolist()).sort_values(0, ascending = False)

	0
hour	1.389077
workingday	0.265882
temp	0.093428
humidity	0.092064
feel_temp	0.067184
season	0.064084
month	0.056086
weekday	0.030959
weather	0.030037
windspeed	0.017760
holiday	0.005367
year	0.000000

Comparing Boosted Models#

Returning to the food marketing data from last class:

Can you tune a boosted model to do better than the baseline and optimize towards precision?
What features were important to making these predictions and how would you suggest targeting future customers?

food_marketing = pd.read_csv('https://raw.githubusercontent.com/jfkoehler/nyu_bootcamp_fa25/refs/heads/main/data/food_data.csv')

food_marketing.head()

	ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds	NumDealsPurchases	NumWebPurchases	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	Z_CostContact	Z_Revenue	Response
0	5524	1957	Graduation	Single	58138.0	0	0	2012-09-04	58	635	88	546	172	88	88	3	8	10	4	7	3	11	1
1	2174	1954	Graduation	Single	46344.0	1	1	2014-03-08	38	11	1	6	2	1	6	2	1	1	2	5	3	11	0
2	4141	1965	Graduation	Together	71613.0	0	0	2013-08-21	26	426	49	127	111	21	42	1	8	2	10	4	3	11	0
3	6182	1984	Graduation	Together	26646.0	1	0	2014-02-10	26	11	4	20	10	3	5	2	2	0	4	6	3	11	0
4	5324	1981	PhD	Married	58293.0	1	0	2014-01-19	94	173	43	118	46	27	15	5	5	3	6	5	3	11	0

food_marketing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 ID                   2240 non-null   int64  
 Year_Birth           2240 non-null   int64  
 Education            2240 non-null   object 
 Marital_Status       2240 non-null   object 
 Income               2216 non-null   float64
 Kidhome              2240 non-null   int64  
 Teenhome             2240 non-null   int64  
 Dt_Customer          2240 non-null   object 
 Recency              2240 non-null   int64  
 MntWines             2240 non-null   int64  
MntFruits            2240 non-null   int64  
MntMeatProducts      2240 non-null   int64  
MntFishProducts      2240 non-null   int64  
MntSweetProducts     2240 non-null   int64  
MntGoldProds         2240 non-null   int64  
NumDealsPurchases    2240 non-null   int64  
NumWebPurchases      2240 non-null   int64  
NumCatalogPurchases  2240 non-null   int64  
NumStorePurchases    2240 non-null   int64  
NumWebVisitsMonth    2240 non-null   int64  
AcceptedCmp3         2240 non-null   int64  
AcceptedCmp4         2240 non-null   int64  
AcceptedCmp5         2240 non-null   int64  
AcceptedCmp1         2240 non-null   int64  
AcceptedCmp2         2240 non-null   int64  
Complain             2240 non-null   int64  
Z_CostContact        2240 non-null   int64  
Z_Revenue            2240 non-null   int64  
Response             2240 non-null   int64  
dtypes: float64(1), int64(25), object(3)
memory usage: 507.6+ KB

food_marketing[['Education', 'Marital_Status']] = food_marketing[['Education', 'Marital_Status']].astype('category')

food_marketing.drop(columns = 'Dt_Customer', inplace = True)

X = food_marketing.drop(columns = 'Response')
y = food_marketing['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=10)

PROBLEM#

Below, a dataset around an email campaign for sales is loaded and displayed. Explore the description of the task here. Consider a strategy for model building and evaluation.

# !pip install scikit-uplift

from sklift.datasets import fetch_hillstrom

dataset = fetch_hillstrom(target_col='conversion')
data, target, treatment = dataset.data, dataset.target, dataset.treatment

data.head()

	recency	history_segment	history	mens	womens	zip_code	newbie	channel
0	10	2) $100 - $200	142.44	1	0	Surburban	0	Phone
1	6	3) $200 - $350	329.08	1	1	Rural	1	Web
2	7	2) $100 - $200	180.65	0	1	Surburban	1	Web
3	9	5) $500 - $750	675.83	1	0	Rural	1	Web
4	2	1) $0 - $100	45.34	1	0	Urban	0	Web

target.head()

  0
  0
  0
  0
  0
Name: conversion, dtype: int64

treatment.head()

  Womens E-Mail
      No E-Mail
  Womens E-Mail
    Mens E-Mail
  Womens E-Mail
Name: segment, dtype: object