Introduction to Decision Trees

Introduction to Decision Trees#

OBJECTIVES

Understand how a decision tree is built for classification and regression
Fit decision tree models using scikit-learn
Compare and evaluate classifiers

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.inspection import DecisionBoundaryDisplay

Homework

from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

from sklearn import set_config
set_config(transform_output="pandas")

kidney = fetch_openml(data_id=42972).frame
kidney.head()

	id	age	bp	sg	al	su	rbc	pc	pcc	ba	...	pcv	wc	rc	htn	dm	cad	appet	pe	ane	classification
0	0	48.0	80.0	1.020	1.0	0.0	NaN	normal	notpresent	notpresent	...	44	7800	5.2	yes	yes	no	good	no	no	ckd
1	1	7.0	50.0	1.020	4.0	0.0	NaN	normal	notpresent	notpresent	...	38	6000	NaN	no	no	no	good	no	no	ckd
2	2	62.0	80.0	1.010	2.0	3.0	normal	normal	notpresent	notpresent	...	31	7500	NaN	no	yes	no	poor	no	yes	ckd
3	3	48.0	70.0	1.005	4.0	0.0	normal	abnormal	present	notpresent	...	32	6700	3.9	yes	no	no	poor	yes	yes	ckd
4	4	51.0	80.0	1.010	2.0	0.0	normal	normal	notpresent	notpresent	...	35	7300	4.6	no	no	no	good	no	no	ckd

5 rows × 26 columns

kidney.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 id              400 non-null    int64  
 age             391 non-null    float64
 bp              388 non-null    float64
 sg              353 non-null    float64
 al              354 non-null    float64
 su              351 non-null    float64
 rbc             248 non-null    object 
 pc              335 non-null    object 
 pcc             396 non-null    object 
 ba              396 non-null    object 
bgr             356 non-null    float64
bu              381 non-null    float64
sc              383 non-null    float64
sod             313 non-null    float64
pot             312 non-null    float64
hemo            348 non-null    float64
pcv             330 non-null    object 
wc              295 non-null    object 
rc              270 non-null    object 
htn             398 non-null    object 
dm              398 non-null    object 
cad             398 non-null    object 
appet           399 non-null    object 
pe              399 non-null    object 
ane             399 non-null    object 
classification  400 non-null    object 
dtypes: float64(11), int64(1), object(14)
memory usage: 81.4+ KB

kidney['rc'] = kidney['rc'].replace({"'t?'":np.nan}).replace("'t43'", np.nan).astype('float')
kidney['pcv'] = kidney['pcv'].replace("'t?'", np.nan).replace("'t43'", np.nan).astype('float')
kidney['wc'] = kidney['wc'].replace("'t6200'", np.nan).replace("'t8400'").replace("'t?'", np.nan).astype('float')

kidney.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 id              400 non-null    int64  
 age             391 non-null    float64
 bp              388 non-null    float64
 sg              353 non-null    float64
 al              354 non-null    float64
 su              351 non-null    float64
 rbc             248 non-null    object 
 pc              335 non-null    object 
 pcc             396 non-null    object 
 ba              396 non-null    object 
bgr             356 non-null    float64
bu              381 non-null    float64
sc              383 non-null    float64
sod             313 non-null    float64
pot             312 non-null    float64
hemo            348 non-null    float64
pcv             328 non-null    float64
wc              293 non-null    float64
rc              269 non-null    float64
htn             398 non-null    object 
dm              398 non-null    object 
cad             398 non-null    object 
appet           399 non-null    object 
pe              399 non-null    object 
ane             399 non-null    object 
classification  400 non-null    object 
dtypes: float64(14), int64(1), object(11)
memory usage: 81.4+ KB

/var/folders/8v/7bhy8yqn04b7rzqglb2s38200000gn/T/ipykernel_122/1048509165.py:3: FutureWarning: Series.replace without 'value' and with non-dict-like 'to_replace' is deprecated and will raise in a future version. Explicitly specify the new values instead.
  kidney['wc'] = kidney['wc'].replace("'t6200'", np.nan).replace("'t8400'").replace("'t?'", np.nan).astype('float')

cat_cols = kidney.select_dtypes('object').columns.tolist()
cat_cols = cat_cols[:-1]

num_cols = kidney.select_dtypes('number').columns.tolist()


ohe = OneHotEncoder(sparse_output=False)
imp_num = SimpleImputer(strategy = 'mean')
imp_cat = SimpleImputer(strategy = 'most_frequent')
scale = StandardScaler()

imputer = make_column_transformer((imp_num, num_cols),
                             (imp_cat, cat_cols),
                                verbose_feature_names_out=False,
                             remainder = 'passthrough')
ohe = make_column_transformer((ohe, cat_cols),
                              remainder = scale)


X_train, X_test, y_train, y_test = train_test_split(kidney.iloc[:, :-1], kidney.iloc[:, -1])

Customer Churn#

churn = pd.read_csv('https://raw.githubusercontent.com/jfkoehler/nyu_bootcamp_fa24/refs/heads/main/data/cell_phone_churn.csv')

churn.head()

	state	account_length	area_code	intl_plan	vmail_plan	vmail_message	day_mins	day_calls	day_charge	eve_mins	eve_calls	eve_charge	night_mins	night_calls	night_charge	intl_mins	intl_calls	intl_charge	custserv_calls	churn
0	KS	128	415	no	yes	25	265.1	110	45.07	197.4	99	16.78	244.7	91	11.01	10.0	3	2.70	1	False
1	OH	107	415	no	yes	26	161.6	123	27.47	195.5	103	16.62	254.4	103	11.45	13.7	3	3.70	1	False
2	NJ	137	415	no	no	0	243.4	114	41.38	121.2	110	10.30	162.6	104	7.32	12.2	5	3.29	0	False
3	OH	84	408	yes	no	0	299.4	71	50.90	61.9	88	5.26	196.9	89	8.86	6.6	7	1.78	2	False
4	OK	75	415	yes	no	0	166.7	113	28.34	148.3	122	12.61	186.9	121	8.41	10.1	3	2.73	3	False

Problem

Can you determine a value to break the data apart that seperates churn from not churn for the day_mins? What about the custserv_calls?

plt.figure(figsize = (15, 10))
sns.scatterplot(data = churn, x = 'day_mins', y = 'custserv_calls', hue = 'churn', s = 100)
plt.grid();

_images/e8e797970b7de88c03f6fbc9e432180de8dc8a892b70cee71942b08275fea8fc.png

tree = DecisionTreeClassifier(max_depth = 2, criterion='entropy')

X = churn[['day_mins', 'custserv_calls']]
y = churn['churn']

tree.fit(X, y)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

fig, ax = plt.subplots(1, 2, figsize = (20, 5))
DecisionBoundaryDisplay.from_estimator(tree, X, ax = ax[0])
sns.scatterplot(data = X, x = 'day_mins', y = 'custserv_calls', hue = y, ax = ax[0])
ax[0].grid()
ax[0].set_title('Decision Boundary for Tree Model');
plot_tree(tree, feature_names=X.columns,
         filled = True,
         fontsize = 12,
         ax = ax[1],
         rounded = True,
         class_names = ['no churn', 'churn'])
ax[1].set_title('Decision Process for Customer Churn');

_images/cddc45a7a359a4747c0f38de8bf47c6c8187cb29cc8bf881e20b9ecf8af5dd14.png

Titanic Dataset#

#load the data
titanic = sns.load_dataset('titanic')
titanic.head(5) #shows first five rows of data

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

#subset the data to binary columns
data = titanic.loc[:4, ['alone', 'adult_male', 'survived']]
data

	alone	adult_male	survived
0	False	True	0
1	False	False	1
2	True	False	1
3	False	False	1
4	True	True	0

Suppose you want to use a single column to predict if a passenger survives or not. Which column will do a better job predicting survival in the sample dataset above?

Entropy#

One way to quantify the quality of the split is to use a quantity called entropy. This is determined by:

\[H = - \sum p_i \log p_i \]

With a decision tree the idea is to select a feature that produces less entropy.

#all the same -- probability = 1
1*np.log2(1)

0.0

#half and half -- probability = .5
-(1/2*np.log2(1/2) + 1/2*np.log2(1/2))

1.0

#plot of entropy
def entropy(p):
    return -(p*np.log2(p) + (1-p)*np.log2(1 - p))

p = np.linspace(0.0001, .99999, 100)
plt.plot(p, entropy(p))
plt.title('Entropy')
plt.xlabel('probability');

_images/34ada585026073fc17f518f854a3adfb821db1aa77bfb4e5377575547b1130c8.png

#subset the data to age, pclass, and survived five rows
data = titanic.loc[:4, ['age', 'pclass', 'survived']]
data

	age	pclass	survived
0	22.0	3	0
1	38.0	1	1
2	26.0	3	1
3	35.0	1	1
4	35.0	3	0

data.loc[data['pclass'] == 1]

	age	pclass	survived
1	38.0	1	1
3	35.0	1	1

#compute entropy for pclass
#first class entropy
first_class_entropy = -(2/2*np.log2(2/2))
first_class_entropy

-0.0

data.loc[data['pclass'] != 1]

	age	pclass	survived
0	22.0	3	0
2	26.0	3	1
4	35.0	3	0

#pclass entropy
third_class_entropy = -(1/3*np.log2(1/3) + 2/3*np.log2(2/3))
third_class_entropy

0.9182958340544896

#weighted sum of these
pclass_entropy = 2/5*first_class_entropy + 3/5*third_class_entropy
pclass_entropy

0.5509775004326937

data

	age	pclass	survived
0	22.0	3	0
1	38.0	1	1
2	26.0	3	1
3	35.0	1	1
4	35.0	3	0

#splitting on age < 30
entropy_left = -(1/2*np.log2(1/2) + 1/2*np.log2(1/2))
entropy_right = -(1/3*np.log2(1/3) + 2/3*np.log2(2/3))
entropy_age = 2/5*entropy_left + 3/5*entropy_right
entropy_age

0.9509775004326937

data

	age	pclass	survived
0	22.0	3	0
1	38.0	1	1
2	26.0	3	1
3	35.0	1	1
4	35.0	3	0

#original entropy
original_entropy = -((3/5)*np.log2(3/5) + (2/5)*np.log2(2/5))
original_entropy 

0.9709505944546686

# improvement based on pclass
original_entropy - pclass_entropy

0.4199730940219749

#improvement based on age < 30
original_entropy - entropy_age

0.01997309402197489

Using `sklearn`#

The DecisionTreeClassifier can use entropy to build a full decision tree model.

X = data[['age', 'pclass']]
y = data['survived']

from sklearn.tree import DecisionTreeClassifier

#DecisionTreeClassifier?

#instantiate
tree2 = DecisionTreeClassifier(criterion='entropy', max_depth = 1)

#fit
tree2.fit(X, y)

DecisionTreeClassifier(criterion='entropy', max_depth=1)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

#score it
tree2.score(X, y)

0.8

#predictions
tree2.predict(X)

array([0, 1, 0, 1, 0])

Visualizing the results#

The plot_tree function will plot the decision tree model after fitting. There are many options you can use to control the resulting tree drawn.

from sklearn.tree import plot_tree

#plot_tree
fig, ax = plt.subplots(figsize = (5,5))
plot_tree(tree2, 
          feature_names=X.columns, 
          class_names=['died', 'survived'], filled = True);

_images/07260efd215c88b914ee5bbfb45c3b0ce9c349da88d0d014798b9a37c06e63de.png

PROBLEM

Build a Logistic Regression and Decision Tree pipeline to predict purchase. Assign the positive predictions probabilities to the variables plgr and ptree respectively.

purchase_data = pd.read_csv('https://raw.githubusercontent.com/jfkoehler/nyu_bootcamp_fa25/refs/heads/main/data/bank-direct-marketing-campaigns.csv')

purchase_data.head()

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	campaign	pdays	poutcome	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	y
0	56	housemaid	married	basic.4y	no	no	no	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
1	57	services	married	high.school	unknown	no	no	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
2	37	services	married	high.school	no	yes	no	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
3	40	admin.	married	basic.6y	no	no	no	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
4	56	services	married	high.school	no	no	yes	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no

purchase_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 age             41188 non-null  int64  
 job             41188 non-null  object 
 marital         41188 non-null  object 
 education       41188 non-null  object 
 default         41188 non-null  object 
 housing         41188 non-null  object 
 loan            41188 non-null  object 
 contact         41188 non-null  object 
 month           41188 non-null  object 
 day_of_week     41188 non-null  object 
campaign        41188 non-null  int64  
pdays           41188 non-null  int64  
previous        41188 non-null  int64  
poutcome        41188 non-null  object 
emp.var.rate    41188 non-null  float64
cons.price.idx  41188 non-null  float64
cons.conf.idx   41188 non-null  float64
euribor3m       41188 non-null  float64
nr.employed     41188 non-null  float64
y               41188 non-null  object 
dtypes: float64(5), int64(4), object(11)
memory usage: 6.3+ MB

cat_cols = purchase_data.select_dtypes('object').columns.tolist()
cat_cols

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome',
 'y']

cat_cols.remove('y')

cat_cols

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

encoder = make_column_transformer((OneHotEncoder(sparse_output=False), cat_cols),
                                 remainder = StandardScaler(),
                                 verbose_feature_names_out=False)

knn_pipe = Pipeline([('transform', encoder), ('model', KNeighborsClassifier(n_neighbors=3))])

X = purchase_data.iloc[:, :-1]
y = purchase_data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, stratify = y)

knn_pipe.fit(X, y)

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py:1623: FutureWarning: 
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
To use the new behavior now and suppress this warning, use ColumnTransformer(force_int_remainder_cols=False).

  warnings.warn(

Pipeline(steps=[('transform',
                 ColumnTransformer(remainder=StandardScaler(),
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['job', 'marital',
                                                   'education', 'default',
                                                   'housing', 'loan', 'contact',
                                                   'month', 'day_of_week',
                                                   'poutcome'])],
                                   verbose_feature_names_out=False)),
                ('model', KNeighborsClassifier(n_neighbors=3))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

knn_pipe.score(X, y)

0.9249295911430514

y.value_counts(normalize = True)

y
no     0.887346
yes    0.112654
Name: proportion, dtype: float64

ConfusionMatrixDisplay.from_estimator(knn_pipe, X, y)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x135b6f0e0>

_images/3461fcb6834537d83fde3db75d9efaa659bade5407db81cd868c75b0f85813ca.png

prob_positive = knn_pipe.predict_proba(X)[:, 1]
prob_positive

array([0.        , 0.        , 0.        , ..., 0.33333333, 0.33333333,
       0.66666667])

results = pd.DataFrame({'y': y, 'prob_pos': prob_positive})

results.head()

	y	prob_pos
0	no	0.0
1	no	0.0
2	no	0.0
3	no	0.0
4	no	0.0

results = results.sort_values(by = 'prob_pos', ascending = False)
results.reset_index(inplace = True, drop = True)

results.head()

	y	prob_pos
0	yes	1.0
1	yes	1.0
2	yes	1.0
3	yes	1.0
4	yes	1.0

results.iloc[:41188//10, :]['y'].value_counts()

y
yes    2634
no     1484
Name: count, dtype: int64

y.value_counts()

y
no     36548
yes     4640
Name: count, dtype: int64

2634/4640

0.5676724137931034

results.iloc[:(41188//10)*2, :]['y'].value_counts()

y
no     4265
yes    3971
Name: count, dtype: int64

3971/4640

0.8558189655172413

What does it mean? If we use our model to contact 20% of the most probable to subscribe customers, we would be able to target 85% of the positives.

Problem

Build a DecisionTreeClassifier pipeline to predict purchase, grid searching max_depth parameter. Is this better than the knn model in terms of lift when constrained to the 20% most likely customers to churn.

tree_pipe = Pipeline([('transform', encoder), ('model', DecisionTreeClassifier())])
params = {'model__max_depth': [1, 2, 3, 4, 5, 6, 7]} #fill in the appropriate parameter
tree_grid = GridSearchCV(tree_pipe, param_grid=params)

tree_grid.fit(X_train, y_train)

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py:1623: FutureWarning: 
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).
To use the new behavior now and suppress this warning, use ColumnTransformer(force_int_remainder_cols=False).

  warnings.warn(

GridSearchCV(estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(remainder=StandardScaler(),
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['job',
                                                                          'marital',
                                                                          'education',
                                                                          'default',
                                                                          'housing',
                                                                          'loan',
                                                                          'contact',
                                                                          'month',
                                                                          'day_of_week',
                                                                          'poutcome'])],
                                                          verbose_feature_names_out=False)),
                                       ('model', DecisionTreeClassifier())]),
             param_grid={'model__max_depth': [1, 2, 3, 4, 5, 6, 7]})

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

GridSearchCV?Documentation for GridSearchCViFitted

GridSearchCV(estimator=Pipeline(steps=[('transform',
                                        ColumnTransformer(remainder=StandardScaler(),
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(sparse_output=False),
                                                                         ['job',
                                                                          'marital',
                                                                          'education',
                                                                          'default',
                                                                          'housing',
                                                                          'loan',
                                                                          'contact',
                                                                          'month',
                                                                          'day_of_week',
                                                                          'poutcome'])],
                                                          verbose_feature_names_out=False)),
                                       ('model', DecisionTreeClassifier())]),
             param_grid={'model__max_depth': [1, 2, 3, 4, 5, 6, 7]})

best_estimator_: Pipeline

Pipeline(steps=[('transform',
                 ColumnTransformer(remainder=StandardScaler(),
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['job', 'marital',
                                                   'education', 'default',
                                                   'housing', 'loan', 'contact',
                                                   'month', 'day_of_week',
                                                   'poutcome'])],
                                   verbose_feature_names_out=False)),
                ('model', DecisionTreeClassifier(max_depth=5))])

transform: ColumnTransformer?Documentation for transform: ColumnTransformer

ColumnTransformer(remainder=StandardScaler(),
                  transformers=[('onehotencoder',
                                 OneHotEncoder(sparse_output=False),
                                 ['job', 'marital', 'education', 'default',
                                  'housing', 'loan', 'contact', 'month',
                                  'day_of_week', 'poutcome'])],
                  verbose_feature_names_out=False)

onehotencoder

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

OneHotEncoder?Documentation for OneHotEncoder

OneHotEncoder(sparse_output=False)

remainder

['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

StandardScaler?Documentation for StandardScaler

StandardScaler()

DecisionTreeClassifier?Documentation for DecisionTreeClassifier

DecisionTreeClassifier(max_depth=5)

ConfusionMatrixDisplay.from_estimator(tree_grid, X, y)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x13593ba10>

_images/f7009e5041fd8d164bc86b2bdcf9742e0846c3289cf1a9263ea4beec6dfa1c77.png

plt.figure(figsize = (20, 10))
plot_tree(tree_grid.best_estimator_['model'], feature_names=tree_pipe['transform'].get_feature_names_out(),
         class_names = ['no purchase', 'purchase'],
         filled = True,
         fontsize=7);

_images/2057459979ec84825590fa2860e92795359a9f3dc13a2b9581c5ba71612ca965.png

Problem

For next class, read the chapter “Decision Analytic Thinking: What’s a Good Model?” from Data Science for Business. Pay special attention to the cost benefit analysis. You will be asked to use cost/benefit analysis to determine the “best” classifier for an example dataset next class.

#example cost benefit matrix
cost_benefits = np.array([ [0, -1], [0, 99]])
cost_benefits

array([[ 0, -1],
       [ 0, 99]])

The expected value is computed by:

\[\text{Expected Profit} = p(Y,p)*b(Y, p) + p(N, p)*b(N,p) + p(N,n)*b(N,n) + p(Y,n)*b(Y,n)\]

use this to calculate the expected profit of the Decision Tree and KNN Regression model. Which is better? Expected Value and argument in slack.

from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y, tree_grid.predict(X))

mat

array([[35983,   565],
       [ 3465,  1175]])

mat.sum()

mat/mat.sum()

array([[0.87362824, 0.01371759],
       [0.08412644, 0.02852773]])

(mat/mat.sum())*cost_benefits

array([[ 0.        , -0.01371759],
       [ 0.        ,  2.82424493]])

#expected profit for decision tree model
(((mat/mat.sum())*cost_benefits)).sum()

2.810527338059629

knn_mat = confusion_matrix(y, knn_pipe.predict(X))

knn_mat/knn_mat.sum()

array([[0.87030203, 0.0170438 ],
       [0.05802661, 0.05462756]])

#expected profit for logistic model
((knn_mat/knn_mat.sum())*cost_benefits).sum()

5.391084781975333

EXIT TICKET: here

Introduction to Decision Trees

Contents

Introduction to Decision Trees#

Customer Churn#

Titanic Dataset#

Entropy#

Using sklearn#

Visualizing the results#

Using `sklearn`#