from dsc80_utils import *

diabetes = pd.read_csv(Path('data') / 'diabetes.csv')
display_df(diabetes, cols=9)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

fig = (
    X_train.assign(Outcome=y_train.astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes')
)
fig

# Let's use more features for prediction
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes.drop(columns=['Outcome']), diabetes['Outcome'], random_state=1)
)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

clf.score(X_test, y_test)

0.8020833333333334

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=4, criterion='entropy')
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.7829861111111112

dt.score(X_test, y_test)

0.7395833333333334

news = pd.read_csv('data/fake_news_training.csv')
news

news['label'].value_counts(normalize=True)

label
real    0.55
fake    0.45
Name: proportion, dtype: float64

from sklearn.feature_extraction.text import CountVectorizer

example_corp = ['hey hey hey my name is billy', 
                'hey billy how is your dog billy']

count_vec = CountVectorizer()
count_vec.fit(example_corp)

CountVectorizer()

CountVectorizer()

count_vec.vocabulary_

{'hey': 2,
 'my': 5,
 'name': 6,
 'is': 4,
 'billy': 0,
 'how': 3,
 'your': 7,
 'dog': 1}

count_vec.transform(example_corp).toarray()

array([[1, 0, 3, 0, 1, 1, 1, 0],
       [2, 1, 1, 1, 1, 0, 0, 1]])

example_corp

['hey hey hey my name is billy', 'hey billy how is your dog billy']

pd.DataFrame(count_vec.transform(example_corp).toarray(),
             columns=pd.Series(count_vec.vocabulary_).sort_values().index)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

X = news['content']
y = news['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

pl = Pipeline([
    ('cv', CountVectorizer()), 
    ('clf', RandomForestClassifier(
        max_depth=3,
        n_estimators=100, # Uses 100 separate decision trees!
        random_state=42,
    )) 
])

pl.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=3, random_state=42))])

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=3, random_state=42))])

CountVectorizer()

RandomForestClassifier(max_depth=3, random_state=42)

# Training accuracy.
pl.score(X_train, y_train)

0.7393939393939394

# Testing accuracy.
pl.score(X_test, y_test)

0.7108433734939759

y_train.value_counts(normalize=True)

label
real    0.54
fake    0.46
Name: proportion, dtype: float64

# Distribution of predicted ys in the training set:

# stops scientific notation for pandas
pd.set_option('display.float_format', '{:.3f}'.format)
pd.Series(pl.predict(X_train)).value_counts(normalize=True)

fake   0.689
real   0.311
Name: proportion, dtype: float64

len(pl.named_steps['cv'].vocabulary_) # Lots of features!

23527

# Note that we've used the key clf__max_depth, not max_depth
# because max_depth is a hyperparameter of clf, not of pl.

hyperparameters = {
    'clf__max_depth': np.arange(2, 200, 20)
}

%%time

# Takes a few seconds to run – how many trees are being trained?
from sklearn.model_selection import GridSearchCV
grids = GridSearchCV(
    pl,
    n_jobs=-1, # Use multiple processors to parallelize
    param_grid=hyperparameters,
    return_train_score=True
)
grids.fit(X_train, y_train)

CPU times: user 1.24 s, sys: 313 ms, total: 1.55 s
Wall time: 6.12 s

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf',
                 RandomForestClassifier(max_depth=np.int64(42),
                                        random_state=42))])

CountVectorizer()

RandomForestClassifier(max_depth=np.int64(42), random_state=42)

grids.best_params_

{'clf__max_depth': np.int64(42)}

# Training accuracy.
grids.score(X_train, y_train)

0.9959595959595959

# Testing accuracy.
grids.score(X_test, y_test)

0.8373493975903614

index = grids.param_grid['clf__max_depth']
train = grids.cv_results_['mean_train_score']
valid = grids.cv_results_['mean_test_score']

pd.DataFrame({'train': train, 'valid': valid}, index=index).plot().update_layout(
    xaxis_title='max_depth', yaxis_title='Accuracy'
)

hyperparameters = {
    'n_estimators': [10, 100, 1000], # number of trees per forest
    'max_depth': [None, 100, 10]     # max depth of each tree
}
grids = GridSearchCV(
    RandomForestClassifier(), param_grid=hyperparameters,
    cv=3, # 3-fold cross-validation
)
grids.fit(X_train, y_train)

from sklearn.datasets import load_breast_cancer
loaded = load_breast_cancer() # explore the value of `loaded`!
data = loaded['data']
labels = 1 - loaded['target']
cols = loaded['feature_names']
bc = pd.DataFrame(data, columns=cols)

bc.head()

labels

array([1, 1, 1, ..., 1, 1, 0])

pd.Series(labels).value_counts(normalize=True)

0   0.627
1   0.373
Name: proportion, dtype: float64

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(bc, labels)

clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

LogisticRegression(max_iter=10000)

clf.predict(X_test)

array([1, 0, 0, ..., 0, 1, 0])

# [:, 1] refers to the predicted probabilities for class 1.
clf.predict_proba(X_test)

array([[0.  , 1.  ],
       [1.  , 0.  ],
       [0.98, 0.02],
       ...,
       [1.  , 0.  ],
       [0.  , 1.  ],
       [1.  , 0.  ]])

clf.intercept_

array([-30.45])

clf.coef_

array([[-0.97, -0.09,  0.28, ...,  0.48,  0.64,  0.07]])

from sklearn import metrics

y_pred = clf.predict(X_test)

metrics.confusion_matrix(y_test, y_pred)

array([[91,  1],
       [ 6, 45]])

from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test);
plt.grid(False)

metrics.accuracy_score(y_test, y_pred)

0.951048951048951

metrics.precision_score(y_test, y_pred)

np.float64(0.9782608695652174)

metrics.recall_score(y_test, y_pred)

np.float64(0.8823529411764706)

thresholds = np.arange(0.01, 1.01, 0.01)
precisions = np.array([])
recalls = np.array([])

for t in thresholds:
    y_pred = clf.predict_proba(X_test)[:, 1] >= t
    precisions = np.append(precisions, metrics.precision_score(y_test, y_pred, zero_division=1))
    recalls = np.append(recalls, metrics.recall_score(y_test, y_pred))

px.line(x=thresholds, y=precisions,
        labels={'x': 'Threshold', 'y': 'Precision'}, title='Precision vs. Threshold', width=1000, height=600)

px.line(x=thresholds, y=recalls, 
        labels={'x': 'Threshold', 'y': 'Recall'}, title='Recall vs. Threshold', width=1000, height=600)

px.line(x=recalls, y=precisions, hover_name=thresholds, 
        labels={'x': 'Recall', 'y': 'Precision'}, title='Precision vs. Recall')

pr = metrics.precision_score(y_test, clf.predict(X_test))
re = metrics.recall_score(y_test, clf.predict(X_test))

2 * pr * re / (pr + re)

np.float64(0.9278350515463919)

metrics.f1_score(y_test, clf.predict(X_test))

np.float64(0.9278350515463918)

metrics.accuracy_score(y_test, clf.predict(X_test))

0.951048951048951

loans = pd.read_csv(Path('data') / 'loan_vars1.csv', index_col=0)
loans.head()

loans['loan_amnt'].sum()

np.float64(5706507225.0)

loans.shape[0]

386772

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = loans.drop('tag', axis=1)
y = loans.tag
X_train, X_test, y_train, y_test = train_test_split(X, y)

clf = RandomForestClassifier(n_estimators=50)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50)

RandomForestClassifier(n_estimators=50)

y_pred = clf.predict(X_test)
y_pred

array([0., 1., 0., ..., 1., 1., 0.])

clf.score(X_test, y_test)

0.7142088879236346

ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test);
plt.grid(False)

metrics.precision_score(y_test, y_pred)

np.float64(0.7727873911605864)

1 - metrics.precision_score(y_test, y_pred)

np.float64(0.22721260883941363)

metrics.recall_score(y_test, y_pred)

np.float64(0.7337530958942338)

1 - metrics.recall_score(y_test, y_pred)

np.float64(0.26624690410576624)

results = X_test
results['age_bracket'] = results['age'].apply(lambda x: 5 * (x // 5 + 1))
results['prediction'] = y_pred
results['tag'] = y_test

(
    results
    .groupby('age_bracket')
    [['tag', 'prediction']]
    .apply(lambda x: 1 - metrics.recall_score(x['tag'], x['prediction']))
    .plot(kind='bar', title='False Negative Rate by Age Group')
)

results['is_young'] = (results['age'] < 25).replace({True: 'young', False: 'old'})

results.groupby('is_young')['prediction'].mean()

is_young
old     0.686
young   0.295
Name: prediction, dtype: float64

compute_accuracy = lambda x: metrics.accuracy_score(x['tag'], x['prediction'])

(
    results
    .groupby('is_young')
    [['tag', 'prediction']]
    .apply(compute_accuracy)
    .rename('accuracy')
)

is_young
old     0.730
young   0.679
Name: accuracy, dtype: float64

obs = (results
       .groupby('is_young')
       [['tag', 'prediction']]
       .apply(compute_accuracy)
       .diff()
       .iloc[-1])
obs

np.float64(-0.05127926116930481)

diff_in_acc = []
for _ in range(500):
    s = (
        results[['is_young', 'prediction', 'tag']]
        .assign(is_young=np.random.permutation(results['is_young']))
        .groupby('is_young')
        [['tag', 'prediction']]
        .apply(compute_accuracy)
        .diff()
        .iloc[-1]
    )
    
    diff_in_acc.append(s)

fig = pd.Series(diff_in_acc).plot(kind='hist', histnorm='probability', nbins=20,
                            title='Difference in Accuracy (Young - Old)')
fig.add_vline(x=obs, line_color='red')
fig.update_layout(xaxis_range=[-0.1, 0.05])

loans

	baseurl	content	label
0	twitter.com	\njavascript is not available.\n\nwe’ve detect...	real
1	whitehouse.gov	remarks by the president at campaign event -- ...	real
2	web.archive.org	the committee on energy and commerce\nbarton: ...	real
...	...	...	...
658	politico.com	full text: jeff flake on trump speech transcri...	fake
659	pol.moveon.org	moveon.org political action: 10 things to know...	real
660	uspostman.com	uspostman.com is for sale\nyes, you can transf...	fake

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	mean radius	mean texture	mean perimeter	mean area	...	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.990	10.380	122.800	1001.000	...	0.712	0.265	0.460	0.119
1	20.570	17.770	132.900	1326.000	...	0.242	0.186	0.275	0.089
2	19.690	21.250	130.000	1203.000	...	0.450	0.243	0.361	0.088
3	11.420	20.380	77.580	386.100	...	0.687	0.258	0.664	0.173
4	20.290	14.340	135.100	1297.000	...	0.400	0.163	0.236	0.077

	loan_amnt	emp_length	home_ownership	inq_last_6mths	revol_bal	age
268309	6400.000	0.000	1.000	1.000	899.000	22.000
301093	10700.000	10.000	1.000	0.000	29411.000	19.000
1379211	15000.000	10.000	1.000	2.000	9911.000	48.000
486795	15000.000	10.000	1.000	2.000	15883.000	35.000
1481134	22775.000	3.000	1.000	0.000	17008.000	39.000

Lecture 18 – Classifier Evaluation, Model Fairness¶

DSC 80, Fall 2024¶

Announcements 📣¶

Final Exam 📝¶

Agenda 📆¶

Random Forests¶

Random Forests¶

Main idea:¶

Idea 1: Bootstrap the training data¶

Idea 2: Only use a subset of features¶

Question 🤔 (Answer at dsc80.com/q)

Example¶

Example: Modeling using text features¶

Example: Fake news¶

Aside: CountVectorizer¶

Creating an initial Pipeline¶

Choosing tree depth via GridSearchCV¶

Training and validation accuracy vs. depth¶

Question 🤔 (Answer at dsc80.com/q)

Classifier Evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Logistic regression¶

Wisconsin breast cancer dataset¶

Logistic regression¶

Fitting a logistic regression model¶

Evaluating our model¶

What if we choose a different threshold?¶

Trying several thresholds¶

Combining precision and recall¶

Other evaluation metrics for binary classifiers¶

Model fairness¶

Fairness: why do we care?¶

Model fairness¶

Parity measures for classifiers¶

More on parity measures¶

Example: Loan approval¶

Predicting 'tag'¶

Precision¶

Recall¶

False negative rate by age¶

Computing parity measures¶

Is this difference in accuracy significant?¶

Ethical questions of fairness¶

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

Next time¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶

Predicting `'tag'`¶