import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(7, 5))
plt.rc('font', size=12)

import warnings
warnings.simplefilter('ignore')


from sklearn.datasets import load_breast_cancer
loaded = load_breast_cancer() # explore the value of `loaded`!
data = loaded['data']
labels = 1 - loaded['target']
cols = loaded['feature_names']
bc = pd.DataFrame(data, columns=cols)


bc.head()


labels[:5]

array([1, 1, 1, 1, 1])


pd.Series(labels).value_counts(normalize=True)

0    0.627417
1    0.372583
dtype: float64


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


X_train, X_test, y_train, y_test = train_test_split(bc, labels)


clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression()


clf.predict(X_test)

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])


# [:, 1] refers to the predicted probabilities for class 1
clf.predict_proba(X_test)[:, 1]

array([2.20253638e-02, 1.35994732e-03, 2.89690100e-03, 9.78234450e-01,
       5.31942714e-04, 4.10782587e-04, 6.74575167e-03, 3.24595405e-02,
       9.99774556e-01, 4.61849754e-02, 5.54972341e-01, 1.11305800e-03,
       9.99109588e-01, 6.40848507e-04, 9.99999837e-01, 9.99948589e-01,
       5.44235548e-03, 9.46643645e-01, 1.00000000e+00, 1.00000000e+00,
       8.16338885e-03, 5.66155305e-01, 9.99898049e-01, 3.64573338e-04,
       9.98766289e-01, 2.72194093e-01, 1.63366135e-03, 9.99999955e-01,
       3.86343681e-03, 4.17546476e-02, 9.99997345e-01, 7.39683899e-03,
       2.74247221e-02, 1.02945866e-01, 9.88146351e-01, 2.51457420e-01,
       8.66723127e-05, 9.98471760e-01, 1.45032212e-01, 4.96494397e-02,
       8.26077421e-01, 8.85240835e-02, 1.00000000e+00, 9.90204278e-01,
       9.99999984e-01, 9.99999998e-01, 8.42681426e-02, 3.12773885e-03,
       9.99435808e-01, 2.17577529e-02, 9.51928651e-01, 3.76424830e-04,
       2.43889826e-04, 4.64243442e-03, 9.32811417e-01, 5.91992690e-03,
       9.99999590e-01, 9.99999927e-01, 1.10906216e-02, 9.86177784e-01,
       6.50872472e-02, 3.82930620e-01, 1.36249392e-02, 3.05705040e-02,
       3.71223922e-01, 2.75140395e-01, 9.92734259e-01, 1.00000000e+00,
       4.11250789e-03, 6.11650617e-01, 5.25961629e-05, 9.99990512e-01,
       9.99997644e-01, 9.60612923e-01, 9.31482574e-01, 4.18540866e-03,
       1.91030023e-02, 1.14805643e-03, 7.71859380e-02, 6.99839600e-01,
       9.99992534e-01, 9.07302216e-05, 2.98460897e-01, 7.45875585e-02,
       9.99999997e-01, 4.03995188e-03, 1.07041490e-02, 1.00000000e+00,
       5.18829429e-05, 1.34351397e-02, 2.87568119e-02, 5.41969228e-02,
       2.77578204e-03, 1.75731323e-01, 1.56303788e-02, 9.99703471e-01,
       9.99999979e-01, 1.00000000e+00, 6.33574300e-04, 1.43014823e-02,
       2.10960841e-01, 9.99999998e-01, 1.40726055e-01, 1.05431029e-02,
       2.85531651e-04, 1.75123805e-04, 9.94463636e-01, 9.99999308e-01,
       2.84718870e-04, 9.99344959e-01, 2.76881549e-05, 3.23738934e-02,
       9.84014697e-01, 1.00000000e+00, 6.41383013e-01, 6.48610700e-04,
       1.27593922e-04, 2.78023041e-01, 2.95234529e-02, 1.57136962e-03,
       5.03558182e-04, 9.90342454e-01, 9.99999997e-01, 9.99450279e-01,
       9.79201002e-01, 4.38178543e-02, 1.54248916e-03, 1.16559608e-02,
       1.64950345e-02, 3.15756658e-04, 4.75411755e-03, 9.99984802e-01,
       2.60736441e-04, 3.39005750e-03, 4.83542411e-03, 1.00000000e+00,
       1.89371194e-01, 2.43088459e-02, 1.94186018e-02, 4.45345885e-04,
       2.42394688e-04, 1.06612199e-01, 4.55846092e-03])


clf.intercept_

array([-0.22028652])


clf.coef_

array([[-1.14981798, -0.50150065, -0.26873902,  0.02075917,  0.03860407,
         0.19056491,  0.2830736 ,  0.11948749,  0.05917228,  0.00894889,
        -0.06649598, -0.38466513, -0.2996452 ,  0.08591204,  0.00396817,
         0.04383983,  0.06525643,  0.01572042,  0.01654787,  0.00426983,
        -1.31811266,  0.57822617,  0.2908566 ,  0.01317504,  0.06288459,
         0.55746945,  0.73464736,  0.21157   ,  0.16449191,  0.04794465]])


from sklearn import metrics


y_pred = clf.predict(X_test)


metrics.accuracy_score(y_test, y_pred)

0.9300699300699301


metrics.precision_score(y_test, y_pred)

0.9259259259259259


metrics.recall_score(y_test, y_pred)

0.8928571428571429


metrics.confusion_matrix(y_test, y_pred)

array([[83,  4],
       [ 6, 50]])


metrics.plot_confusion_matrix(clf, X_test, y_test);


thresholds = np.arange(0, 1.01, 0.01)
precisions = np.array([])
recalls = np.array([])

for t in thresholds:
    y_pred = clf.predict_proba(X_test)[:, 1] >= t
    precisions = np.append(precisions, metrics.precision_score(y_test, y_pred))
    recalls = np.append(recalls, metrics.recall_score(y_test, y_pred))


px.line(x=thresholds, y=precisions,
        labels={'x': 'Threshold', 'y': 'Precision'}, title='Precision vs. Threshold', width=1000, height=600)


px.line(x=thresholds, y=recalls, 
        labels={'x': 'Threshold', 'y': 'Recall'}, title='Recall vs. Threshold', width=1000, height=600)


px.line(x=recalls, y=precisions, hover_name=thresholds, 
        labels={'x': 'Recall', 'y': 'Precision'}, title='Precision vs. Recall')


pr = metrics.precision_score(y_test, clf.predict(X_test))
re = metrics.recall_score(y_test, clf.predict(X_test))

2 * pr * re / (pr + re)

0.9090909090909091


metrics.f1_score(y_test, clf.predict(X_test))

0.9090909090909091


metrics.accuracy_score(y_test, clf.predict(X_test))

0.9300699300699301

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

Lecture 27 – Classifier Evaluation and Fairness¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Classifier evaluation¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Discussion Question¶

Example: Tumor malignancy prediction (via logistic regression)¶

Wisconsin breast cancer dataset¶

Aside: Logistic regression¶

Fitting a logistic regression model¶

Evaluating our model¶

What if we choose a different threshold?¶

Trying several thresholds¶

Combining precision and recall¶

Other evaluation metrics for binary classifiers¶

Fairness¶

Recall, from Lecture 1¶

Fairness: why do we care?¶

Example: COMPAS and recidivism prediction¶

Example: Facial recognition¶

How does bias occur?¶

Example: Gender associations¶

Example: Gender associations¶

Example: Image searches¶

Ethics: What gender ratio should we expect in the results?¶

How did this unequal representation occur?¶

Summary, next time¶

Summary¶