from dsc80_utils import *

from sklearn.datasets import load_breast_cancer
loaded = load_breast_cancer() # explore the value of `loaded`!
data = loaded['data']
labels = 1 - loaded['target']
cols = loaded['feature_names']
bc = pd.DataFrame(data, columns=cols)

bc.head()

labels

array([1, 1, 1, ..., 1, 1, 0])

pd.Series(labels).value_counts(normalize=True)

0    0.63
1    0.37
dtype: float64

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(bc, labels)

clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

clf.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

# [:, 1] refers to the predicted probabilities for class 1.
clf.predict_proba(X_test)

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [0.91, 0.09],
       ...,
       [0.  , 1.  ],
       [0.92, 0.08],
       [1.  , 0.  ]])

clf.intercept_

array([-37.04])

clf.coef_

array([[-0.61, -0.33,  0.2 , ...,  0.49,  0.68,  0.08]])

from sklearn import metrics

y_pred = clf.predict(X_test)

metrics.confusion_matrix(y_test, y_pred)

array([[93,  1],
       [ 7, 42]])

from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test);
plt.grid(False)

metrics.accuracy_score(y_test, y_pred)

0.9440559440559441

metrics.precision_score(y_test, y_pred)

0.9767441860465116

metrics.recall_score(y_test, y_pred)

0.8571428571428571

thresholds = np.arange(0.01, 1.01, 0.01)
precisions = np.array([])
recalls = np.array([])

for t in thresholds:
    y_pred = clf.predict_proba(X_test)[:, 1] >= t
    precisions = np.append(precisions, metrics.precision_score(y_test, y_pred, zero_division=1))
    recalls = np.append(recalls, metrics.recall_score(y_test, y_pred))

px.line(x=thresholds, y=precisions,
        labels={'x': 'Threshold', 'y': 'Precision'}, title='Precision vs. Threshold', width=1000, height=600)

px.line(x=thresholds, y=recalls, 
        labels={'x': 'Threshold', 'y': 'Recall'}, title='Recall vs. Threshold', width=1000, height=600)

px.line(x=recalls, y=precisions, hover_name=thresholds, 
        labels={'x': 'Recall', 'y': 'Precision'}, title='Precision vs. Recall')

pr = metrics.precision_score(y_test, clf.predict(X_test))
re = metrics.recall_score(y_test, clf.predict(X_test))

2 * pr * re / (pr + re)

0.9130434782608695

metrics.f1_score(y_test, clf.predict(X_test))

0.9130434782608695

metrics.accuracy_score(y_test, clf.predict(X_test))

0.9440559440559441

loans = pd.read_csv(Path('data') / 'loan_vars1.csv', index_col=0)
loans.head()

loans['loan_amnt'].sum()

5706507225.0

loans.shape[0]

386772

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = loans.drop('tag', axis=1)
y = loans.tag
X_train, X_test, y_train, y_test = train_test_split(X, y)

clf = RandomForestClassifier(n_estimators=50)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50)

y_pred = clf.predict(X_test)
y_pred

array([0., 0., 0., ..., 0., 0., 1.])

clf.score(X_test, y_test)

0.7114268871583258

ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test);
plt.grid(False)

metrics.precision_score(y_test, y_pred)

0.7709923664122137

1 - metrics.precision_score(y_test, y_pred)

0.2290076335877863

metrics.recall_score(y_test, y_pred)

0.7314890579571371

1 - metrics.recall_score(y_test, y_pred)

0.26851094204286285

results = X_test
results['age_bracket'] = results['age'].apply(lambda x: 5 * (x // 5 + 1))
results['prediction'] = y_pred
results['tag'] = y_test

(
    results
    .groupby('age_bracket')
    .apply(lambda x: 1 - metrics.recall_score(x['tag'], x['prediction']))
    .plot(kind='bar', title='False Negative Rate by Age Group')
)

results['is_young'] = (results['age'] < 25).replace({True: 'young', False: 'old'})

results.groupby('is_young')['prediction'].mean()

is_young
old      0.69
young    0.30
Name: prediction, dtype: float64

compute_accuracy = lambda x: metrics.accuracy_score(x['tag'], x['prediction'])

(
    results
    .groupby('is_young')
    .apply(compute_accuracy)
    .rename('accuracy')
)

is_young
old      0.73
young    0.68
Name: accuracy, dtype: float64

obs = results.groupby('is_young').apply(compute_accuracy).diff().iloc[-1]
obs

-0.0464439332813531

diff_in_acc = []
for _ in range(500):
    s = (
        results[['is_young', 'prediction', 'tag']]
        .assign(is_young=np.random.permutation(results['is_young']))
        .groupby('is_young')
        .apply(compute_accuracy)
        .diff()
        .iloc[-1]
    )
    
    diff_in_acc.append(s)

fig = pd.Series(diff_in_acc).plot(kind='hist', histnorm='probability', nbins=20,
                            title='Difference in Accuracy (Young - Old)')
fig.add_vline(x=obs, line_color='red')
fig.update_layout(xaxis_range=[-0.1, 0.05])

loans

	Predicted Negative	Predicted Positive
Actually Negative	???	30
Actually Positive	66	105

	mean radius	mean texture	mean perimeter	mean area	...	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	...	0.71	0.27	0.46	0.12
1	20.57	17.77	132.90	1326.0	...	0.24	0.19	0.28	0.09
2	19.69	21.25	130.00	1203.0	...	0.45	0.24	0.36	0.09
3	11.42	20.38	77.58	386.1	...	0.69	0.26	0.66	0.17
4	20.29	14.34	135.10	1297.0	...	0.40	0.16	0.24	0.08

	loan_amnt	emp_length	home_ownership	inq_last_6mths	revol_bal	age
268309	6400.0	0.0	1.0	1.0	899.0	22.0
301093	10700.0	10.0	1.0	0.0	29411.0	19.0
1379211	15000.0	10.0	1.0	2.0	9911.0	48.0
486795	15000.0	10.0	1.0	2.0	15883.0	35.0
1481134	22775.0	3.0	1.0	0.0	17008.0	39.0

	loan_amnt	emp_length	home_ownership	inq_last_6mths	revol_bal	age	tag
268309	6400.0	0.0	1.0	1.0	899.0	22.0	0.0
301093	10700.0	10.0	1.0	0.0	29411.0	19.0	0.0
1379211	15000.0	10.0	1.0	2.0	9911.0	48.0	0.0
...	...	...	...	...	...	...	...
1150493	5000.0	1.0	1.0	0.0	3842.0	52.0	1.0
686485	6000.0	10.0	0.0	0.0	6529.0	36.0	1.0
342901	15000.0	8.0	1.0	1.0	16060.0	39.0	1.0

Lecture 18 – Classifier Evaluation, Model Fairness¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda 📆¶

Classifier evaluation¶

Precision and recall¶

Question 🤔 (Answer at q.dsc80.com)

Logistic regression¶

Wisconsin breast cancer dataset¶

Logistic regression¶

Fitting a logistic regression model¶

Evaluating our model¶

What if we choose a different threshold?¶

Trying several thresholds¶

Combining precision and recall¶

Other evaluation metrics for binary classifiers¶

Model fairness¶

Fairness: why do we care?¶

Example: Google's Gemini¶

Model fairness¶

Parity measures for classifiers¶

More on parity measures¶

Example: Loan approval¶

Predicting `'tag'`¶

Precision¶

Recall¶

False negative rate by age¶

Computing parity measures¶

Is this difference in accuracy significant?¶

Ethical questions of fairness¶

Summary, next time¶

Summary¶

Next time¶

Lecture 18 – Classifier Evaluation, Model Fairness¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda 📆¶

Classifier evaluation¶

Precision and recall¶

Question 🤔 (Answer at q.dsc80.com)

Logistic regression¶

Wisconsin breast cancer dataset¶

Logistic regression¶

Fitting a logistic regression model¶

Evaluating our model¶

What if we choose a different threshold?¶

Trying several thresholds¶

Combining precision and recall¶

Other evaluation metrics for binary classifiers¶

Model fairness¶

Fairness: why do we care?¶

Example: Google's Gemini¶

Model fairness¶

Parity measures for classifiers¶

More on parity measures¶

Example: Loan approval¶

Predicting 'tag'¶

Precision¶

Recall¶

False negative rate by age¶

Computing parity measures¶

Is this difference in accuracy significant?¶

Ethical questions of fairness¶

Summary, next time¶

Summary¶

Next time¶

Predicting `'tag'`¶