from dsc80_utils import *

diabetes = pd.read_csv('data/diabetes.csv')
display_df(diabetes, cols=9)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

fig = (
    X_train.assign(Outcome=y_train.astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes')
)
fig

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

dt.score(X_train, y_train)

0.9913194444444444

# Low test set accuracy!
dt.score(X_test, y_test)

0.7239583333333334

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=4, criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

# Much lower training set accuracy, but...
dt.score(X_train, y_train)

0.7864583333333334

# Much better test set accuracy!
dt.score(X_test, y_test)

0.765625

# Let's use more features for prediction
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes.drop(columns=['Outcome']), diabetes['Outcome'], random_state=1)
)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

clf.score(X_test, y_test)

0.796875

dt = DecisionTreeClassifier(max_depth=4, criterion='entropy')
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.7829861111111112

dt.score(X_test, y_test)

0.734375

news = pd.read_csv('data/fake_news_training.csv')
news

news['label'].value_counts(normalize=True)

real    0.55
fake    0.45
Name: label, dtype: float64

from sklearn.feature_extraction.text import CountVectorizer

example_corp = ['hey hey hey my name is billy', 
                'hey billy how is your dog billy']

count_vec = CountVectorizer()
count_vec.fit(example_corp)

CountVectorizer()

count_vec.vocabulary_

{'hey': 2,
 'my': 5,
 'name': 6,
 'is': 4,
 'billy': 0,
 'how': 3,
 'your': 7,
 'dog': 1}

count_vec.transform(example_corp).toarray()

array([[1, 0, 3, 0, 1, 1, 1, 0],
       [2, 1, 1, 1, 1, 0, 0, 1]])

example_corp

['hey hey hey my name is billy', 'hey billy how is your dog billy']

pd.DataFrame(count_vec.transform(example_corp).toarray(),
             columns=pd.Series(count_vec.vocabulary_).sort_values().index)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

X = news['content']
y = news['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

pl = Pipeline([
    ('cv', CountVectorizer()), 
    ('clf', RandomForestClassifier(
        max_depth=3,
        n_estimators=100, # Uses 100 separate decision trees!
        random_state=42,
    )) 
])

pl.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=3, random_state=42))])

# Training accuracy.
pl.score(X_train, y_train)

0.7575757575757576

# Testing accuracy.
pl.score(X_test, y_test)

0.7530120481927711

y_train.value_counts(normalize=True)

real    0.53
fake    0.47
Name: label, dtype: float64

# Distribution of predicted ys in the training set:

# stops scientific notation for pandas
pd.set_option('display.float_format', '{:.3f}'.format)
pd.Series(pl.predict(X_train)).value_counts(normalize=True)

fake   0.695
real   0.305
dtype: float64

len(pl.named_steps['cv'].vocabulary_) # Lots of features!

25495

# Note that we've used the key clf__max_depth, not max_depth
# because max_depth is a hyperparameter of clf, not of pl.

hyperparameters = {
    'clf__max_depth': np.arange(2, 200, 20)
}

%%time

# Takes a few seconds to run – how many trees are being trained?
from sklearn.model_selection import GridSearchCV
grids = GridSearchCV(
    pl,
    n_jobs=-1, # Use multiple processors to parallelize
    param_grid=hyperparameters,
    return_train_score=True
)
grids.fit(X_train, y_train)

CPU times: user 727 ms, sys: 246 ms, total: 973 ms
Wall time: 6.34 s

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

grids.best_params_

{'clf__max_depth': 42}

# Training accuracy.
grids.score(X_train, y_train)

0.9959595959595959

# Testing accuracy.
grids.score(X_test, y_test)

0.8493975903614458

index = grids.param_grid['clf__max_depth']
train = grids.cv_results_['mean_train_score']
valid = grids.cv_results_['mean_test_score']

pd.DataFrame({'train': train, 'valid': valid}, index=index).plot().update_layout(
    xaxis_title='max_depth', yaxis_title='Accuracy'
)

from sklearn.datasets import load_breast_cancer
loaded = load_breast_cancer() # explore the value of `loaded`!
data = loaded['data']
labels = 1 - loaded['target']
cols = loaded['feature_names']
bc = pd.DataFrame(data, columns=cols)

bc.head()

labels[:5]

array([1, 1, 1, 1, 1])

pd.Series(labels).value_counts(normalize=True)

0   0.627
1   0.373
dtype: float64

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(bc, labels)

clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

clf.predict(X_test)

array([0, 0, 1, ..., 1, 0, 0])

# [:, 1] refers to the predicted probabilities for class 1
clf.predict_proba(X_test)

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       ...,
       [0.45, 0.55],
       [0.99, 0.01],
       [1.  , 0.  ]])

clf.intercept_

array([-29.98])

clf.coef_

array([[-0.81, -0.26,  0.41, ...,  0.38,  0.51,  0.08]])

from sklearn import metrics

y_pred = clf.predict(X_test)

metrics.accuracy_score(y_test, y_pred)

0.9440559440559441

metrics.precision_score(y_test, y_pred)

0.9245283018867925

metrics.recall_score(y_test, y_pred)

0.9245283018867925

metrics.confusion_matrix(y_test, y_pred)

array([[86,  4],
       [ 4, 49]])

from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test);
plt.grid(False)

thresholds = np.arange(0, 1.01, 0.01)
precisions = np.array([])
recalls = np.array([])

for t in thresholds:
    y_pred = clf.predict_proba(X_test)[:, 1] >= t
    precisions = np.append(precisions, metrics.precision_score(y_test, y_pred))
    recalls = np.append(recalls, metrics.recall_score(y_test, y_pred))

px.line(x=thresholds, y=precisions,
        labels={'x': 'Threshold', 'y': 'Precision'}, title='Precision vs. Threshold', width=1000, height=600)

px.line(x=thresholds, y=recalls, 
        labels={'x': 'Threshold', 'y': 'Recall'}, title='Recall vs. Threshold', width=1000, height=600)

px.line(x=recalls, y=precisions, hover_name=thresholds, 
        labels={'x': 'Recall', 'y': 'Precision'}, title='Precision vs. Recall')

pr = metrics.precision_score(y_test, clf.predict(X_test))
re = metrics.recall_score(y_test, clf.predict(X_test))

2 * pr * re / (pr + re)

0.9245283018867925

metrics.f1_score(y_test, clf.predict(X_test))

0.9245283018867925

metrics.accuracy_score(y_test, clf.predict(X_test))

0.9440559440559441

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

	baseurl	content	label
0	twitter.com	\njavascript is not available.\n\nwe’ve detect...	real
1	whitehouse.gov	remarks by the president at campaign event -- ...	real
2	web.archive.org	the committee on energy and commerce\nbarton: ...	real
...	...	...	...
658	politico.com	full text: jeff flake on trump speech transcri...	fake
659	pol.moveon.org	moveon.org political action: 10 things to know...	real
660	uspostman.com	uspostman.com is for sale\nyes, you can transf...	fake

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

	mean radius	mean texture	mean perimeter	mean area	...	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.990	10.380	122.800	1001.000	...	0.712	0.265	0.460	0.119
1	20.570	17.770	132.900	1326.000	...	0.242	0.186	0.275	0.089
2	19.690	21.250	130.000	1203.000	...	0.450	0.243	0.361	0.088
3	11.420	20.380	77.580	386.100	...	0.687	0.258	0.664	0.173
4	20.290	14.340	135.100	1297.000	...	0.400	0.163	0.236	0.077

Lecture 17 – Random Forests, Classifier Evaluation¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📝 Final Exam¶

🙋🙋🏽‍♀️ Questions?¶

Random Forests¶

Example: Diabetes¶

Review: Decision Trees¶

Decision tree pros and cons¶

Ideas for reducing decision tree variance¶

Random Forests¶

Another idea:¶

Idea 1: Bootstrap the training data¶

Idea 2: Only use a subset of features¶

Practice exam question:¶

🙋🙋🏽‍♀️ Questions?¶

Example¶

Example: Modeling using text features¶

Example: Fake news¶

Aside: CountVectorizer¶

Creating an initial Pipeline¶

Choosing tree depth via GridSearchCV¶

Training and validation accuracy vs. depth¶

🙋🙋🏽‍♀️ Questions?¶

Classifier Evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Discussion Question¶

Example: Tumor malignancy prediction (via logistic regression)¶

Wisconsin breast cancer dataset¶

Aside: Logistic regression¶

Fitting a logistic regression model¶

Evaluating our model¶

What if we choose a different threshold?¶

Trying several thresholds¶

Combining precision and recall¶

Other evaluation metrics for binary classifiers¶

🙋🙋🏽‍♀️ Questions?¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶