from course_utils import *

diabetes = pd.read_csv(Path("data") / "diabetes.csv")

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    diabetes[["Glucose", "BMI"]], diabetes["Outcome"], random_state=1
)

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

hyperparameters = {
    "max_depth": [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
    "min_samples_split": [2, 5, 10, 20, 50, 100, 200],
    "criterion": ["gini", "entropy"],
}

len(hyperparameters["max_depth"]) * len(
    hyperparameters["min_samples_split"]
) * len(hyperparameters["criterion"])

140

searcher = GridSearchCV(DecisionTreeClassifier(), hyperparameters, cv=5)

%%time
searcher.fit(X_train, y_train)

CPU times: user 780 ms, sys: 16.8 ms, total: 797 ms
Wall time: 797 ms

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

searcher.cv_results_["mean_test_score"]  # Array of length 140.

array([0.73, 0.73, 0.73, ..., 0.75, 0.74, 0.72])

# Rows correspond to folds, columns correspond to hyperparameter combinations.
pd.DataFrame(
    np.vstack([searcher.cv_results_[f"split{i}_test_score"] for i in range(5)])
)

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

final_tree = DecisionTreeClassifier(**searcher.best_params_)
final_tree

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

final_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

# Training accuracy.
final_tree.score(X_train, y_train)

0.7881944444444444

# Testing accuracy.
# A bit lower than the `dt` tree we fit above!
final_tree.score(X_test, y_test)

0.765625

searcher.score(X_train, y_train)

0.7881944444444444

searcher.score(X_test, y_test)

0.765625

# Let's use more features for prediction
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    diabetes.drop(columns=["Outcome"]), diabetes["Outcome"], random_state=1
)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

clf.score(X_test, y_test)

0.8177083333333334

dt = DecisionTreeClassifier(max_depth=4, criterion="entropy")
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.7829861111111112

dt.score(X_test, y_test)

0.734375

news = pd.read_csv("data/fake_news_training.csv")
news

news["label"].value_counts(normalize=True)

label
real    0.55
fake    0.45
Name: proportion, dtype: float64

from sklearn.feature_extraction.text import CountVectorizer

example_corp = [
    "hey hey hey my name is billy",
    "hey billy how is your dog billy",
]

count_vec = CountVectorizer()
count_vec.fit(example_corp)

CountVectorizer()

CountVectorizer()

count_vec.vocabulary_

{'hey': 2,
 'my': 5,
 'name': 6,
 'is': 4,
 'billy': 0,
 'how': 3,
 'your': 7,
 'dog': 1}

count_vec.transform(example_corp).toarray()

array([[1, 0, 3, 0, 1, 1, 1, 0],
       [2, 1, 1, 1, 1, 0, 0, 1]])

example_corp

['hey hey hey my name is billy', 'hey billy how is your dog billy']

pd.DataFrame(
    count_vec.transform(example_corp).toarray(),
    columns=pd.Series(count_vec.vocabulary_).sort_values().index,
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

X = news["content"]
y = news["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

pl = make_pipeline(
    CountVectorizer(),
    RandomForestClassifier(
        max_depth=3,
        n_estimators=100,  # Uses 100 separate decision trees!
        random_state=42,
    ), 
)

pl.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3, random_state=42))])

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3, random_state=42))])

CountVectorizer()

RandomForestClassifier(max_depth=3, random_state=42)

# Training accuracy.
pl.score(X_train, y_train)

0.7515151515151515

# Testing accuracy.
pl.score(X_test, y_test)

0.7469879518072289

y_train.value_counts(normalize=True)

label
real    0.55
fake    0.45
Name: proportion, dtype: float64

# Distribution of predicted ys in the training set:

# stops scientific notation for pandas
pd.set_option("display.float_format", "{:.3f}".format)
pd.Series(pl.predict(X_train)).value_counts(normalize=True)

fake   0.642
real   0.358
Name: proportion, dtype: float64

pl.named_steps

{'countvectorizer': CountVectorizer(),
 'randomforestclassifier': RandomForestClassifier(max_depth=3, random_state=42)}

len(pl.named_steps["countvectorizer"].vocabulary_)  # Lots of features!

26096

# Note that we've used the key randomforestclassifier__max_depth,
# not max_depth because max_depth is a hyperparameter of randomforestclassifier,
# not of pl.

hyperparameters = {"randomforestclassifier__max_depth": np.arange(2, 200, 20)}

%%time

# Takes a few seconds to run – how many trees are being trained?
from sklearn.model_selection import GridSearchCV

grids = GridSearchCV(
    pl,
    n_jobs=-1,  # Use multiple processors to parallelize
    param_grid=hyperparameters,
    return_train_score=True,
)
grids.fit(X_train, y_train)

CPU times: user 1.04 s, sys: 247 ms, total: 1.29 s
Wall time: 6.15 s

GridSearchCV(estimator=Pipeline(steps=[('countvectorizer', CountVectorizer()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

GridSearchCV(estimator=Pipeline(steps=[('countvectorizer', CountVectorizer()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=np.int64(42),
                                        random_state=42))])

CountVectorizer()

RandomForestClassifier(max_depth=np.int64(42), random_state=42)

grids.best_params_

{'randomforestclassifier__max_depth': np.int64(42)}

# Training accuracy.
grids.score(X_train, y_train)

0.9919191919191919

# Testing accuracy.
grids.score(X_test, y_test)

0.8674698795180723

index = grids.param_grid["randomforestclassifier__max_depth"]
train = grids.cv_results_["mean_train_score"]
valid = grids.cv_results_["mean_test_score"]

pd.DataFrame(
    {"train": train, "valid": valid}, index=index
).plot().update_layout(xaxis_title="max_depth", yaxis_title="Accuracy")

hyperparameters = {
    'n_estimators': [10, 100, 1000], # number of trees per forest
    'max_depth': [None, 100, 10]     # max depth of each tree
}
grids = GridSearchCV(
    RandomForestClassifier(), param_grid=hyperparameters,
    cv=3, # 3-fold cross-validation
)
grids.fit(X_train, y_train)

	0	1	2	3	...	136	137	138	139
0	0.71	0.71	0.71	0.71	...	0.70	0.70	0.71	0.73
1	0.77	0.77	0.77	0.77	...	0.82	0.83	0.77	0.76
2	0.74	0.74	0.74	0.74	...	0.68	0.72	0.74	0.73
3	0.70	0.70	0.70	0.70	...	0.77	0.79	0.76	0.70
4	0.72	0.72	0.72	0.72	...	0.70	0.71	0.72	0.70

	baseurl	content	label
0	twitter.com	\njavascript is not available.\n\nwe’ve detect...	real
1	whitehouse.gov	remarks by the president at campaign event -- ...	real
2	web.archive.org	the committee on energy and commerce\nbarton: ...	real
...	...	...	...
658	politico.com	full text: jeff flake on trump speech transcri...	fake
659	pol.moveon.org	moveon.org political action: 10 things to know...	real
660	uspostman.com	uspostman.com is for sale\nyes, you can transf...	fake

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

Lecture 17 – Random Forests¶

Grid search¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Decision tree pros and cons¶

Random Forests¶

Another idea:¶

Idea 1: Bootstrap the training data¶

Idea 2: Only use a subset of features¶

Question 🤔

Example¶

Example: Modeling using text features¶

Example: Fake news¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶

Training and validation accuracy vs. depth¶

Question 🤔

Classifier Evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Question 🤔

Question 🤔

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

Lecture 17 – Random Forests¶

Grid search¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Decision tree pros and cons¶

Random Forests¶

Another idea:¶

Idea 1: Bootstrap the training data¶

Idea 2: Only use a subset of features¶

Question 🤔

Example¶

Example: Modeling using text features¶

Example: Fake news¶

Aside: CountVectorizer¶

Creating an initial Pipeline¶

Choosing tree depth via GridSearchCV¶

Training and validation accuracy vs. depth¶

Question 🤔

Classifier Evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Question 🤔

Question 🤔

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶