import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(7, 5))
plt.rc('font', size=12)

import warnings
warnings.simplefilter('ignore')


tips = sns.load_dataset('tips')
tips_features = tips.drop('tip', axis=1)
tips_features.head()


X = pd.get_dummies(tips_features)
X.head()


X['all_ones'] = 1
X.head()


np.linalg.matrix_rank(X)

9


np.linalg.matrix_rank(X.drop(columns=['sex_Male', 'smoker_Yes', 'time_Lunch', 'day_Thur']))

9


pd.get_dummies(tips_features, drop_first=True)


from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder(drop='first')
ohe.fit_transform(tips_features[['sex', 'smoker', 'day', 'time']]).toarray()

array([[0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       ...,
       [1., 1., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])


ohe.get_feature_names()

array(['x0_Male', 'x1_Yes', 'x2_Sat', 'x2_Sun', 'x2_Thur', 'x3_Lunch'],
      dtype=object)


reviews = pd.read_json(open('data/reviews.json'), lines=True)
reviews.head()


reviews['overall'].value_counts(normalize=True)

5    0.530214
4    0.254973
3    0.125000
2    0.050708
1    0.039105
Name: overall, dtype: float64


from sklearn.feature_extraction.text import CountVectorizer


example_corp = ['hey hey hey my name is billy', 
                'hey billy how is your dog billy']


count_vec = CountVectorizer()
count_vec.fit(example_corp)

CountVectorizer()


count_vec.vocabulary_

{'hey': 2,
 'my': 5,
 'name': 6,
 'is': 4,
 'billy': 0,
 'how': 3,
 'your': 7,
 'dog': 1}


count_vec.transform(example_corp).toarray()

array([[1, 0, 3, 0, 1, 1, 1, 0],
       [2, 1, 1, 1, 1, 0, 0, 1]])


example_corp

['hey hey hey my name is billy', 'hey billy how is your dog billy']


pd.DataFrame(count_vec.transform(example_corp).toarray(),
             columns=pd.Series(count_vec.vocabulary_).sort_values().index)


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


X = reviews['summary']
y = reviews['overall']
X_train, X_test, y_train, y_test = train_test_split(X, y)


pl = Pipeline([
    ('cv', CountVectorizer()), 
    ('clf', RandomForestClassifier(max_depth=8, n_estimators=7)) # Uses 7 separate decision trees
])


pl.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=8, n_estimators=7))])


# Training accuracy
pl.score(X_train, y_train)

0.5306409483624673


# Testing accuracy
pl.score(X_test, y_test)

0.5343580470162749


len(pl.named_steps['cv'].vocabulary_) # Many features, but we are not asking many questions!

5275


pl.named_steps

{'cv': CountVectorizer(),
 'clf': RandomForestClassifier(max_depth=8, n_estimators=7)}


# Note that we've used the key clf__max_depth, not max_depth
# because max_depth is a hyperparameter of clf, not of pl

hyperparameters = {
    'clf__max_depth': np.arange(2, 500, 20)
}


from sklearn.model_selection import GridSearchCV


# Takes 10+ seconds to run – how many trees are being trained?
grids = GridSearchCV(pl, param_grid=hyperparameters, return_train_score=True)
grids.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=8,
                                                               n_estimators=7))]),
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182, 202, 222, 242,
       262, 282, 302, 322, 342, 362, 382, 402, 422, 442, 462, 482])},
             return_train_score=True)


grids.best_params_

{'clf__max_depth': 162}


# Training accuracy
grids.score(X_train, y_train)

0.8393610608800482


# Testing accuracy
grids.score(X_test, y_test)

0.5732368896925859


index = grids.param_grid['clf__max_depth']
train = grids.cv_results_['mean_train_score']
valid = grids.cv_results_['mean_test_score']


pd.DataFrame({'train': train, 'valid': valid}, index=index).plot()
plt.xlabel('Depth')
plt.ylabel('Accuracy');

	total_bill	size	sex_Male	sex_Female	smoker_No	day_Sun	time_Dinner
0	16.99	2	0	1	1	1	1
1	10.34	3	1	0	1	1	1
2	21.01	3	1	0	1	1	1
3	23.68	2	1	0	1	1	1
4	24.59	4	0	1	1	1	1

	total_bill	size	sex_Male	sex_Female	smoker_No	day_Sun	time_Dinner	all_ones
0	16.99	2	0	1	1	1	1	1
1	10.34	3	1	0	1	1	1	1
2	21.01	3	1	0	1	1	1	1
3	23.68	2	1	0	1	1	1	1
4	24.59	4	0	1	1	1	1	1

	total_bill	size	sex_Female	smoker_No	day_Fri	day_Sat	day_Sun	time_Dinner
0	16.99	2	1	1	0	0	1	1
1	10.34	3	0	1	0	0	1	1
2	21.01	3	0	1	0	0	1	1
3	23.68	2	0	1	0	0	1	1
4	24.59	4	1	1	0	0	1	1
...	...	...	...	...	...	...	...	...
239	29.03	3	0	1	0	1	0	1
240	27.18	2	1	0	0	1	0	1
241	22.67	2	0	0	0	1	0	1
242	17.82	2	0	1	0	1	0	1
243	18.78	2	1	1	0	0	0	1

	reviewerID	asin	reviewerName	helpful	reviewText	overall	summary	unixReviewTime	reviewTime
0	A1JZFGZEZVWQPY	B00002N674	Carter H "1amazonreviewer@gmail . com"	[4, 4]	Good USA company that stands behind their prod...	4	Great Hoses	1308614400	06 21, 2011
1	A32JCI4AK2JTTG	B00002N674	Darryl Bennett "Fuzzy342"	[0, 0]	This is a high quality 8 ply hose. I have had ...	5	Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...	1402272000	06 9, 2014
2	A3N0P5AAMP6XD2	B00002N674	H B	[2, 3]	It's probably one of the best hoses I've ever ...	4	Very satisfied!	1336176000	05 5, 2012
3	A2QK7UNJ857YG	B00002N674	Jason	[0, 0]	I probably should have bought something a bit ...	5	Very high quality	1373846400	07 15, 2013
4	AS0CYBAN6EM06	B00002N674	jimmy	[1, 1]	I bought three of these 5/8-inch Flexogen hose...	5	Good Hoses	1375660800	08 5, 2013

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

Lecture 26 – Examples and Classifier Evaluation¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

One-hot encoding and multicollinearity¶

One-hot encoding and multicollinearity¶

Key takeaways¶

Modeling using text features¶

Example: Predicting reviews¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶

Training and validation accuracy vs. depth¶

Classifier evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Discussion Question¶

Summary, next time¶

Summary¶

	total_bill	sex	smoker	day	time	size
0	16.99	Female	No	Sun	Dinner	2
1	10.34	Male	No	Sun	Dinner	3
2	21.01	Male	No	Sun	Dinner	3
3	23.68	Male	No	Sun	Dinner	2
4	24.59	Female	No	Sun	Dinner	4

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

Lecture 26 – Examples and Classifier Evaluation¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

One-hot encoding and multicollinearity¶

One-hot encoding and multicollinearity¶

Key takeaways¶

Modeling using text features¶

Example: Predicting reviews¶

Aside: CountVectorizer¶

Creating an initial Pipeline¶

Choosing tree depth via GridSearchCV¶

Training and validation accuracy vs. depth¶

Classifier evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Discussion Question¶

Summary, next time¶

Summary¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶